四、NLP基础分词

0 阅读19分钟

本章节所有代码中使用的词表以及相关实验语料需要自己在网上寻找,在这里不做提供。

1、中文分词-正向最大匹配

说明:在传统的中文分词中正向最大匹配属于是最初级的分词算法,该算法主要依赖于一个词表,然后以词表中最长词的长度作为滑动窗口在待分词的句子上进行滑动匹配,能够匹配上记录该窗口的词并将窗口的开始位置滑动至匹配上词的下一个字位置;如没有匹配上则将窗口长度缩减一个长度,在进行匹配,若能够匹配上则该窗口的词并将窗口的开始位置滑动至匹配上词的下一个字位置(注意下一个滑动窗口的长度还是最长词的长度),若匹配不上继续减一直至最后窗口长度为1,则记录单字词并记录,将窗口开始位置滑动至匹配上词的下一个字位置,直至待分词句子走完。 其中上述方案有许多重复操作,耗时较长,要加快程序执行时间则可以使用前缀字典实现,即在字典中加上每个词所有的前缀,然后待分词句子如果切分位置的字符串可以在前缀字典中找到,则继续遍历,若找不到则在前一次遍历的位置进行依次切分,然后重新从后面的位置继续进行遍历查询,这样只需要遍历依次待分词句子则可以完成分词。 在这里插入图片描述

import time
"""
功能描述:实现中文分词-正向最大匹配方法一(优化)

参数:
    vocab_path: 词表路径
    word_path: 待分词路径

返回值:
"""
def word_segmentation_month1(vocab_path, word_path):
    result = []
    vocab = {}
    windows_length = 0
    # 加载其词表
    with open(vocab_path, 'r', encoding='utf8') as f:
        for line in f:
            vocab[line.strip().split()[0]] = 0
            windows_length = max(windows_length, len(line.strip()))
    # 加载分词语句
    with open(word_path, 'r', encoding='utf-8') as f:
        for line in f:
            result_word = []
            while line != '':
                lens = min(len(line), windows_length)
                temp_word = line[:lens]
                while temp_word not in vocab:
                    if len(temp_word) == 1:
                        break
                    temp_word = temp_word[:len(temp_word)-1]
                result_word.append(temp_word)
                line = line[len(temp_word):]
            result.append('/'.join(result_word).strip())
    return result

if __name__ == '__main__':
    vocab_path = './vocab/dict.txt'
    word_path ='./corpus/corpus.txt'
    start = time.time()
    result = word_segmentation_month1(vocab_path, word_path)
    end =time.time()
    print(f'消耗时间:{end-start}秒!')
import time
"""
功能描述:中文分词-正向最大匹配方法二通过前缀字典,实现对字符串的一次遍历分词

参数:
    vocab_path:词表路径
    word_path:待分词语句文件路径

返回值:
"""
def word_segmentation_month2(vocab_path, word_path):
    # 构建前缀词表
    prefix_vocab = {}
    with open(vocab_path, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip().split()[0]
            if line not in prefix_vocab:
                prefix_vocab[line] = 1
            while line != '':
                line = line[: len(line) - 1]
                if line not in prefix_vocab and line != '':
                    prefix_vocab[line] = 0
                if len(line) == 1 and line not in prefix_vocab and line != '':
                    prefix_vocab[line] = 0
                    break
    # 进行分词处理
    result = []
    with open(word_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            temp_str = ''
            index = 1
            result_word = []
            while line != '':
                temp_str = line[:index]
                if temp_str in prefix_vocab and temp_str != line:
                    index += 1
                else:
                    if index == 1 or temp_str == line:
                        result_word.append(temp_str)
                        line = line[index:]
                    else:
                        result_word.append(temp_str[:len(temp_str) - 1])
                        line = line[index-1:]
                    index = 1
            result.append(result_word)
    return result

if __name__ == '__main__':
    vocab_path = './vocab/dict_temp.txt'
    word_path = './corpus/corpus_temp.txt'
    start = time.time()
    result = word_segmentation_month2(vocab_path, word_path)
    end = time.time()
    print(f'消耗时间{end-start}秒!')

2、中文分词-反向最大匹配

说明:反向最大匹配的方式与正向最大匹配的方式基本相似,也是通过滑动窗口匹配待分词句子,但是滑动的方向是按照反方向来的。该匹配方式也被分为两种方式第一种就是按照最大词长度作为窗口长度进行滑动切分,第二中是按照使用前缀字典的方式进行切分(注意:由于是反方向进行切分,对应的前缀字典也应该是词表中每个词反向的前缀)。

import time
"""
功能描述:中文分词-逆向最大匹配(方案一)

参数:
    vocab_path: 词表文件路径
    word_path: 待分词文件路径

返回值:
"""
def word_segmentation_month1(vocab_path, word_path):
    vocab = {}
    windows_length = 0
    # 加载词表
    with open(vocab_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip().split()[0]
            windows_length = max(windows_length, len(line))
            vocab[line] = 0
    # 分词语句
    result = []
    with open(word_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            result_word = []
            while line != '':
                lens = min(len(line), windows_length)
                temp_str = line[len(line)-lens:]
                while temp_str not in vocab:
                    if len(temp_str) == 1:
                        break
                    temp_str = temp_str[1:]
                result_word.append(temp_str)
                line = line[:len(line)-len(temp_str)]
            result_word.reverse()
            result.append(result_word)
    return result


if __name__ == '__main__':
    vocab_path = './vocab/dict.txt'
    corpus_path = './corpus/corpus.txt'
    start = time.time()
    print(word_segmentation_month1(vocab_path, corpus_path))
    end = time.time()
    print(f'消耗时间:{end-start}秒!')
import time

from PIL.Image import preinit

"""
功能描述:中文分词-逆向最大匹配(使用前缀字典)

参数:
    vocab_path: 词表文件路径
    word_path: 待分词文件路径

返回值:
"""
def word_segmentation_month2(vocab_path, word_path):
    # 构建前缀字典
    prefix_dict = {}
    with open(vocab_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip().split()[0]
            if line not in prefix_dict:
                prefix_dict[line] = 1
            temp_dom = ''
            while len(line)-len(temp_dom) != 1:
                temp_dom = line[len(line)-(len(temp_dom)+1):]
                if temp_dom not in prefix_dict and temp_dom != '':
                    prefix_dict[temp_dom] = 0
    # 进行分词
    result = []
    with open(word_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            index = 1
            result_word = []
            while line != '':
                temp_str = line[len(line) - index:]
                if temp_str in prefix_dict and len(temp_str) != len(line):
                    index += 1
                else:
                    if len(temp_str) != 1 and len(temp_str) != len(line):
                        result_word.append(temp_str[1:])
                        line = line[:len(line)-len(temp_str[1:])]
                        index = 1
                    else:
                        result_word.append(temp_str)
                        line = line[:len(line)-len(temp_str)]
                        index = 1
            result_word.reverse()
            result.append(result_word)

    return result


if __name__ == '__main__':
    vocab_path = './vocab/dict.txt'
    word_path = './corpus/corpus.txt'
    start = time.time()
    print(word_segmentation_month2(vocab_path, word_path))
    end = time.time()
    print(f'消耗时间{end-start}秒!')

3、中文分词-双向最大匹配

说明:前面提到的正向最大匹配以及反向最大匹配其实针对不同的句子分词效果也可能不尽相同,如何将两者的分词优势结合起来了。所以就有了双向最大匹配,即针对一个句子正向、反向最大匹配各处理依次,之后根据相应的标准判断出哪种方式分词的效果最合理。 分词标准:

  1. 单字词的个数,越少越好
  2. 非字典词的个数,越少越好
  3. 词总数,越少越好
from reverse_word_segmentation_month2 import word_segmentation_month2 as reverse_months
from straight_word_segmentation_month2 import word_segmentation_month2 as straight_months


if __name__ == '__main__':
    vocab_path = './vocab/dict.txt'
    word_path = './corpus/corpus.txt'
    reverse_result = reverse_months(vocab_path, word_path)
    straight_result = straight_months(vocab_path, word_path)
    result = []
    # 比较单字词个数
    for straight_data, reverse_data in zip(straight_result, reverse_result):
        length1 = len([s for s in straight_data if len(s) == 1])
        length2 = len([s for s in reverse_data if len(s) == 1])
        if length1 <= length2:
            result.append(straight_data)
        else:
            result.append(reverse_data)
    print(result)

4、中文分词-jieba分词

说明:jieba分词是目前常用的分词方式,它的分词逻辑是全切分,即将一个句话的所有切分方式都的出来,然后词表中热点词判断出所有切分方式中的最优解,含有热点词多的切分方式即为最优解。下面会做一个简单示例实现全切分,主要使用到动态规划(有记忆的递归)。

下方通过代码实现模拟实现Jieba分词的效果:

#词典;每个词后方存储的是其词频,词频仅为示例,不会用到,也可自行修改
Dict = {"经常":0.1,
        "经":0.05,
        "有":0.1,
        "常":0.001,
        "有意见":0.1,
        "歧":0.001,
        "意见":0.2,
        "分歧":0.2,
        "见":0.05,
        "意":0.05,
        "见分歧":0.05,
        "分":0.1}
#待切分文本
sentence = "经常有意见分歧"

#实现全切分函数,输出根据字典能够切分出的所有的切分方式
def all_cut(sentence, Dict):
    #TODO
    def recursion_cut(str, memo={}):
        # 动态规划实现
        if str in memo:
            return memo[str]

        result = []     # 存放已经切分好的序列
        if not str:
            return [[]]

        for i in range(1, len(str)+1):
            word = str[:i]
            if word in Dict:
                res_cuts = recursion_cut(str[i:])
                for res in res_cuts:
                    result.append([word]+res)
        memo[str] = result
        return result

    return recursion_cut(sentence)

result_list = all_cut(sentence, Dict)
for res in result_list:
    print(res)

#目标输出;顺序不重要
target = [
    ['经常', '有意见', '分歧'],
    ['经常', '有意见', '分', '歧'],
    ['经常', '有', '意见', '分歧'],
    ['经常', '有', '意见', '分', '歧'],
    ['经常', '有', '意', '见分歧'],
    ['经常', '有', '意', '见', '分歧'],
    ['经常', '有', '意', '见', '分', '歧'],
    ['经', '常', '有意见', '分歧'],
    ['经', '常', '有意见', '分', '歧'],
    ['经', '常', '有', '意见', '分歧'],
    ['经', '常', '有', '意见', '分', '歧'],
    ['经', '常', '有', '意', '见分歧'],
    ['经', '常', '有', '意', '见', '分歧'],
    ['经', '常', '有', '意', '见', '分', '歧']
]

5、中文分词-机器学习实现分词

说明:目前传统的分词方式存在以下问题:

  1. 过度依赖词表
  2. 不注重语义,即前后语境
  3. 如果语句中有错别字会出现一连串的错误
  4. 对于人名等枚举值无法做到有效处理

针对以上缺陷则出现了使用目前主流的机器学习的方式实现分词,该种方式可以有效的避免上面传统分词存在的问题(最大的缺点是需要标注好的训练数据,属于有监督学习);所以针对使用机器学习实现分词效果,做下面的代码实现,其主要实现包含以下部分: 5. 训练数据采用jieba分词提供(在这里只做实验,所以不需要实际标注的数据)

  1. 具体的主体逻辑是针对分词,抽象为判断一个字是不是词的边界,这样就将这个分词任务转换为一个可以用机器学习实现的二分类任务
  2. 网络结构包括embedding层字符转向量,主体网络为RNN循环神经网络,后接一个线性层用于将矩阵映射成一个最后一维为2的矩阵(用于后面的二分类),损失函数使用交叉熵,优化器使用adam优化器。
  3. 在预处理数据的时候可以使用pytorch的 DataLoader方法,它可以将训练数据规整成一个一个批次,并且通过shuffle=True将每轮的训练数据打乱。(注意:可以设置num_workers=n,则会产生多个进程处理数据,但是受到计算机自身硬件资源的限制可以能会变得特别的慢,其次受操作系统影响在windows上使用spawn创建进程(开销比较大),在linux上使用fork创建进程(开销比较小),通常在windows上设置num_workers=0;或者将参数persistent_works=True,该参数开始在每轮训练结束后保留进程不销毁重建)
  4. 需要注意的是在RNN的设置中需要将batch_first=True,该参数的作用是在处理矩阵时将batch_size放在第一位,使得输入的训练数据按照批量排列,可以使训练变得更加高效(模型收敛更快),在GPU等硬件资源的加持下可以更好的利用并行计算的能力,从而更快的计算出梯度更新权重,使损失下降的更快。(batch_first默认为False,属于历史遗留问题)
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import jieba
import numpy as np
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import time

"""
    通过机器学习的方式实现分词,该方式的优势就是分词过程中不需要依赖词表、分词过程中会注意到语义、具有很高的容错性(错别字)、
    不会出现对枚举值不能穷尽的情况
    实现方式:
        1、通过jieba构建训练数据
        2、构建神经网络,实际上就是够一个二分类任务判断语句中每个字是不是词的边界
"""


# 构建训练数据-构建字表
def construct_vocab(vocab_path):
    vocab = {}
    with open(vocab_path, 'r', encoding='utf-8') as f:
        for index, line in enumerate(f):
            char = line.strip()
            vocab[char] = index+1
    vocab['unk'] = len(vocab) + 1
    return vocab


# 构建训练数据-为语句打标签
def construct_label(sequences):
    label = [0]*len(sequences)
    word = jieba.lcut(sequences)
    count = 0
    for i in word:
        count = count + len(i)
        label[count-1] = 1
    return label


# 构建训练数据-字符转序列
def word_to_sequence(sentence, vocab):
    sequence = [vocab.get(char, vocab['unk']) for char in sentence]
    return sequence


# 构建训练数据-标准化训练数据
class Dataset:
    def __init__(self, vocab, max_length, corpus_path):
        self.vocab = vocab
        self.max_length = max_length
        self.corpus_path = corpus_path
        self.load()

    # 对字符序列进行补齐操作
    def completion_sentence(self, sentence, label):
        sentence = sentence[:self.max_length]
        sentence += [0]*(self.max_length-len(sentence))
        label = label[:self.max_length]
        label += [-100]*(self.max_length-len(label))
        return sentence, label

    # 加载数据
    def load(self):
        self.data = []
        with open(self.corpus_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                label = construct_label(line)
                sentence = word_to_sequence(line, self.vocab)
                sentence, label = self.completion_sentence(sentence, label)
                sentence = torch.LongTensor(sentence)
                label = torch.LongTensor(label)
                self.data.append([sentence, label])
                if len(self.data) > 10000:
                    break

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return self.data[item]


# 将训练数据整理成批次
def data_loader(max_length, corpus_path, batch_size, vocab):
    dateset = Dataset(vocab, max_length, corpus_path)
    train_loader = DataLoader(dateset, batch_size=batch_size, shuffle=True)
    return train_loader


# 构建神经网络结构
class RNN_word_segmentation(nn.Module):
    def __init__(self, vocab, embedding_dim, hidden_size, num_layers):
        super(RNN_word_segmentation, self).__init__()
        self.embedding = nn.Embedding(len(vocab)+1, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_size,batch_first=True, num_layers=num_layers)
        self.linear = nn.Linear(hidden_size, 2)
        self.loss = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, sentence, label=None):
        sentence = self.embedding(sentence)
        sentence, _ = self.rnn(sentence)
        sentence = self.linear(sentence)
        if label is not None:
            return self.loss(sentence.reshape(-1, 2), label.view(-1))
        else:
            return sentence

# 训练模型
def train_main(model, data_loader):
    epochs = 1000           # 训练轮数
    learning_rate = 1e-3    # 学习率
    # 定义优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    # 记录损失值、以及准确率
    log = []
    # 训练模型
    for epoch in range(epochs):
        model.train()
        loss_value = []
        start = time.time()
        for sentence, label in data_loader:
            optimizer.zero_grad()  # 梯度置零
            loss = model(sentence, label)
            loss.backward()             # 计算梯度
            optimizer.step()            # 更新权重
            loss_value.append(loss.item())
        end = time.time()
        loss_avg = np.mean(loss_value)
        log.append(loss_avg)
        print(f'完成第{epoch}训练,损失值{loss_avg},消耗时间{end-start}!')
        if loss_avg <= 1e-1:
            print('训练模型达标,完成训练!')
            break
    # 保存模型
    torch.save(model.state_dict(), './model/word_segmentation_model.bin')
    # 可视化损失值变化趋势
    fig, ax = plt.subplots()
    ax.plot(range(len(log)), log, label='loss', color='red')
    plt.legend()
    plt.title('Change in loss')
    plt.show()


# 结果预测
def prediction(strings):
    vocab_path = './vocab/chars.txt'
    vocab = construct_vocab(vocab_path)
    model = RNN_word_segmentation(vocab, 50, 100, 1)
    model.load_state_dict(torch.load('./model/word_segmentation_model.bin',weights_only=True))
    model.eval()
    for string in strings:
        with torch.no_grad():
            sentence = word_to_sequence(string, vocab)
            sentence = torch.LongTensor(sentence)
            y_pred = model(sentence)
            y_pred = torch.argmax(y_pred, dim=-1)
            for index, p in enumerate(y_pred):
                if p == 1:
                    print(string[index], end='/')
                else:
                    print(string[index], end='')
            print()

if __name__ == '__main__':
    # word_path = './corpus/corpus.txt'
    # vocab_path = './vocab/chars.txt'
    # vocab = construct_vocab(vocab_path)
    # dataloaders = data_loader(20, word_path, 20, vocab)
    # model = RNN_word_segmentation(vocab, 50, 100, 1)
    # train_main(model, dataloaders)
    input_strings = ["同时国内有望出台新汽车刺激方案",
                     "沪胶后市有望延续强势",
                     "经过两个交易日的强势调整后",
                     "昨日上海天然橡胶期货价格再度大幅上扬",
                     "要弘扬好中华民族传统美德"]
    prediction(input_strings)
    print("================jieba分词效果================")
    for sentence in input_strings:
        print(jieba.lcut(sentence))

模型训练效果:
在这里插入图片描述 使用模型预测效果对比:
在这里插入图片描述

6、中文分词新词发现

6.1 固定搭配:内部稳固度+外部左右熵

说明:在一片文章中要进行分词使在传统的分词过程中,需要依赖于词表但是词表中的词不能一成不变也需要定期加入新词,所以衍生出在新词的问题解决方案,判断一个此是否为新词主要从两个方面内部稳固度、外部左右熵。 在这里插入图片描述 模拟代码实现:

'''
    通过内部稳固度算法以及左右熵的计算进行新词的发现
'''
import math
from collections import defaultdict
from itertools import count


class NewWordsFind:
    def __init__(self, max_length, corpus_path):
        self.word_count = defaultdict(int)
        self.left_neighbour = defaultdict(dict)
        self.right_neighbour = defaultdict(dict)
        self.left_neighbour_entropy = defaultdict(int)
        self.right_neighbour_entropy = defaultdict(int)
        self.max_length = max_length
        self.corpus_path = corpus_path
        self.count_text_words(self.corpus_path)
        self.compute_left_right_entropy()
        self.compute_stability()
        self.word_value()

    # 统计一句话中各个词的数量以及左右字及数量
    def count_sequence_words(self, sequence, max_length):
        for i in range(len(sequence)-max_length+1):
            word = sequence[i:i+max_length]
            self.word_count[word] += 1
            if i-1 > 0:
                char = sequence[i-1]
                self.left_neighbour[word][char] = self.left_neighbour[word].get(char, 0) + 1
            if i+max_length < len(sequence):
                char = sequence[i+max_length]
                self.right_neighbour[word][char] = self.right_neighbour[word].get(char, 0) + 1

    # 计算文本中各个词的数量以及左右字数量
    def count_text_words(self, corpus_path):
        with open(corpus_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                for word_length in range(1, self.max_length+1):
                    self.count_sequence_words(line, word_length)

    # 计算左右熵
    def compute_entropy(self, word_dict):
        total = sum(word_dict.values())
        entropy = sum((-num/total)*math.log(num/total, 10) for num in word_dict.values())
        return entropy

    def compute_left_right_entropy(self):
        for word, char_dict in self.left_neighbour.items():
            self.left_neighbour_entropy[word] = self.compute_entropy(char_dict)
        for word, char_dict in self.right_neighbour.items():
            self.right_neighbour_entropy[word] = self.compute_entropy(char_dict)

    # 计算稳固度
    def count_word_by_length(self):
        self.word_count_by_length = defaultdict(int)
        for word, count in self.word_count.items():
            self.word_count_by_length[len(word)] += count

    def compute_stability(self):
        self.count_word_by_length()
        self.pim = {}
        for word, count in self.word_count.items():
            pad = count/self.word_count_by_length[len(word)]
            p_char = 1
            for char in word:
                p_char *= self.word_count[char]/self.word_count_by_length[1]
            self.pim[word] = math.log(pad/p_char, 10)/len(word)

    def word_value(self):
        self.word_value = {}
        for word in self.pim:
            if len(word) < 2 or "," in word or "。" in word:
                continue
            pim = self.pim.get(word, 1e-3)
            le = self.left_neighbour_entropy.get(word, 1e-3)
            re = self.right_neighbour_entropy.get(word, 1e-3)
            self.word_value[word] = pim * min(le,re)

if __name__ == '__main__':
    text_path = './corpus/诡舍.txt'
    new_word = NewWordsFind(8, text_path)
    sorted_new_word = sorted([(word, count)for word, count in new_word.word_value.items()], key=lambda x:x[1], reverse=True)
    print([word for word, count in sorted_new_word if len(word) == 2][:10])
    print([word for word, count in sorted_new_word if len(word) == 3][:10])
    print([word for word, count in sorted_new_word if len(word) == 4][:10])
    print([word for word, count in sorted_new_word if len(word) == 5][:10])
    print([word for word, count in sorted_new_word if len(word) == 6][:10])
    print([word for word, count in sorted_new_word if len(word) == 7][:10])

在这里插入图片描述

6.2 TF-IDF

说明:TF代表词频指一个词在一个文本中出现的概率,IDF代表逆文本概率指一个词在几个文本中出现, TF*IDF代表一个词在一个文本中的重要程度,用于找出重要词。 主要有以下应用:

  1. 信息搜索:主要逻辑是针对已存在网页中的信息进行分词并进行 TF-IDF值的计算,之后对输入的搜索信息进行分词,之后对每个文本 中的搜索信息中提到的词进行 TF-IDF求值得分计算,跟得分的高低给出相应的网页。
import json

import jieba

import compute_tf_idf

'''
    通过iftdf实现搜索引擎
'''
# 计算语料中每个词的tfidf
def get_word_tf_idf(news_path):
    news_path = './corpus/news.json'
    corpus = {}
    with open(news_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for news in data:
            corpus[news['title']] = [news['content']]
    word_tf_idf = compute_tf_idf.compute_tf_idf(corpus)
    return word_tf_idf, corpus


def search_engine(query, word_tf_idf, corpus, top):
    word_list = jieba.lcut(query)
    res = []
    for title_id, tfidf in word_tf_idf.items():
        sorce = 0
        for word in word_list:
            sorce += tfidf.get(word, 0)
        res.append([title_id, sorce])
    res = sorted(res, key=lambda x: x[1], reverse=True)
    for i in range(top):
        doc_id = res[i][0]
        doc = corpus[doc_id]
        print(doc)
        print("=================")

if __name__ == '__main__':
    news_path = './corpus/news.json'
    word_tf_idf, corpus = get_word_tf_idf(news_path)
    while True:
        query = input('请输入您要搜索的内容:')
        search_engine(query, word_tf_idf, corpus, 3)
  1. 文本摘要实现:主要逻辑是通过对文本中每一句话进行分词,之后根据每个词的 TF-IDF值计算每句话的得分,之后比较文本中排名前TopN的句子作为文本的摘要
import json

import jieba

import compute_tf_idf
import re
'''
    通过tfidf实现文件摘要
'''


# 记载文本数据以及计算其tfidf值
def load_data(corpus_path):
    corpus = {}
    with open(corpus_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for news in data:
            corpus[news["title"]] = [news["content"]]
    word_tf_idf = compute_tf_idf.compute_tf_idf(corpus)
    return corpus, word_tf_idf


# 对一个文本进行文本摘要
def sentence_abstract(document, document_tf_idf, top):
    if document is None:
        return None
    sentences = re.split("?|!|。", document)
    result = []
    for index, sentence in enumerate(sentences):
        word_list = jieba.lcut(sentence)
        source = 0
        for word in word_list:
            source += document_tf_idf.get(word, 0)
        source /= (len(word_list)+1)
        result.append([source, index])
    result = sorted(result, key=lambda x: x[1], reverse=True)
    important_index = sorted([x[1] for x in result[:top]])
    return "。".join([sentences[index] for index in important_index])

def craete_abstract(corpus, word_tf_idf, top):
    res = []
    for title_id, document_list in corpus.items():
        for document in document_list:
            abstract = sentence_abstract(document, word_tf_idf.get(title_id), top)
            if abstract is None:
                continue
            res.append({'title':title_id, 'content': document, 'abstract': abstract})

    return res


if __name__ == '__main__':
    corpus, word_tf_idf = load_data('./corpus/news.json')
    res = craete_abstract(corpus, word_tf_idf, 3)
    for i in res:
        print(i)

    # word = '编者按:今年冬季让叠穿打造你的完美身材,越穿越瘦是可能。哪怕是不了解流行,也要对时尚又显瘦的叠穿造型吸引,现在就开始行动吧!搭配Tips:亮红色的皮外套给人光彩夺目的感觉,内搭短版的黑色T恤,露出带有线条的腹部是关键,展现你健美的身材。 搭配Tips:简单款型的机车装也是百搭的单品,内搭一条长版的连衣裙打造瘦身的中性装扮。软硬结合的mix风同样备受关注。 搭配Tips:贴身的黑色装最能达到瘦身的效果,即时加上白色的长外套也不会发福。长款的靴子同样很好的修饰了你的小腿线条。 搭配Tips:高腰线的抹胸装很有拉长下身比例的效果,A字形的荷叶摆同时也能掩盖腰部的赘肉。外加一件短款的羽绒服,配上贴腿的仔裤,也很修长。'
    # sentences = re.split("?|!|。", word)
    # print(sentences)
  1. 文本相似度比较:具体逻辑是首先得找到每个文本中 TF-IDF排名前TopN的词然后构成一个词表,然后构建一个词表长度的全零向量,向量中每个位置对应词表中的词在该文本中的概率求出来作为该向量的值,完成之后就将其作为该文本的向量,进行向量夹角余弦值的计算,其值越大则两个文本的相似度越大。
import json
import math
from collections import defaultdict

import jieba

import compute_tf_idf
import numpy as np

'''
    通过tf-idf实现文本相似度比较
'''
# 计算文本的tf-idf值
def compute_document_td_idf(corpus_path):
    corpus = defaultdict(dict)
    with open(corpus_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for doc in data:
            corpus[doc['title']] = [doc['content']]
    # 计算文本的tf-idf值
    word_tf_idf = compute_tf_idf.compute_tf_idf(corpus)
    # 保存前5关键字
    key_word = set()
    for title, content in word_tf_idf.items():
        content_temp = sorted(content.items(), key=lambda x: x[1], reverse=True)
        top_5_word = content_temp[:5]
        for word in top_5_word:
            key_word.add(word[0])
    return word_tf_idf, key_word, corpus


# 将文本转换成向量
def doc_to_vector(document, vocab):
    vector = np.zeros(len(vocab))
    document_list = jieba.lcut(document)
    for index, word in enumerate(vocab):
        vector[index] = document_list.count(word)/len(document_list)
    return vector

# 计算余弦向量值
def compute_cosine(vector1, vector2):
    product = np.dot(vector1, vector2)
    vector1_mold_height = math.sqrt(sum(x**2 for x in vector1))
    vector2_mold_height = math.sqrt(sum(x**2 for x in vector2))
    return product/(vector1_mold_height*vector2_mold_height)

# 计算各个文本的与输入文本的相似度
def compute_document_similarity(input_string, corpus, vocab):
    similarity_dict = {}
    vector1 = doc_to_vector(input_string, vocab)
    for title, content in corpus.items():
        vector2 = doc_to_vector(content[0], vocab)
        similarity = compute_cosine(vector1, vector2)
        similarity_dict[title] = similarity
    similarity_dict = sorted(similarity_dict.items(), key=lambda x:x[1], reverse=True)
    similarity_top = similarity_dict[:4]
    return similarity_top


if __name__ == '__main__':
    word_tf_idf, vocab, corpus = compute_document_td_idf('./corpus/news.json')
    string = '魔兽争霸'
    similarity_top = compute_document_similarity(string, corpus, vocab)
    for title, score in similarity_top:
        print(f'文章:{title},得分:{score}')