如题,所示,两个docx文本,想检测两个的相似性,分词用的是jieba',结果一直是0,求教
import docx
import jieba
import gensim
#检验是否含有中文字符
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
#检验是否全是中文字符
def is_all_chinese(strs):
for _char in strs:
if not '\u4e00' <= _char <= '\u9fa5':
return False
return True
file=docx.Document(r"C:\Work\Python\应用\jieba分词统计\分词测试.docx")
file_test=docx.Document(r"C:\Work\Python\应用\jieba分词统计\分词测试2.docx")
text=""
for para in file.paragraphs:
text=text+para.text
text_test=""
for para in file_test.paragraphs:
text_test=text_test+para.text
# 分词
words = jieba.cut(text, cut_all = False)
words_test=jieba.cut(text_test,cut_all = False)
stopwords=[] # 设置停用词
for word in open(r"C:\Work\Python\应用\jieba分词统计\stop_word.txt",'r',encoding='utf-8'): # 这里加载停用词的路径
stopwords.append(word.strip())
# 停用词过滤
stayed_line=""
stayed_line_text=""
for word in words:
if word not in stopwords and is_contains_chinese(word):
stayed_line += word + " "
for word in words_test:
if word not in stopwords and is_contains_chinese(word):
stayed_line_text += word + " "
#对过滤后的文本进行分词
words = jieba.cut(stayed_line, cut_all = False)
words_test = jieba.cut(stayed_line_text, cut_all = False)
# 统计词频
word_freq = {} # 词频序列
for word in words:
if word in word_freq: # 统计字/词出现的次数
word_freq[word] += 1
else:
word_freq[word] = 1
word_freq_test = {} # 词频序列
for word in words_test:
if word in word_freq_test: # 统计字/词出现的次数
word_freq_test[word] += 1
else:
word_freq_test[word] = 1
freq_word = [] # 排序后的字/词列表
freq_word_test=[]
for word, freq in word_freq.items():
freq_word.append((word, freq))
freq_word.sort(key = lambda x: x[1], reverse = True)
for word, freq in word_freq_test.items():
freq_word_test.append((word, freq))
freq_word_test.sort(key = lambda x: x[1], reverse = True)
dictionary = gensim.corpora.Dictionary([list(word_freq.keys())])
dictionary_test=gensim.corpora.Dictionary([list(word_freq_test.keys())])
corpus = [dictionary.doc2bow(doc) for doc in [list(word_freq.keys())]]
corpus_test = [dictionary_test.doc2bow(doc) for doc in [list(word_freq_test.keys())]]
doc_test_vec = dictionary.doc2bow(list(word_freq_test.keys()))
tfidf = gensim.models.TfidfModel(corpus)
index = gensim.similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
sim = index[tfidf[doc_test_vec]]
print(sim)