参考网址
www.jianshu.com/p/f3b92124c…
0/前言
tf-idf指标,是用来衡量一个词word对一篇文档的重要程度,可以用来提取文档的关键词。
该指标,兼顾词频和普遍度
本文主要来介绍4种计算tf-idf指标的方法
分别是:
gensim库计算
sklearn库计算
jieba库计算
自行编写python代码计算
1/gensim库计算tf-idf
import gensim
corpus = [
'this is the first document',
'this is the second second document',
'and the third one',
'is this the first document'
]
word_list = []
for i in corpus:
word_list.append( i.split(' ') )
print(word_list)
[['this', 'is', 'the', 'first', 'document'],
['this', 'is', 'the', 'second', 'second', 'document'],
['and', 'the', 'third', 'one'],
['is', 'this', 'the', 'first', 'document']
]
dictionary = gensim.corpora.Dictionary(word_list)
print( dictionary.token2id )
{'document': 0,
'first': 1,
'is': 2,
'the': 3,
'this': 4,
'second': 5,
'and': 6,
'one': 7,
'third': 8}
new_corpus = [dictionary.doc2bow(text) for text in word_list]
print( new_corpus )
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
[(0, 1), (2, 1), (3, 1), (4, 1), (5, 2)],
[(3, 1), (6, 1), (7, 1), (8, 1)],
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]
tfidf = gensim.models.TfidfModel( new_corpus )
tfidf.save("my_model.tfidf")
tfidf_model = gensim.models.TfidfModel.load("my_model.tfidf")
tfidf_vec = []
for i in corpus:
string = i
string_bow = dictionary.doc2bow(string.lower().split())
string_tfidf = tfidf_model[string_bow]
tfidf_vec.append(string_tfidf)
print(tfidf_vec)
[
[(0, 0.33699829595119235),
(1, 0.8119707171924228),
(2, 0.33699829595119235),
(4, 0.33699829595119235)],
[(0, 0.10212329019650272),
(2, 0.10212329019650272),
(4, 0.10212329019650272),
(5, 0.9842319344536239)],
[(6, 0.5773502691896258),
(7, 0.5773502691896258),
(8, 0.5773502691896258)],
[(0, 0.33699829595119235),
(1, 0.8119707171924228),
(2, 0.33699829595119235),
(4, 0.33699829595119235)]
]
string = 'the i first second name'
string_bow = dictionary.doc2bow(string.lower().split())
string_tfidf = tfidf[string_bow]
print(string_tfidf)
[ (1, 0.4472135954999579),
(5, 0.8944271909999159)
]
> gensim训练出来的tf-idf值左边是词的id,右边是词的tfidf值(重要程度)
> gensim有自动去除停用词的功能,比如the
> gensim会自动去除单个字母,比如i
> gensim会去除没有被训练到的词,比如name
> 所以通过gensim并不能计算每个单词的tfidf值
gensim的弊端:在预料库中没有的word,gensim不会给出tf-idf值
2/sklearn库计算tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
'this is the first document',
'this is the second second document',
'and the third one',
'is this the first document'
]
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(corpus)
print(tfidf_vec.get_feature_names())
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
print(tfidf_vec.vocabulary_)
{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
print(tfidf_matrix.toarray())
[
[0. 0.43877674 0.54197657 0.43877674 0. 0.
0.35872874 0. 0.43877674]
[0. 0.27230147 0. 0.27230147 0. 0.85322574
0.22262429 0. 0.27230147]
[0.55280532 0. 0. 0. 0.55280532 0.
0.28847675 0.55280532 0. ]
[0. 0.43877674 0.54197657 0.43877674 0. 0.
0.35872874 0. 0.43877674]
]
3/jieba库计算tf-idf
4/python计算tf-idf
corpus = [
'this is the first document',
'this is the second second document',
'and the third one',
'is this the first document'
]
word_list = []
for i in corpus:
word_list.append( i.split(' ') )
print(word_list)
[
['this', 'is', 'the', 'first', 'document'],
['this', 'is', 'the', 'second', 'second', 'document'],
['and', 'the', 'third', 'one'],
['is', 'this', 'the', 'first', 'document']
]
countlist = []
for i in range( len(word_list) ):
count = Counter(word_list[i])
countlist.append(count)
countlist
[Counter({'document': 1, 'first': 1, 'is': 1, 'the': 1, 'this': 1}),
Counter({'document': 1, 'is': 1, 'second': 2, 'the': 1, 'this': 1}),
Counter({'and': 1, 'one': 1, 'the': 1, 'third': 1}),
Counter({'document': 1, 'first': 1, 'is': 1, 'the': 1, 'this': 1})]
def tf(word, count):
return count[word] / sum(count.values())
def n_containing(word, count_list):
return sum(1 for count in count_list if word in count)
def idf(word, count_list):
return math.log(len(count_list) / (1 + n_containing(word, count_list)))
def tfidf(word, count, count_list):
return tf(word, count) * idf(word, count_list)
import math
for i, count in enumerate(countlist):
print("Top words in document {}".format(i + 1))
scores = {word: tfidf(word, count, countlist) for word in count}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:]:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
Top words in document 1
Word: first, TF-IDF: 0.05754
Word: this, TF-IDF: 0.0
Word: is, TF-IDF: 0.0
Word: document, TF-IDF: 0.0
Word: the, TF-IDF: -0.04463
Top words in document 2
Word: second, TF-IDF: 0.23105
Word: this, TF-IDF: 0.0
Word: is, TF-IDF: 0.0
Word: document, TF-IDF: 0.0
Word: the, TF-IDF: -0.03719
Top words in document 3
Word: and, TF-IDF: 0.17329
Word: third, TF-IDF: 0.17329
Word: one, TF-IDF: 0.17329
Word: the, TF-IDF: -0.05579
Top words in document 4
Word: first, TF-IDF: 0.05754
Word: is, TF-IDF: 0.0
Word: this, TF-IDF: 0.0
Word: document, TF-IDF: 0.0
Word: the, TF-IDF: -0.04463