语料
corpus = [ '我在北京天安门', '选择AI,就是选择未来', '要么996要么icu', '我爱加班,加班使我快乐' ]
分词
import jieba
corpus = [
'我在北京天安门',
'选择AI,就是选择未来',
'要么996要么icu',
'我爱加班,加班使我快乐'
]
# 将停用词读出放在stopwords这个列表中
filepath = r'stopwords.txt'
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
word_list = []
for corpu in corpus:
seg_list = jieba.cut(corpu)
seg_list = [i for i in seg_list if i not in stopwords and i!=' ']
word_list.append(seg_list)
print(word_list)
词袋模型
from gensim import corpora
dictionary = corpora.Dictionary(word_list)
print(dictionary.token2id)
new_corpus = [dictionary.doc2bow(word) for word in word_list]
print(new_corpus)
tfidf
from gensim import models
tfidf = models.TfidfModel(new_corpus)
tfidf.save("my_model.tfidf")
tfidf = models.TfidfModel.load("my_model.tfidf")
tfidf_vec = []
for i in new_corpus:
string_tfidf = tfidf[i]
tfidf_vec.append(string_tfidf)
print(tfidf_vec)