jieba+gensim 文本词向量训练word2vec
*安装gensim
pip install gensim
*准备数据
from gensim import utils
import jieba
class MyCorpus:
"""An iterator that yields sentences (lists of str)."""
def __iter__(self):
for line in open('./raw/medical_all_data.txt').readlines():
# assume there's one document per line, tokens separated by whitespace
yield utils.simple_preprocess(" ".join(jieba.lcut(line,HMM=False)))
训练模型
import gensim.models
sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences,workers=4)
保存模型
import tempfile
model.save("medical_all_data.npy")
#读取模型
new_model = gensim.models.Word2Vec.load("medical_all_data.npy")