jieba+gensim 文本词向量训练word2vec

336 阅读1分钟

jieba+gensim 文本词向量训练word2vec

*安装gensim pip install gensim

*准备数据

from gensim import utils
import jieba

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in open('./raw/medical_all_data.txt').readlines():
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(" ".join(jieba.lcut(line,HMM=False)))

训练模型

import gensim.models

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences,workers=4)

保存模型

import tempfile

model.save("medical_all_data.npy")
#读取模型
new_model = gensim.models.Word2Vec.load("medical_all_data.npy")