使用 PyTorch 实现基于 Skip-Gram 的 Word2vec 模型

698 阅读7分钟

Word2vec 是用来产生词向量的模型,它可以从大量文本语料中以无监督方式学习语义知识。 在模型训练完成后,Word2vec 模型可用来映射每个词到一个向量, 可用来表示词对词之间的关系。

在学习和实现 Word2vec 过程中,苦于不容易找到直接复制粘贴就能跑起来的代码, 花了不少时间参考网上分享的代码和 Word2vec 的公式推导,才最终理解 Skip-Gram 实现的细节问题。

为了助力后来人更容易学会 Word2vec 模型,以下提供可以直接能跑起来的代码, 在下载数据完成后运行代码训练 Word2vec 模型。

下载数据

点击链接 text8.txt 或在网页 About the Test Data (mattmahoney.net) 下载开源数据集。

完整代码

下述代码使用 Skip-Gram 的负采样方法实现,依赖 PyTorch 以及 scipy 库。 Skip-Gram 模型的负采样方法原理可以查看 Word2vec 中 Skip-Gram 模型负采样方法的原理 - 掘金 (juejin.cn)

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import scipy
import random
import threading
from typing import List, Dict, Tuple
from collections import Counter


# 实现简易的分词器
class WordTokenizer(object):
    def __init__(self):
        super(WordTokenizer, self).__init__()
    
    def split_words(self, content_str: str) -> List[str]:
        sentences_str = content_str.strip()
        sentences_str = sentences_str.lower()
        words_list = sentences_str.split(' ')
        
        return words_list  


# 封装语料预处理的逻辑
class CorpusData(object):
    def __init__(self, word_tokenizer: WordTokenizer, max_vocab_size: int):
        super(CorpusData, self).__init__()
        self.word_tokenizer = word_tokenizer
        self.max_vocab_size = max_vocab_size
        # 编码前的词汇序列
        self.original_words_list: List[str] = list()
        # 编码后的词汇序列
        self.encoded_words_list: List[str] = list()
        # 词汇与 id 的映射关系
        self.word2id_dict: Dict[str, int] = dict()
        self.id2word_dict: Dict[int, str] = dict()
        # 记录词汇表的列表
        self.vocab_words_list: List[str] = list()
        # 统计词汇表的频数
        self.vocab_counts_list: List[int] = list()
        
    def load_data(self, data_str: str) -> None:
        original_words_list = self.word_tokenizer.split_words(data_str)
        # 统计词汇出现的次数
        word2count_dict = Counter(original_words_list)
        # 只保留出现次数最多的前 N 个词汇,剩下的其它词汇用 <UNK> 代替
        word2count_dict = dict(word2count_dict.most_common(self.max_vocab_size - 1))
        word2count_dict['<UNK>'] = len(original_words_list) - sum(word2count_dict.values())
        
        vocab_words_list = sorted(word2count_dict, key=word2count_dict.get, reverse=True)
        vocab_counts_list = list()
        word2id_dict = dict()
        id2word_dict = dict()
        
        for index, word in enumerate(vocab_words_list):
            vocab_counts_list.append(word2count_dict[word])
            word2id_dict[word] = index
            id2word_dict[index] = word
            
        unknown_word_id = word2id_dict['<UNK>']
        # 编码词汇序列成对应的 id 序列
        encoded_words_list = [word2id_dict.get(word, unknown_word_id) for word in original_words_list]
        
        self.original_words_list = original_words_list
        self.encoded_words_list = encoded_words_list
        self.word2id_dict = word2id_dict
        self.id2word_dict = id2word_dict
        self.vocab_words_list = vocab_words_list
        self.vocab_counts_list = vocab_counts_list
        

# 自定义数据集
class SkipGramDataset(Dataset):
    def __init__(self, corpus_data: CorpusData, max_window_size: int, negative_sample_num: int):
        super(SkipGramDataset, self).__init__()
        corpus_words_list = corpus_data.encoded_words_list
        corpus_word_num = len(corpus_words_list)
        # 根据窗口大小忽略开头和结尾的 N 个词汇
        dataset_item_num = corpus_word_num - (2 * max_window_size)
        center_index_offset = max_window_size
        # 计算负样本的采样概率
        vocab_counts_list = corpus_data.vocab_counts_list
        vocab_counts_ndarray = np.array(vocab_counts_list, dtype=np.float32)
        negative_probs_ndarray = vocab_counts_ndarray ** 0.75
        negative_probs_ndarray = negative_probs_ndarray / np.sum(negative_probs_ndarray)
        
        self.dataset_item_num = dataset_item_num
        self.center_index_offset = center_index_offset
        self.max_window_size = max_window_size
        self.corpus_words_list = corpus_words_list
        self.corpus_words_tensor = torch.IntTensor(corpus_words_list)
        self.corpus_word_num = corpus_word_num
        # 每个正样本对应的负样本数目
        self.negative_sample_num = negative_sample_num
        self.negative_probs_tensor = torch.FloatTensor(negative_probs_ndarray)
        self.current_thread_data = threading.local()
        
    def __len__(self):
        return self.dataset_item_num
    
    def __getitem__(self, index) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        center_word_index = index + self.center_index_offset
        # 获取中心词
        center_word = self.corpus_words_list[center_word_index]
        center_words_tensor = torch.IntTensor([center_word])
        
        window_size = self.max_window_size
        start_index = center_word_index - window_size
        end_index = center_word_index + window_size + 1
        
        # 获取中心词附近的上下文词
        positive_words_list = [self.corpus_words_list[curr_index]  for curr_index 
                               in range(start_index, end_index) if curr_index != center_word_index]
        positive_words_tensor = torch.IntTensor(positive_words_list)
        
        # 使用线程本地变量去避免多线程情况下可能出现的逻辑错误
        try:
            negative_candidates_tensor = self.current_thread_data.negative_candidates_tensor
        except AttributeError:
            negative_candidates_tensor = self.negative_probs_tensor.clone()
            self.current_thread_data.negative_candidates_tensor = negative_candidates_tensor
            
        # 根据正样本计算要采样的负样本数目
        negative_word_num = self.negative_sample_num * len(positive_words_tensor)
        # 通过把概率置为 0 实现负样本采样时跳过正样本中的词汇
        negative_exclude_list = [self.corpus_words_list[curr_index]  for curr_index 
                                    in range(start_index, end_index)]
        negative_candidates_tensor[negative_exclude_list] = 0
        # 随机获取 N 个负样本
        negative_words_tensor = torch.multinomial(negative_candidates_tensor, negative_word_num, replacement=True)
        negative_candidates_tensor[negative_exclude_list] = self.negative_probs_tensor[negative_exclude_list]
        
        return center_words_tensor, positive_words_tensor, negative_words_tensor
            

# Skip-Gram模型
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_size: int):
        super(SkipGramModel, self).__init__()
        # 词汇表大小
        self.vocal_size = vocab_size
        # 词向量的维度大小
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.embedding_out = nn.Embedding(vocab_size, embedding_size)
        
    def forward(self, center_in: torch.Tensor, positive_out: torch.Tensor, negative_out: torch.Tensor) -> torch.Tensor:
        center_emb_in = self.embedding(center_in)
        # 转置该矩阵以满足矩阵相乘的前提条件,第二个矩阵的行数等于第一个矩阵的列数
        center_emb_in = torch.transpose(center_emb_in, dim0=2, dim1=1)
        positive_emb_out = self.embedding_out(positive_out)
        negative_emb_out = self.embedding_out(negative_out)
        
        # 正样本与中心词做矩阵乘法运算
        positive_prob = torch.bmm(positive_emb_out, center_emb_in)  
        positive_prob = torch.squeeze(positive_prob, dim=2)
        positive_prob = nn.functional.logsigmoid(positive_prob)
        positive_prob = torch.sum(positive_prob, dim=1)

        # 负样本与中心词做矩阵乘法运算
        negative_emb_out = torch.neg(negative_emb_out)
        negative_prob = torch.bmm(negative_emb_out, center_emb_in)
        negative_prob = torch.squeeze(negative_prob, dim=2)
        negative_prob = nn.functional.logsigmoid(negative_prob)
        negative_prob = torch.sum(negative_prob, dim=1)
        
        loss = positive_prob + negative_prob
        loss = torch.neg(loss)
        loss = torch.mean(loss)
        
        return loss


# 封装训练模型和测试模型的逻辑
class Word2vec(object):
    def __init__(self, corpus_data: CorpusData, embedding_size: int):
        super(Word2vec, self).__init__()
        self.corpus_data = corpus_data
        self.embedding_size = embedding_size
    
    def train_model(self, output_file_path: str, max_window_size: int, negative_sample_num: int,
                    epoch_num = 1, batch_size = 1, learning_rate = 0.1) -> None:
        """
        训练模型.
        
        Args:
          output_file_path: 训练结束后模型中的参数要保存的文件路径
          max_window_size: 最大窗口大小
          negative_sample_num: 每个正样本对应的负样本数目
          epoch_num: 迭代次数
          batch_size: 批次大小
          learning_rate: 学习率
        """
        
        # 此处通过设置随机方法的种子,可以复现训练的过程
        random.seed(1)
        np.random.seed(1)
        torch.manual_seed(1)
        
        if torch.cuda.is_available():
            device = 'cuda'
        else:
            device = 'cpu'
        
        dataset = SkipGramDataset(self.corpus_data, max_window_size=max_window_size, 
                                  negative_sample_num=negative_sample_num)
        dataloader = DataLoader(dataset, batch_size, shuffle=True)
        batch_num = len(dataloader)
        
        vocab_size = len(self.corpus_data.vocab_words_list)
        # 创建 Skip-Gram 模型
        model = SkipGramModel(vocab_size = vocab_size, embedding_size = self.embedding_size)
        model = model.to(device)
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
        model.train()
        
        for epoch_id in range(epoch_num):
            print(f'Epoch {epoch_id+1}/{epoch_num}')
            
            for batch_id, (center_in, positive_out, negative_out) in enumerate(dataloader):
                center_in = center_in.to(device)
                positive_out = positive_out.to(device)
                negative_out = negative_out.to(device)
                
                loss = model(center_in, positive_out, negative_out)
                # 根据误差反向计算梯度和更新权重
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                
                if batch_id % 500 == 0:
                    print(f'loss: {loss.item():>10f}  [{batch_id+1:>5d}/{batch_num:>5d}]')
                    
        # 保存模型的所有参数
        state_dict = model.state_dict()        
        torch.save(state_dict, output_file_path)
    
    def test_model(self, model_state_path: str, test_words_list: List[str], nearest_word_num = 10) -> None:
        """
        测试模型.
        
        Args:
          model_state_path: 保存模型参数的文件路径
          test_words_list: 测试的词汇
          nearest_word_num: 每个测试词汇要获取最相似的词汇数
        """
        
        vocab_size = len(self.corpus_data.vocab_words_list)
        model = SkipGramModel(vocab_size = vocab_size, embedding_size = self.embedding_size)
        
        # 加载模型参数
        state_dict = torch.load(model_state_path)
        model.load_state_dict(state_dict)
        model.eval()
        
        word2id_dict = self.corpus_data.word2id_dict
        id2word_dict = self.corpus_data.id2word_dict
        # 模型中 embedding.weight 就是各个词汇对应的词向量
        weights_tensor = model.embedding.weight.detach()
        weights_ndarray = weights_tensor.numpy()
        
        for word in test_words_list:
            if (word not in word2id_dict):
                print(f'{word}: []')
                continue
            
            word_id = word2id_dict[word]
            word_weight = weights_ndarray[word_id]
            # 计算各个词汇与当前词汇的相似度
            similarity_list = [scipy.spatial.distance.cosine(curr_weight, word_weight) 
                              for curr_weight in weights_ndarray]
            similarity_ndarray = np.array(similarity_list)
            indexes_ndarray = similarity_ndarray.argsort()
            # 获取相似度最高的前 N 个词汇
            indexes_ndarray = indexes_ndarray[1:nearest_word_num+1]
            nearest_words_list =  [id2word_dict[curr_index] for curr_index in indexes_ndarray]

            print(f'{word}: {nearest_words_list}')



# 语料文件的路径
CORPUS_DATA_PATH = './text8.txt'
# 模型参数保存的路径
MODEL_DICT_PATH = './word2vec-latest.pth'
# 正样本距离中心词的最大距离
#MAX_WINDOW_SIZE = 3
MAX_WINDOW_SIZE = 5
# 每个正样本对应的负样本数据
NEGATIVE_SAMPLE_NUM = 15
# 词汇表最大数目
MAX_VOCAB_SIZE = 10000
# 词向量的维度大小
EMBEDDING_SIZE = 100
# 迭代次数
EPOCH_NUM = 1
# 批次大小
BATCH_SIZE = 32
# 学习率
LEARNING_RATE = 0.2


run_mode = 'test'
run_mode = 'train'
test_list = ['two', 'america', 'computer', 'queen', 'king', 'woman', 'man', 'black', 'green', 'java']

if run_mode == 'train':
    with open(CORPUS_DATA_PATH, 'r', encoding='utf-8') as f:
        file_content = f.read()
    
    word_tokenizer = WordTokenizer()
    corpus_data = CorpusData(word_tokenizer, MAX_VOCAB_SIZE)
    corpus_data.load_data(file_content)
    
    word2vec = Word2vec(corpus_data, EMBEDDING_SIZE)
    # 训练模型
    word2vec.train_model( output_file_path = MODEL_DICT_PATH, 
                        max_window_size = MAX_WINDOW_SIZE, 
                        negative_sample_num = NEGATIVE_SAMPLE_NUM,
                        epoch_num = EPOCH_NUM, 
                        batch_size = BATCH_SIZE, 
                        learning_rate = LEARNING_RATE)
elif run_mode == 'test':
    with open(CORPUS_DATA_PATH, 'r', encoding='utf-8') as f:
        file_content = f.read()
    
    word_tokenizer = WordTokenizer()
    corpus_data = CorpusData(word_tokenizer, MAX_VOCAB_SIZE)
    corpus_data.load_data(file_content)
    
    word2vec = Word2vec(corpus_data, EMBEDDING_SIZE)
    # 测试模型
    word2vec.test_model(model_state_path = MODEL_DICT_PATH, 
                        test_words_list = test_list,
                        nearest_word_num = 10)

测试截图

使用 3060Ti 显卡花费几十分钟训练,可能在 CPU 上会花费更多时间。 image.png

参考资料