使用torch将单词映射为词向量(word2vec)

110 阅读3分钟

1.Sikp-Gram 模型

与COBW的区别是, Skip-Gram 是给定中心词预测上下文, CBOW 是给定上下文预测中心词(预测center 还是 context的区别),下面的代码改改也可以成CBOW模型,训练最后把权重导出当做词向量

这里embeding层和linear层使用的是同一个权重, 这种方法更加有效,如果使用不同权重会有耦合上的问题

import time
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import PennTreebank
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = get_tokenizer("basic_english")
train_iter, _, _ = PennTreebank()
vocab = build_vocab_from_iterator(map(tokenizer, train_iter),
                                  min_freq=5,
                                  specials=['<unk>', 'N']
                                  )
vocab.set_default_index(vocab['<unk>'])
vocab.to(device)
EMBEDDING_DIM = 100
VOCAB_SIZE = len(vocab)
EPOCHS = 5
BATCH_SIZE = 16
INTERVAL = 400


class PTBDataset(Dataset):
    def __init__(self, data, vocab, window_size=1):
        self.data = list(data)
        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.window_size = window_size
        self.pairs = self.create_pairs()

    def create_pairs(self):
        pairs = []
        for sentence in self.data:
            words = tokenizer(sentence)
            indices = [self.vocab[word] for word in words]
            for center_word_pos in range(len(indices)):
                center_word_idx = indices[center_word_pos]
                context_word_idx = list()
                for w in range(-self.window_size, self.window_size + 1):
                    if w == 0:
                        continue
                    context_word_pos = center_word_pos + w
                    if context_word_pos >= len(indices) or context_word_pos < 0:
                        continue
                    context_word_idx.append(indices[context_word_pos])
                pairs.append((center_word_idx, context_word_idx))
        return pairs

    def to_tensor(self, i):
        center, context = self.pairs[i]
        center_idx = torch.tensor(center, device=device)
        context_tensor = torch.zeros(self.vocab_size, device=device)
        context_tensor[context] = 1
        return center_idx, context_tensor

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.to_tensor(idx)


class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.linear.weight = torch.nn.Parameter(self.embeddings.weight)

    def forward(self, center):
        c0 = self.embeddings(center)
        c0 = self.linear(c0)
        return c0


dataset = PTBDataset(train_iter, vocab, window_size=1)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

model = Word2Vec(VOCAB_SIZE, EMBEDDING_DIM).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

loss_list = list()
start_time = time.time()
print('start training...')
for epoch in range(1, EPOCHS + 1):
    total_loss = 0
    iter_loss = 0
    cnt = 0
    for center, context in dataloader:
        optimizer.zero_grad()
        output = model(center)
        loss = criterion(output, context)
        loss.backward()
        optimizer.step()
        iter_loss += loss.item()
        cnt += 1
        if cnt % INTERVAL == 0:
            avg_loss = iter_loss / INTERVAL
            print(f"-epoch: {epoch}\t -loss: {avg_loss:.5f} -"
                  f"time:{time.time() - start_time:.2f}s")
            loss_list.append(avg_loss)
            iter_loss = 0

torch.save(model.embeddings.weight.data, r"pth_file/word_embeddings.pth")

fig = plt.figure()
ax = plt.plot(loss_list)
plt.show()

训练中loss曲线:

01b05cdeef3a252621aca984cea88598.png

2.加载并计算相似度

直接算余弦相似好了, cosθ=ababcos \theta = \frac{a \cdot b}{\parallel a\parallel \parallel b\parallel} , 也是中学就学过的公式

import torch
from torchtext.datasets import PennTreebank
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = get_tokenizer("basic_english")
train_iter, _, _ = PennTreebank()
vocab = build_vocab_from_iterator(map(tokenizer, train_iter),
                                  min_freq=5,
                                  specials=['<unk>', 'N']
                                  )
vocab.set_default_index(vocab['<unk>'])
vocab.to(device)
weight = torch.load(r"pth_file/word_embeddings.pth").to(device)
stoi = vocab.get_stoi()
itos = vocab.get_itos()


def get_most_similar_words(word, top_n=10):
    if word not in stoi:
        return None

    word_idx = stoi[word]
    word_vec = weight[word_idx]
    cos = torch.nn.CosineSimilarity(dim=0)

    similarities = []
    for i in range(weight.size(0)):
        if i == word_idx:
            continue
        sim = cos(word_vec, weight[i]).item()
        similarities.append((itos[i], sim))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]


# 示例:获取最相似的词
word = 'what'
similar_words = get_most_similar_words(word)

if similar_words:
    print(f"Most similar words to '{word}':")
    for similar_word, similarity in similar_words:
        print(f"{similar_word}: {similarity:.4f}")
else:
    print(f"Word '{word}' not found in vocabulary.")

随便找个单词跑跑

image.png

image.png

对于有些词甚至可以区分出单复数

当然如果自己训的效果不理想也可以使用别人的权重,把之前的函数复制粘贴即可测试

import torch
from torchtext.vocab import GloVe

# 加载预训练的GloVe词向量
glove = GloVe(name='6B', dim=300)
stoi = glove.stoi
itos = glove.itos
weight = glove.vectors
# 粘贴处