1.Sikp-Gram 模型
与COBW的区别是, Skip-Gram 是给定中心词预测上下文, CBOW 是给定上下文预测中心词(预测center 还是 context的区别),下面的代码改改也可以成CBOW模型,训练最后把权重导出当做词向量
这里embeding层和linear层使用的是同一个权重, 这种方法更加有效,如果使用不同权重会有耦合上的问题
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import PennTreebank
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = get_tokenizer("basic_english")
train_iter, _, _ = PennTreebank()
vocab = build_vocab_from_iterator(map(tokenizer, train_iter),
min_freq=5,
specials=['<unk>', 'N']
)
vocab.set_default_index(vocab['<unk>'])
vocab.to(device)
EMBEDDING_DIM = 100
VOCAB_SIZE = len(vocab)
EPOCHS = 5
BATCH_SIZE = 16
INTERVAL = 400
class PTBDataset(Dataset):
def __init__(self, data, vocab, window_size=1):
self.data = list(data)
self.vocab = vocab
self.vocab_size = len(vocab)
self.window_size = window_size
self.pairs = self.create_pairs()
def create_pairs(self):
pairs = []
for sentence in self.data:
words = tokenizer(sentence)
indices = [self.vocab[word] for word in words]
for center_word_pos in range(len(indices)):
center_word_idx = indices[center_word_pos]
context_word_idx = list()
for w in range(-self.window_size, self.window_size + 1):
if w == 0:
continue
context_word_pos = center_word_pos + w
if context_word_pos >= len(indices) or context_word_pos < 0:
continue
context_word_idx.append(indices[context_word_pos])
pairs.append((center_word_idx, context_word_idx))
return pairs
def to_tensor(self, i):
center, context = self.pairs[i]
center_idx = torch.tensor(center, device=device)
context_tensor = torch.zeros(self.vocab_size, device=device)
context_tensor[context] = 1
return center_idx, context_tensor
def __len__(self):
return len(self.pairs)
def __getitem__(self, idx):
return self.to_tensor(idx)
class Word2Vec(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super(Word2Vec, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.linear = nn.Linear(embedding_dim, vocab_size)
self.linear.weight = torch.nn.Parameter(self.embeddings.weight)
def forward(self, center):
c0 = self.embeddings(center)
c0 = self.linear(c0)
return c0
dataset = PTBDataset(train_iter, vocab, window_size=1)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
model = Word2Vec(VOCAB_SIZE, EMBEDDING_DIM).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
loss_list = list()
start_time = time.time()
print('start training...')
for epoch in range(1, EPOCHS + 1):
total_loss = 0
iter_loss = 0
cnt = 0
for center, context in dataloader:
optimizer.zero_grad()
output = model(center)
loss = criterion(output, context)
loss.backward()
optimizer.step()
iter_loss += loss.item()
cnt += 1
if cnt % INTERVAL == 0:
avg_loss = iter_loss / INTERVAL
print(f"-epoch: {epoch}\t -loss: {avg_loss:.5f} -"
f"time:{time.time() - start_time:.2f}s")
loss_list.append(avg_loss)
iter_loss = 0
torch.save(model.embeddings.weight.data, r"pth_file/word_embeddings.pth")
fig = plt.figure()
ax = plt.plot(loss_list)
plt.show()
训练中loss曲线:
2.加载并计算相似度
直接算余弦相似好了, , 也是中学就学过的公式
import torch
from torchtext.datasets import PennTreebank
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = get_tokenizer("basic_english")
train_iter, _, _ = PennTreebank()
vocab = build_vocab_from_iterator(map(tokenizer, train_iter),
min_freq=5,
specials=['<unk>', 'N']
)
vocab.set_default_index(vocab['<unk>'])
vocab.to(device)
weight = torch.load(r"pth_file/word_embeddings.pth").to(device)
stoi = vocab.get_stoi()
itos = vocab.get_itos()
def get_most_similar_words(word, top_n=10):
if word not in stoi:
return None
word_idx = stoi[word]
word_vec = weight[word_idx]
cos = torch.nn.CosineSimilarity(dim=0)
similarities = []
for i in range(weight.size(0)):
if i == word_idx:
continue
sim = cos(word_vec, weight[i]).item()
similarities.append((itos[i], sim))
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:top_n]
# 示例:获取最相似的词
word = 'what'
similar_words = get_most_similar_words(word)
if similar_words:
print(f"Most similar words to '{word}':")
for similar_word, similarity in similar_words:
print(f"{similar_word}: {similarity:.4f}")
else:
print(f"Word '{word}' not found in vocabulary.")
随便找个单词跑跑
对于有些词甚至可以区分出单复数
当然如果自己训的效果不理想也可以使用别人的权重,把之前的函数复制粘贴即可测试
import torch
from torchtext.vocab import GloVe
# 加载预训练的GloVe词向量
glove = GloVe(name='6B', dim=300)
stoi = glove.stoi
itos = glove.itos
weight = glove.vectors
# 粘贴处