Word embedding是从离散对象(如单词)映射到向量和实数的概念,可将离散的输入对象有效地转换为有用的向量。
Word embedding的输入如下所示:
blue: (0.01359, 0.00075997, 0.24608, ..., -0.2524, 1.0048, 0.06259) blues: (0.01396, 0.11887, -0.48963, ..., 0.033483, -0.10007, 0.1158) orange: (-0.24776, -0.12359, 0.20986, ..., 0.079717, 0.23865, -0.014213) oranges: (-0.35609, 0.21854, 0.080944, ..., -0.35413, 0.38511, -0.070976)
Word2vec
Word2vec是用于无监督最常见方法,它以一种方式训练模型,即给定的输入单词通过使用跳跃语法来预测单词的上下文。
TensorFlow提供了多种方法来实现这种模型,从而提高了复杂性和优化级别,并使用了多线程概念和更高级别的抽象。
import os
import math
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
batch_size = 64
embedding_dimension = 5
negative_samples = 8
LOG_DIR = "logs/word2vec_intro"
digit_to_word_map = {
1: "One",
2: "Two",
3: "Three",
4: "Four",
5: "Five",
6: "Six",
7: "Seven",
8: "Eight",
9: "Nine"}
sentences = []
# 创建两种句子 - 奇数和偶数序列。
for i in range(10000):
rand_odd_ints = np.random.choice(range(1, 10, 2), 3)
sentences.append(" ".join([digit_to_word_map[r] for r in rand_odd_ints]))
rand_even_ints = np.random.choice(range(2, 10, 2), 3)
sentences.append(" ".join([digit_to_word_map[r] for r in rand_even_ints]))
# 将单词映射到索引
word2index_map = {}
index = 0
for sent in sentences:
for word in sent.lower().split():
if word not in word2index_map:
word2index_map[word] = index
index += 1
index2word_map = {index: word for word, index in word2index_map.items()}
vocabulary_size = len(index2word_map)
# 生成skip-gram对
skip_gram_pairs = []
for sent in sentences:
tokenized_sent = sent.lower().split()
for i in range(1, len(tokenized_sent)-1):
word_context_pair = [[word2index_map[tokenized_sent[i-1]],
word2index_map[tokenized_sent[i+1]]], word2index_map[tokenized_sent[i]]]
skip_gram_pairs</span><span class="pun">.</span><span class="pln">append</span><span class="pun">([</span><span class="pln">word_context_pair</span><span class="pun">[</span><span class="lit">1</span><span class="pun">],</span><span class="pln"> word_context_pair</span><span class="pun">[</span><span class="lit">0</span><span class="pun">][</span><span class="lit">0</span><span class="pun">]])</span><span class="pln">
skip_gram_pairs</span><span class="pun">.</span><span class="pln">append</span><span class="pun">([</span><span class="pln">word_context_pair</span><span class="pun">[</span><span class="lit">1</span><span class="pun">],</span><span class="pln"> word_context_pair</span><span class="pun">[</span><span class="lit">0</span><span class="pun">][</span><span class="lit">1</span><span class="pun">]])</span><span class="pln">
def get_skipgram_batch(batch_size):
instance_indices = list(range(len(skip_gram_pairs)))
np.random.shuffle(instance_indices)
batch = instance_indices[:batch_size]
x = [skip_gram_pairs[i][0] for i in batch]
y = [[skip_gram_pairs[i][1]] for i in batch]
return x, y
#批处理示例
x_batch, y_batch = get_skipgram_batch(8)
x_batch
y_batch
[index2word_map[word] for word in x_batch] [index2word_map[word[0]] for word in y_batch]
#输入数据,标签 train_inputs=tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])
# 嵌入查找表目前仅在 CPU 中实现
tf.name_scope("embeddings"):
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_dimension], -1.0, 1.0),
name = embedding)
# 这本质上是一个查找表
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# 为 NCE 损失创建变量
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_dimension], stddev = 1.0/
math.sqrt(embedding_dimension)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
loss = tf.reduce_mean(
tf.nn.nce_loss(weights = nce_weights, biases = nce_biases, inputs = embed,
labels = train_labels,num_sampled = negative_samples,
num_classes = vocabulary_size)) tf.summary.scalar("NCE_loss", loss)
# 学习率衰减
global_step = tf.Variable(0, trainable = False)
learningRate = tf.train.exponential_decay(learning_rate = 0.1,
global_step = global_step, decay_steps = 1000, decay_rate = 0.95, staircase = True)
train_step = tf.train.GradientDescentOptimizer(learningRate).minimize(loss)
merged = tf.summary.merge_all()
with tf.Session() as sess:
train_writer = tf.summary.FileWriter(LOG_DIR,
graph = tf.get_default_graph())
saver = tf.train.Saver()
with open(os.path.join(LOG_DIR, metadata.tsv), "w") as metadata:
metadata.write(Name Class
) for k, v in index2word_map.items():
metadata.write(%s %d
% (v, k))
config = projector.ProjectorConfig()
embedding = config.embeddings.add() embedding.tensor_name = embeddings.name
# 将此张量链接到其元数据文件(例如标签)。
embedding.metadata_path = os.path.join(LOG_DIR, metadata.tsv)
projector.visualize_embeddings(train_writer, config)
tf.global_variables_initializer().run()
for step in range(1000):
x_batch, y_batch = get_skipgram_batch(batch_size) summary, _ = sess.run(
[merged, train_step], feed_dict = {train_inputs: x_batch, train_labels: y_batch})
train_writer.add_summary(summary, step)
</span><span class="kwd">if</span><span class="pln"> step </span><span class="pun">%</span><span class="pln"> </span><span class="lit">100</span><span class="pln"> </span><span class="pun">==</span><span class="pln"> </span><span class="lit">0</span><span class="pun">:</span><span class="pln">
saver</span><span class="pun">.</span><span class="pln">save</span><span class="pun">(</span><span class="pln">sess</span><span class="pun">,</span><span class="pln"> os</span><span class="pun">.</span><span class="pln">path</span><span class="pun">.</span><span class="kwd">join</span><span class="pun">(</span><span class="pln">LOG_DIR</span><span class="pun">,</span><span class="pln"> </span><span class="str">"w2v_model.ckpt"</span><span class="pun">),</span><span class="pln"> step</span><span class="pun">)</span><span class="pln">
loss_value </span><span class="pun">=</span><span class="pln"> sess</span><span class="pun">.</span><span class="pln">run</span><span class="pun">(</span><span class="pln">loss</span><span class="pun">,</span><span class="pln"> feed_dict </span><span class="pun">=</span><span class="pln"> </span><span class="pun">{</span><span class="pln">
train_inputs</span><span class="pun">:</span><span class="pln"> x_batch</span><span class="pun">,</span><span class="pln"> train_labels</span><span class="pun">:</span><span class="pln"> y_batch</span><span class="pun">})</span><span class="pln">
</span><span class="kwd">print</span><span class="pun">(</span><span class="str">"Loss at %d: %.5f"</span><span class="pln"> </span><span class="pun">%</span><span class="pln"> </span><span class="pun">(</span><span class="pln">step</span><span class="pun">,</span><span class="pln"> loss_value</span><span class="pun">))</span><span class="pln">
# 在使用之前规范化嵌入
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims = True))
normalized_embeddings = embeddings /
norm normalized_embeddings_matrix = sess.run(normalized_embeddings)
ref_word = normalized_embeddings_matrix[word2index_map["one"]]
cosine_dists = np.dot(normalized_embeddings_matrix, ref_word)
ff = np.argsort(cosine_dists)[::-1][1:10] for f in ff: print(index2word_map[f])
print(cosine_dists[f])
上面的代码生成以下输出-