1. 基础概念
(1). N vs N (输入和输出序列必须要是等长的)
(2). N VS 1 (我们要处理的问题输入是一个序列,输出是一个单独的值而不是序列,我们只在最后一个h上进行输出变换就可以了,这种结构通常用来处理序列分类问题。如输入一段文字判别它所属的类别,输入一个句子判断其情感倾向等。)
(3). 1 VS N (如图像生成文字(image caption),此时输入的X就是图像的特征,而输出的y序列就是一段句子)
参考:
- zhuanlan.zhihu.com/p/28054589
- LSTM: colah.github.io/posts/2015-…
- RNN梯度消失和爆炸的原因: zhuanlan.zhihu.com/p/28687529
- LSTM如何缓解梯度消失: www.zhihu.com/question/44…
总结: sigmoid函数值的上界是1不是1/4,所以当bias较大的时候,sigmoid函数值可以做到很接近1,这样连乘下去的话会减少的比较慢,远处的信息还是能传播过来的;但是对于rnn的话,是sigmoid的导数连乘,每次乘的系数最大也就是1/4,所以很快就趋于0了,因此rnn只能考虑近距离传播的效果,学习到的参数无法对远距离信息的影响进行精确的刻画。
2. RNN 和 GRUs 向前传播代码
import numpy as np
from numpy import random
from time import perf_counter
def sigmoid(x): # Sigmoid function
return 1.0 / (1.0 + np.exp(-x))
Forward method for vanilla RNNs and GRUs:
random.seed(10) # Random seed, so your results match ours
emb = 128 # Embedding size
T = 256 # Number of variables in the sequences
h_dim = 16 # Hidden state dimension
h_0 = np.zeros((h_dim, 1)) # Initial hidden state
# Random initialization of weights and biases
w1 = random.standard_normal((h_dim, emb+h_dim))
w2 = random.standard_normal((h_dim, emb+h_dim))
w3 = random.standard_normal((h_dim, emb+h_dim))
b1 = random.standard_normal((h_dim, 1))
b2 = random.standard_normal((h_dim, 1))
b3 = random.standard_normal((h_dim, 1))
X = random.standard_normal((T, emb, 1))
weights = [w1, w2, w3, b1, b2, b3]
print(X[1].shape)
def forward_V_RNN(inputs, weights): # Forward propagation for a a single vanilla RNN cell
x, h_t = inputs
# weights.
wh, _, _, bh, _, _ = weights
# new hidden state
h_t = np.dot(wh, np.concatenate([h_t, x])) + bh
h_t = sigmoid(h_t)
return h_t, h_t
forward_V_RNN([X[1],h_0], weights)[0]
def forward_GRU(inputs, weights): # Forward propagation for a single GRU cell
x, h_t = inputs
# weights.
wu, wr, wc, bu, br, bc = weights
# Update gate
### START CODE HERE (1-2 lINES) ###
u = np.dot(wu, np.concatenate([h_t, x])) + bu
u = sigmoid(u)
### END CODE HERE ###
# Relevance gate
### START CODE HERE (1-2 lINES) ###
r = np.dot(wr, np.concatenate([h_t, x])) + br
r = sigmoid(r)
### END CODE HERE ###
# Candidate hidden state
### START CODE HERE (1-2 lINES) ###
c = np.dot(wc, np.concatenate([r * h_t, x])) + bc
c = np.tanh(c)
### END CODE HERE ###
# New Hidden state h_t
h_t = u* c + (1 - u)* h_t
return h_t, h_t
forward_GRU([X[1],h_0], weights)[0]
array([[ 9.77779014e-01],
[-9.97986240e-01],
[-5.19958083e-01],
[-9.99999886e-01],
[-9.99707004e-01],
[-3.02197037e-04],
[-9.58733503e-01],
[ 2.10804828e-02],
[ 9.77365398e-05],
[ 9.99833090e-01],
[ 1.63200940e-08],
[ 8.51874303e-01],
[ 5.21399924e-02],
[ 2.15495959e-02],
[ 9.99878828e-01],
[ 9.77165472e-01]])
def scan(fn, elems, weights, h_0=None): # Forward propagation for RNNs
h_t = h_0
ys = []
for x in elems:
### START CODE HERE (1 lINE) ###
y, h_t = fn([x, h_t], weights)
### END CODE HERE ###
ys.append(y)
return ys, h_t
ys, h_T = scan(forward_V_RNN, X, weights, h_0)
ys, h_T = scan(forward_GRU, X, weights, h_0)
3. Deep and Bi-directional RNNs
First, you compute the hidden states for the current layer. Then you get the activations and pass those values to the next hidden layer and repeat this process. In other words, at first, you propagate information through time. Then you go deeper in the network and repeat the process for each layer until you get to your predictions.
4. GRUs based character generation 代码示例
(1). Data generator
import os
import trax
import trax.fastmath.numpy as np
import pickle
import numpy
import random as rnd
from trax import fastmath
from trax import layers as tl
def line_to_tensor(line, EOS_int=1):
"""Turns a line of text into a tensor
Args:
line (str): A single line of text.
EOS_int (int, optional): End-of-sentence integer. Defaults to 1.
Returns:
list: a list of integers (unicode values) for the characters in the `line`.
"""
# Initialize the tensor as an empty list
tensor = []
# for each character:
for c in line:
# convert to unicode int
c_int = ord(c)
# append the unicode integer to the tensor list
tensor.append(c_int)
# include the end-of-sentence integer
tensor.append(1)
return tensor
def data_generator(batch_size, max_length, data_lines, line_to_tensor=line_to_tensor, shuffle=True):
"""Generator function that yields batches of data
Args:
batch_size (int): number of examples (in this case, sentences) per batch.
max_length (int): maximum length of the output tensor.
NOTE: max_length includes the end-of-sentence character that will be added
to the tensor.
Keep in mind that the length of the tensor is always 1 + the length
of the original line of characters.
data_lines (list): list of the sentences to group into batches.
line_to_tensor (function, optional): function that converts line to tensor. Defaults to line_to_tensor.
shuffle (bool, optional): True if the generator should generate random batches of data. Defaults to True.
Yields:
tuple: two copies of the batch (jax.interpreters.xla.DeviceArray) and mask (jax.interpreters.xla.DeviceArray).
NOTE: jax.interpreters.xla.DeviceArray is trax's version of numpy.ndarray
The batch is a tuple of three parts: inputs, targets, mask. The inputs and targets are identical. The second column will be used to evaluate your predictions. Mask is 1 for non-padding tokens.
"""
# initialize the index that points to the current position in the lines index array
index = 0
# initialize the list that will contain the current batch
cur_batch = []
# count the number of lines in data_lines
num_lines = len(data_lines)
# create an array with the indexes of data_lines that can be shuffled
lines_index = [*range(num_lines)]
# shuffle line indexes if shuffle is set to True
if shuffle:
rnd.shuffle(lines_index)
while True:
# if the index is greater or equal than to the number of lines in data_lines
if index >= num_lines:
# then reset the index to 0
index = 0
# shuffle line indexes if shuffle is set to True
if shuffle:
rnd.shuffle(lines_index)
# get a line at the `lines_index[index]` position in data_lines
line = data_lines[lines_index[index]]
# if the length of the line is less than max_length
if len(line) < max_length:
# append the line to the current batch
cur_batch.append(line)
# increment the index by one
index += 1
# if the current batch is now equal to the desired batch size
if len(cur_batch) == batch_size:
batch = []
mask = []
# go through each line (li) in cur_batch
for li in cur_batch:
# convert the line (li) to a tensor of integers
tensor = line_to_tensor(li)
# Create a list of zeros to represent the padding
# so that the tensor plus padding will have length `max_length`
pad = [0] * (max_length-len(tensor))
# combine the tensor plus pad
tensor_pad = tensor + pad
# append the padded tensor to the batch
batch.append(tensor_pad)
# A mask for tensor_pad is 1 wherever tensor_pad is not
# 0 and 0 wherever tensor_pad is 0, i.e. if tensor_pad is
# [1, 2, 3, 0, 0, 0] then example_mask should be
# [1, 1, 1, 0, 0, 0]
# Hint: Use a list comprehension for this
example_mask = [1 if x != 0 else 0 for x in tensor_pad]
mask.append(example_mask)
# convert the batch (data type list) to a trax's numpy array
batch_np_arr = np.array(batch)
mask_np_arr = np.array(mask)
# Yield two copies of the batch and mask.
yield batch_np_arr, batch_np_arr, mask_np_arr
# reset the current batch to an empty list
cur_batch = []
(2). Model
def GRULM(vocab_size=256, d_model=512, n_layers=2, mode='train'):
"""Returns a GRU language model.
Args:
vocab_size (int, optional): Size of the vocabulary. Defaults to 256.
d_model (int, optional): Depth of embedding (n_units in the GRU cell). Defaults to 512.
n_layers (int, optional): Number of GRU layers. Defaults to 2.
mode (str, optional): 'train', 'eval' or 'predict', predict mode is for fast inference. Defaults to "train".
Returns:
trax.layers.combinators.Serial: A GRU language model as a layer that maps from a tensor of tokens to activations over a vocab set.
"""
model = tl.Serial(
tl.ShiftRight(mode=mode), # Stack the ShiftRight layer
tl.Embedding(vocab_size=vocab_size, d_feature=d_model), # Stack the embedding layer
[tl.GRU(n_units=d_model) for _ in range(n_layers)], # Stack GRU layers of d_model units keeping n_layer parameter in mind (use list comprehension syntax)
tl.Dense(n_units=vocab_size), # Dense layer
tl.LogSoftmax() # Log Softmax
)
return model
(3). Training
from trax.supervised import training
def train_model(model, data_generator, batch_size=32, max_length=64, lines=lines, eval_lines=eval_lines, n_steps=1, output_dir='model/'):
"""Function that trains the model
Args:
model (trax.layers.combinators.Serial): GRU model.
data_generator (function): Data generator function.
batch_size (int, optional): Number of lines per batch. Defaults to 32.
max_length (int, optional): Maximum length allowed for a line to be processed. Defaults to 64.
lines (list, optional): List of lines to use for training. Defaults to lines.
eval_lines (list, optional): List of lines to use for evaluation. Defaults to eval_lines.
n_steps (int, optional): Number of steps to train. Defaults to 1.
output_dir (str, optional): Relative path of directory to save model. Defaults to "model/".
Returns:
trax.supervised.training.Loop: Training loop for the model.
"""
bare_train_generator = data_generator(batch_size, max_length, data_lines=lines)
bare_eval_generator = data_generator(batch_size, max_length, data_lines=eval_lines)
train_task = training.TrainTask(
labeled_data=bare_train_generator,
loss_layer=tl.CrossEntropyLoss(), # Don't forget to instantiate this object
optimizer=trax.optimizers.Adam(0.0005) # Don't forget to add the learning rate parameter
)
eval_task = training.EvalTask(
labeled_data=bare_eval_generator,
metrics=[tl.CrossEntropyLoss(), tl.Accuracy()], # Don't forget to instantiate these objects
n_eval_batches=3 # For better evaluation accuracy in reasonable time
)
training_loop = training.Loop(model,
train_task,
eval_task=eval_task,
output_dir=output_dir)
training_loop.run(n_steps=n_steps)
# We return this because it contains a handle to the model, which has the weights etc.
return training_loop
training_loop = train_model(GRULM(), data_generator)
(4). Predict
model = GRULM()
model.init_from_file('model/model.pkl.gz')
print (model(numpy.array([[1,2,3]])).shape) #(1, 3, 256)
def gumbel_sample(log_probs, temperature=1.0):
"""Gumbel sampling from a categorical distribution."""
u = numpy.random.uniform(low=1e-6, high=1.0 - 1e-6, size=log_probs.shape)
g = -np.log(-np.log(u))
return np.argmax(log_probs + g * temperature, axis=-1)
def predict(num_chars, prefix):
inp = [ord(c) for c in prefix]
result = [c for c in prefix]
max_len = len(prefix) + num_chars
for _ in range(num_chars):
cur_inp = np.array(inp + [0] * (max_len - len(inp)))
outp = model(cur_inp[None, :]) # Add batch dim.
next_char = gumbel_sample(outp[0, len(inp)])
inp += [int(next_char)]
if inp[-1] == 1:
break # EOS
result.append(chr(int(next_char)))
return "".join(result)
print(predict(5, "f"))
def predict(num_chars, prefix):
inp = [ord(c) for c in prefix]
print('begin')
print(inp)
result = [c for c in prefix]
max_len = len(prefix) + num_chars
print('max len:')
print(max_len)
print('\n')
for _ in range(num_chars):
cur_inp = np.array(inp + [0] * (max_len - len(inp)))
print(cur_inp[None, :])
outp = model(cur_inp[None, :]) # Add batch dim.
print(outp)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(outp[0, len(inp)])
next_char = np.argmax(outp[0, len(inp)])
inp += [int(next_char)]
if inp[-1] == 1:
break # EOS
result.append(chr(int(next_char)))
print(result)
print('\n')
return "".join(result)
print(predict(3, "f"))
------------------------------------------------
输出:
begin
[102]
max len:
4
[[102 0 0 0]]
[[[-5.5498857 -5.539008 -5.5438304 ... -5.5491195 -5.543413 -5.5474577]
[-5.5621333 -5.5365143 -5.5547066 ... -5.5503917 -5.542101 -5.5518985]
[-5.565542 -5.5324373 -5.553304 ... -5.552781 -5.5410833 -5.553952 ]
[-5.565685 -5.5292797 -5.5489216 ... -5.5549684 -5.5403547 -5.5547576]]]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[-5.5621333 -5.5365143 -5.5547066 ... -5.5503917 -5.542101 -5.5518985]
['f', ' ']
[[102 32 0 0]]
[[[-5.5498857 -5.539008 -5.5438304 ... -5.5491195 -5.543413 -5.5474577]
[-5.5621333 -5.5365143 -5.5547066 ... -5.5503917 -5.542101 -5.5518985]
[-5.569587 -5.538272 -5.55638 ... -5.5401726 -5.5351195 -5.564734 ]
[-5.569918 -5.534585 -5.552242 ... -5.5417237 -5.534379 -5.565476 ]]]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[-5.569587 -5.538272 -5.55638 ... -5.5401726 -5.5351195 -5.564734 ]
['f', ' ', 'o']
[[102 32 111 0]]
[[[-5.5498857 -5.539008 -5.5438304 ... -5.5491195 -5.543413 -5.5474577]
[-5.5621333 -5.5365143 -5.5547066 ... -5.5503917 -5.542101 -5.5518985]
[-5.569587 -5.538272 -5.55638 ... -5.5401726 -5.5351195 -5.564734 ]
[-5.5647793 -5.5384717 -5.5544705 ... -5.543451 -5.5318937 -5.5628376]]]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[-5.5647793 -5.5384717 -5.5544705 ... -5.543451 -5.5318937 -5.5628376]
['f', ' ', 'o', ' ']
f o
5. LSTM训练实体识别 代码示例
(1). Data generator
#!pip -q install trax==1.3.1
import trax
from trax import layers as tl
import os
import numpy as np
import pandas as pd
import random as rnd
def data_generator(batch_size, x, y, pad, shuffle=False, verbose=False):
'''
Input:
batch_size - integer describing the batch size
x - list containing sentences where words are represented as integers
y - list containing tags associated with the sentences
shuffle - Shuffle the data order
pad - an integer representing a pad character
verbose - Print information during runtime
Output:
a tuple containing 2 elements:
X - np.ndarray of dim (batch_size, max_len) of padded sentences
Y - np.ndarray of dim (batch_size, max_len) of tags associated with the sentences in X
'''
# count the number of lines in data_lines
num_lines = len(x)
# create an array with the indexes of data_lines that can be shuffled
lines_index = [*range(num_lines)]
# shuffle the indexes if shuffle is set to True
if shuffle:
rnd.shuffle(lines_index)
index = 0 # tracks current location in x, y
while True:
buffer_x = [0] * batch_size # Temporal array to store the raw x data for this batch
buffer_y = [0] * batch_size # Temporal array to store the raw y data for this batch
# Copy into the temporal buffers the sentences in x[index : index + batch_size]
# along with their corresponding labels y[index : index + batch_size]
# Find maximum length of sentences in x[index : index + batch_size] for this batch.
# Reset the index if we reach the end of the data set, and shuffle the indexes if needed.
max_len = 0
for i in range(batch_size):
# if the index is greater than or equal to the number of lines in x
if index >= num_lines:
# then reset the index to 0
index = 0
# re-shuffle the indexes if shuffle is set to True
if shuffle:
rnd.shuffle(lines_index)
# The current position is obtained using `lines_index[index]`
# Store the x value at the current position into the buffer_x
buffer_x[i] = x[lines_index[index]]
# Store the y value at the current position into the buffer_y
buffer_y[i] = y[lines_index[index]]
lenx = len(x[lines_index[index]]) #length of current x[]
if lenx > max_len:
max_len = lenx #max_len tracks longest x[]
# increment index by one
index += 1
# create X,Y, NumPy arrays of size (batch_size, max_len) 'full' of pad value
X = np.full((batch_size, max_len), pad)
Y = np.full((batch_size, max_len), pad)
# copy values from lists to NumPy arrays. Use the buffered values
for i in range(batch_size):
# get the example (sentence as a tensor)
# in `buffer_x` at the `i` index
x_i = buffer_x[i]
# similarly, get the example's labels
# in `buffer_y` at the `i` index
y_i = buffer_y[i]
# Walk through each word in x_i
for j in range(len(x_i)):
# store the word in x_i at position j into X
X[i, j] = x_i[j]
# store the label in y_i at position j into Y
Y[i, j] = y_i[j]
if verbose: print("index=", index)
yield((X,Y))
输出示例:
batch_size = 5
X[0][:]:
[ 0 1 2 3 4 5 6 7 8 9 10 11
12 13 14 9 15 1 16 17 18 19 20 21
35180 35180 35180 35180 35180 35180]
Y[0][:]:
[ 0 0 0 0 0 0 1 0 0 0 0 0
1 0 0 0 0 0 2 0 0 0 0 0
35180 35180 35180 35180 35180 35180]
(2). Model
def NER(vocab_size=35181, d_model=50, tags=tag_map):
'''
Input:
vocab_size - integer containing the size of the vocabulary
d_model - integer describing the embedding size
Output:
model - a trax serial model
'''
model = tl.Serial(
tl.Embedding(vocab_size, d_model), # Embedding layer
tl.LSTM(d_model), # LSTM layer
tl.Dense(len(tags)), # Dense layer with len(tags) units
tl.LogSoftmax() # LogSoftmax layer
)
return model
(3). Training
from trax.supervised import training
batch_size = 64
# Create training data, mask pad id=35180 for training.
train_generator = trax.supervised.inputs.add_loss_weights(
data_generator(batch_size, t_sentences, t_labels, vocab['<PAD>'], True),
id_to_mask=vocab['<PAD>'])
# Create validation data, mask pad id=35180 for training.
eval_generator = trax.supervised.inputs.add_loss_weights(
data_generator(batch_size, v_sentences, v_labels, vocab['<PAD>'], True),
id_to_mask=vocab['<PAD>'])
输出示例:
X1, Y1, Z1= next(train_generator)
print(X1[0][:], "\n", Y1[0][:], "\n", Z1[0][:])
X1[0][:]:
[ 1641 151 9 934 68 340 1577 3179 13 78 53 9
2332 1 9 1511 312 1053 1054 93 157 575 1578 158
1580 1033 45 1581 19 158 5798 437 1088 324 93 5004
45 1587 21 35180 35180 35180 35180]
Y1[0][:]:
[ 1 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 35180 35180 35180 35180]
Z1[0][:]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
def train_model(NER, train_generator, eval_generator, train_steps=1, output_dir='model'):
'''
Input:
NER - the model you are building
train_generator - The data generator for training examples
eval_generator - The data generator for validation examples,
train_steps - number of training steps
output_dir - folder to save your model
Output:
training_loop - a trax supervised training Loop
'''
train_task = training.TrainTask(
train_generator, # A train data generator
loss_layer = tl.CrossEntropyLoss(), # A cross-entropy loss function
optimizer = trax.optimizers.Adam(0.01), # The adam optimizer
)
eval_task = training.EvalTask(
labeled_data = eval_generator, # A labeled data generator
metrics = [tl.CrossEntropyLoss(), tl.Accuracy()], # Evaluate with cross-entropy loss and accuracy
n_eval_batches = 10 # Number of batches to use on each evaluation
)
training_loop = training.Loop(
NER, # A model to train
train_task, # A train task
eval_task = eval_task, # The evaluation task
output_dir = output_dir) # The output directory
# Train with train_steps (batches)
training_loop.run(n_steps = train_steps)
return training_loop
training_loop = train_model(NER(), train_generator, eval_generator, 100)
(4). Predict
# loading in a pretrained model..
model = NER()
model.init(trax.shapes.ShapeDtype((1, 1), dtype=np.int32))
# Load the pretrained model
model.init_from_file('model/model.pkl.gz', weights_only=True)
def predict(sentence, model, vocab, tag_map):
s = [vocab[token] if token in vocab else vocab['UNK'] for token in sentence.split(' ')]
batch_data = np.ones((1, len(s)))
batch_data[0][:] = s
sentence = np.array(batch_data).astype(int)
print(sentence.shape)
output = model(sentence)
print(output.shape)
outputs = np.argmax(output, axis=2)
print(outputs)
labels = list(tag_map.keys())
pred = []
for i in range(len(outputs[0])):
idx = outputs[0][i]
pred_label = labels[idx]
pred.append(pred_label)
return pred
sentence = "Peter Navarro, the White House director of trade and manufacturing policy of U.S, said in an interview on Sunday morning that the White House was working to prepare for the possibility of a second wave of the coronavirus in the fall, though he said it wouldn’t necessarily come"
predictions = predict(sentence, vocab, model, tag_map)
for x,y in zip(sentence.split(' '), predictions):
if y != 'O':
print(x,y)
输出示例:
(1, 48)
(1, 48, 17)
[[ 3 10 0 5 6 0 0 0 0 0 0 0 0 0 0 0 0 0 7 12 0 0 5 6
0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 7 0 0 0 0 0 0 0]]
Peter B-per
Navarro, I-per
White B-org
House I-org
Sunday B-tim
morning I-tim
White B-org
House I-org
coronavirus B-tim
fall, B-tim
a=np.array([[[3,2],[3,4],[8,2]]])
np.argmax(a, axis=2)
array([[0, 1, 0]], dtype=int64)
6. Bi-LSTM + CRF 训练实体识别
参考:
a). confusedcoders.com/data-scienc…