Tensorflow twitter-RNN-评论情感分析(l2正则)

648 阅读3分钟

github: github.com/yangjinghit…

import pandas as pd
import numpy as np
data = pd.read_csv('Tweets.csv')

data.head(2)
tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereason_confidence airline airline_sentiment_gold name negativereason_gold retweet_count text tweet_coord tweet_created tweet_location user_timezone
0 570306133677760513 neutral 1.0000 NaN NaN Virgin America NaN cairdin NaN 0 @VirginAmerica What @dhepburn said. NaN 2015-02-24 11:35:52 -0800 NaN Eastern Time (US & Canada)
1 570301130888122368 positive 0.3486 NaN 0.0 Virgin America NaN jnardino NaN 0 @VirginAmerica plus you've added commercials t... NaN 2015-02-24 11:15:59 -0800 NaN Pacific Time (US & Canada)
data = data[['airline_sentiment', 'text']]
with open('twee', 'a', encoding = 'utf-8') as f:
    for string in data.text:
        f.writelines(string+'\n')
from gensim.models import word2vec
sentences = word2vec.Text8Corpus("twee")
model = word2vec.Word2Vec(sentences, size=300)
word_vectors = model.wv
del model
data['vec'] = data.text.apply(lambda x : [word_vectors[w] for w in x.split() if w in word_vectors])
data = data[data['vec'].apply(lambda x : len(x)>5)]
data.head(3)
airline_sentiment text vec
1 positive @VirginAmerica plus you've added commercials t... [[2.2402475, 0.15890086, -0.082046695, 0.80472...
2 neutral @VirginAmerica I didn't today... Must mean I n... [[2.2402475, 0.15890086, -0.082046695, 0.80472...
3 negative @VirginAmerica it's really aggressive to blast... [[2.2402475, 0.15890086, -0.082046695, 0.80472...
del data['text']
data.airline_sentiment.unique()
array(['positive', 'neutral', 'negative'], dtype=object)
data.airline_sentiment.value_counts()
negative    9007
neutral     2789
positive    2013
Name: airline_sentiment, dtype: int64
dic = {'neutral':np.array([1,0,0]), 'positive':np.array([0,1,0]), 'negative':np.array([0,0,1])}
data['cat'] = data.airline_sentiment.map(dic)
del data['airline_sentiment']
data.columns
Index(['vec', 'cat'], dtype='object')
data = data.reset_index()
del data['index']
maxlength = max(len(x) for x in data.vec)
maxlength
36
data.head(2)
vec cat
0 [[2.2402475, 0.15890086, -0.082046695, 0.80472... [0, 1, 0]
1 [[2.2402475, 0.15890086, -0.082046695, 0.80472... [1, 0, 0]
def pad(x):
    xl = np.zeros((maxlength, 300))
    xl[:len(x)] = x
    return xl
dataset = data.vec.apply(pad)
dataset.head(2)
0    [[2.2402474880218506, 0.15890085697174072, -0....
1    [[2.2402474880218506, 0.15890085697174072, -0....
Name: vec, dtype: object
len(dataset)
13809
labels = np.concatenate(data.cat).reshape(len(data.cat), -1)

np.shape(labels)
(13809, 3)
data_ = np.concatenate(dataset).reshape(len(dataset), maxlength, 300)
np.shape(data_)
(13809, 36, 300)
index = np.random.permutation(int(len(data)))
label = labels[index]
dataset = data_[index]
label_train = label[:12000]
dataset_train = dataset[:12000]
label_test = label[12000:]
dataset_test = dataset[12000:]
import tensorflow as tf
/anaconda3/envs/py35/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5
  return f(*args, **kwds)
/anaconda3/envs/py35/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
learning_rate = 0.005
batch_size = 300
n_input = 300
n_steps = maxlength
n_hidden = 128
n_classes = 3
x = tf.placeholder(tf.float32, [None, n_steps, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
output_keep_prob = tf.placeholder("float")
reg = tf.contrib.layers.l2_regularizer(scale=0.01)

def length(shuju):
    return tf.reduce_sum(tf.sign(tf.reduce_max(tf.abs(shuju),reduction_indices=2)), reduction_indices=1)
cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(n_hidden,
                                                           kernel_initializer = tf.truncated_normal_initializer(stddev= 0.0001),
                                                           bias_initializer = tf.truncated_normal_initializer(stddev=0.0001)),
                                    output_keep_prob = output_keep_prob)

output, _ = tf.nn.dynamic_rnn(
            cell,
            x,
            dtype=tf.float32,
            sequence_length= length(x))
output.get_shape()
TensorShape([Dimension(None), Dimension(36), Dimension(128)])
index = tf.range(0, batch_size)*n_steps + (tf.cast(length(x), tf.int32) -1)
flat = tf.reshape(output, [-1, int(output.get_shape()[2])])
last = tf.gather(flat, index)
fc_1 = tf.contrib.layers.fully_connected(
                        last,
                        64,
                        weights_initializer = tf.truncated_normal_initializer(stddev=0.01),
                        activation_fn = tf.nn.relu)
keep_prob = tf.placeholder("float")
fc1_drop = tf.nn.dropout(fc_1, keep_prob)
weight = tf.Variable(tf.truncated_normal([64, n_classes],stddev=0.001))
bias = tf.Variable(tf.constant(0.1, shape=[n_classes]))
prediction = tf.nn.softmax(tf.matmul(fc1_drop, weight) + bias)
cross_entropy = -tf.reduce_sum(y * tf.log(prediction))
weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
tf.contrib.layers.apply_regularization(reg, weights_list=weights)
<tf.Tensor 'get_regularization_penalty:0' shape=() dtype=float32>
reg_ws = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
optimizer = tf.train.AdamOptimizer(learning_rate,beta1=0.9)
grads = optimizer.compute_gradients(cross_entropy + tf.reduce_sum(reg_ws))
for i, (g,v) in enumerate(grads):
    if g is not None:
        grads[i] = (tf.clip_by_norm(g, 5), v)
train_op = optimizer.apply_gradients(grads)
/anaconda3/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py:97: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


WARNING:tensorflow:From /anaconda3/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/clip_ops.py:110: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

def generatebatch(X,Y, n_examples, batch_size):
    for batch_i in range(n_examples // batch_size):
        start = batch_i*batch_size
        end = start + batch_size
        batch_xs = X[start:end]
        batch_ys = Y[start:end]
        yield batch_xs, batch_ys
sess = tf.Session()

init = tf.global_variables_initializer()
sess.run(init)
saver = tf.train.Saver()
for step in range(18):
    index_= np.random.permutation(int(len(dataset_train)))
    dataset_train = dataset_train[index_]
    label_train = label_train[index_]
    for batch_x, batch_y in generatebatch(dataset_train, label_train, len(label_train), batch_size):
        sess.run(train_op, feed_dict={x:batch_x, y:batch_y, keep_prob:0.5, output_keep_prob:0.5})
    acc = sess.run(accuracy, feed_dict={x:batch_x, y:batch_y, keep_prob:1, output_keep_prob:1})
    loss = sess.run(cross_entropy, feed_dict={x:batch_x, y:batch_y, keep_prob:1, output_keep_prob:1})
    saver.save(sess, './lesson0', global_step=step)
    print("Iter" + str(step) + "MiniBatch Loss =" + "{:.6f}".format(loss) + ", Training Accuracy = " + "{:.5f}".format(acc))
print("Optimization Finished!")
Iter0MiniBatch Loss =214.256958, Training Accuracy = 0.66667
Iter1MiniBatch Loss =173.106171, Training Accuracy = 0.76333
Iter2MiniBatch Loss =163.925598, Training Accuracy = 0.80333
Iter3MiniBatch Loss =158.836716, Training Accuracy = 0.77667
Iter4MiniBatch Loss =155.008820, Training Accuracy = 0.79667
Iter5MiniBatch Loss =131.040298, Training Accuracy = 0.83667
Iter6MiniBatch Loss =133.507889, Training Accuracy = 0.80667
Iter7MiniBatch Loss =114.443909, Training Accuracy = 0.86333
Iter8MiniBatch Loss =103.080223, Training Accuracy = 0.86333
Iter9MiniBatch Loss =99.932602, Training Accuracy = 0.90000
Iter10MiniBatch Loss =93.207428, Training Accuracy = 0.86000
Iter11MiniBatch Loss =67.471329, Training Accuracy = 0.93000
Iter12MiniBatch Loss =62.449608, Training Accuracy = 0.92333
Iter13MiniBatch Loss =50.676277, Training Accuracy = 0.93000
Iter14MiniBatch Loss =55.832417, Training Accuracy = 0.92333
Iter15MiniBatch Loss =44.194443, Training Accuracy = 0.96333
Iter16MiniBatch Loss =30.585236, Training Accuracy = 0.95667
Iter17MiniBatch Loss =48.206429, Training Accuracy = 0.94333
Optimization Finished!