[机器学习读书笔记] - Trax 舆情判别

660 阅读2分钟

1. Sentiment Analysis

(1). Data generator

Build a data generator that takes in the positive/negative tweets and returns a batch of training examples. It returns the model inputs, the targets (positive or negative labels) and the weight for each target (ex: this allows us to can treat some examples as more important to get right than others, but commonly this will all be 1.0)

def data_generator(data_pos, data_neg, batch_size, vocab_dict, shuffle=False):
    '''
    Input: 
        data_pos - Set of posstive examples
        data_neg - Set of negative examples
        batch_size - number of samples per batch. Must be even
        vocab_dict - The words dictionary
        shuffle - Shuffle the data order
    Yield:
        inputs - Subset of positive and negative examples
        targets - The corresponding labels for the subset
        example_weights - An array specifying the importance of each example
        
    ''' 
    
    pass
    
    yield inputs, targets, example_weights

# Create the training data generator
def train_generator(batch_size, shuffle = False):
    return data_generator(train_pos, train_neg, batch_size, Vocab, shuffle)

# Create the validation data generator
def val_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, Vocab, shuffle)

输出示例:
[[1065  136  479 2351  745 8148 1123  745   53    2 2672  791    2    2
   349  601    2 3489 1017  597 4559    9 1065  157    2    2]
 [ 444    2  304  567   56    9    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [ 127   92 1595 1085 3761    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [8460    2 3761  335   37    2    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]] [1 1 0 0] [1 1 1 1]
---------------

[[  60 2992    2   22  236 1292   45 1354  118]
 [3495   17  443 8821  443    2  179   92    9]
 [ 460 1244 2063 2440    2    2 8232 2035 3761]
 [   2 3761    0    0    0    0    0    0    0]] [1 1 0 0] [1 1 1 1]
---------------

(2). Model

from trax import layers as tl

def classifier(vocab_size=len(Vocab), embedding_dim=256, output_dim=2, mode='train'):
       
    # create embedding layer
    embed_layer = tl.Embedding(
        vocab_size=vocab_size, # Size of the vocabulary
        d_feature=embedding_dim)  # Embedding dimension
    
    # Create a mean layer, to create an "average" word embedding
    mean_layer = tl.Mean(axis=1)
    
    # Create a dense layer, one unit for each output
    dense_output_layer = tl.Dense(n_units = output_dim)
    
    # Create the log softmax layer (no parameters needed)
    log_softmax_layer = tl.LogSoftmax()
    
    # Use tl.Serial to combine all layers
    # and create the classifier
    # of type trax.layers.combinators.Serial
    model = tl.Serial(
      embed_layer, # embedding layer
      mean_layer, # mean layer
      dense_output_layer, # dense output layer 
      log_softmax_layer # log softmax layer
    )
    
    # return the model of type
    return model

(3). Training

from trax.supervised import training

batch_size = 16

train_task = training.TrainTask(
    labeled_data=train_generator(batch_size=batch_size, shuffle=True),
    loss_layer=tl.CrossEntropyLoss(),
    optimizer=trax.optimizers.Adam(0.01),
    n_steps_per_checkpoint=10,
)

eval_task = training.EvalTask(
    labeled_data=val_generator(batch_size=batch_size, shuffle=True),
    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
)

def train_model(classifier, train_task, eval_task, n_steps, output_dir):
    '''
    Input: 
        classifier - the model you are building
        train_task - Training task
        eval_task - Evaluation task
        n_steps - the evaluation steps
        output_dir - folder to save your files
    Output:
        trainer -  trax trainer
    '''
    training_loop = training.Loop(
                                classifier, # The learning model
                                train_task, # The training task
                                eval_task = eval_task, # The evaluation task
                                output_dir = output_dir) # The output directory

    training_loop.run(n_steps = n_steps)

    # Return the training_loop, since it has the model.
    return training_loop
    
training_loop = train_model(classifier(), train_task, eval_task, 100, 'model')

(4). Evaluate & Predict

training_loop.eval_model(inputs)