【MyHelloWorld】使用Transformers库做一个简单的聊天应答脚本

1,114 阅读3分钟

“我报名参加金石计划1期挑战——瓜分10万奖池,这是我的第2篇文章,点击查看活动详情

搞深度学习的同学们想必都听说过BERT模型,前段时间有需要用Bert做文本分类的工作任务,一时觉得BERT的结构有点复杂,后来发现Transformers库十分强大,方便快捷,故尝试用Transformers库的 BertForSequenceClassification 搭建一个简单的聊天应答脚本来玩玩。

思路很简单 分类语料,作为模型输入。训练好的模型预测语料的类别,根据预测的类别从相应的answerlist里随机抽取作为机器人的回答。

chatbot.png 数据集示例 greetings.yaml

categories:
- greetings
questions:
- 你好
- 嗨
- 好久不见
- 早上好
- 晚上好
- 下午好
- 碰巧又遇到你了
answers:
- 你好
- hello
- 真巧呀
 

用tansformers搭建整个流程:

HuggingFaces😀是一个专注于自然语言处理的开源社区 huggingface.co/ ,提供了丰富的NLP模型接口、预训练模型、数据等等,让初学者能够快速搭建NLP模型。

安装Transformers库

pip install transformers

环境

torch==1.11.0
transformers==4.17.0
numpy==1.22.3
yaml
argparse
random
logging

确定模型

Transformers提供了许多预训练模型,以bert-based-uncased(bert-base-uncased · Hugging Face)为例,里面包含:

  • 预训练模型 pytorch_model.bin
  • 词汇表 vocab.txt 包含中文、英文、数字、标点符号等等
  • config,.json 包含模型的配置

image.png

config.json

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.6.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

数据处理

输入的数据为中文短句子,想要被模型识别,要经过分词和编码,用transformers库只要两行代码:

    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    encoded_questions = tokenizer(questions, padding=True)

对输入进行分词和编码后,还需要新建一个dataset类在训练的时候对数据逐个读取。


class corpus_dataset(Dataset):
    def __init__(self, data_dir):
        self.questions, self.labels = read_questions_from_dir(data_dir)
        self.token_ids = self.questions['input_ids']
        self.attn_masks =  self.questions['attention_mask']
        self.token_type_ids = self.questions['token_type_ids']

    def __len__(self):
        return len(self.token_ids)
    def __getitem__(self,i):
        token_ids = torch.tensor(self.token_ids[i])
        attn_masks = torch.tensor(self.attn_masks[i]) # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids =  torch.tensor(self.token_type_ids[i]) # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        labels = torch.tensor(self.labels[i])
        return token_ids,attn_masks,token_type_ids,labels

加载模型

    model = BertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=args.classes, output_hidden_states=False)
    model.to(device)

模型推理

    model.load_state_dict(torch.load('model.pt'))
    pred = model(torch.tensor(sentence['input_ids']).to(device),torch.tensor(sentence['token_type_ids']).to(device),torch.tensor(sentence['attention_mask']).to(device)) 
    pred = pred.logits.argmax(1)[0].item()
    
    print('回答:', random.choice(answer[pred]))

完整代码

dataset.py

import os
from torch.utils.data import Dataset
import yaml
from transformers import AutoTokenizer
import torch

class corpus_dataset(Dataset):
    def __init__(self, data_dir):
        if data_dir == 'data_dir':
            self.questions, self.labels, answer, label_dict = read_questions_from_dir(data_dir)
        else:
             self.questions, self.labels, label_dict = read_questions_from_dir(data_dir)
        self.token_ids = self.questions['input_ids']
        self.attn_masks =  self.questions['attention_mask']
        self.token_type_ids = self.questions['token_type_ids']

    def __len__(self):
        return len(self.token_ids)
    def __getitem__(self,i):
        token_ids = torch.tensor(self.token_ids[i])
        attn_masks = torch.tensor(self.attn_masks[i]) # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids =  torch.tensor(self.token_type_ids[i]) # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        labels = torch.tensor(self.labels[i])
        return token_ids,attn_masks,token_type_ids,labels

def read_questions_from_dir(data_dir):
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    questions = []
    answer = []
    labels = []
    label_dict = {}
    labelcode = 0
    for file in os.listdir(data_dir):
        yamlPath = os.path.join(data_dir, file)

        f = open(yamlPath, 'r', encoding='utf-8')
        cfg = f.read()
        corpus = yaml.load(cfg, Loader=yaml.FullLoader)
        if data_dir == 'data_dir':
            answer.append(corpus['answers'])
        questions.append(corpus['questions'])
        l = [labelcode] * len(corpus['questions'])
        label_dict[labelcode] = corpus['categories']
        labels.append(l)
        labelcode+=1
   
    q = ['CLS' + token + 'SEP' for s in questions for token in s]
    label = [l for z in labels for l in z]

    encoded_questions = tokenizer(q, padding=True)
    if data_dir == 'data_dir':
        return encoded_questions, label, answer, label_dict
    else:
        return encoded_questions, label, label_dict

train.py

import numpy as np
from transformers import AdamW
from data.dataset import corpus_dataset, read_questions_from_dir
from transformers import BertForSequenceClassification
import argparse
from torch.utils.data import DataLoader
import torch
import logging
import random
import torch.nn.functional as f
from transformers import AutoTokenizer

device = 'cpu'


def train(model, args, train_dataset, val_dataset, optimizer):
    train_dataloader = DataLoader(train_dataset, batch_size=8)
    val_dataloader = DataLoader(val_dataset, batch_size=1)
    for epoch in range(args.epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            
            optimizer.zero_grad()
            output = model(batch[0].to(device), token_type_ids=batch[1], attention_mask=(batch[2]).to(device),labels=(batch[3]).to(device))
            
            loss = f.cross_entropy(output[1],batch[3])
            logits = output[1]
            # 反向梯度信息
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # 梯度裁剪

            # 参数更新
            optimizer.step()
            label = torch.tensor(batch[3])
            acc = (logits.argmax(1) == label).float().mean()
            logging.info(f"Epoch: {epoch}, Batch[{step}/{len(train_dataloader)}], "
                        f"Train loss :{loss.item():.3f}, Train acc: {acc:.3f}")
        model.eval()
        with torch.no_grad():
            n = 0
            acc_sum = 0
            for _,test_data in enumerate(val_dataloader):
                test_output = model(test_data[0].to(device), token_type_ids=test_data[1], attention_mask=(test_data[2]).to(device),labels=test_data[3])
                _,pred = test_output[0], test_output[1]
                
                acc_sum += (pred.argmax(1) == test_data[3]).float().sum().item()
                n+=1

            torch.save(model.state_dict(),args.model_save_path)

            logging.info(f"Val Acc: {acc_sum/n}")

def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--data_dir",
        default='D:/project/NLP/ChatbotBert/ChatbotBert/chatterbot_corpus',
        type=str,
        help="The input data dir.",
    )
    parser.add_argument(
        "--val_dir",
        default='D:/project/NLP/ChatbotBert/ChatbotBert/val_data',
        type=str,
    )
    parser.add_argument(
        "--pretrained_model",
        default='bert-base-uncased',
        type=str,
        help="pretrained model path",
    )    
    parser.add_argument(
        "--epochs",
        default=20,
        type=int,
        help="pretrained model path",
    )   

    parser.add_argument(
        "--seed", type=int, default=42, help="random seed for initialization"
    )
    parser.add_argument(
        "--output_dir",
        default='outputs',
        type=str,
        help="The output directory where the model predictions and checkpoints will be written.",
    )

    parser.add_argument(
        "--classes",
        default=9,
        type=int,
        help="The number of labels",
    )

    parser.add_argument(
        "--model_save_path",
        type=str,
        default='model.pt',
        help="model save path",
    )
    args = parser.parse_args()
    set_seed(args)
    # Setup logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger("__main__")

    # dataset preparation
    model = BertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=args.classes, output_hidden_states=False)
    model.to(device)
    logger.info("Training/evaluation parameters %s", args)

    # training
    train_dataset = corpus_dataset(args.data_dir)
    val_dataset = corpus_dataset(args.val_dir)
    encoded_questions, label, label_dict, answer = read_questions_from_dir(args.data_dir)
    optimizer_params = {'lr': 1e-5, 'eps': 1e-6, 'correct_bias': False}

    optimizer = AdamW(model.parameters(), **optimizer_params)
    train(model, args, train_dataset, val_dataset, optimizer=optimizer)
    
    sentence = ['你平时喜欢做什么']

    print('问题:', sentence)

    tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
    sentence = tokenizer(sentence)  
    model.load_state_dict(torch.load('model.pt'))
    pred = model(torch.tensor(sentence['input_ids']).to(device),torch.tensor(sentence['token_type_ids']).to(device),torch.tensor(sentence['attention_mask']).to(device)) 
    pred = pred.logits.argmax(1)[0].item()
    
    print('回答:', random.choice(answer[pred]))


if __name__ == "__main__":
    main()