“我报名参加金石计划1期挑战——瓜分10万奖池,这是我的第2篇文章,点击查看活动详情”
搞深度学习的同学们想必都听说过BERT模型,前段时间有需要用Bert做文本分类的工作任务,一时觉得BERT的结构有点复杂,后来发现Transformers库十分强大,方便快捷,故尝试用Transformers库的 BertForSequenceClassification 搭建一个简单的聊天应答脚本来玩玩。
思路很简单 分类语料,作为模型输入。训练好的模型预测语料的类别,根据预测的类别从相应的answerlist里随机抽取作为机器人的回答。
数据集示例 greetings.yaml
categories:
- greetings
questions:
- 你好
- 嗨
- 好久不见
- 早上好
- 晚上好
- 下午好
- 碰巧又遇到你了
answers:
- 你好
- hello
- 真巧呀
用tansformers搭建整个流程:
HuggingFaces😀是一个专注于自然语言处理的开源社区 huggingface.co/ ,提供了丰富的NLP模型接口、预训练模型、数据等等,让初学者能够快速搭建NLP模型。
安装Transformers库:
pip install transformers
环境
torch==1.11.0
transformers==4.17.0
numpy==1.22.3
yaml
argparse
random
logging
确定模型
Transformers提供了许多预训练模型,以bert-based-uncased(bert-base-uncased · Hugging Face)为例,里面包含:
- 预训练模型 pytorch_model.bin
- 词汇表 vocab.txt 包含中文、英文、数字、标点符号等等
- config,.json 包含模型的配置
config.json
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.6.0.dev0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 30522
}
数据处理
输入的数据为中文短句子,想要被模型识别,要经过分词和编码,用transformers库只要两行代码:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
encoded_questions = tokenizer(questions, padding=True)
对输入进行分词和编码后,还需要新建一个dataset类在训练的时候对数据逐个读取。
class corpus_dataset(Dataset):
def __init__(self, data_dir):
self.questions, self.labels = read_questions_from_dir(data_dir)
self.token_ids = self.questions['input_ids']
self.attn_masks = self.questions['attention_mask']
self.token_type_ids = self.questions['token_type_ids']
def __len__(self):
return len(self.token_ids)
def __getitem__(self,i):
token_ids = torch.tensor(self.token_ids[i])
attn_masks = torch.tensor(self.attn_masks[i]) # binary tensor with "0" for padded values and "1" for the other values
token_type_ids = torch.tensor(self.token_type_ids[i]) # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens
labels = torch.tensor(self.labels[i])
return token_ids,attn_masks,token_type_ids,labels
加载模型
model = BertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=args.classes, output_hidden_states=False)
model.to(device)
模型推理
model.load_state_dict(torch.load('model.pt'))
pred = model(torch.tensor(sentence['input_ids']).to(device),torch.tensor(sentence['token_type_ids']).to(device),torch.tensor(sentence['attention_mask']).to(device))
pred = pred.logits.argmax(1)[0].item()
print('回答:', random.choice(answer[pred]))
完整代码
dataset.py
import os
from torch.utils.data import Dataset
import yaml
from transformers import AutoTokenizer
import torch
class corpus_dataset(Dataset):
def __init__(self, data_dir):
if data_dir == 'data_dir':
self.questions, self.labels, answer, label_dict = read_questions_from_dir(data_dir)
else:
self.questions, self.labels, label_dict = read_questions_from_dir(data_dir)
self.token_ids = self.questions['input_ids']
self.attn_masks = self.questions['attention_mask']
self.token_type_ids = self.questions['token_type_ids']
def __len__(self):
return len(self.token_ids)
def __getitem__(self,i):
token_ids = torch.tensor(self.token_ids[i])
attn_masks = torch.tensor(self.attn_masks[i]) # binary tensor with "0" for padded values and "1" for the other values
token_type_ids = torch.tensor(self.token_type_ids[i]) # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens
labels = torch.tensor(self.labels[i])
return token_ids,attn_masks,token_type_ids,labels
def read_questions_from_dir(data_dir):
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
questions = []
answer = []
labels = []
label_dict = {}
labelcode = 0
for file in os.listdir(data_dir):
yamlPath = os.path.join(data_dir, file)
f = open(yamlPath, 'r', encoding='utf-8')
cfg = f.read()
corpus = yaml.load(cfg, Loader=yaml.FullLoader)
if data_dir == 'data_dir':
answer.append(corpus['answers'])
questions.append(corpus['questions'])
l = [labelcode] * len(corpus['questions'])
label_dict[labelcode] = corpus['categories']
labels.append(l)
labelcode+=1
q = ['CLS' + token + 'SEP' for s in questions for token in s]
label = [l for z in labels for l in z]
encoded_questions = tokenizer(q, padding=True)
if data_dir == 'data_dir':
return encoded_questions, label, answer, label_dict
else:
return encoded_questions, label, label_dict
train.py
import numpy as np
from transformers import AdamW
from data.dataset import corpus_dataset, read_questions_from_dir
from transformers import BertForSequenceClassification
import argparse
from torch.utils.data import DataLoader
import torch
import logging
import random
import torch.nn.functional as f
from transformers import AutoTokenizer
device = 'cpu'
def train(model, args, train_dataset, val_dataset, optimizer):
train_dataloader = DataLoader(train_dataset, batch_size=8)
val_dataloader = DataLoader(val_dataset, batch_size=1)
for epoch in range(args.epochs):
model.train()
for step, batch in enumerate(train_dataloader):
optimizer.zero_grad()
output = model(batch[0].to(device), token_type_ids=batch[1], attention_mask=(batch[2]).to(device),labels=(batch[3]).to(device))
loss = f.cross_entropy(output[1],batch[3])
logits = output[1]
# 反向梯度信息
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # 梯度裁剪
# 参数更新
optimizer.step()
label = torch.tensor(batch[3])
acc = (logits.argmax(1) == label).float().mean()
logging.info(f"Epoch: {epoch}, Batch[{step}/{len(train_dataloader)}], "
f"Train loss :{loss.item():.3f}, Train acc: {acc:.3f}")
model.eval()
with torch.no_grad():
n = 0
acc_sum = 0
for _,test_data in enumerate(val_dataloader):
test_output = model(test_data[0].to(device), token_type_ids=test_data[1], attention_mask=(test_data[2]).to(device),labels=test_data[3])
_,pred = test_output[0], test_output[1]
acc_sum += (pred.argmax(1) == test_data[3]).float().sum().item()
n+=1
torch.save(model.state_dict(),args.model_save_path)
logging.info(f"Val Acc: {acc_sum/n}")
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--data_dir",
default='D:/project/NLP/ChatbotBert/ChatbotBert/chatterbot_corpus',
type=str,
help="The input data dir.",
)
parser.add_argument(
"--val_dir",
default='D:/project/NLP/ChatbotBert/ChatbotBert/val_data',
type=str,
)
parser.add_argument(
"--pretrained_model",
default='bert-base-uncased',
type=str,
help="pretrained model path",
)
parser.add_argument(
"--epochs",
default=20,
type=int,
help="pretrained model path",
)
parser.add_argument(
"--seed", type=int, default=42, help="random seed for initialization"
)
parser.add_argument(
"--output_dir",
default='outputs',
type=str,
help="The output directory where the model predictions and checkpoints will be written.",
)
parser.add_argument(
"--classes",
default=9,
type=int,
help="The number of labels",
)
parser.add_argument(
"--model_save_path",
type=str,
default='model.pt',
help="model save path",
)
args = parser.parse_args()
set_seed(args)
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("__main__")
# dataset preparation
model = BertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=args.classes, output_hidden_states=False)
model.to(device)
logger.info("Training/evaluation parameters %s", args)
# training
train_dataset = corpus_dataset(args.data_dir)
val_dataset = corpus_dataset(args.val_dir)
encoded_questions, label, label_dict, answer = read_questions_from_dir(args.data_dir)
optimizer_params = {'lr': 1e-5, 'eps': 1e-6, 'correct_bias': False}
optimizer = AdamW(model.parameters(), **optimizer_params)
train(model, args, train_dataset, val_dataset, optimizer=optimizer)
sentence = ['你平时喜欢做什么']
print('问题:', sentence)
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
sentence = tokenizer(sentence)
model.load_state_dict(torch.load('model.pt'))
pred = model(torch.tensor(sentence['input_ids']).to(device),torch.tensor(sentence['token_type_ids']).to(device),torch.tensor(sentence['attention_mask']).to(device))
pred = pred.logits.argmax(1)[0].item()
print('回答:', random.choice(answer[pred]))
if __name__ == "__main__":
main()