1、导入相关包
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import evaluate
2、加载、划分、处理数据集
dataset = load_dataset("json", data_files="./train_pair_1w.json", split="train")
datasets = dataset.train_test_split(test_size=0.2)
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
def process_function(examples):
tokenized_examples = tokenizer(examples["sentence1"], examples["sentence2"], max_length=128, truncation=True)
tokenized_examples["labels"] = [float(label) for label in examples["label"]]
return tokenized_examples
tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
3、创建模型
model = AutoModelForSequenceClassification.from_pretrained("./hfl/chinese-macbert-base", num_labels=1)
4、创建评估函数
acc_metric = evaluate.load("./metric_accuracy.py")
f1_metirc = evaluate.load("./metric_f1.py")
def eval_metric(eval_predict):
predictions, labels = eval_predict
predictions = [int(p > 0.5) for p in predictions]
labels = [int(l) for l in labels]
acc = acc_metric.compute(predictions=predictions, references=labels)
f1 = f1_metirc.compute(predictions=predictions, references=labels)
acc.update(f1)
return acc
5、创建TrainingArguments、Trainer
train_args = TrainingArguments(output_dir="./cross_model",
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
logging_steps=10,
eval_strategy="epoch",
save_strategy="epoch",
save_total_limit=3,
learning_rate=2e-5,
weight_decay=0.01,
metric_for_best_model="f1",
load_best_model_at_end=True)
from transformers import DataCollatorWithPadding
trainer = Trainer(model=model,
args=train_args,
tokenizer=tokenizer,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
compute_metrics=eval_metric)
6、模型训练、评估、预测
trainer.train()
trainer.evaluate()
7、模型预测
from transformers import pipeline
model.config.id2label = {0: "不相似", 1: "相似"}
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
result = pipe({"text": "我喜欢北京", "text_pair": "天气怎样"}, function_to_apply="none")
result["label"] = "相似" if result["score"] > 0.5 else "不相似"
result