高复用Bert模型文本分类代码(三)训练部分

1,055 阅读3分钟

「这是我参与11月更文挑战的第10天,活动详情查看:2021最后一次更文挑战

本片文章我们对训练部分代码进行讲解,前两期链接:
高复用Bert模型文本分类代码(一)数据读取
高复用Bert模型文本分类代码(二)模型部分

训练部分代码全部再trainer.py这个文件下,该文件下仅有一个Trainer类。

Trainer中除了包含常见的trianevaluate方法外,还有save_modelload_modelsave_results等实验需要用到的方法。

image.png

_init_

_init_中基本为对应参数的配置和加载
self.config_class.from_pretrained()加载BertConfig的相关
self.config_class.from_pretrained()加载之前模型部分写的基于Bert的分类模型
self.device会检测电脑中是否有可用的GPU,若没有则采用cpu训练

    def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None):
        self.args = args
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset
        self.test_results = None

        self.label_lst = get_labels(args)
        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later

        self.config_class, self.model_class, _ = MODEL_CLASSES[args.model_type]
        self.config = self.config_class.from_pretrained(args.model_name_or_path, finetuning_task=args.task)
        self.model = self.model_class.from_pretrained(args.model_name_or_path,
                                                      config=self.config,
                                                      args=args,
                                                      label_lst=self.label_lst,
                                                      )

        # GPU or CPU
        self.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
        self.model.to(self.device)

train

train部分代码比较长,首先当然是加载self.train_dataset训练集数据

train_sampler = RandomSampler(self.train_dataset)
train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.train_batch_size)

根据参数控制训练的轮次、梯度累加步长

if self.args.max_steps > 0:
    t_total = self.args.max_steps
    self.args.num_train_epochs = self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
else:
    t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs

设置学习率、优化器、和warm_up、weight_decay

# 打印bert每一层的信息
for n, p in self.model.named_parameters():
    print(n)
# BERT部分参数,设置一个较低的学习率
optimizer_grouped_parameters = []
bert_params = list(self.model.bert.named_parameters())
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters += [
    {
        'params': [p for n, p in bert_params if not any(nd in n for nd in no_decay)],
        'weight_decay': self.args.weight_decay,
        "lr": self.args.learning_rate,
    },
    {
        'params': [p for n, p in bert_params if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0,
        'lr': self.args.learning_rate,
    }
]

# 线性层参数
linear_params = list(self.model.classifier.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters += [
    {
        'params': [p for n, p in linear_params if not any(nd in n for nd in no_decay)],
        'weight_decay': self.args.weight_decay,
        "lr": self.args.linear_learning_rate,
    },
    {
        'params': [p for n, p in linear_params if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0,
        'lr': self.args.linear_learning_rate,
    }
]

optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total)

打印基本的训练信息

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(self.train_dataset))
    logger.info("  Num Epochs = %d", self.args.num_train_epochs)
    logger.info("  Total train batch size = %d", self.args.train_batch_size)
    logger.info("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    logger.info("  Logging steps = %d", self.args.logging_steps)
    logger.info("  Save steps = %d", self.args.save_steps)

开始训练,这里和常规torch训练没有区别,注意输入的inputs配置。 在optimizer.step()scheduler.step()更新完后 检测是否到达验证轮次,若到达则进入evaluate验证阶段,并保存验证集模型
代码中默认设置kappa为评价指标,需要其他评价指标多为score可以更改。

wait = 0
global_step = 0
tr_loss = 0.0
best_score = 0.0
self.model.zero_grad()

train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch")

for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(epoch_iterator):
        self.model.train()
        batch = tuple(t.to(self.device) for t in batch)  # GPU or CPU

        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'label_ids': batch[3],
                  }
        if self.args.model_type != 'distilbert':
            inputs['token_type_ids'] = batch[2]
        outputs = self.model(**inputs)
        loss = outputs[0]

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        loss.backward()

        tr_loss += loss.item()
        if (step + 1) % self.args.gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            self.model.zero_grad()
            global_step += 1

            if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0:
                results = self.evaluate("dev")

                if best_score < results["kappa"]:
                    wait = 0
                    best_score = results["kappa"]
                    self.save_model()
                else:
                    wait += 1
                    print("eraly stop {}/{}".format(wait,self.args.wait_patient))

        if wait >= self.args.wait_patient:
            break

            # if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
            #     self.save_model()

        if 0 < self.args.max_steps < global_step:
            epoch_iterator.close()
            break

    if 0 < self.args.max_steps < global_step:
        train_iterator.close()
        break

return global_step, tr_loss / global_step

训练集到此就结束了

eval

验证集带代码内容比较简单不详细展开

def evaluate(self, mode):
    if mode == 'test':
        dataset = self.test_dataset
    elif mode == 'dev':
        dataset = self.dev_dataset
    else:
        raise Exception("Only dev and test dataset available")

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation on %s dataset *****", mode)
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", self.args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None

    self.model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = tuple(t.to(self.device) for t in batch)
        with torch.no_grad():
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'label_ids': batch[3],
                      }
            if self.args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2]
            outputs = self.model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1

        # Intent prediction
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['label_ids'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(
                out_label_ids, inputs['label_ids'].detach().cpu().numpy(), axis=0)


    eval_loss = eval_loss / nb_eval_steps
    results = {
        "loss": round(eval_loss,7)
    }

    # Intent result
    preds = np.argmax(preds, axis=1)

    total_result = compute_metrics(preds, out_label_ids)
    results.update(total_result)

    if mode == 'test':
        self.test_results = results
        self.save_results()

    logger.info("***** Eval results *****")
    for key in sorted(results.keys()):
        logger.info("  %s = %s", key, str(results[key]))

    return results

save_model

通过torch.save保存模型

def save_model(self):
    # Save model checkpoint (Overwrite)
    if not os.path.exists(self.args.model_dir):
        os.makedirs(self.args.model_dir)
    model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
    model_to_save.save_pretrained(self.args.model_dir)

    # Save training arguments together with the trained model
    torch.save(self.args, os.path.join(self.args.model_dir, 'training_args.bin'))
    logger.info("Saving model checkpoint to %s", self.args.model_dir)

load_model

模型加载

def load_model(self):
    # Check whether model exists
    if not os.path.exists(self.args.model_dir):
        raise Exception("Model doesn't exists! Train first!")

    try:
        self.model = self.model_class.from_pretrained(self.args.model_dir,
                                                      args=self.args,
                                                      label_lst=self.label_lst,)
        self.model.to(self.device)
        logger.info("***** Model Loaded *****")
    except:
        raise Exception("Some model files might be missing...")

save_results

pandas生成csv文件,记录实验结果,保存效果和参数

def save_results(self):
    if not os.path.exists(self.args.results_dir):
        os.makedirs(self.args.results_dir)

    var = [self.args.task, self.args.learning_rate, self.args.num_train_epochs, self.args.max_seq_len, self.args.seed]
    names = ['task', 'lr', 'epoch', 'max_len', 'seed']
    vars_dict = {k: v for k, v in zip(names, var)}
    results = dict(self.test_results, **vars_dict)
    keys = list(results.keys())
    values = list(results.values())

    file_name = 'results.csv'
    results_path = os.path.join(self.args.results_dir, file_name)

    if not os.path.exists(results_path):
        ori = []
        ori.append(values)
        df1 = pd.DataFrame(ori, columns=keys)
        df1.to_csv(results_path, index=False)
    else:
        df1 = pd.read_csv(results_path)
        new = pd.DataFrame(results, index=[1])
        df1 = df1.append(new, ignore_index=True)
        df1.to_csv(results_path, index=False)
    data_diagram = pd.read_csv(results_path)

    print('test_results', data_diagram)

效果如下:

image.png

NLP萌新,才疏学浅,有错误或者不完善的地方,请批评指正!!