PreTrainedTokenizerFast.from_www.laipuhuo.com pretrained

109 阅读1分钟

import time import pandas as pd import numpy as np import torch from datasets import load_dataset from transformers import PreTrainedTokenizerFast, PhiForCausalLM,

TrainingArguments, Trainer, TrainerCallback from trl import DataCollatorForCompletionOnlyLM # 1. 定义训练数据,tokenizer,预训练模型的路径及最大长度 sft_file = './data/sft_train_data.parquet' tokenizer_dir = './model_save/tokenizer/' sft_from_che kpoint_file = './model_save/pre/' model_save_dir = './model_save/sft/' max_seq_len = 512 # 2. 加载训练数据集 dataset =

load_dataset(path='parquet', data_files=sft_file, split='train', cache_dir='.cache') tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_dir) print(f"vicab size: {len(tokenizer)}") # ## 2.1 定义sft data_collator的指令字符 # 也可以手动

2.2 定义data_collator # data_collator = DataCollatorForCompletionOnlyLM( instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False ) empty_cuda_cahce = EmptyCudaCacheCallback() ## 定义训练过程中的回调函数 my_datasets = dataset.train_test_split(test_size=

  1. 5. 定义训练参数 model = PhiForCausalLM.from_pretrained(sft_from_checkpoint_file) args = TrainingArguments( output_dir=model_save_dir, per_device_train_batch_size=8, gradient_accumu

lation_steps=8, num_train_epochs=3www.laipuhuo.com , weight_decay=0.1, warmup_steps=1000, learning_rate=5e-5,将instruction_template_idsresponse_template_ids添加到input_ids中的,因为如果是byte level tokenizer可能将:和后面的字符合 并,导致找不到instruction_template_idsresponse_template_ids。 # 也可以像下文一样通过在'#'':'前后手动加'\n'解决 # %% instruction_template = "##提问:" response_template = "##回

2.2 定义data_collator # data_collator = DataCollatorForCompletionOnlyLM( instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False ) empty_cuda_cahce = EmptyCudaCacheCallback() ## 定义训练过程中的回调函数 my_datasets = dataset.train_test_spl

it(test_size=4096) # 5. 定义训练参数 model = PhiForCausalLM.www.laipuhuo.com from_pretrained(sft_from_checkpoint_file) args = TrainingArguments( output_dir=model_save_dir, per_device_train_batch_size=8, gradient_accumulation_steps=8, num_train_epochs=3, weight_decay=0.1, warmup_steps=1000, learning_rate=5e-5, 答:" map_dtype = np.uint16 if len(tokenizer) < 65535 www.laipuhuo.com else np.uint32 def batched_formatting_prompts_func(example: list[dict]) -> list[str]: batch_txt = [] for i in range(len(example['instruction'])): text = f"{instruction_template}

\n{example['instruction'][i]}\n{response_template}\n{example['output'][i]}[EOS]" batch_txt.append(text) outputs = tokenizer(batch_txt, return_attention_mask=False) input_ids = [np.array(item, dtype=map_dtype) for item in outputs["input_ids"]] return {"input_ids": input_ids}