Transformers 源码解析(一百三十六)
.\trainer_callback.py
"""
Callbacks to use with the Trainer class and customize the training loop.
"""
import dataclasses
import json
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import numpy as np
from tqdm.auto import tqdm
from .trainer_utils import IntervalStrategy, has_length
from .training_args import TrainingArguments
from .utils import logging
logger = logging.get_logger(__name__)
@dataclass
class TrainerState:
"""
A class containing the [`Trainer`] inner state that will be saved along the model and optimizer when checkpointing
and passed to the [`TrainerCallback`].
<Tip>
In all this class, one step is to be understood as one update step. When using gradient accumulation, one update
step may require several forward and backward passes: if you use `gradient_accumulation_steps=n`, then one update
step requires going through *n* batches.
</Tip>
"""
epoch: Optional[float] = None
global_step: int = 0
max_steps: int = 0
logging_steps: int = 500
eval_steps: int = 500
save_steps: int = 500
train_batch_size: int = None
num_input_tokens_seen: int = 0
total_flos: float = 0
log_history: List[Dict[str, float]] = None
best_metric: Optional[float] = None
best_model_checkpoint: Optional[str] = None
is_local_process_zero: bool = True
is_world_process_zero: bool = True
is_hyper_param_search: bool = False
is_world_process_zero: bool = True
is_hyper_param_search: bool = False
trial_name: str = None
trial_params: Dict[str, Union[str, float, int, bool]] = None
def __post_init__(self):
if self.log_history is None:
self.log_history = []
def save_to_json(self, json_path: str):
"""将实例内容以 JSON 格式保存到指定的 `json_path` 文件中。"""
json_string = json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n"
with open(json_path, "w", encoding="utf-8") as f:
f.write(json_string)
@classmethod
def load_from_json(cls, json_path: str):
"""从指定的 `json_path` 文件加载内容并创建一个类实例。"""
with open(json_path, "r", encoding="utf-8") as f:
text = f.read()
return cls(**json.loads(text))
@dataclass
class TrainerControl:
"""
A class that handles the [`Trainer`] control flow. This class is used by the [`TrainerCallback`] to activate some
switches in the training loop.
Args:
should_training_stop (`bool`, *optional*, defaults to `False`):
Whether or not the training should be interrupted.
If `True`, this variable will not be set back to `False`. The training will just stop.
should_epoch_stop (`bool`, *optional*, defaults to `False`):
Whether or not the current epoch should be interrupted.
If `True`, this variable will be set back to `False` at the beginning of the next epoch.
should_save (`bool`, *optional*, defaults to `False`):
Whether or not the model should be saved at this step.
If `True`, this variable will be set back to `False` at the beginning of the next step.
should_evaluate (`bool`, *optional*, defaults to `False`):
Whether or not the model should be evaluated at this step.
If `True`, this variable will be set back to `False` at the beginning of the next step.
should_log (`bool`, *optional*, defaults to `False`):
Whether or not the logs should be reported at this step.
If `True`, this variable will be set back to `False` at the beginning of the next step.
"""
should_training_stop: bool = False
should_epoch_stop: bool = False
should_save: bool = False
should_evaluate: bool = False
should_log: bool = False
def _new_training(self):
"""Internal method that resets the variable for a new training."""
self.should_training_stop = False
def _new_epoch(self):
"""Internal method that resets the variable for a new epoch."""
self.should_epoch_stop = False
def _new_step(self):
"""Internal method that resets the variable for a new step."""
self.should_save = False
self.should_evaluate = False
self.should_log = False
class TrainerCallback:
"""
A class for objects that will inspect the state of the training loop at some events and take some decisions. At
each of those events the following arguments are available:
"""
Args:
args ([`TrainingArguments`]):
用于实例化 [`Trainer`] 的训练参数。
state ([`TrainerState`]):
[`Trainer`] 的当前状态。
control ([`TrainerControl`]):
返回给 [`Trainer`] 的对象,用于做出某些决策。
model ([`PreTrainedModel`] or `torch.nn.Module`):
正在训练的模型。
tokenizer ([`PreTrainedTokenizer`]):
用于对数据进行编码的分词器。
optimizer (`torch.optim.Optimizer`):
训练步骤中使用的优化器。
lr_scheduler (`torch.optim.lr_scheduler.LambdaLR`):
用于设置学习率的调度器。
train_dataloader (`torch.utils.data.DataLoader`, *optional*):
用于训练的当前数据加载器。
eval_dataloader (`torch.utils.data.DataLoader`, *optional*):
用于评估的当前数据加载器。
metrics (`Dict[str, float]`):
上次评估阶段计算的指标。
只能在 `on_evaluate` 事件中访问。
logs (`Dict[str, float]`):
要记录的值。
只能在 `on_log` 事件中访问。
`control` 对象是唯一可以被回调函数更改的对象,在更改它的事件中应返回修改后的版本。
参数 `args`, `state` 和 `control` 对于所有事件都是位置参数,其余的参数在 `kwargs` 中分组。
您可以在事件的签名中解包需要使用的参数。例如,查看简单 [`~transformers.PrinterCallback`] 的代码示例。
示例:
```
class PrinterCallback(TrainerCallback):
def on_log(self, args, state, control, logs=None, **kwargs):
_ = logs.pop("total_flos", None)
if state.is_local_process_zero:
print(logs)
```
def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
"""
Event called at the end of an epoch.
"""
pass
def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
"""
Event called at the beginning of a training step. If using gradient accumulation, one training step might take
several inputs.
"""
pass
def on_substep_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
"""
Event called at the end of an substep during gradient accumulation.
"""
pass
def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
"""
Event called at the end of a training step. If using gradient accumulation, one training step might take
several inputs.
"""
pass
def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
"""
Event called after an evaluation phase.
"""
pass
def on_predict(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics, **kwargs):
"""
Event called after a successful prediction.
"""
pass
def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
"""
Event called after a checkpoint save.
"""
pass
def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
"""
Event called after logging the last logs.
"""
pass
def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
"""
Event called after a prediction step.
"""
pass
class CallbackHandler(TrainerCallback):
"""Internal class that just calls the list of callbacks in order."""
def __init__(self, callbacks, model, tokenizer, optimizer, lr_scheduler):
self.callbacks = []
for cb in callbacks:
self.add_callback(cb)
self.model = model
self.tokenizer = tokenizer
self.optimizer = optimizer
self.lr_scheduler = lr_scheduler
self.train_dataloader = None
self.eval_dataloader = None
if not any(isinstance(cb, DefaultFlowCallback) for cb in self.callbacks):
logger.warning(
"The Trainer will not work properly if you don't have a `DefaultFlowCallback` in its callbacks. You\n"
+ "should add one before training with `trainer.add_callback(DefaultFlowCallback). The current list of"
+ "callbacks is\n:"
+ self.callback_list
)
def add_callback(self, callback):
cb = callback() if isinstance(callback, type) else callback
cb_class = callback if isinstance(callback, type) else callback.__class__
if cb_class in [c.__class__ for c in self.callbacks]:
logger.warning(
f"You are adding a {cb_class} to the callbacks of this Trainer, but there is already one. The current"
+ "list of callbacks is\n:"
+ self.callback_list
)
self.callbacks.append(cb)
def pop_callback(self, callback):
if isinstance(callback, type):
for cb in self.callbacks:
if isinstance(cb, callback):
self.callbacks.remove(cb)
return cb
else:
for cb in self.callbacks:
if cb == callback:
self.callbacks.remove(cb)
return cb
def remove_callback(self, callback):
if isinstance(callback, type):
for cb in self.callbacks:
if isinstance(cb, callback):
self.callbacks.remove(cb)
return
else:
self.callbacks.remove(callback)
@property
def callback_list(self):
return "\n".join(cb.__class__.__name__ for cb in self.callbacks)
def on_init_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
return self.call_event("on_init_end", args, state, control)
def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
control.should_training_stop = False
return self.call_event("on_train_begin", args, state, control)
def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
return self.call_event("on_train_end", args, state, control)
def on_epoch_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
control.should_epoch_stop = False
return self.call_event("on_epoch_begin", args, state, control)
def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
return self.call_event("on_epoch_end", args, state, control)
def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
control.should_log = False
control.should_evaluate = False
control.should_save = False
return self.call_event("on_step_begin", args, state, control)
def on_substep_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
return self.call_event("on_substep_end", args, state, control)
def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
return self.call_event("on_step_end", args, state, control)
def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics):
control.should_evaluate = False
return self.call_event("on_evaluate", args, state, control, metrics=metrics)
def on_predict(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics):
return self.call_event("on_predict", args, state, control, metrics=metrics)
def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
control.should_save = False
return self.call_event("on_save", args, state, control)
def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs):
control.should_log = False
return self.call_event("on_log", args, state, control, logs=logs)
def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
return self.call_event("on_prediction_step", args, state, control)
def call_event(self, event, args, state, control, **kwargs):
for callback in self.callbacks:
result = getattr(callback, event)(
args,
state,
control,
model=self.model,
tokenizer=self.tokenizer,
optimizer=self.optimizer,
lr_scheduler=self.lr_scheduler,
train_dataloader=self.train_dataloader,
eval_dataloader=self.eval_dataloader,
**kwargs,
)
if result is not None:
control = result
return control
"""
A [`TrainerCallback`] that handles the default flow of the training loop for logs, evaluation and checkpoints.
"""
def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
if state.global_step == 1 and args.logging_first_step:
control.should_log = True
if args.logging_strategy == IntervalStrategy.STEPS and state.global_step % state.logging_steps == 0:
control.should_log = True
if (
args.evaluation_strategy == IntervalStrategy.STEPS
and state.global_step % state.eval_steps == 0
and args.eval_delay <= state.global_step
):
control.should_evaluate = True
if (
args.save_strategy == IntervalStrategy.STEPS
and state.save_steps > 0
and state.global_step % state.save_steps == 0
):
control.should_save = True
if state.global_step >= state.max_steps:
control.should_training_stop = True
return control
def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
if args.logging_strategy == IntervalStrategy.EPOCH:
control.should_log = True
if args.evaluation_strategy == IntervalStrategy.EPOCH and args.eval_delay <= state.epoch:
control.should_evaluate = True
if args.save_strategy == IntervalStrategy.EPOCH:
control.should_save = True
return control
class ProgressCallback(TrainerCallback):
"""
A [`TrainerCallback`] that displays the progress of training or evaluation.
"""
def __init__(self):
self.training_bar = None
self.prediction_bar = None
def on_train_begin(self, args, state, control, **kwargs):
if state.is_world_process_zero:
self.training_bar = tqdm(total=state.max_steps, dynamic_ncols=True)
self.current_step = 0
def on_step_end(self, args, state, control, **kwargs):
if state.is_world_process_zero:
self.training_bar.update(state.global_step - self.current_step)
self.current_step = state.global_step
def on_prediction_step(self, args, state, control, eval_dataloader=None, **kwargs):
if state.is_world_process_zero and has_length(eval_dataloader):
if self.prediction_bar is None:
self.prediction_bar = tqdm(
total=len(eval_dataloader), leave=self.training_bar is None, dynamic_ncols=True
)
self.prediction_bar.update(1)
def on_evaluate(self, args, state, control, **kwargs):
if state.is_world_process_zero:
if self.prediction_bar is not None:
self.prediction_bar.close()
self.prediction_bar = None
def on_predict(self, args, state, control, **kwargs):
if state.is_world_process_zero:
if self.prediction_bar is not None:
self.prediction_bar.close()
self.prediction_bar = None
def on_log(self, args, state, control, logs=None, **kwargs):
if state.is_world_process_zero and self.training_bar is not None:
_ = logs.pop("total_flos", None)
self.training_bar.write(str(logs))
def on_train_end(self, args, state, control, **kwargs):
if state.is_world_process_zero:
self.training_bar.close()
self.training_bar = None
class PrinterCallback(TrainerCallback):
"""
A bare [`TrainerCallback`] that just prints the logs.
"""
def on_log(self, args, state, control, logs=None, **kwargs):
_ = logs.pop("total_flos", None)
if state.is_local_process_zero:
print(logs)
class EarlyStoppingCallback(TrainerCallback):
"""
A [`TrainerCallback`] that handles early stopping.
Args:
early_stopping_patience (`int`):
Use with `metric_for_best_model` to stop training when the specified metric worsens for
`early_stopping_patience` evaluation calls.
early_stopping_threshold(`float`, *optional*):
Use with TrainingArguments `metric_for_best_model` and `early_stopping_patience` to denote how much the
specified metric must improve to satisfy early stopping conditions. `
This callback depends on [`TrainingArguments`] argument *load_best_model_at_end* functionality to set best_metric
in [`TrainerState`]. Note that if the [`TrainingArguments`] argument *save_steps* differs from *eval_steps*, the
early stopping will not occur until the next save step.
"""
def __init__(self, early_stopping_patience: int = 1, early_stopping_threshold: Optional[float] = 0.0):
self.early_stopping_patience = early_stopping_patience
self.early_stopping_threshold = early_stopping_threshold
self.early_stopping_patience_counter = 0
def check_metric_value(self, args, state, control, metric_value):
operator = np.greater if args.greater_is_better else np.less
if state.best_metric is None or (
operator(metric_value, state.best_metric)
and abs(metric_value - state.best_metric) > self.early_stopping_threshold
):
self.early_stopping_patience_counter = 0
else:
self.early_stopping_patience_counter += 1
def on_train_begin(self, args, state, control, **kwargs):
assert args.load_best_model_at_end, "EarlyStoppingCallback requires load_best_model_at_end = True"
assert args.metric_for_best_model is not None, "EarlyStoppingCallback requires metric_for_best_model is defined"
assert args.evaluation_strategy != IntervalStrategy.NO, "EarlyStoppingCallback requires IntervalStrategy of steps or epoch"
def on_evaluate(self, args, state, control, metrics, **kwargs):
metric_to_check = args.metric_for_best_model
if not metric_to_check.startswith("eval_"):
metric_to_check = f"eval_{metric_to_check}"
metric_value = metrics.get(metric_to_check)
if metric_value is None:
logger.warning(
f"early stopping required metric_for_best_model, but did not find {metric_to_check} so early stopping"
" is disabled"
)
return
self.check_metric_value(args, state, control, metric_value)
if self.early_stopping_patience_counter >= self.early_stopping_patience:
control.should_training_stop = True
.\trainer_pt_utils.py
"""
用于 Trainer 类的 Torch 实用程序。
"""
import copy
import datetime
import io
import json
import math
import os
import sys
import warnings
from collections.abc import Mapping
from contextlib import contextmanager
from dataclasses import dataclass, field
from logging import StreamHandler
from typing import Any, Dict, Iterator, List, Optional, Union
import numpy as np
import torch
import torch.distributed as dist
from torch import nn
from torch.optim.lr_scheduler import LRScheduler
from torch.utils.data import Dataset, IterableDataset, RandomSampler, Sampler
from torch.utils.data.distributed import DistributedSampler
from .integrations.deepspeed import is_deepspeed_zero3_enabled
from .tokenization_utils_base import BatchEncoding
from .utils import is_sagemaker_mp_enabled, is_torch_xla_available, is_training_run_on_sagemaker, logging
if is_training_run_on_sagemaker():
logging.add_handler(StreamHandler(sys.stdout))
if is_torch_xla_available():
import torch_xla.core.xla_model as xm
try:
from torch.optim.lr_scheduler import SAVE_STATE_WARNING
except ImportError:
SAVE_STATE_WARNING = ""
logger = logging.get_logger(__name__)
def get_dataloader_sampler(dataloader):
if hasattr(dataloader, "batch_sampler") and dataloader.batch_sampler is not None:
return get_dataloader_sampler(dataloader.batch_sampler)
elif hasattr(dataloader, "sampler"):
return dataloader.sampler
def atleast_1d(tensor_or_array: Union[torch.Tensor, np.ndarray]):
if isinstance(tensor_or_array, torch.Tensor):
if hasattr(torch, "atleast_1d"):
tensor_or_array = torch.atleast_1d(tensor_or_array)
elif tensor_or_array.ndim < 1:
tensor_or_array = tensor_or_array[None]
else:
tensor_or_array = np.atleast_1d(tensor_or_array)
return tensor_or_array
def torch_pad_and_concatenate(tensor1, tensor2, padding_index=-100):
"""连接 `tensor1` 和 `tensor2` 在第一轴上,如果需要在第二轴上进行填充。"""
tensor1 = atleast_1d(tensor1)
tensor2 = atleast_1d(tensor2)
if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
return torch.cat((tensor1, tensor2), dim=0)
new_shape = (tensor1.shape[0] + tensor2.shape[0], max(tensor1.shape[1], tensor2.shape[1])) + tensor1.shape[2:]
result = tensor1.new_full(new_shape, padding_index)
result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1
result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2
return result
def numpy_pad_and_concatenate(array1, array2, padding_index=-100):
"""Concatenates `array1` and `array2` on first axis, applying padding on the second if necessary."""
array1 = atleast_1d(array1)
array2 = atleast_1d(array2)
if len(array1.shape) == 1 or array1.shape[1] == array2.shape[1]:
return np.concatenate((array1, array2), axis=0)
new_shape = (array1.shape[0] + array2.shape[0], max(array1.shape[1], array2.shape[1])) + array1.shape[2:]
result = np.full_like(array1, padding_index, shape=new_shape)
result[: array1.shape[0], : array1.shape[1]] = array1
result[array1.shape[0] :, : array2.shape[1]] = array2
return result
def nested_concat(tensors, new_tensors, padding_index=-100):
"""
Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or
nested list/tuples/dict of tensors.
"""
assert type(tensors) == type(
new_tensors
), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
if isinstance(tensors, (list, tuple)):
return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
elif isinstance(tensors, torch.Tensor):
return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
elif isinstance(tensors, Mapping):
return type(tensors)({k: nested_concat(t, new_tensors[k], padding_index=padding_index) for k, t in tensors.items()})
elif isinstance(tensors, np.ndarray):
return numpy_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
else:
raise TypeError(f"Unsupported type for concatenation: got {type(tensors)}")
def find_batch_size(tensors):
"""
Find the first dimension of a tensor in a nested list/tuple/dict of tensors.
"""
if isinstance(tensors, (list, tuple)):
for t in tensors:
result = find_batch_size(t)
if result is not None:
return result
elif isinstance(tensors, Mapping):
for key, value in tensors.items():
result = find_batch_size(value)
if result is not None:
return result
elif isinstance(tensors, torch.Tensor):
return tensors.shape[0] if len(tensors.shape) >= 1 else None
elif isinstance(tensors, np.ndarray):
return tensors.shape[0] if len(tensors.shape) >= 1 else None
def nested_numpify(tensors):
"Numpify `tensors` (even if it's a nested list/tuple/dict of tensors)."
if isinstance(tensors, (list, tuple)):
return type(tensors)(nested_numpify(t) for t in tensors)
if isinstance(tensors, Mapping):
return type(tensors)({k: nested_numpify(t) for k, t in tensors.items()})
t = tensors.cpu()
if t.dtype == torch.bfloat16:
t = t.to(torch.float32)
return t.numpy()
def nested_detach(tensors):
if isinstance(tensors, (list, tuple)):
return type(tensors)(nested_detach(t) for t in tensors)
elif isinstance(tensors, Mapping):
return type(tensors)({k: nested_detach(t) for k, t in tensors.items()})
return tensors.detach()
def nested_xla_mesh_reduce(tensors, name):
if is_torch_xla_available():
import torch_xla.core.xla_model as xm
if isinstance(tensors, (list, tuple)):
return type(tensors)(nested_xla_mesh_reduce(t, f"{name}_{i}") for i, t in enumerate(tensors))
if isinstance(tensors, Mapping):
return type(tensors)(
{k: nested_xla_mesh_reduce(t, f"{name}_{i}") for i, (k, t) in enumerate(tensors.items())}
)
tensors = atleast_1d(tensors)
return xm.mesh_reduce(name, tensors, torch.cat)
else:
raise ImportError("Torch xla must be installed to use `nested_xla_mesh_reduce`")
def distributed_concat(tensor: Any, num_total_examples: Optional[int] = None) -> Any:
try:
if isinstance(tensor, (tuple, list)):
return type(tensor)(distributed_concat(t, num_total_examples) for t in tensor)
if isinstance(tensor, Mapping):
return type(tensor)({k: distributed_concat(t, num_total_examples) for k, t in tensor.items()})
tensor = atleast_1d(tensor).contiguous()
output_tensors = [tensor.clone() for _ in range(dist.get_world_size())]
dist.all_gather(output_tensors, tensor)
concat = torch.cat(output_tensors, dim=0)
if num_total_examples is not None:
concat = concat[:num_total_examples]
return concat
except AssertionError:
raise AssertionError("Not currently using distributed training")
def distributed_broadcast_scalars(
scalars: List[Union[int, float]],
num_total_examples: Optional[int] = None,
device: Optional[torch.device] = torch.device("cuda"),
) -> torch.Tensor:
try:
tensorized_scalar = torch.tensor(scalars).to(device)
output_tensors = [tensorized_scalar.clone() for _ in range(dist.get_world_size())]
dist.all_gather(output_tensors, tensorized_scalar)
concat = torch.cat(output_tensors, dim=0)
if num_total_examples is not None:
concat = concat[:num_total_examples]
return concat
except AssertionError:
raise AssertionError("Not currently using distributed training")
def reissue_pt_warnings(caught_warnings):
if len(caught_warnings) > 1:
for w in caught_warnings:
if w.category != UserWarning or w.message != SAVE_STATE_WARNING:
warnings.warn(w.message, w.category)
@contextmanager
def torch_distributed_zero_first(local_rank: int):
"""
分布式训练中的装饰器,使所有进程等待每个本地主节点执行某些操作。
Args:
local_rank (`int`): 本地进程的排名。
"""
if local_rank not in [-1, 0]:
dist.barrier()
yield
if local_rank == 0:
dist.barrier()
class DistributedSamplerWithLoop(DistributedSampler):
"""
类似于`torch.utils.data.distributed.DistributedSampler`,但在洗牌样本末尾循环以使每个进程具有批次大小的整数倍样本。
Args:
dataset (`torch.utils.data.Dataset`):
用于采样的数据集。
batch_size (`int`):
此采样器使用的批次大小。
kwargs (`Dict[str, Any]`, *可选*):
传递给`DistributedSampler`的所有其他关键字参数。
"""
def __init__(self, dataset, batch_size, **kwargs):
super().__init__(dataset, **kwargs)
self.batch_size = batch_size
def __iter__(self):
indices = list(super().__iter__())
remainder = 0 if len(indices) % self.batch_size == 0 else self.batch_size - len(indices) % self.batch_size
start_remainder = 1 if self.rank < len(self.dataset) % self.num_replicas else 0
indices += indices[start_remainder : start_remainder + remainder]
return iter(indices)
class SequentialDistributedSampler(Sampler):
"""
顺序子采样器,按顺序子采样索引,使最终在收集所有结果时更容易。
即使我们只在评估和预测中使用此采样器(没有训练),这意味着模型参数不必同步(即使前向传递次数不同,也不会挂起同步),
我们仍然向采样器添加额外的样本,使其可以被`gather`或`reduce`,以便在循环结束时轻松处理。
"""
def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None):
warnings.warn(
"SequentialDistributedSampler is deprecated and will be removed in v5 of Transformers.",
FutureWarning,
)
if num_replicas is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
num_replicas = dist.get_world_size()
if rank is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
rank = dist.get_rank()
self.dataset = dataset
self.num_replicas = num_replicas
self.rank = rank
num_samples = len(self.dataset)
if batch_size is not None:
self.num_samples = int(math.ceil(num_samples / (batch_size * num_replicas))) * batch_size
else:
self.num_samples = int(math.ceil(num_samples / num_replicas))
self.total_size = self.num_samples * self.num_replicas
self.batch_size = batch_size
def __iter__(self):
indices = list(range(len(self.dataset)))
indices += indices[: (self.total_size - len(indices))]
assert (
len(indices) == self.total_size
), f"Indices length {len(indices)} and total size {self.total_size} mismatched"
indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples]
assert (
len(indices) == self.num_samples
), f"Indices length {len(indices)} and sample number {self.num_samples} mismatched"
return iter(indices)
def __len__(self):
return self.num_samples
def get_tpu_sampler(dataset: torch.utils.data.Dataset, batch_size: int):
if xm.xrt_world_size() <= 1:
return RandomSampler(dataset)
return DistributedSampler(dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal())
def nested_new_like(arrays, num_samples, padding_index=-100):
if isinstance(arrays, (list, tuple)):
return type(arrays)(nested_new_like(x, num_samples) for x in arrays)
return np.full_like(arrays, padding_index, shape=(num_samples, *arrays.shape[1:]))
def expand_like(arrays, new_seq_length, padding_index=-100):
result = np.full_like(arrays, padding_index, shape=(arrays.shape[0], new_seq_length) + arrays.shape[2:])
result[:, : arrays.shape[1]] = arrays
return result
def nested_truncate(tensors, limit):
if isinstance(tensors, (list, tuple)):
return type(tensors)(nested_truncate(t, limit) for t in tensors)
if isinstance(tensors, Mapping):
return type(tensors)({k: nested_truncate(t, limit) for k, t in tensors.items()})
return tensors[:limit]
class DistributedTensorGatherer:
"""
一个负责在CPU上按块正确聚合张量(或嵌套的张量列表/元组/字典)的类。
如果我们的数据集有16个样本,每个进程批量大小为2,有3个进程,并且我们在每一步都聚合并传输到CPU,
我们的采样器将生成以下索引:
`[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1]`
为了使每个进程获得相同的数据集长度的多个数目,然后进程0、1和2将负责为以下样本做出预测:
- P0: `[0, 1, 2, 3, 4, 5]`
- P1: `[6, 7, 8, 9, 10, 11]`
- P2: `[12, 13, 14, 15, 0, 1]`
每个进程的第一个批次将是
- P0: `[0, 1]`
- P1: `[6, 7]`
- P2: `[12, 13]`
因此,如果我们在第一个批次结束时聚合,我们将获得一个对应于以下索引的张量(或嵌套的张量列表/元组):
`[0, 1, 6, 7, 12, 13]`
如果我们直接连接我们的结果而不采取任何预防措施,用户最终会在预测循环结束时按以下顺序获得索引的预测结果:
`[0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1]`
由于某种原因,这种情况并不会符合他们的期望。这个类就是为了解决这个问题。
"""
pass
Args:
world_size (`int`):
分布式训练中使用的进程数。
num_samples (`int`):
数据集中的样本数。
make_multiple_of (`int`, *可选*):
如果传入,表示每个进程处理的数据集大小应该是此参数的倍数(通过增加样本数来实现)。
padding_index (`int`, *可选*, 默认为 -100):
如果数组的序列长度不相同时使用的填充索引。
"""
初始化方法,初始化分布式张量收集器对象。
"""
def __init__(self, world_size, num_samples, make_multiple_of=None, padding_index=-100):
warnings.warn(
"DistributedTensorGatherer is deprecated and will be removed in v5 of Transformers.",
FutureWarning,
)
self.world_size = world_size
self.num_samples = num_samples
total_size = world_size if make_multiple_of is None else world_size * make_multiple_of
self.total_samples = int(np.ceil(num_samples / total_size)) * total_size
self.process_length = self.total_samples // world_size
self._storage = None
self._offsets = None
self.padding_index = padding_index
"""
添加数组到内部存储,如果是第一次添加数组,则初始化存储到完整大小,以便在开始时发生内存溢出。
"""
def add_arrays(self, arrays):
if arrays is None:
return
if self._storage is None:
self._storage = nested_new_like(arrays, self.total_samples, padding_index=self.padding_index)
self._offsets = list(range(0, self.total_samples, self.process_length))
slice_len, self._storage = self._nested_set_tensors(self._storage, arrays)
for i in range(self.world_size):
self._offsets[i] += slice_len
def _nested_set_tensors(self, storage, arrays):
if isinstance(arrays, (list, tuple)):
result = [self._nested_set_tensors(x, y) for x, y in zip(storage, arrays)]
return result[0][0], type(arrays)(r[1] for r in result)
assert (
arrays.shape[0] % self.world_size == 0
), f"Arrays passed should all have a first dimension multiple of {self.world_size}, found {arrays.shape[0]}."
slice_len = arrays.shape[0] // self.world_size
for i in range(self.world_size):
if len(arrays.shape) == 1:
storage[self._offsets[i] : self._offsets[i] + slice_len] = arrays[i * slice_len : (i + 1) * slice_len]
else:
if len(storage.shape) > 1 and storage.shape[1] < arrays.shape[1]:
storage = expand_like(storage, arrays.shape[1], padding_index=self.padding_index)
storage[self._offsets[i] : self._offsets[i] + slice_len, : arrays.shape[1]] = arrays[
i * slice_len : (i + 1) * slice_len
]
return slice_len, storage
def finalize(self):
"""
Return the properly gathered arrays and truncate to the number of samples (since the sampler added some extras
to get each process a dataset of the same length).
"""
if self._storage is None:
return
if self._offsets[0] != self.process_length:
logger.warning("Not all data has been set. Are you sure you passed all values?")
return nested_truncate(self._storage, self.num_samples)
@dataclass
class LabelSmoother:
"""
Adds label-smoothing on a pre-computed output from a Transformers model.
Args:
epsilon (`float`, *optional*, defaults to 0.1):
The label smoothing factor.
ignore_index (`int`, *optional*, defaults to -100):
The index in the labels to ignore when computing the loss.
"""
epsilon: float = 0.1
ignore_index: int = -100
def __call__(self, model_output, labels, shift_labels=False):
logits = model_output["logits"] if isinstance(model_output, dict) else model_output[0]
if shift_labels:
logits = logits[..., :-1, :].contiguous()
labels = labels[..., 1:].contiguous()
log_probs = -nn.functional.log_softmax(logits, dim=-1)
if labels.dim() == log_probs.dim() - 1:
labels = labels.unsqueeze(-1)
padding_mask = labels.eq(self.ignore_index)
labels = torch.clamp(labels, min=0)
nll_loss = log_probs.gather(dim=-1, index=labels)
smoothed_loss = log_probs.sum(dim=-1, keepdim=True, dtype=torch.float32)
nll_loss.masked_fill_(padding_mask, 0.0)
smoothed_loss.masked_fill_(padding_mask, 0.0)
num_active_elements = padding_mask.numel() - padding_mask.long().sum()
nll_loss = nll_loss.sum() / num_active_elements
smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1])
return (1 - self.epsilon) * nll_loss + self.epsilon * smoothed_loss
def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, generator=None):
"""
Return a list of indices so that each slice of `batch_size` consecutive indices correspond to elements of similar
lengths. To do this, the indices are:
- randomly permuted
- grouped in mega-batches of size `mega_batch_mult * batch_size`
- sorted by length in each mega-batch
The result is the concatenation of all mega-batches, with the batch of `batch_size` containing the element of
maximum length placed first, so that an OOM happens sooner rather than later.
"""
if mega_batch_mult is None:
mega_batch_mult = min(len(lengths) // (batch_size * 4), 50)
if mega_batch_mult == 0:
mega_batch_mult = 1
indices = torch.randperm(len(lengths), generator=generator)
megabatch_size = mega_batch_mult * batch_size
megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
megabatch_maximums = [lengths[megabatch[0]] for megabatch in megabatches]
max_idx = torch.argmax(torch.tensor(megabatch_maximums)).item()
megabatches[0][0], megabatches[max_idx][0] = megabatches[max_idx][0], megabatches[0][0]
return [i for megabatch in megabatches for i in megabatch]
class LengthGroupedSampler(Sampler):
r"""
Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
keeping a bit of randomness.
"""
def __init__(
self,
batch_size: int,
dataset: Optional[Dataset] = None,
lengths: Optional[List[int]] = None,
model_input_name: Optional[str] = None,
generator=None,
):
if dataset is None and lengths is None:
raise ValueError("One of dataset and lengths must be provided.")
self.batch_size = batch_size
if lengths is None:
model_input_name = model_input_name if model_input_name is not None else "input_ids"
if (
not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding))
or model_input_name not in dataset[0]
):
raise ValueError(
"Can only automatically infer lengths for datasets whose items are dictionaries with an "
f"'{model_input_name}' key."
)
lengths = [len(feature[model_input_name]) for feature in dataset]
elif isinstance(lengths, torch.Tensor):
logger.info(
"If lengths is a torch.Tensor, LengthGroupedSampler will be slow. Converting lengths to List[int]..."
)
lengths = lengths.tolist()
self.lengths = lengths
self.generator = generator
def __len__(self):
return len(self.lengths)
def __iter__(self):
indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=self.generator)
return iter(indices)
class DistributedLengthGroupedSampler(DistributedSampler):
r"""
Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same
length while keeping a bit of randomness.
"""
def __init__(
self,
batch_size: int,
dataset: Optional[Dataset] = None,
num_replicas: Optional[int] = None,
rank: Optional[int] = None,
seed: int = 0,
drop_last: bool = False,
lengths: Optional[List[int]] = None,
model_input_name: Optional[str] = None,
generator=None,
):
super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=False, seed=seed)
if lengths is None:
model_input_name = model_input_name if model_input_name is not None else "input_ids"
if (
not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding))
or model_input_name not in dataset[0]
):
raise ValueError(
"Can only automatically infer lengths for datasets whose items are dictionaries with an "
f"'{model_input_name}' key."
)
lengths = [len(feature[model_input_name]) for feature in dataset]
elif isinstance(lengths, torch.Tensor):
logger.info(
"If lengths is a torch.Tensor, DistributedLengthGroupedSampler will be slow. Converting lengths to List[int]..."
)
lengths = lengths.tolist()
self.lengths = lengths
self.batch_size = batch_size
self.drop_last = drop_last
self.generator = generator
):
raise ValueError("One of dataset and lengths must be provided.")
if num_replicas is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
num_replicas = dist.get_world_size()
if rank is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
rank = dist.get_rank()
self.batch_size = batch_size
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.drop_last = drop_last
if lengths is None:
model_input_name = model_input_name if model_input_name is not None else "input_ids"
if (
not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding))
or model_input_name not in dataset[0]
):
raise ValueError(
"Can only automatically infer lengths for datasets whose items are dictionaries with an "
f"'{model_input_name}' key."
)
lengths = [len(feature[model_input_name]) for feature in dataset]
elif isinstance(lengths, torch.Tensor):
logger.info(
"If lengths is a torch.Tensor, DistributedLengthGroupedSampler will be slow. Converting lengths to"
" List[int]..."
)
lengths = lengths.tolist()
self.lengths = lengths
if self.drop_last and len(self.lengths) % self.num_replicas != 0:
self.num_samples = math.ceil((len(self.lengths) - self.num_replicas) / self.num_replicas)
else:
self.num_samples = math.ceil(len(self.lengths) / self.num_replicas)
self.total_size = self.num_samples * self.num_replicas
self.seed = seed
def __iter__(self) -> Iterator:
g = torch.Generator()
g.manual_seed(self.seed + self.epoch)
indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=g)
if not self.drop_last:
indices += indices[: (self.total_size - len(indices))]
else:
indices = indices[: self.total_size]
assert len(indices) == self.total_size
indices = indices[self.rank : self.total_size : self.num_replicas]
assert len(indices) == self.num_samples
return iter(indices)
class ShardSampler(Sampler):
"""
Sampler that shards batches between several processes. Dispatches indices batch by batch: on 2 processes with batch
size 4, the first two batches are `[0, 1, 2, 3, 4, 5, 6, 7]` and `[8, 9, 10, 11, 12, 13, 14, 15]`, which shard into
`[0, 1, 2, 3]` and `[8, 9, 10, 11]` for GPU-0 and `[4, 5, 6, 7]` and `[12, 13, 14, 15]` for GPU-1.
The sampler thus yields `[0, 1, 2, 3, 8, 9, 10, 11]` on GPU-0 and `[4, 5, 6, 7, 12, 13, 14, 15]` on GPU-1.
"""
def __init__(
self,
dataset: Dataset,
batch_size: int = 1,
drop_last: bool = False,
num_processes: int = 1,
process_index: int = 0,
):
self.dataset = dataset
self.batch_size = batch_size
self.drop_last = drop_last
self.num_processes = num_processes
self.process_index = process_index
self.total_batch_size = total_batch_size = batch_size * num_processes
num_batches = len(dataset) // total_batch_size if drop_last else math.ceil(len(dataset) / total_batch_size)
self.total_num_samples = num_batches * total_batch_size
def __iter__(self):
indices = list(range(len(self.dataset)))
while len(indices) < self.total_num_samples:
indices += indices[: (self.total_num_samples - len(indices))]
result = []
for batch_start in range(self.batch_size * self.process_index, self.total_num_samples, self.total_batch_size):
result += indices[batch_start : batch_start + self.batch_size]
return iter(result)
def __len__(self):
return self.total_num_samples // self.num_processes
class IterableDatasetShard(IterableDataset):
"""
Wraps a PyTorch `IterableDataset` to generate samples for one of the processes only. Instances of this class will
always yield a number of samples that is a round multiple of the actual batch size (which is `batch_size x
num_processes`). Depending on the value of the `drop_last` attribute, it will either stop the iteration at the
first batch that would be too small or loop with indices from the beginning.
On two processes with an iterable dataset yielding of `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]` with a batch size of
2:
- the shard on process 0 will yield `[0, 1, 4, 5, 8, 9]` so will see batches `[0, 1]`, `[4, 5]`, `[8, 9]`
- the shard on process 1 will yield `[2, 3, 6, 7, 10, 11]` so will see batches `[2, 3]`, `[6, 7]`, `[10, 11]`
"""
def __init__(
self,
dataset: IterableDataset,
batch_size: int = 1,
drop_last: bool = False,
num_processes: int = 1,
process_index: int = 0,
seed: int = 0,
):
self.dataset = dataset
self.batch_size = batch_size
self.drop_last = drop_last
self.num_processes = num_processes
self.process_index = process_index
self.seed = seed
self.epoch = 0
self.num_examples = 0
def set_epoch(self, epoch):
self.epoch = epoch
if hasattr(self.dataset, "set_epoch"):
self.dataset.set_epoch(epoch)
def __iter__(self):
self.num_examples = 0
if (
not hasattr(self.dataset, "set_epoch")
and hasattr(self.dataset, "generator")
and isinstance(self.dataset.generator, torch.Generator)
):
self.dataset.generator.manual_seed(self.seed + self.epoch)
real_batch_size = self.batch_size * self.num_processes
process_slice = range(self.process_index * self.batch_size, (self.process_index + 1) * self.batch_size)
first_batch = None
current_batch = []
for element in self.dataset:
self.num_examples += 1
current_batch.append(element)
if len(current_batch) == real_batch_size:
for i in process_slice:
yield current_batch[i]
if first_batch is None:
first_batch = current_batch.copy()
current_batch = []
if not self.drop_last and len(current_batch) > 0:
if first_batch is None:
first_batch = current_batch.copy()
while len(current_batch) < real_batch_size:
current_batch += first_batch
for i in process_slice:
yield current_batch[i]
def __len__(self):
if self.drop_last:
return (len(self.dataset) // (self.batch_size * self.num_processes)) * self.batch_size
else:
return math.ceil(len(self.dataset) / (self.batch_size * self.num_processes)) * self.batch_size
def _get_learning_rate(self):
if self.is_deepspeed_enabled:
try:
last_lr = self.lr_scheduler.get_last_lr()[0]
except AssertionError as e:
if "need to call step" in str(e):
logger.warning("tried to get lr value before scheduler/optimizer started stepping, returning lr=0")
last_lr = 0
else:
raise
else:
if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
last_lr = self.optimizer.param_groups[0]["lr"]
else:
last_lr = self.lr_scheduler.get_last_lr()[0]
if torch.is_tensor(last_lr):
last_lr = last_lr.item()
return last_lr
def _secs2timedelta(secs):
msec = int(abs(secs - int(secs)) * 100)
return f"{datetime.timedelta(seconds=int(secs))}.{msec:02d}"
def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]:
"""
Args:
metrics (`Dict[str, float]`):
训练/评估/预测返回的指标
Returns:
metrics (`Dict[str, float]`): 格式化后的指标
"""
metrics_copy = metrics.copy()
for k, v in metrics_copy.items():
if "_mem_" in k:
metrics_copy[k] = f"{ v >> 20 }MB"
elif "_runtime" in k:
metrics_copy[k] = _secs2timedelta(v)
elif k == "total_flos":
metrics_copy[k] = f"{ int(v) >> 30 }GF"
elif isinstance(metrics_copy[k], float):
metrics_copy[k] = round(v, 4)
return metrics_copy
def log_metrics(self, split, metrics):
"""
Args:
split (`str`):
模式/分割名称:`train`, `eval`, `test` 中的一个
metrics (`Dict[str, float]`):
训练/评估/预测返回的指标字典
Notes on memory reports:
要获取内存使用报告,需要安装 `psutil`。您可以使用 `pip install psutil` 进行安装。
当运行此方法时,您将看到一个报告,其中包含:
```
init_mem_cpu_alloc_delta = 1301MB
init_mem_cpu_peaked_delta = 154MB
init_mem_gpu_alloc_delta = 230MB
init_mem_gpu_peaked_delta = 0MB
train_mem_cpu_alloc_delta = 1345MB
train_mem_cpu_peaked_delta = 0MB
```
"""
train_mem_gpu_alloc_delta = 693MB
train_mem_gpu_peaked_delta = 7MB
"""
Print formatted metrics for a given split.
This function prints out metrics in a formatted way for a specified split (e.g., 'train', 'validation').
It calculates the maximum width required for keys and values to ensure aligned printing. Metrics are sorted
by keys before printing.
Args:
- split (str): The name of the split (e.g., 'train', 'validation') for which metrics are being printed.
- metrics (dict): A dictionary containing metrics to be printed.
Notes:
- This function is intended to be called to print metrics during training or evaluation.
- Metrics are formatted to ensure proper alignment based on the widest key and value in the provided dictionary.
"""
if not self.is_world_process_zero():
return
print(f"***** {split} metrics *****")
metrics_formatted = self.metrics_format(metrics)
k_width = max(len(str(x)) for x in metrics_formatted.keys())
v_width = max(len(str(x)) for x in metrics_formatted.values())
for key in sorted(metrics_formatted.keys()):
print(f" {key: <{k_width}} = {metrics_formatted[key]:>{v_width}}")
def save_metrics(self, split, metrics, combined=True):
"""
Save metrics into a json file for that split, e.g. `train_results.json`.
Under distributed environment this is done only for a process with rank 0.
Args:
split (`str`):
Mode/split name: one of `train`, `eval`, `test`, `all`
metrics (`Dict[str, float]`):
The metrics returned from train/evaluate/predict
combined (`bool`, *optional*, defaults to `True`):
Creates combined metrics by updating `all_results.json` with metrics of this call
To understand the metrics please read the docstring of [`~Trainer.log_metrics`]. The only difference is that raw
unformatted numbers are saved in the current method.
"""
if not self.is_world_process_zero():
return
path = os.path.join(self.args.output_dir, f"{split}_results.json")
with open(path, "w") as f:
json.dump(metrics, f, indent=4, sort_keys=True)
if combined:
path = os.path.join(self.args.output_dir, "all_results.json")
if os.path.exists(path):
with open(path, "r") as f:
all_metrics = json.load(f)
else:
all_metrics = {}
all_metrics.update(metrics)
with open(path, "w") as f:
json.dump(all_metrics, f, indent=4, sort_keys=True)
def save_state(self):
"""
Saves the Trainer state, since Trainer.save_model saves only the tokenizer with the model
Under distributed environment this is done only for a process with rank 0.
"""
if not self.is_world_process_zero():
return
path = os.path.join(self.args.output_dir, "trainer_state.json")
self.state.save_to_json(path)
def get_model_param_count(model, trainable_only=False):
"""
Calculate model's total param count. If trainable_only is True then count only those requiring grads
"""
if is_deepspeed_zero3_enabled():
def numel(p):
return p.ds_numel if hasattr(p, "ds_numel") else p.numel()
else:
def numel(p):
return p.numel()
return sum(numel(p) for p in model.parameters() if not trainable_only or p.requires_grad)
def get_parameter_names(model, forbidden_layer_types):
"""
Returns the names of the model parameters that are not inside a forbidden layer.
"""
result = []
for name, child in model.named_children():
result += [
f"{name}.{n}"
for n in get_parameter_names(child, forbidden_layer_types)
if not isinstance(child, tuple(forbidden_layer_types))
]
result += list(model._parameters.keys())
return result
def get_module_class_from_name(module, name):
"""
Gets a class from a module by its name.
Args:
module (`torch.nn.Module`): The module to get the class from.
name (`str`): The name of the class.
"""
获取给定模块中特定类名的类对象。
参数:
module: 给定的模块对象
name: 要查找的类名
返回值:
如果找到与给定名称匹配的类对象,则返回该类对象;如果找不到,返回 None。
"""
modules_children = list(module.children()) # 获取模块的所有子模块列表
if module.__class__.__name__ == name: # 如果当前模块的类名与目标名称相同
return module.__class__ # 返回当前模块的类对象
elif len(modules_children) == 0: # 如果当前模块没有子模块
return # 返回空,表示未找到目标类
else:
for child_module in modules_children: # 遍历所有子模块
module_class = get_module_class_from_name(child_module, name) # 递归调用获取目标类对象
if module_class is not None: # 如果找到了目标类对象
return module_class # 返回目标类对象
# 如果当前进程为主进程,则执行删除指定目录下的文件
def remove_dummy_checkpoint(is_main_process, output_dir, filenames):
if is_main_process:
for filename in filenames:
file = os.path.join(output_dir, filename)
# 如果文件存在,则删除该文件
if os.path.isfile(file):
os.remove(file)
# 检查是否启用了SageMaker的模型并行功能
if is_sagemaker_mp_enabled():
# 导入SageMaker模型并行的Torch扩展
import smdistributed.modelparallel.torch as smp
# 使用SMP的step装饰器定义前向传播和反向传播步骤
@smp.step()
def smp_forward_backward(model, inputs, gradient_accumulation_steps=1):
# 执行模型前向传播
outputs = model(**inputs)
# 提取损失值,如果输出是字典则取loss键,否则取第一个元素
loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
# 根据梯度累积步数计算平均损失
loss /= gradient_accumulation_steps
# 执行模型反向传播
model.backward(loss)
return loss
# 使用SMP的step装饰器定义仅前向传播步骤
@smp.step()
def smp_forward_only(model, inputs):
return model(**inputs)
# SMP下的数据收集函数,递归地收集嵌套的列表、元组或字典中的张量
def smp_gather(tensor):
if isinstance(tensor, (list, tuple)):
return type(tensor)(smp_gather(t) for t in tensor)
elif isinstance(tensor, dict):
return type(tensor)({k: smp_gather(v) for k, v in tensor.items()})
# 如果不是张量类型则抛出类型错误
elif not isinstance(tensor, torch.Tensor):
raise TypeError(
f"Can't gather the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors."
)
# 使用SMP的allgather函数在DP_GROUP中收集所有张量
all_tensors = smp.allgather(tensor, smp.CommGroup.DP_GROUP)
# 将每个张量至少转换为1维,并将它们连接起来
all_tensors = [atleast_1d(t) for t in all_tensors]
return torch.cat([t.cpu() for t in all_tensors], dim=0)
# SMP下的嵌套张量连接函数,递归地连接嵌套的列表、元组或字典中的张量
def smp_nested_concat(tensor):
if isinstance(tensor, (list, tuple)):
return type(tensor)(smp_nested_concat(t) for t in tensor)
elif isinstance(tensor, dict):
return type(tensor)({k: smp_nested_concat(v) for k, v in tensor.items()})
# 如果不是StepOutput类型,则进行连接并将结果从计算图中分离并移到CPU上
# 注:这里由于StepOutput与smp.step同名,Python可能会混淆
return tensor.concat().detach().cpu()
# 加速器配置类,用于自定义加速器相关参数
@dataclass
class AcceleratorConfig:
"""
A subset of arguments relating to the underlying [`accelerate.Accelerator`]
implementation utilized in the `Trainer` that can be customized.
Mostly relating to data.
"""
# Parameters:
# split_batches (`bool`, *optional*, defaults to `False`):
# Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If
# `True` the actual batch size used will be the same on any kind of distributed processes, but it must be a
# round multiple of the `num_processes` you are using. If `False`, actual batch size used will be the one set
# in your script multiplied by the number of processes.
# dispatch_batches (`bool`, *optional*):
# If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process
# and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose
# underlying dataset is an `IterableDataset`, `False` otherwise.
# even_batches (`bool`, *optional*, defaults to `True`):
# If set to `True`, in cases where the total batch size across all processes does not exactly divide the
# dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among
# all workers.
# use_seedable_sampler (`bool`, *optional*, defaults to `True`):
# Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures
# training results are fully reproducable using a different sampling technique. While seed-to-seed results
# may differ, on average the differences are neglible when using multiple different seeds to compare. Should
# also be ran with [`~utils.set_seed`] for the best results.
even_batches: bool = field(
default=True,
metadata={
"help": "If set to `True`, in cases where the total batch size across all processes does not exactly divide the"
" dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among"
" all workers."
},
)
# 是否使用偶数批次
# 如果设置为 `True`,在所有进程的总批次大小无法精确地整除数据集的情况下,
# 将在数据集的开头复制样本,以便批次可以在所有工作器之间均匀分配
use_seedable_sampler: bool = field(
default=True,
metadata={
"help": "Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`])."
"Ensures training results are fully reproducable using a different sampling technique. "
"While seed-to-seed results may differ, on average the differences are neglible when using"
"multiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results."
},
)
# 是否使用可种子化的随机采样器
# 如果设置为 `True`,将使用可完全种子化的随机采样器([`accelerate.data_loader.SeedableRandomSampler`])。
# 确保使用不同的采样技术可以完全复制训练结果。
# 尽管种子之间的结果可能会有所不同,但在使用多个不同种子进行比较时,平均差异微乎其微。
# 应与 [`~utils.set_seed`] 一起使用以获得最佳结果。
@classmethod
def from_json_file(cls, json_file):
# 检查文件是否存在,选择合适的打开方式
open_file = io.open if os.path.exists(json_file) else open
with open_file(json_file, "r", encoding="utf-8") as f:
# 加载 JSON 文件内容为字典
config_dict = json.load(f)
# 检查字典中是否有未知键,并加载合理的默认值
extra_keys = sorted(key for key in config_dict.keys() if key not in cls.__dataclass_fields__.keys())
if len(extra_keys) > 0:
# 如果配置文件中存在未知键,抛出 ValueError 异常
raise ValueError(
f"The config file at {json_file} had unknown keys ({extra_keys}), please try upgrading your `transformers`"
" version or fix (and potentially remove these keys) from your config file."
)
# 使用加载的配置字典创建当前类的实例并返回
return cls(**config_dict)
# 将对象转换为字典的方法
def to_dict(self):
# 使用深度复制来创建当前对象的属性字典并返回
return copy.deepcopy(self.__dict__)
# 创建一个自定义的优化器类 LayerWiseDummyOptimizer,继承自 torch.optim.Optimizer
class LayerWiseDummyOptimizer(torch.optim.Optimizer):
"""
对于像 GaLoRE 优化器这样的分层优化器,优化步骤已经通过后梯度钩子完成。
因此,关键在于创建一个虚拟的优化器,在训练过程中返回空操作。
初始想法来自 LLaMA-Factory 中的 @hiyouga:
https://github.com/hiyouga/LLaMA-Factory/commit/8664262cde3919e10eaecbd66e8c5d356856362e
"""
# 初始化函数,接受 optimizer_dict 和任意的 args 和 kwargs
def __init__(self, optimizer_dict=None, *args, **kwargs):
# 创建一个虚拟张量
dummy_tensor = torch.randn(1, 1)
self.optimizer_dict = optimizer_dict
# 调用父类的初始化函数,传入一个包含虚拟张量的列表和学习率 lr 的字典
super().__init__([dummy_tensor], {"lr": 1e-03})
# 定义了 zero_grad 方法,设置为无操作
def zero_grad(self, set_to_none: bool = True) -> None:
pass
# 定义了 step 方法,设置为无操作,并返回空值
def step(self, closure=None) -> Optional[float]:
pass
# 创建一个自定义的调度器类 LayerWiseDummyScheduler,继承自 LRScheduler
class LayerWiseDummyScheduler(LRScheduler):
"""
对于像 GaLoRE 优化器这样的分层优化器,优化和调度步骤已经通过后梯度钩子完成。
因此,关键在于创建一个虚拟的调度器,在训练过程中返回空操作。
"""
# 初始化函数,接受任意的 args 和 kwargs
def __init__(self, *args, **kwargs):
# 创建一个 LayerWiseDummyOptimizer 的实例作为优化器
optimizer = LayerWiseDummyOptimizer()
last_epoch = -1
verbose = False
# 调用父类 LRScheduler 的初始化函数,传入虚拟优化器、上一个 epoch、是否详细输出
super().__init__(optimizer, last_epoch, verbose)
# 定义了 get_lr 方法,返回当前优化器各参数组的学习率列表
def get_lr(self):
return [group["lr"] for group in self.optimizer.param_groups]
# 定义了 _get_closed_form_lr 方法,返回基础学习率的列表
def _get_closed_form_lr(self):
return self.base_lrs
.\trainer_seq2seq.py
import warnings
from copy import deepcopy
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
import torch
from torch import nn
from torch.utils.data import Dataset
from .generation.configuration_utils import GenerationConfig
from .integrations.deepspeed import is_deepspeed_zero3_enabled
from .trainer import Trainer
from .utils import logging
if TYPE_CHECKING:
from .data.data_collator import DataCollator
from .modeling_utils import PreTrainedModel
from .tokenization_utils_base import PreTrainedTokenizerBase
from .trainer_callback import TrainerCallback
from .trainer_utils import EvalPrediction, PredictionOutput
from .training_args import TrainingArguments
logger = logging.get_logger(__name__)
class Seq2SeqTrainer(Trainer):
def __init__(
self,
model: Union["PreTrainedModel", nn.Module] = None,
args: "TrainingArguments" = None,
data_collator: Optional["DataCollator"] = None,
train_dataset: Optional[Dataset] = None,
eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
tokenizer: Optional["PreTrainedTokenizerBase"] = None,
model_init: Optional[Callable[[], "PreTrainedModel"]] = None,
compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None,
callbacks: Optional[List["TrainerCallback"]] = None,
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
):
super().__init__(
model=model,
args=args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
model_init=model_init,
compute_metrics=compute_metrics,
callbacks=callbacks,
optimizers=optimizers,
preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)
if self.args.generation_config is not None:
gen_config = self.load_generation_config(self.args.generation_config)
self.model.generation_config = gen_config
@staticmethod
def load_generation_config(gen_config_arg: Union[str, GenerationConfig]) -> GenerationConfig:
"""
Loads a `~generation.GenerationConfig` from the `Seq2SeqTrainingArguments.generation_config` arguments.
Args:
gen_config_arg (`str` or [`~generation.GenerationConfig`]):
`Seq2SeqTrainingArguments.generation_config` argument.
Returns:
A `~generation.GenerationConfig`.
"""
if isinstance(gen_config_arg, GenerationConfig):
gen_config = deepcopy(gen_config_arg)
else:
pretrained_model_name = Path(gen_config_arg) if isinstance(gen_config_arg, str) else gen_config_arg
config_file_name = None
if pretrained_model_name.is_file():
config_file_name = pretrained_model_name.name
pretrained_model_name = pretrained_model_name.parent
elif pretrained_model_name.is_dir():
pass
else:
pretrained_model_name = gen_config_arg
gen_config = GenerationConfig.from_pretrained(pretrained_model_name, config_file_name)
try:
with warnings.catch_warnings(record=True) as caught_warnings:
gen_config.validate()
if len(caught_warnings) > 0:
raise ValueError(str([w.message for w in caught_warnings]))
except ValueError as exc:
raise ValueError(
"The loaded generation config instance is invalid -- `GenerationConfig.validate()` throws warnings "
"and/or exceptions. Fix these issues to train your model.\n\nThrown during validation:\n" + str(exc)
)
return gen_config
) -> Dict[str, float]:
"""
Run evaluation and returns metrics.
The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
(pass it to the init `compute_metrics` argument).
You can also subclass and override this method to inject custom behavior.
Args:
eval_dataset (`Dataset`, *optional*):
Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns
not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
method.
ignore_keys (`List[str]`, *optional*):
A list of keys in the output of your model (if it is a dictionary) that should be ignored when
gathering predictions.
metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
"eval_bleu" if the prefix is `"eval"` (default)
max_length (`int`, *optional*):
The maximum target length to use when predicting with the generate method.
num_beams (`int`, *optional*):
Number of beams for beam search that will be used when predicting with the generate method. 1 means no
beam search.
gen_kwargs:
Additional `generate` specific kwargs.
Returns:
A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
dictionary also contains the epoch number which comes from the training state.
"""
gen_kwargs = gen_kwargs.copy()
if (
gen_kwargs.get("max_length") is None
and gen_kwargs.get("max_new_tokens") is None
and self.args.generation_max_length is not None
):
gen_kwargs["max_length"] = self.args.generation_max_length
if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
gen_kwargs["num_beams"] = self.args.generation_num_beams
self.gather_function = self.accelerator.gather
self._gen_kwargs = gen_kwargs
return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
) -> "PredictionOutput":
"""
Run prediction and returns predictions and potential metrics.
Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
will also return metrics, like in `evaluate()`.
Args:
test_dataset (`Dataset`):
Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the
`model.forward()` method are automatically removed. Has to implement the method `__len__`
ignore_keys (`List[str]`, *optional*):
A list of keys in the output of your model (if it is a dictionary) that should be ignored when
gathering predictions.
metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
"eval_bleu" if the prefix is `"eval"` (default)
max_length (`int`, *optional*):
The maximum target length to use when predicting with the generate method.
num_beams (`int`, *optional*):
Number of beams for beam search that will be used when predicting with the generate method. 1 means no
beam search.
gen_kwargs:
Additional `generate` specific kwargs.
<Tip>
If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
padding in a token classification task) the predictions will be padded (on the right) to allow for
concatenation into one array. The padding index is -100.
</Tip>
Returns: *NamedTuple* A namedtuple with the following keys:
- predictions (`np.ndarray`): The predictions on `test_dataset`.
- label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
- metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
labels).
"""
gen_kwargs = gen_kwargs.copy()
if (
gen_kwargs.get("max_length") is None
and gen_kwargs.get("max_new_tokens") is None
and self.args.generation_max_length is not None
):
gen_kwargs["max_length"] = self.args.generation_max_length
if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
gen_kwargs["num_beams"] = self.args.generation_num_beams
self.gather_function = self.accelerator.gather
self._gen_kwargs = gen_kwargs
return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
def prediction_step(
self,
model: nn.Module,
inputs: Dict[str, Union[torch.Tensor, Any]],
prediction_loss_only: bool,
ignore_keys: Optional[List[str]] = None,
**gen_kwargs,
):
if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
pad_token_id = (
self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
)
else:
if self.model.config.pad_token_id is not None:
pad_token_id = self.model.config.pad_token_id
else:
raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors")
padded_tensor = pad_token_id * torch.ones(
(tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
)
padded_tensor[:, : tensor.shape[-1]] = tensor
return padded_tensor
.\trainer_utils.py
"""
PyTorch-独立工具类,为 Trainer 类提供支持。
"""
import copy
import functools
import gc
import inspect
import os
import random
import re
import threading
import time
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union
import numpy as np
from .utils import (
ExplicitEnum,
is_psutil_available,
is_tf_available,
is_torch_available,
is_torch_cuda_available,
is_torch_mps_available,
is_torch_npu_available,
is_torch_xla_available,
is_torch_xpu_available,
requires_backends,
)
if is_torch_available():
import torch
def seed_worker(_: Any):
"""
Helper function to set worker seed during Dataloader initialization.
"""
worker_seed = torch.initial_seed() % 2**32
set_seed(worker_seed)
def enable_full_determinism(seed: int, warn_only: bool = False):
"""
Helper function for reproducible behavior during distributed training. See
- https://pytorch.org/docs/stable/notes/randomness.html for pytorch
- https://www.tensorflow.org/api_docs/python/tf/config/experimental/enable_op_determinism for tensorflow
"""
set_seed(seed)
if is_torch_available():
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
torch.use_deterministic_algorithms(True, warn_only=warn_only)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
if is_tf_available():
import tensorflow as tf
tf.config.experimental.enable_op_determinism()
def set_seed(seed: int):
"""
Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch` and/or `tf` (if installed).
Args:
seed (`int`): The seed to set.
"""
random.seed(seed)
np.random.seed(seed)
if is_torch_available():
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
if is_torch_npu_available():
torch.npu.manual_seed_all(seed)
if is_torch_xpu_available():
torch.xpu.manual_seed_all(seed)
if is_tf_available():
import tensorflow as tf
tf.random.set_seed(seed)
def neftune_post_forward_hook(module, input, output):
"""
Implements the NEFTune forward pass for the model using forward hooks. Note this works only for torch.nn.Embedding
layers. This method is slightly adapted from the original source code that can be found here:
https://github.com/neelsjain/NEFTune Simply add it to your model as follows:
```
model = ...
model.embed_tokens.neftune_noise_alpha = 0.1
model.embed_tokens.register_forward_hook(neftune_post_forward_hook)
```
Args:
module (`torch.nn.Module`):
The embedding module where the hook is attached. Note that you need to set `module.neftune_noise_alpha` to
the desired noise alpha value.
input (`torch.Tensor`):
The input tensor to the model.
output (`torch.Tensor`):
The output tensor of the model (i.e. the embeddings).
"""
if module.training:
dims = torch.tensor(output.size(1) * output.size(2))
mag_norm = module.neftune_noise_alpha / torch.sqrt(dims)
output = output + torch.zeros_like(output).uniform_(-mag_norm, mag_norm)
return output
class EvalPrediction:
"""
Evaluation output (always contains labels), to be used to compute metrics.
Parameters:
predictions (`np.ndarray`): Predictions of the model.
label_ids (`np.ndarray`): Targets to be matched.
inputs (`np.ndarray`, *optional*):
"""
def __init__(
self,
predictions: Union[np.ndarray, Tuple[np.ndarray]],
label_ids: Union[np.ndarray, Tuple[np.ndarray]],
inputs: Optional[Union[np.ndarray, Tuple[np.ndarray]]] = None,
):
self.predictions = predictions
self.label_ids = label_ids
self.inputs = inputs
def __iter__(self):
if self.inputs is not None:
return iter((self.predictions, self.label_ids, self.inputs))
else:
return iter((self.predictions, self.label_ids))
def __getitem__(self, idx):
if idx < 0 or idx > 2:
raise IndexError("tuple index out of range")
if idx == 2 and self.inputs is None:
raise IndexError("tuple index out of range")
if idx == 0:
return self.predictions
elif idx == 1:
return self.label_ids
elif idx == 2:
return self.inputs
class EvalLoopOutput(NamedTuple):
"""
NamedTuple for evaluation loop output, containing predictions, label_ids, metrics, and num_samples.
Attributes:
predictions (Union[np.ndarray, Tuple[np.ndarray]]): Predictions from the model.
label_ids (Optional[Union[np.ndarray, Tuple[np.ndarray]]]): Target labels.
metrics (Optional[Dict[str, float]]): Metrics computed during evaluation.
num_samples (Optional[int]): Number of samples evaluated.
"""
predictions: Union[np.ndarray, Tuple[np.ndarray]]
label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
metrics: Optional[Dict[str, float]]
num_samples: Optional[int]
class PredictionOutput(NamedTuple):
"""
NamedTuple for prediction output, containing predictions, label_ids, and metrics.
Attributes:
predictions (Union[np.ndarray, Tuple[np.ndarray]]): Predictions from the model.
label_ids (Optional[Union[np.ndarray, Tuple[np.ndarray]]]): Target labels.
metrics (Optional[Dict[str, float]]): Metrics computed during prediction.
"""
predictions: Union[np.ndarray, Tuple[np.ndarray]]
label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
metrics: Optional[Dict[str, float]]
class TrainOutput(NamedTuple):
"""
NamedTuple for training output, containing global_step, training_loss, and metrics.
Attributes:
global_step (int): Current global step of training.
training_loss (float): Loss computed during training.
metrics (Dict[str, float]): Metrics computed during training.
"""
global_step: int
training_loss: float
metrics: Dict[str, float]
PREFIX_CHECKPOINT_DIR = "checkpoint"
_re_checkpoint = re.compile(r"^" + PREFIX_CHECKPOINT_DIR + r"\-(\d+)$")
def get_last_checkpoint(folder):
content = os.listdir(folder)
checkpoints = [
path
for path in content
if _re_checkpoint.search(path) is not None and os.path.isdir(os.path.join(folder, path))
]
if len(checkpoints) == 0:
return
return os.path.join(folder, max(checkpoints, key=lambda x: int(_re_checkpoint.search(x).groups()[0])))
class IntervalStrategy(ExplicitEnum):
NO = "no"
STEPS = "steps"
EPOCH = "epoch"
class EvaluationStrategy(ExplicitEnum):
NO = "no"
STEPS = "steps"
EPOCH = "epoch"
class HubStrategy(ExplicitEnum):
END = "end"
EVERY_SAVE = "every_save"
CHECKPOINT = "checkpoint"
ALL_CHECKPOINTS = "all_checkpoints"
class BestRun(NamedTuple):
"""
通过超参数搜索找到的最佳运行结果的命名元组。
Parameters:
run_id (`str`):
最佳运行的 ID (如果模型被保存,则对应的检查点将位于以 run-{run_id} 结尾的文件夹中)。
objective (`float`):
获得此运行的目标值。
hyperparameters (`Dict[str, Any]`):
用于此运行的超参数。
run_summary (`Optional[Any]`):
调优实验的摘要。对于 Ray 后端,为 `ray.tune.ExperimentAnalysis` 对象。
"""
run_id: str
objective: Union[float, List[float]]
hyperparameters: Dict[str, Any]
run_summary: Optional[Any] = None
def default_compute_objective(metrics: Dict[str, float]) -> float:
"""
在进行超参数搜索时最大化/最小化的默认目标函数。如果没有提供任何指标,则为评估损失;否则为所有指标之和。
Args:
metrics (`Dict[str, float]`): evaluate 方法返回的指标。
Return:
`float`: 最小化或最大化的目标值。
"""
metrics = copy.deepcopy(metrics)
loss = metrics.pop("eval_loss", None)
_ = metrics.pop("epoch", None)
speed_metrics = [
m
for m in metrics.keys()
if m.endswith("_runtime") or m.endswith("_per_second") or m.endswith("_compilation_time")
]
for sm in speed_metrics:
_ = metrics.pop(sm, None)
return loss if len(metrics) == 0 else sum(metrics.values())
def default_hp_space_optuna(trial) -> Dict[str, float]:
from .integrations import is_optuna_available
assert is_optuna_available(), "This function needs Optuna installed: `pip install optuna`"
return {
"learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
"num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
"seed": trial.suggest_int("seed", 1, 40),
"per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32, 64]),
}
}
注释:
def default_hp_space_ray(trial) -> Dict[str, float]:
from .integrations import is_ray_tune_available
assert is_ray_tune_available(), "This function needs ray installed: `pip install ray[tune]`"
from ray import tune
return {
"learning_rate": tune.loguniform(1e-6, 1e-4),
"num_train_epochs": tune.choice(list(range(1, 6))),
"seed": tune.uniform(1, 40),
"per_device_train_batch_size": tune.choice([4, 8, 16, 32, 64]),
}
def default_hp_space_sigopt(trial):
return [
{"bounds": {"min": 1e-6, "max": 1e-4}, "name": "learning_rate", "type": "double", "transformamtion": "log"},
{"bounds": {"min": 1, "max": 6}, "name": "num_train_epochs", "type": "int"},
{"bounds": {"min": 1, "max": 40}, "name": "seed", "type": "int"},
{
"categorical_values": ["4", "8", "16", "32", "64"],
"name": "per_device_train_batch_size",
"type": "categorical",
},
]
def default_hp_space_wandb(trial) -> Dict[str, float]:
from .integrations import is_wandb_available
if not is_wandb_available():
raise ImportError("This function needs wandb installed: `pip install wandb`")
return {
"method": "random",
"metric": {"name": "objective", "goal": "minimize"},
"parameters": {
"learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},
"num_train_epochs": {"distribution": "int_uniform", "min": 1, "max": 6},
"seed": {"distribution": "int_uniform", "min": 1, "max": 40},
"per_device_train_batch_size": {"values": [4, 8, 16, 32, 64]},
},
}
class HPSearchBackend(ExplicitEnum):
OPTUNA = "optuna"
RAY = "ray"
SIGOPT = "sigopt"
WANDB = "wandb"
def is_main_process(local_rank):
"""
Whether or not the current process is the local process, based on `xm.get_ordinal()` (for TPUs) first, then on
`local_rank`.
"""
if is_torch_xla_available():
import torch_xla.core.xla_model as xm
return xm.get_ordinal() == 0
return local_rank in [-1, 0]
def total_processes_number(local_rank):
"""
Return the number of processes launched in parallel. Works with `torch.distributed` and TPUs.
"""
if is_torch_xla_available():
import torch_xla.core.xla_model as xm
return xm.xrt_world_size()
elif local_rank != -1 and is_torch_available():
import torch
return torch.distributed.get_world_size()
return 1
def speed_metrics(split, start_time, num_samples=None, num_steps=None, num_tokens=None):
"""
Measure and return speed performance metrics.
This function requires a time snapshot `start_time` before the operation to be measured starts and this function
should be run immediately after the operation to be measured has completed.
Args:
- split: name to prefix metric (like train, eval, test...)
- start_time: operation start time
"""
runtime = time.time() - start_time
result = {f"{split}_runtime": round(runtime, 4)}
if runtime == 0:
return result
if num_samples is not None:
samples_per_second = num_samples / runtime
result[f"{split}_samples_per_second"] = round(samples_per_second, 3)
if num_steps is not None:
steps_per_second = num_steps / runtime
result[f"{split}_steps_per_second"] = round(steps_per_second, 3)
if num_tokens is not None:
tokens_per_second = num_tokens / runtime
result[f"{split}_tokens_per_second"] = round(tokens_per_second, 3)
return result
class SchedulerType(ExplicitEnum):
LINEAR = "linear"
COSINE = "cosine"
COSINE_WITH_RESTARTS = "cosine_with_restarts"
POLYNOMIAL = "polynomial"
CONSTANT = "constant"
CONSTANT_WITH_WARMUP = "constant_with_warmup"
INVERSE_SQRT = "inverse_sqrt"
REDUCE_ON_PLATEAU = "reduce_lr_on_plateau"
class TrainerMemoryTracker:
"""
A helper class that tracks cpu and gpu memory.
This class will silently skip unless `psutil` is available. Install with `pip install psutil`.
When a stage completes, it can pass metrics dict to update with the memory metrics gathered during this stage.
Example :
```
self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
self._memory_tracker.start()
# code ...
metrics = {"train_runtime": 10.5}
self._memory_tracker.stop_and_update_metrics(metrics)
```
At the moment GPU tracking is only for `pytorch`, but can be extended to support `tensorflow`.
To understand this class' intricacies please read the documentation of [`~Trainer.log_metrics`].
"""
stages = {
"__init__": "init",
"train": "train",
"_inner_training_loop": "train",
"evaluate": "eval",
"predict": "test",
}
def __init__(self, skip_memory_metrics=False):
self.skip_memory_metrics = skip_memory_metrics
if not is_psutil_available():
self.skip_memory_metrics = True
if self.skip_memory_metrics:
return
import psutil
if is_torch_cuda_available():
import torch
self.torch = torch
self.gpu = {}
elif is_torch_mps_available():
import torch
self.torch = torch
self.gpu = {}
elif is_torch_xpu_available():
import torch
self.torch = torch
self.gpu = {}
elif is_torch_npu_available():
import torch
self.torch = torch
self.gpu = {}
else:
self.torch = None
self.process = psutil.Process()
self.cur_stage = None
self.cpu = {}
self.init_reported = False
def derive_stage(self):
"""自动推断阶段/调用者名称"""
caller = inspect.currentframe().f_back.f_back.f_code.co_name
if caller in self.stages:
return self.stages[caller]
else:
raise ValueError(
f"was called from {caller}, but only expect to be called from one of {self.stages.keys()}"
)
def cpu_mem_used(self):
"""获取当前进程的驻留集大小内存"""
return self.process.memory_info().rss
def peak_monitor_func(self):
self.cpu_mem_used_peak = -1
while True:
self.cpu_mem_used_peak = max(self.cpu_mem_used(), self.cpu_mem_used_peak)
if not self.peak_monitoring:
break
def start(self):
"""开始跟踪调用者的阶段"""
if self.skip_memory_metrics:
return
stage = self.derive_stage()
if self.cur_stage is not None and self.cur_stage != stage:
return
self.cur_stage = stage
gc.collect()
if self.torch is not None:
if torch.cuda.is_available():
self.torch.cuda.reset_peak_memory_stats()
self.torch.cuda.empty_cache()
elif is_torch_xpu_available():
self.torch.xpu.reset_peak_memory_stats()
self.torch.xpu.empty_cache()
elif is_torch_npu_available():
self.torch.npu.reset_peak_memory_stats()
self.torch.npu.empty_cache()
elif is_torch_mps_available():
self.torch.mps.empty_cache()
if self.torch is not None:
if torch.cuda.is_available():
self.gpu_mem_used_at_start = self.torch.cuda.memory_allocated()
elif is_torch_xpu_available():
self.gpu_mem_used_at_start = self.torch.xpu.memory_allocated()
elif is_torch_npu_available():
self.gpu_mem_used_at_start = self.torch.npu.memory_allocated()
elif is_torch_mps_available():
self.gpu_mem_used_at_start = self.torch.mps.current_allocated_memory()
self.cpu_mem_used_at_start = self.cpu_mem_used()
self.peak_monitoring = True
peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
peak_monitor_thread.daemon = True
peak_monitor_thread.start()
def update_metrics(self, stage, metrics):
"""updates the metrics"""
if self.skip_memory_metrics:
return
if self.cur_stage is not None and self.cur_stage != stage:
return
stages = [stage]
if not self.init_reported:
stages.insert(0, "init")
self.init_reported = True
for stage in stages:
for t in ["alloc", "peaked"]:
if stage in self.cpu and t in self.cpu[stage]:
metrics[f"{stage}_mem_cpu_{t}_delta"] = self.cpu[stage][t]
if self.torch is not None and stage in self.gpu and t in self.gpu[stage]:
metrics[f"{stage}_mem_gpu_{t}_delta"] = self.gpu[stage][t]
if stages[0] == "init":
metrics["before_init_mem_cpu"] = self.cpu["init"]["begin"]
if self.torch is not None:
metrics["before_init_mem_gpu"] = self.gpu["init"]["begin"]
def stop_and_update_metrics(self, metrics=None):
"""combine stop and metrics update in one call for simpler code"""
if self.skip_memory_metrics:
return
stage = self.derive_stage()
self.stop(stage)
if metrics is not None:
self.update_metrics(stage, metrics)
def has_length(dataset):
try:
return len(dataset) is not None
except TypeError:
return False
def denumpify_detensorize(metrics):
if isinstance(metrics, (list, tuple)):
return type(metrics)(denumpify_detensorize(m) for m in metrics)
elif isinstance(metrics, dict):
return type(metrics)({k: denumpify_detensorize(v) for k, v in metrics.items()})
elif isinstance(metrics, np.generic):
return metrics.item()
elif is_torch_available() and isinstance(metrics, torch.Tensor) and metrics.numel() == 1:
return metrics.item()
return metrics
def number_of_arguments(func):
if isinstance(func, functools.partial):
total_args = len(inspect.signature(func.func).parameters)
return total_args - len(func.args) - len(func.keywords)
return len(inspect.signature(func).parameters)
def find_executable_batch_size(
function: callable = None, starting_batch_size: int = 128, auto_find_batch_size: bool = False
):
"""
Args:
A basic decorator that will try to execute `function`. If it fails from exceptions related to out-of-memory or
CUDNN, the batch size is cut in half and passed to `function`. `function` must take in a `batch_size` parameter as
its first argument.
function (`callable`, *optional*)
A function to wrap
starting_batch_size (`int`, *optional*)
The batch size to try and fit into memory
auto_find_batch_size (`bool`, *optional*)
If False, will just execute `function`
"""
if function is None:
return functools.partial(
find_executable_batch_size,
starting_batch_size=starting_batch_size,
auto_find_batch_size=auto_find_batch_size,
)
if auto_find_batch_size:
requires_backends(find_executable_batch_size, "accelerate")
from accelerate.utils import find_executable_batch_size as accelerate_find_executable_batch_size
return accelerate_find_executable_batch_size(function=function, starting_batch_size=starting_batch_size)
return functools.partial(function, batch_size=starting_batch_size)
class FSDPOption(ExplicitEnum):
FULL_SHARD = "full_shard"
SHARD_GRAD_OP = "shard_grad_op"
NO_SHARD = "no_shard"
HYBRID_SHARD = "hybrid_shard"
HYBRID_SHARD_ZERO2 = "hybrid_shard_zero2"
OFFLOAD = "offload"
AUTO_WRAP = "auto_wrap"
class RemoveColumnsCollator:
"""Wrap the data collator to remove unused columns before they are passed to the collator."""
def __init__(
self,
data_collator,
signature_columns,
logger=None,
model_name: Optional[str] = None,
description: Optional[str] = None,
):
self.data_collator = data_collator
self.signature_columns = signature_columns
self.logger = logger
self.description = description
self.model_name = model_name
self.message_logged = False
def _remove_columns(self, feature: dict) -> dict:
if not isinstance(feature, dict):
return feature
if not self.message_logged and self.logger and self.model_name:
ignored_columns = list(set(feature.keys()) - set(self.signature_columns))
if len(ignored_columns) > 0:
dset_description = "" if self.description is None else f"in the {self.description} set"
self.logger.info(
f"The following columns {dset_description} don't have a corresponding argument in "
f"`{self.model_name}.forward` and have been ignored: {', '.join(ignored_columns)}."
f" If {', '.join(ignored_columns)} are not expected by `{self.model_name}.forward`, "
" you can safely ignore this message."
)
self.message_logged = True
return {k: v for k, v in feature.items() if k in self.signature_columns}
def __call__(self, features: List[dict]):
features = [self._remove_columns(feature) for feature in features]
return self.data_collator(features)
def check_target_module_exists(optim_target_modules, key: str, return_is_regex: bool = False):
"""A helper method to check if the passed module's key name matches any of the target modules in the optim_target_modules.
Args:
optim_target_modules (`Union[str, List[str]]`):
A list of strings to try to match. Can be also a full string.
key (`str`):
A key to search any matches in optim_target_modules
return_is_regex (`bool`):
If set to `True`, the method will return whether the passed `optim_target_modules`
is a regex or not.
Returns:
`bool` : True of match object if key matches any target modules from config, False or
None if no match found
`bool` : If the matched target module is a regex to silence out the warnings in Trainer
for extra modules being found (only if `target_module_found=True` for an array of regex).
"""
target_module_found = False
is_regex = False
if isinstance(optim_target_modules, str):
target_module_found = bool(re.fullmatch(optim_target_modules, key))
is_regex = True if not optim_target_modules == key else False
elif key in optim_target_modules:
target_module_found = True
elif any(target_key in key for target_key in optim_target_modules):
target_module_found = True
elif any(bool(re.fullmatch(optim_target_module, key)) for optim_target_module in optim_target_modules):
target_module_found = True
is_regex = True
if return_is_regex:
return target_module_found, is_regex
return target_module_found
.\training_args.py
import contextlib
import io
import json
import math
import os
import warnings
from dataclasses import asdict, dataclass, field, fields
from datetime import timedelta
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from huggingface_hub import get_full_repo_name
from packaging import version
from .debug_utils import DebugOption
from .trainer_utils import (
EvaluationStrategy,
FSDPOption,
HubStrategy,
IntervalStrategy,
SchedulerType,
)
from .utils import (
ACCELERATE_MIN_VERSION,
ExplicitEnum,
cached_property,
is_accelerate_available,
is_safetensors_available,
is_sagemaker_dp_enabled,
is_sagemaker_mp_enabled,
is_torch_available,
is_torch_bf16_cpu_available,
is_torch_bf16_gpu_available,
is_torch_neuroncore_available,
is_torch_npu_available,
is_torch_tf32_available,
is_torch_xla_available,
is_torch_xpu_available,
logging,
requires_backends,
)
from .utils.generic import strtobool
from .utils.import_utils import is_optimum_neuron_available
logger = logging.get_logger(__name__)
log_levels = logging.get_log_levels_dict().copy()
trainer_log_levels = dict(**log_levels, passive=-1)
if is_torch_available():
import torch
import torch.distributed as dist
from .pytorch_utils import is_torch_greater_or_equal_than_2_0
if is_accelerate_available():
from accelerate.state import AcceleratorState, PartialState
from accelerate.utils import DistributedType
from .trainer_pt_utils import AcceleratorConfig
if is_torch_xla_available():
import torch_xla.core.xla_model as xm
if is_torch_neuroncore_available(check_device=False):
pass
if os.environ.get("TORCHELASTIC_RUN_ID"):
if is_optimum_neuron_available():
logger.info(
"Make sure that you are performing the training with the TrainiumTrainer from optimum[neuron], this "
"will fail otherwise."
)
else:
logger.warning(
"Please use the TrainiumTrainer from optimum[neuron] instead of the Transformers library to perform "
"training on AWS Trainium instances. More information here: "
"https://github.com/huggingface/optimum-neuron"
)
import torch_xla.distributed.xla_backend as xbn
if not isinstance(dist.group.WORLD, xbn.ProcessGroupXla):
dist.init_process_group(backend="xla")
if not isinstance(dist.group.WORLD, xbn.ProcessGroupXla):
raise AssertionError("Failed to initialize torch.distributed process group using XLA backend.")
if is_sagemaker_mp_enabled():
import smdistributed.modelparallel.torch as smp
smp.init()
def default_logdir() -> str:
"""
Same default as PyTorch
"""
import socket
from datetime import datetime
current_time = datetime.now().strftime("%b%d_%H-%M-%S")
return os.path.join("runs", current_time + "_" + socket.gethostname())
def get_int_from_env(env_keys, default):
"""Returns the first positive env value found in the `env_keys` list or the default."""
for e in env_keys:
val = int(os.environ.get(e, -1))
if val >= 0:
return val
return default
def get_xla_device_type(device: "torch.device") -> Optional[str]:
"""
Returns the xla device type (CPU|GPU|TPU) or None if the device is a non-xla device.
"""
if is_torch_xla_available():
if device.type == "cpu":
return "CPU"
return xm.xla_real_devices([device])[0].split(":")[0]
return None
class OptimizerNames(ExplicitEnum):
"""
Stores the acceptable string identifiers for optimizers.
"""
ADAMW_HF = "adamw_hf"
ADAMW_TORCH = "adamw_torch"
ADAMW_TORCH_FUSED = "adamw_torch_fused"
ADAMW_TORCH_XLA = "adamw_torch_xla"
ADAMW_TORCH_NPU_FUSED = "adamw_torch_npu_fused"
ADAMW_APEX_FUSED = "adamw_apex_fused"
ADAFACTOR = "adafactor"
ADAMW_ANYPRECISION = "adamw_anyprecision"
SGD = "sgd"
ADAGRAD = "adagrad"
ADAMW_BNB = "adamw_bnb_8bit"
ADAMW_8BIT = "adamw_8bit"
LION_8BIT = "lion_8bit"
LION = "lion_32bit"
PAGED_ADAMW = "paged_adamw_32bit"
PAGED_ADAMW_8BIT = "paged_adamw_8bit"
PAGED_LION = "paged_lion_32bit"
PAGED_LION_8BIT = "paged_lion_8bit"
RMSPROP = "rmsprop"
RMSPROP_BNB = "rmsprop_bnb"
RMSPROP_8BIT = "rmsprop_bnb_8bit"
RMSPROP_32BIT = "rmsprop_bnb_32bit"
GALORE_ADAMW = "galore_adamw"
GALORE_ADAMW_8BIT = "galore_adamw_8bit"
GALORE_ADAFACTOR = "galore_adafactor"
GALORE_ADAMW_LAYERWISE = "galore_adamw_layerwise"
GALORE_ADAMW_8BIT_LAYERWISE = "galore_adamw_8bit_layerwise"
GALORE_ADAFACTOR_LAYERWISE = "galore_adafactor_layerwise"
@dataclass
class TrainingArguments:
"""
TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
itself**.
Using [`HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.
"""
framework = "pt"
output_dir: str = field(
metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
)
overwrite_output_dir: bool = field(
default=False,
metadata={
"help": (
"Overwrite the content of the output directory. "
"Use this to continue training if output_dir points to a checkpoint directory."
)
},
)
do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
evaluation_strategy: Union[IntervalStrategy, str] = field(
default="no",
metadata={"help": "The evaluation strategy to use."},
)
prediction_loss_only: bool = field(
default=False,
metadata={"help": "When performing evaluation and predictions, only returns the loss."},
)
per_device_train_batch_size: int = field(
default=8, metadata={"help": "Batch size per GPU/TPU/MPS/NPU core/CPU for training."}
)
per_device_eval_batch_size: int = field(
default=8, metadata={"help": "Batch size per GPU/TPU/MPS/NPU core/CPU for evaluation."}
)
per_gpu_train_batch_size: Optional[int] = field(
default=None,
metadata={
"help": (
"Deprecated, the use of `--per_device_train_batch_size` is preferred. "
"Batch size per GPU/TPU core/CPU for training."
)
},
)
per_gpu_eval_batch_size: Optional[int] = field(
default=None,
metadata={
"help": (
"Deprecated, the use of `--per_device_eval_batch_size` is preferred. "
"Batch size per GPU/TPU core/CPU for evaluation."
)
},
)
gradient_accumulation_steps: int = field(
default=1,
metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
)
eval_accumulation_steps: Optional[int] = field(
default=None,
metadata={"help": "Number of predictions steps to accumulate before moving the tensors to the CPU."},
)
eval_delay: Optional[float] = field(
default=0,
metadata={
"help": (
"Number of epochs or steps to wait for before the first evaluation can be performed, depending on the"
" evaluation_strategy."
)
},
)
learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."})
num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
max_steps: int = field(
default=-1,
metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
)
lr_scheduler_type: Union[SchedulerType, str] = field(
default="linear",
metadata={"help": "The scheduler type to use."},
)
lr_scheduler_kwargs: Optional[Dict] = field(
default_factory=dict,
metadata={
"help": (
"Extra parameters for the lr_scheduler such as {'num_cycles': 1} for the cosine with hard restarts"
)
},
)
warmup_ratio: float = field(
default=0.0, metadata={"help": "Linear warmup over warmup_ratio fraction of total steps."}
)
warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
log_level: Optional[str] = field(
default="passive",
metadata={
"help": (
"Logger log level to use on the main node. Possible choices are the log levels as strings: 'debug',"
" 'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and"
" lets the application set the level. Defaults to 'passive'."
),
"choices": trainer_log_levels.keys(),
},
)
log_level_replica: Optional[str] = field(
default="warning",
metadata={
"help": "Logger log level to use on replica nodes. Same choices and defaults as ``log_level``",
"choices": trainer_log_levels.keys(),
},
)
log_on_each_node: bool = field(
default=True,
metadata={
"help": (
"When doing a multinode distributed training, whether to log once per node or just once on the main"
" node."
)
},
)
logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."})
logging_strategy: Union[IntervalStrategy, str] = field(
default="steps",
metadata={"help": "The logging strategy to use."},
)
logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"})
logging_steps: float = field(
default=500,
metadata={
"help": (
"Log every X updates steps. Should be an integer or a float in range `[0,1)`. "
"If smaller than 1, will be interpreted as ratio of total training steps."
)
},
)
logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."})
save_strategy: Union[IntervalStrategy, str] = field(
default="steps",
metadata={"help": "The checkpoint save strategy to use."},
)
save_steps: float = field(
default=500,
metadata={
"help": (
"Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`. "
"If smaller than 1, will be interpreted as ratio of total training steps."
)
},
)
save_total_limit: Optional[int] = field(
default=None,
metadata={
"help": (
"If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in"
" `output_dir`. When `load_best_model_at_end` is enabled, the 'best' checkpoint according to"
" `metric_for_best_model` will always be retained in addition to the most recent ones. For example,"
" for `save_total_limit=5` and `load_best_model_at_end=True`, the four last checkpoints will always be"
" retained alongside the best model. When `save_total_limit=1` and `load_best_model_at_end=True`,"
" it is possible that two checkpoints are saved: the last one and the best one (if they are different)."
" Default is unlimited checkpoints"
)
},
)
save_safetensors: Optional[bool] = field(
default=True,
metadata={
"help": "Use safetensors saving and loading for state dicts instead of default torch.load and torch.save."
},
)
save_on_each_node: bool = field(
default=False,
metadata={
"help": (
"When doing multi-node distributed training, whether to save models and checkpoints on each node, or"
" only on the main one"
)
},
)
save_only_model: bool = field(
default=False,
metadata={
"help": (
"When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state."
"Note that when this is true, you won't be able to resume training from checkpoint."
"This enables you to save storage by not storing the optimizer, scheduler & rng state."
"You can only load the model using from_pretrained with this option set to True."
)
},
)
no_cuda: bool = field(
default=False,
metadata={"help": "This argument is deprecated. It will be removed in version 5.0 of 🤗 Transformers."},
)
use_cpu: bool = field(
default=False,
metadata={
"help": " Whether or not to use cpu. If set to False, we will use cuda/tpu/mps/npu device if available."
},
)
use_mps_device: bool = field(
default=False,
metadata={
"help": "This argument is deprecated. `mps` device will be used if available similar to `cuda` device."
" It will be removed in version 5.0 of 🤗 Transformers"
},
)
seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
data_seed: Optional[int] = field(default=None, metadata={"help": "Random seed to be used with data samplers."})
jit_mode_eval: bool = field(
default=False, metadata={"help": "Whether or not to use PyTorch jit trace for inference"}
)
use_ipex: bool = field(
default=False,
metadata={
"help": (
"Use Intel extension for PyTorch when it is available, installation:"
" 'https://github.com/intel/intel-extension-for-pytorch'"
)
},
)
bf16: bool = field(
default=False,
metadata={
"help": (
"Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA"
" architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change."
)
},
)
fp16: bool = field(
default=False,
metadata={"help": "Whether to use fp16 (mixed) precision instead of 32-bit"},
)
fp16_opt_level: str = field(
default="O1",
metadata={
"help": (
"For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
"See details at https://nvidia.github.io/apex/amp.html"
)
},
)
half_precision_backend: str = field(
default="auto",
metadata={
"help": "The backend to be used for half precision.",
"choices": ["auto", "apex", "cpu_amp"],
},
)
bf16_full_eval: bool = field(
default=False,
metadata={
"help": (
"Whether to use full bfloat16 evaluation instead of 32-bit. This is an experimental API and it may"
" change."
)
},
)
fp16_full_eval: bool = field(
default=False,
metadata={"help": "Whether to use full float16 evaluation instead of 32-bit"},
)
tf32: Optional[bool] = field(
default=None,
metadata={
"help": (
"Whether to enable tf32 mode, available in Ampere and newer GPU architectures. This is an experimental"
" API and it may change."
)
},
)
local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
ddp_backend: Optional[str] = field(
default=None,
metadata={
"help": "The backend to be used for distributed training",
"choices": ["nccl", "gloo", "mpi", "ccl", "hccl"],
},
)
tpu_num_cores: Optional[int] = field(
default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"}
)
tpu_metrics_debug: bool = field(
default=False,
metadata={
"help": (
"已弃用,推荐使用 `--debug tpu_metrics_debug`。TPU:是否打印调试指标"
)
},
)
debug: Union[str, List[DebugOption]] = field(
default="",
metadata={
"help": (
"是否启用调试模式。当前选项:"
"`underflow_overflow`(检测激活和权重中的下溢和上溢),"
"`tpu_metrics_debug`(在TPU上打印调试指标)。"
)
},
)
dataloader_drop_last: bool = field(
default=False, metadata={"help": "如果不是批量大小的整数倍,丢弃最后不完整的批次。"}
)
eval_steps: Optional[float] = field(
default=None,
metadata={
"help": (
"每隔X步运行一次评估。应为整数或范围为`[0,1)`的浮点数。"
"如果小于1,将解释为总训练步数的比例。"
)
},
)
dataloader_num_workers: int = field(
default=0,
metadata={
"help": (
"用于数据加载的子进程数(仅适用于PyTorch)。"
"0表示数据将在主进程中加载。"
)
},
)
dataloader_prefetch_factor: Optional[int] = field(
default=None if not is_torch_available() or is_torch_greater_or_equal_than_2_0 else 2,
metadata={
"help": (
"每个工作进程预加载的批次数。"
"2表示每个工作进程预加载2 * num_workers批次。"
"对于PyTorch < 2.0.0,默认为2,否则为None。"
)
},
)
past_index: int = field(
default=-1,
metadata={"help": "如果 >= 0,则使用输出的相应部分作为下一步的过去状态。"},
)
run_name: Optional[str] = field(
default=None, metadata={"help": "运行的可选描述符。主要用于wandb日志记录。"}
)
disable_tqdm: Optional[bool] = field(
default=None, metadata={"help": "是否禁用tqdm进度条。"}
)
remove_unused_columns: Optional[bool] = field(
default=True, metadata={"help": "在使用nlp.Dataset时,移除模型不需要的列。"}
)
label_names: Optional[List[str]] = field(
default=None, metadata={"help": "输入字典中与标签对应的键列表。"}
)
load_best_model_at_end: Optional[bool] = field(
default=False,
metadata={
"help": (
"Whether or not to load the best model found during training at the end of training. When this option"
" is enabled, the best checkpoint will always be saved. See `save_total_limit` for more."
)
},
)
metric_for_best_model: Optional[str] = field(
default=None, metadata={"help": "The metric to use to compare two different models."}
)
greater_is_better: Optional[bool] = field(
default=None, metadata={"help": "Whether the `metric_for_best_model` should be maximized or not."}
)
ignore_data_skip: bool = field(
default=False,
metadata={
"help": (
"When resuming training, whether or not to skip the first epochs and batches to get to the same"
" training data."
)
},
)
fsdp: Optional[Union[List[FSDPOption], str]] = field(
default="",
metadata={
"help": (
"Whether or not to use PyTorch Fully Sharded Data Parallel (FSDP) training (in distributed training"
" only). The base option should be `full_shard`, `shard_grad_op` or `no_shard` and you can add"
" CPU-offload to `full_shard` or `shard_grad_op` like this: full_shard offload` or `shard_grad_op"
" offload`. You can add auto-wrap to `full_shard` or `shard_grad_op` with the same syntax: full_shard"
" auto_wrap` or `shard_grad_op auto_wrap`."
),
},
)
fsdp_min_num_params: int = field(
default=0,
metadata={
"help": (
"This parameter is deprecated. FSDP's minimum number of parameters for Default Auto Wrapping. (useful"
" only when `fsdp` field is passed)."
)
},
)
fsdp_config: Optional[Union[dict, str]] = field(
default=None,
metadata={
"help": (
"Config to be used with FSDP (Pytorch Fully Sharded Data Parallel). The value is either a "
"fsdp json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`."
)
},
)
fsdp_transformer_layer_cls_to_wrap: Optional[str] = field(
default=None,
metadata={
"help": (
"This parameter is deprecated. Transformer layer class name (case-sensitive) to wrap, e.g,"
" `BertLayer`, `GPTJBlock`, `T5Block` .... (useful only when `fsdp` flag is passed)."
)
},
)
accelerator_config: Optional[str] = field(
default=None,
metadata={
"help": (
"Config to be used with the internal Accelerator object initializtion. The value is either a "
"accelerator json config file (e.g., `accelerator_config.json`) or an already loaded json file as `dict`."
)
},
)
deepspeed: Optional[str] = field(
default=None,
metadata={
"help": (
"Enable deepspeed and pass the path to deepspeed json config file (e.g. `ds_config.json`) or an already"
" loaded json file as a dict"
)
},
)
label_smoothing_factor: float = field(
default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."}
)
default_optim = "adamw_torch"
optim: Union[OptimizerNames, str] = field(
default=default_optim,
metadata={"help": "The optimizer to use."},
)
optim_args: Optional[str] = field(default=None, metadata={"help": "Optional arguments to supply to optimizer."})
adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."})
group_by_length: bool = field(
default=False,
metadata={"help": "Whether or not to group samples of roughly the same length together when batching."},
)
length_column_name: Optional[str] = field(
default="length",
metadata={"help": "Column name with precomputed lengths to use when grouping by length."},
)
report_to: Optional[List[str]] = field(
default=None, metadata={"help": "The list of integrations to report the results and logs to."}
)
ddp_find_unused_parameters: Optional[bool] = field(
default=None,
metadata={
"help": (
"When using distributed training, the value of the flag `find_unused_parameters` passed to "
"`DistributedDataParallel`."
)
},
)
ddp_bucket_cap_mb: Optional[int] = field(
default=None,
metadata={
"help": (
"When using distributed training, the value of the flag `bucket_cap_mb` passed to "
"`DistributedDataParallel`."
)
},
)
ddp_broadcast_buffers: Optional[bool] = field(
default=None,
metadata={
"help": (
"When using distributed training, the value of the flag `broadcast_buffers` passed to "
"`DistributedDataParallel`."
)
},
)
dataloader_pin_memory: bool = field(
default=True, metadata={"help": "Whether or not to pin memory for DataLoader."}
)
dataloader_persistent_workers: bool = field(
default=False,
metadata={
"help": "If True, the data loader will not shut down the worker processes after a dataset has been consumed once. This allows to maintain the workers Dataset instances alive. Can potentially speed up training, but will increase RAM usage."
},
)
skip_memory_metrics: bool = field(
default=True, metadata={"help": "Whether or not to skip adding of memory profiler reports to metrics."}
)
use_legacy_prediction_loop: bool = field(
default=False, metadata={"help": "Whether or not to use the legacy prediction_loop in the Trainer."}
)
push_to_hub: bool = field(
default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
)
resume_from_checkpoint: Optional[str] = field(
default=None,
metadata={"help": "The path to a folder with a valid checkpoint for your model."},
)
hub_model_id: Optional[str] = field(
default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
)
hub_strategy: Union[HubStrategy, str] = field(
default="every_save",
metadata={"help": "The hub strategy to use when `--push_to_hub` is activated."},
)
hub_token: Optional[str] = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
hub_private_repo: bool = field(default=False, metadata={"help": "Whether the model repository is private or not."})
hub_always_push: bool = field(
default=False,
metadata={"help": "Unless `True`, the Trainer will skip pushes if the previous one wasn't finished yet."},
)
gradient_checkpointing: bool = field(
default=False,
metadata={
"help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
},
)
gradient_checkpointing_kwargs: Optional[dict] = field(
default=None,
metadata={
"help": "Gradient checkpointing key word arguments such as `use_reentrant`. Will be passed to `torch.utils.checkpoint.checkpoint` through `model.gradient_checkpointing_enable`."
},
)
include_inputs_for_metrics: bool = field(
default=False, metadata={"help": "Whether or not the inputs will be passed to the `compute_metrics` function."}
)
fp16_backend: str = field(
default="auto",
metadata={
"help": "Deprecated. Use half_precision_backend instead",
"choices": ["auto", "apex", "cpu_amp"],
},
)
push_to_hub_model_id: Optional[str] = field(
default=None, metadata={"help": "The name of the repository to which push the `Trainer`."}
)
push_to_hub_organization: Optional[str] = field(
default=None, metadata={"help": "The name of the organization in with to which push the `Trainer`."}
)
push_to_hub_token: Optional[str] = field(
default=None, metadata={"help": "The token to use to push to the Model Hub."}
)
_n_gpu: int = field(init=False, repr=False, default=-1)
mp_parameters: str = field(
default="",
metadata={"help": "Used by the SageMaker launcher to send mp-specific args. Ignored in Trainer"},
)
auto_find_batch_size: bool = field(
default=False,
metadata={
"help": (
"Whether to automatically decrease the batch size in half and rerun the training loop again each time"
" a CUDA Out-of-Memory was reached"
)
},
)
full_determinism: bool = field(
default=False,
metadata={
"help": (
"Whether to call enable_full_determinism instead of set_seed for reproducibility in distributed"
" training. Important: this will negatively impact the performance, so only use it for debugging."
)
},
)
torchdynamo: Optional[str] = field(
default=None,
metadata={
"help": "This argument is deprecated, use `--torch_compile_backend` instead.",
},
)
ray_scope: Optional[str] = field(
default="last",
metadata={
"help": (
'The scope to use when doing hyperparameter search with Ray. By default, `"last"` will be used. Ray'
" will then use the last checkpoint of all trials, compare those, and select the best one. However,"
" other options are also available. See the Ray documentation"
" (https://docs.ray.io/en/latest/tune/api_docs/analysis.html"
"#ray.tune.ExperimentAnalysis.get_best_trial)"
" for more options."
)
},
)
ddp_timeout: Optional[int] = field(
default=1800,
metadata={
"help": "Overrides the default timeout for distributed training (value should be given in seconds)."
},
)
torch_compile: bool = field(
default=False, metadata={"help": "If set to `True`, the model will be wrapped in `torch.compile`."}
)
torch_compile_backend: Optional[str] = field(
default=None,
metadata={
"help": "Which backend to use with `torch.compile`, passing one will trigger a model compilation.",
},
)
torch_compile_mode: Optional[str] = field(
default=None,
metadata={
"help": "Which mode to use with `torch.compile`, passing one will trigger a model compilation.",
},
)
dispatch_batches: Optional[bool] = field(
default=None,
metadata={"help": "Deprecated. Pass {'dispatch_batches':VALUE} to `accelerator_config`."},
)
split_batches: Optional[bool] = field(
default=None,
metadata={"help": "Deprecated. Pass {'split_batches':True} to `accelerator_config`."},
)
include_tokens_per_second: Optional[bool] = field(
default=False,
metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."},
)
include_num_input_tokens_seen: Optional[bool] = field(
default=False,
metadata={
"help": "If set to `True`, will track the number of input tokens seen throughout training. (May be slower in distributed training)"
},
)
neftune_noise_alpha: Optional[float] = field(
default=None,
metadata={
"help": "Activates neftune noise embeddings into the model. NEFTune has been proven to drastically improve model performances for instrcution fine-tuning. Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes."
},
)
optim_target_modules: Union[None, str, List[str]] = field(
default=None,
metadata={
"help": "Target modules for the optimizer defined in the `optim` argument. Only used for the GaLore optimizer at the moment."
},
)
def __str__(self):
self_as_dict = asdict(self)
del self_as_dict["per_gpu_train_batch_size"]
del self_as_dict["per_gpu_eval_batch_size"]
self_as_dict = {k: f"<{k.upper()}>" if k.endswith("_token") else v for k, v in self_as_dict.items()}
attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())]
return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})"
__repr__ = __str__
@property
def train_batch_size(self) -> int:
"""
The actual batch size for training (may differ from `per_gpu_train_batch_size` in distributed training).
"""
if self.per_gpu_train_batch_size:
logger.warning(
"Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
"version. Using `--per_device_train_batch_size` is preferred."
)
per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
train_batch_size = per_device_batch_size * max(1, self.n_gpu)
return train_batch_size
@property
def eval_batch_size(self) -> int:
"""
The actual batch size for evaluation (may differ from `per_gpu_eval_batch_size` in distributed training).
"""
if self.per_gpu_eval_batch_size:
logger.warning(
"Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
"version. Using `--per_device_eval_batch_size` is preferred."
)
per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
eval_batch_size = per_device_batch_size * max(1, self.n_gpu)
return eval_batch_size
@property
def ddp_timeout_delta(self) -> timedelta:
"""
The actual timeout for torch.distributed.init_process_group since it expects a timedelta variable.
"""
return timedelta(seconds=self.ddp_timeout)
@cached_property
@property
def device(self) -> "torch.device":
"""
The device used by this process.
"""
requires_backends(self, ["torch"])
return self._setup_devices
@property
def n_gpu(self):
"""
The number of GPUs used by this process.
Note:
This will only be greater than one when you have multiple GPUs available but are not using distributed
training. For distributed training, it will always be 1.
"""
requires_backends(self, ["torch"])
if not hasattr(self, "_n_gpu"):
_ = self._setup_devices
return self._n_gpu
def parallel_mode(self):
"""
The current mode used for parallelism if multiple GPUs/TPU cores are available. One of:
- `ParallelMode.NOT_PARALLEL`: no parallelism (CPU or one GPU).
- `ParallelMode.NOT_DISTRIBUTED`: several GPUs in one single process (uses `torch.nn.DataParallel`).
- `ParallelMode.DISTRIBUTED`: several GPUs, each having its own process (uses
`torch.nn.DistributedDataParallel`).
- `ParallelMode.TPU`: several TPU cores.
"""
requires_backends(self, ["torch"])
if is_torch_xla_available():
return ParallelMode.TPU
elif is_sagemaker_mp_enabled():
return ParallelMode.SAGEMAKER_MODEL_PARALLEL
elif is_sagemaker_dp_enabled():
return ParallelMode.SAGEMAKER_DATA_PARALLEL
elif (
self.distributed_state is not None and self.distributed_state.distributed_type != DistributedType.NO
) or (self.distributed_state is None and self.local_rank != -1):
return ParallelMode.DISTRIBUTED
elif self.n_gpu > 1:
return ParallelMode.NOT_DISTRIBUTED
else:
return ParallelMode.NOT_PARALLEL
@property
def world_size(self):
"""
The number of processes used in parallel.
"""
requires_backends(self, ["torch"])
if self.distributed_state is not None:
return self.distributed_state.num_processes
elif is_sagemaker_mp_enabled():
return smp.dp_size() if not smp.state.cfg.prescaled_batch else smp.rdp_size()
return 1
@property
def process_index(self):
"""
The index of the current process used.
"""
requires_backends(self, ["torch"])
if self.distributed_state is not None:
return self.distributed_state.process_index
elif is_sagemaker_mp_enabled():
return smp.dp_rank() if not smp.state.cfg.prescaled_batch else smp.rdp_rank()
return 0
@property
def local_process_index(self):
"""
The index of the local process used.
"""
requires_backends(self, ["torch"])
if self.distributed_state is not None:
return self.distributed_state.local_process_index
elif is_sagemaker_mp_enabled():
return smp.local_rank()
return 0
@property
def should_log(self):
"""
Whether or not the current process should produce log.
"""
if self.log_on_each_node:
return self.local_process_index == 0
else:
if is_sagemaker_mp_enabled():
return smp.rank() == 0
else:
return self.process_index == 0
def should_save(self):
"""
Whether or not the current process should write to disk, e.g., to save models and checkpoints.
"""
if self.save_on_each_node:
return self.local_process_index == 0
else:
if is_sagemaker_mp_enabled():
return smp.rank() == 0
else:
return self.process_index == 0
def get_process_log_level(self):
"""
Returns the log level to be used depending on whether this process is the main process of node 0, main process
of node non-0, or a non-main process.
For the main process the log level defaults to the logging level set (`logging.WARNING` if you didn't do
anything) unless overridden by `log_level` argument.
For the replica processes the log level defaults to `logging.WARNING` unless overridden by `log_level_replica`
argument.
The choice between the main and replica process settings is made according to the return value of `should_log`.
"""
log_level = trainer_log_levels[self.log_level]
log_level_replica = trainer_log_levels[self.log_level_replica]
log_level_main_node = logging.get_verbosity() if log_level == -1 else log_level
log_level_replica_node = logging.get_verbosity() if log_level_replica == -1 else log_level_replica
return log_level_main_node if self.should_log else log_level_replica_node
@property
def place_model_on_device(self):
"""
Can be subclassed and overridden for some specific integrations.
"""
return not is_sagemaker_mp_enabled()
@property
def _no_sync_in_gradient_accumulation(self):
"""
Whether or not to use no_sync for the gradients when doing gradient accumulation.
"""
return not (
self.deepspeed or is_sagemaker_dp_enabled() or is_sagemaker_mp_enabled() or is_torch_neuroncore_available()
)
@contextlib.contextmanager
def main_process_first(self, local=True, desc="work"):
"""
A context manager for torch distributed environment where on needs to do something on the main process, while
blocking replicas, and when it's finished releasing the replicas.
One such use is for `datasets`'s `map` feature which to be efficient should be run once on the main process,
which upon completion saves a cached version of results and which then automatically gets loaded by the
replicas.
Args:
local (`bool`, *optional*, defaults to `True`):
if `True` first means process of rank 0 of each node if `False` first means process of rank 0 of node
rank 0 In multi-node environment with a shared filesystem you most likely will want to use
`local=False` so that only the main process of the first node will do the processing. If however, the
filesystem is not shared, then the main process of each node will need to do the processing, which is
the default behavior.
desc (`str`, *optional*, defaults to `"work"`):
a work description to be used in debug logs
"""
if is_torch_available() and self.world_size > 1:
main_process_desc = "main local process" if local else "main process"
if self.distributed_state is not None:
is_main_process = (
self.distributed_state.is_local_main_process if local else self.distributed_state.is_main_process
)
elif is_sagemaker_mp_enabled():
is_main_process = smp.rank() == 0
try:
if not is_main_process:
logger.debug(f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}")
if is_torch_xla_available():
xm.rendezvous(desc)
else:
dist.barrier()
yield
finally:
if is_main_process:
logger.debug(f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas")
if is_torch_xla_available():
xm.rendezvous(desc)
else:
dist.barrier()
else:
yield
def get_warmup_steps(self, num_training_steps: int):
"""
Get number of steps used for a linear warmup.
"""
warmup_steps = (
self.warmup_steps if self.warmup_steps > 0 else math.ceil(num_training_steps * self.warmup_ratio)
)
return warmup_steps
def to_dict(self):
"""
Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates
the token values by removing their value.
"""
d = {field.name: getattr(self, field.name) for field in fields(self) if field.init}
for k, v in d.items():
if isinstance(v, Enum):
d[k] = v.value
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum):
d[k] = [x.value for x in v]
if k.endswith("_token"):
d[k] = f"<{k.upper()}>"
if is_accelerate_available() and isinstance(v, AcceleratorConfig):
d[k] = v.to_dict()
return d
def to_json_string(self):
"""
Serializes this instance to a JSON string.
"""
return json.dumps(self.to_dict(), indent=2)
def to_sanitized_dict(self) -> Dict[str, Any]:
"""
Sanitized serialization to use with TensorBoard’s hparams
"""
d = self.to_dict()
d = {**d, **{"train_batch_size": self.train_batch_size, "eval_batch_size": self.eval_batch_size}}
valid_types = [bool, int, float, str]
if is_torch_available():
valid_types.append(torch.Tensor)
return {k: v if type(v) in valid_types else str(v) for k, v in d.items()}
def set_training(
self,
learning_rate: float = 5e-5,
batch_size: int = 8,
weight_decay: float = 0,
num_epochs: float = 3,
max_steps: int = -1,
gradient_accumulation_steps: int = 1,
seed: int = 42,
gradient_checkpointing: bool = False,
):
"""
A method that regroups all basic arguments linked to the training.
<Tip>
Calling this method will automatically set `self.do_train` to `True`.
</Tip>
Args:
learning_rate (`float`, *optional*, defaults to 5e-5):
The initial learning rate for the optimizer.
batch_size (`int` *optional*, defaults to 8):
The batch size per device (GPU/TPU core/CPU...) used for training.
weight_decay (`float`, *optional*, defaults to 0):
The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in the
optimizer.
num_train_epochs(`float`, *optional*, defaults to 3.0):
Total number of training epochs to perform (if not an integer, will perform the decimal part percents
of the last epoch before stopping training).
max_steps (`int`, *optional*, defaults to -1):
If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until
`max_steps` is reached.
gradient_accumulation_steps (`int`, *optional*, defaults to 1):
Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
<Tip warning={true}>
When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
logging, evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training
examples.
</Tip>
seed (`int`, *optional*, defaults to 42):
Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use
the [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized
parameters.
gradient_checkpointing (`bool`, *optional*, defaults to `False`):
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
Example:
```
>>> from transformers import TrainingArguments
>>> args = TrainingArguments("working_dir")
>>> args = args.set_training(learning_rate=1e-4, batch_size=32)
>>> args.learning_rate
1e-4
```
"""
self.do_train = True
self.learning_rate = learning_rate
self.per_device_train_batch_size = batch_size
self.weight_decay = weight_decay
self.num_train_epochs = num_epochs
self.max_steps = max_steps
self.gradient_accumulation_steps = gradient_accumulation_steps
self.seed = seed
self.gradient_checkpointing = gradient_checkpointing
return self
def set_evaluate(
self,
strategy: Union[str, IntervalStrategy] = "no",
steps: int = 500,
batch_size: int = 8,
accumulation_steps: Optional[int] = None,
delay: Optional[float] = None,
loss_only: bool = False,
jit_mode: bool = False,
):
"""
A method that regroups all arguments linked to evaluation.
Args:
strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
The evaluation strategy to adopt during training. Possible values are:
- `"no"`: No evaluation is done during training.
- `"steps"`: Evaluation is done (and logged) every `steps`.
- `"epoch"`: Evaluation is done at the end of each epoch.
Setting a `strategy` different from `"no"` will set `self.do_eval` to `True`.
steps (`int`, *optional*, defaults to 500):
Number of update steps between two evaluations if `strategy="steps"`.
batch_size (`int` *optional*, defaults to 8):
The batch size per device (GPU/TPU core/CPU...) used for evaluation.
accumulation_steps (`int`, *optional*):
Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU.
If left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster
but requires more memory).
delay (`float`, *optional*):
Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
evaluation_strategy.
loss_only (`bool`, *optional*, defaults to `False`):
Ignores all outputs except the loss.
jit_mode (`bool`, *optional*):
Whether or not to use PyTorch jit trace for inference.
Example:
```
>>> from transformers import TrainingArguments
>>> args = TrainingArguments("working_dir")
>>> args = args.set_evaluate(strategy="steps", steps=100)
>>> args.eval_steps
100
```
"""
self.evaluation_strategy = IntervalStrategy(strategy)
if self.evaluation_strategy == IntervalStrategy.STEPS and steps == 0:
raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.")
self.do_eval = self.evaluation_strategy != IntervalStrategy.NO
self.eval_steps = steps
self.per_device_eval_batch_size = batch_size
self.eval_accumulation_steps = accumulation_steps
self.eval_delay = delay
self.prediction_loss_only = loss_only
self.jit_mode_eval = jit_mode
return self
):
"""
A method that regroups all basic arguments linked to testing on a held-out dataset.
<Tip>
Calling this method will automatically set `self.do_predict` to `True`.
</Tip>
Args:
batch_size (`int` *optional*, defaults to 8):
The batch size per device (GPU/TPU core/CPU...) used for testing.
loss_only (`bool`, *optional*, defaults to `False`):
Ignores all outputs except the loss.
jit_mode (`bool`, *optional*):
Whether or not to use PyTorch jit trace for inference.
Example:
```
>>> from transformers import TrainingArguments
>>> args = TrainingArguments("working_dir")
>>> args = args.set_testing(batch_size=32)
>>> args.per_device_eval_batch_size
32
```
"""
self.do_predict = True
self.per_device_eval_batch_size = batch_size
self.prediction_loss_only = loss_only
self.jit_mode_eval = jit_mode
return self
):
"""
A method that regroups all arguments linked to checkpoint saving.
Args:
strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
The checkpoint save strategy to adopt during training. Possible values are:
- `"no"`: No save is done during training.
- `"epoch"`: Save is done at the end of each epoch.
- `"steps"`: Save is done every `save_steps`.
steps (`int`, *optional*, defaults to 500):
Number of updates steps before two checkpoint saves if `strategy="steps"`.
total_limit (`int`, *optional*):
If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
`output_dir`.
on_each_node (`bool`, *optional*, defaults to `False`):
When doing multi-node distributed training, whether to save models and checkpoints on each node, or
only on the main one.
This should not be activated when the different nodes use the same storage as the files will be saved
with the same names for each node.
Example:
```
>>> from transformers import TrainingArguments
>>> args = TrainingArguments("working_dir")
>>> args = args.set_save(strategy="steps", steps=100)
>>> args.save_steps
100
```
"""
self.save_strategy = IntervalStrategy(strategy)
if self.save_strategy == IntervalStrategy.STEPS and steps == 0:
raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.")
self.save_steps = steps
self.save_total_limit = total_limit
self.save_on_each_node = on_each_node
return self
def set_logging(
self,
strategy: Union[str, IntervalStrategy] = "steps",
steps: int = 500,
report_to: Union[str, List[str]] = "none",
level: str = "passive",
first_step: bool = False,
nan_inf_filter: bool = False,
on_each_node: bool = False,
replica_level: str = "passive",
def set_push_to_hub(
self,
model_id: str,
strategy: Union[str, HubStrategy] = "every_save",
token: Optional[str] = None,
private_repo: bool = False,
always_push: bool = False,
def set_optimizer(
self,
name: Union[str, OptimizerNames] = "adamw_torch",
learning_rate: float = 5e-5,
weight_decay: float = 0,
beta1: float = 0.9,
beta2: float = 0.999,
epsilon: float = 1e-8,
args: Optional[str] = None,
):
"""
A method that regroups all arguments linked to the optimizer and its hyperparameters.
Args:
name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
The optimizer to use: `"adamw_hf"`, `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`,
`"adamw_anyprecision"` or `"adafactor"`.
learning_rate (`float`, *optional*, defaults to 5e-5):
The initial learning rate.
weight_decay (`float`, *optional*, defaults to 0):
The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights.
beta1 (`float`, *optional*, defaults to 0.9):
The beta1 hyperparameter for the adam optimizer or its variants.
beta2 (`float`, *optional*, defaults to 0.999):
The beta2 hyperparameter for the adam optimizer or its variants.
epsilon (`float`, *optional*, defaults to 1e-8):
The epsilon hyperparameter for the adam optimizer or its variants.
args (`str`, *optional*):
Optional arguments that are supplied to AnyPrecisionAdamW (only useful when
`optim="adamw_anyprecision"`).
Example:
```
>>> from transformers import TrainingArguments
>>> args = TrainingArguments("working_dir")
>>> args = args.set_optimizer(name="adamw_torch", beta1=0.8)
>>> args.optim
'adamw_torch'
```
"""
self.optim = OptimizerNames(name)
self.learning_rate = learning_rate
self.weight_decay = weight_decay
self.adam_beta1 = beta1
self.adam_beta2 = beta2
self.adam_epsilon = epsilon
self.optim_args = args
return self
):
"""
A method that regroups all arguments linked to the learning rate scheduler and its hyperparameters.
Args:
name (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`):
The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values.
num_epochs(`float`, *optional*, defaults to 3.0):
Total number of training epochs to perform (if not an integer, will perform the decimal part percents
of the last epoch before stopping training).
max_steps (`int`, *optional*, defaults to -1):
If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until
`max_steps` is reached.
warmup_ratio (`float`, *optional*, defaults to 0.0):
Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
warmup_steps (`int`, *optional*, defaults to 0):
Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of
`warmup_ratio`.
Example:
```
>>> from transformers import TrainingArguments
>>> args = TrainingArguments("working_dir")
>>> args = args.set_lr_scheduler(name="cosine", warmup_ratio=0.05)
>>> args.warmup_ratio
0.05
```
"""
self.lr_scheduler_type = SchedulerType(name)
self.num_train_epochs = num_epochs
self.max_steps = max_steps
self.warmup_ratio = warmup_ratio
self.warmup_steps = warmup_steps
return self
def set_dataloader(
self,
train_batch_size: int = 8,
eval_batch_size: int = 8,
drop_last: bool = False,
num_workers: int = 0,
pin_memory: bool = True,
persistent_workers: bool = False,
prefetch_factor: Optional[int] = None,
auto_find_batch_size: bool = False,
ignore_data_skip: bool = False,
sampler_seed: Optional[int] = None,
class ParallelMode(Enum):
NOT_PARALLEL = "not_parallel"
NOT_DISTRIBUTED = "not_distributed"
DISTRIBUTED = "distributed"
SAGEMAKER_MODEL_PARALLEL = "sagemaker_model_parallel"
SAGEMAKER_DATA_PARALLEL = "sagemaker_data_parallel"
TPU = "tpu"