Transformers 源码解析（一百三十六）

`.\trainer_callback.py`

# coding=utf-8
# 定义编码格式为UTF-8

# Copyright 2020-present the HuggingFace Inc. team.
# 版权声明，指出代码的版权归HuggingFace Inc.团队所有，年份为2020至今

# Licensed under the Apache License, Version 2.0 (the "License");
# 使用Apache License 2.0许可协议，允许在符合条件下使用本代码

# you may not use this file except in compliance with the License.
# 除非遵守许可证规定，否则不得使用本文件

# You may obtain a copy of the License at
# 可以在以下网址获取许可证副本

#     http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# 除非法律要求或书面同意，否则不得使用软件

# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 根据许可证的规定，软件按"原样"提供，不提供任何形式的保证或条件，无论明示或暗示

# See the License for the specific language governing permissions and
# limitations under the License.
# 查看许可证以了解特定语言的许可和限制

"""
Callbacks to use with the Trainer class and customize the training loop.
"""
# 导入必要的模块和库
import dataclasses
import json
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

# 导入NumPy库，用于数值计算
import numpy as np

# 导入tqdm库中的自动模块，用于显示进度条
from tqdm.auto import tqdm

# 导入本地模块
from .trainer_utils import IntervalStrategy, has_length
from .training_args import TrainingArguments
from .utils import logging

# 获取logger对象用于日志记录
logger = logging.get_logger(__name__)

@dataclass
class TrainerState:
    """
    A class containing the [`Trainer`] inner state that will be saved along the model and optimizer when checkpointing
    and passed to the [`TrainerCallback`].

    <Tip>

    In all this class, one step is to be understood as one update step. When using gradient accumulation, one update
    step may require several forward and backward passes: if you use `gradient_accumulation_steps=n`, then one update
    step requires going through *n* batches.

    </Tip>
    """
    # TrainerState类，包含Trainer内部状态，当进行检查点保存时，将连同模型和优化器一起保存，并传递给TrainerCallback
    # 提示信息，解释在这个类中，一步（step）理解为一次更新步骤。当使用梯度累积时，一个更新步骤可能需要多次前向和反向传播。
    # 如果使用gradient_accumulation_steps=n，则一个更新步骤需要通过n批次数据进行。
    # 可选参数：训练时的当前 epoch（小数部分表示当前 epoch 完成的百分比）
    epoch: Optional[float] = None
    # 可选参数：已完成的全局训练步数
    global_step: int = 0
    # 可选参数：当前训练的总步数
    max_steps: int = 0
    # 可选参数：每隔多少步更新时记录日志
    logging_steps: int = 500
    # 可选参数：每隔多少步运行一次评估
    eval_steps: int = 500
    # 可选参数：每隔多少步保存一次检查点
    save_steps: int = 500
    # 可选参数：训练数据加载器的批次大小，仅在使用 `auto_find_batch_size` 时需要设置
    train_batch_size: int = None
    # 可选参数：训练过程中已见的输入标记数（训练时输入的标记数，而不是预测标记数）
    num_input_tokens_seen: int = 0
    # 可选参数：模型自训练开始以来执行的浮点操作总数（存储为浮点数以避免溢出）
    total_flos: float = 0
    # 可选参数：自训练开始以来已记录的日志列表
    log_history: List[Dict[str, float]] = None
    # 可选参数：迄今为止遇到的最佳指标值（用于跟踪最佳模型时）
    best_metric: Optional[float] = None
    # 可选参数：迄今为止遇到的最佳模型的检查点名称（用于跟踪最佳模型时）
    best_model_checkpoint: Optional[str] = None
    # 可选参数：当前进程是否为本地主进程（例如，在多机分布式训练中的一台机器上）
    is_local_process_zero: bool = True
    # 可选参数：当前进程是否为全局主进程（在多机分布式训练中，仅有一个进程会为全局主进程）
    is_world_process_zero: bool = True
    # 可选参数：是否处于超参数搜索过程中（影响 TensorBoard 中数据记录的方式）
    is_hyper_param_search: bool = False
    # 定义一个布尔类型的属性，表示是否为全局进程的第一个进程
    is_world_process_zero: bool = True
    # 定义一个布尔类型的属性，表示是否进行超参数搜索
    is_hyper_param_search: bool = False
    # 定义一个字符串类型的属性，表示试验名称，默认为 None
    trial_name: str = None
    # 定义一个字典类型的属性，表示试验参数，键为字符串，值可以是字符串、浮点数、整数或布尔值，默认为 None
    trial_params: Dict[str, Union[str, float, int, bool]] = None

    def __post_init__(self):
        # 如果实例化时没有指定 log_history 属性，则初始化为空列表
        if self.log_history is None:
            self.log_history = []

    def save_to_json(self, json_path: str):
        """将实例内容以 JSON 格式保存到指定的 `json_path` 文件中。"""
        # 将实例转换为字典，然后转换为 JSON 格式的字符串，并写入到文件中
        json_string = json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n"
        with open(json_path, "w", encoding="utf-8") as f:
            f.write(json_string)

    @classmethod
    def load_from_json(cls, json_path: str):
        """从指定的 `json_path` 文件加载内容并创建一个类实例。"""
        # 打开指定路径的 JSON 文件，读取内容，并转换为对象初始化参数
        with open(json_path, "r", encoding="utf-8") as f:
            text = f.read()
        # 使用加载的 JSON 文本内容来实例化当前类并返回
        return cls(**json.loads(text))
@dataclass
class TrainerControl:
    """
    A class that handles the [`Trainer`] control flow. This class is used by the [`TrainerCallback`] to activate some
    switches in the training loop.

    Args:
        should_training_stop (`bool`, *optional*, defaults to `False`):
            Whether or not the training should be interrupted.

            If `True`, this variable will not be set back to `False`. The training will just stop.
        should_epoch_stop (`bool`, *optional*, defaults to `False`):
            Whether or not the current epoch should be interrupted.

            If `True`, this variable will be set back to `False` at the beginning of the next epoch.
        should_save (`bool`, *optional*, defaults to `False`):
            Whether or not the model should be saved at this step.

            If `True`, this variable will be set back to `False` at the beginning of the next step.
        should_evaluate (`bool`, *optional*, defaults to `False`):
            Whether or not the model should be evaluated at this step.

            If `True`, this variable will be set back to `False` at the beginning of the next step.
        should_log (`bool`, *optional*, defaults to `False`):
            Whether or not the logs should be reported at this step.

            If `True`, this variable will be set back to `False` at the beginning of the next step.
    """

    should_training_stop: bool = False
    should_epoch_stop: bool = False
    should_save: bool = False
    should_evaluate: bool = False
    should_log: bool = False

    def _new_training(self):
        """Internal method that resets the variable for a new training."""
        # 重置 `should_training_stop` 变量，以准备进行新的训练
        self.should_training_stop = False

    def _new_epoch(self):
        """Internal method that resets the variable for a new epoch."""
        # 重置 `should_epoch_stop` 变量，以准备进行新的 epoch
        self.should_epoch_stop = False

    def _new_step(self):
        """Internal method that resets the variable for a new step."""
        # 重置 `should_save`, `should_evaluate`, `should_log` 变量，以准备进行新的步骤
        self.should_save = False
        self.should_evaluate = False
        self.should_log = False


class TrainerCallback:
    # no-format
    """
    A class for objects that will inspect the state of the training loop at some events and take some decisions. At
    each of those events the following arguments are available:
    """
    Args:
        args ([`TrainingArguments`]):
            用于实例化 [`Trainer`] 的训练参数。
        state ([`TrainerState`]):
            [`Trainer`] 的当前状态。
        control ([`TrainerControl`]):
            返回给 [`Trainer`] 的对象，用于做出某些决策。
        model ([`PreTrainedModel`] or `torch.nn.Module`):
            正在训练的模型。
        tokenizer ([`PreTrainedTokenizer`]):
            用于对数据进行编码的分词器。
        optimizer (`torch.optim.Optimizer`):
            训练步骤中使用的优化器。
        lr_scheduler (`torch.optim.lr_scheduler.LambdaLR`):
            用于设置学习率的调度器。
        train_dataloader (`torch.utils.data.DataLoader`, *optional*):
            用于训练的当前数据加载器。
        eval_dataloader (`torch.utils.data.DataLoader`, *optional*):
            用于评估的当前数据加载器。
        metrics (`Dict[str, float]`):
            上次评估阶段计算的指标。

            只能在 `on_evaluate` 事件中访问。
        logs  (`Dict[str, float]`):
            要记录的值。

            只能在 `on_log` 事件中访问。

    `control` 对象是唯一可以被回调函数更改的对象，在更改它的事件中应返回修改后的版本。

    参数 `args`, `state` 和 `control` 对于所有事件都是位置参数，其余的参数在 `kwargs` 中分组。
    您可以在事件的签名中解包需要使用的参数。例如，查看简单 [`~transformers.PrinterCallback`] 的代码示例。

    示例：

    ```
    class PrinterCallback(TrainerCallback):
        def on_log(self, args, state, control, logs=None, **kwargs):
            _ = logs.pop("total_flos", None)
            if state.is_local_process_zero:
                print(logs)
    ```
    # 当一个 epoch 结束时调用的事件
    def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called at the end of an epoch.
        """
        pass

    # 当训练步骤开始时调用的事件。如果使用梯度累积，则一个训练步骤可能包含多个输入。
    def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called at the beginning of a training step. If using gradient accumulation, one training step might take
        several inputs.
        """
        pass

    # 在梯度累积期间每个子步骤结束时调用的事件
    def on_substep_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called at the end of an substep during gradient accumulation.
        """
        pass

    # 当训练步骤结束时调用的事件。如果使用梯度累积，则一个训练步骤可能包含多个输入。
    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called at the end of a training step. If using gradient accumulation, one training step might take
        several inputs.
        """
        pass

    # 在评估阶段结束后调用的事件
    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called after an evaluation phase.
        """
        pass

    # 在成功预测后调用的事件
    def on_predict(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics, **kwargs):
        """
        Event called after a successful prediction.
        """
        pass

    # 在检查点保存后调用的事件
    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called after a checkpoint save.
        """
        pass

    # 在记录最后日志后调用的事件
    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called after logging the last logs.
        """
        pass

    # 在预测步骤后调用的事件
    def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called after a prediction step.
        """
        pass
# 定义一个继承自 TrainerCallback 的内部类，用于按顺序调用一组回调函数。
class CallbackHandler(TrainerCallback):
    """Internal class that just calls the list of callbacks in order."""

    def __init__(self, callbacks, model, tokenizer, optimizer, lr_scheduler):
        # 初始化函数，接收一组回调函数、模型、分词器、优化器和学习率调度器作为参数
        self.callbacks = []
        # 遍历传入的回调函数列表，逐个添加到 callbacks 列表中
        for cb in callbacks:
            self.add_callback(cb)
        self.model = model  # 存储模型对象
        self.tokenizer = tokenizer  # 存储分词器对象
        self.optimizer = optimizer  # 存储优化器对象
        self.lr_scheduler = lr_scheduler  # 存储学习率调度器对象
        self.train_dataloader = None  # 初始化训练数据加载器为 None
        self.eval_dataloader = None  # 初始化评估数据加载器为 None

        # 如果 callbacks 列表中没有 DefaultFlowCallback 类型的回调函数，发出警告
        if not any(isinstance(cb, DefaultFlowCallback) for cb in self.callbacks):
            logger.warning(
                "The Trainer will not work properly if you don't have a `DefaultFlowCallback` in its callbacks. You\n"
                + "should add one before training with `trainer.add_callback(DefaultFlowCallback). The current list of"
                + "callbacks is\n:"
                + self.callback_list
            )

    def add_callback(self, callback):
        # 添加回调函数到 callbacks 列表中
        cb = callback() if isinstance(callback, type) else callback
        cb_class = callback if isinstance(callback, type) else callback.__class__
        # 如果已经存在相同类型的回调函数，则发出警告
        if cb_class in [c.__class__ for c in self.callbacks]:
            logger.warning(
                f"You are adding a {cb_class} to the callbacks of this Trainer, but there is already one. The current"
                + "list of callbacks is\n:"
                + self.callback_list
            )
        self.callbacks.append(cb)

    def pop_callback(self, callback):
        # 从 callbacks 列表中弹出指定类型或实例的回调函数
        if isinstance(callback, type):
            for cb in self.callbacks:
                if isinstance(cb, callback):
                    self.callbacks.remove(cb)
                    return cb
        else:
            for cb in self.callbacks:
                if cb == callback:
                    self.callbacks.remove(cb)
                    return cb

    def remove_callback(self, callback):
        # 从 callbacks 列表中移除指定类型或实例的回调函数
        if isinstance(callback, type):
            for cb in self.callbacks:
                if isinstance(cb, callback):
                    self.callbacks.remove(cb)
                    return
        else:
            self.callbacks.remove(callback)

    @property
    def callback_list(self):
        # 返回 callbacks 列表中每个回调函数的类名组成的字符串
        return "\n".join(cb.__class__.__name__ for cb in self.callbacks)

    def on_init_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
        # 调用所有回调函数的 on_init_end 方法，并返回结果
        return self.call_event("on_init_end", args, state, control)

    def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
        # 设置控制对象的 should_training_stop 属性为 False，调用所有回调函数的 on_train_begin 方法，并返回结果
        control.should_training_stop = False
        return self.call_event("on_train_begin", args, state, control)

    def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
        # 调用所有回调函数的 on_train_end 方法，并返回结果
        return self.call_event("on_train_end", args, state, control)
    # 当每个 epoch 开始时调用的方法，设置控制参数，标志不停止当前 epoch
    def on_epoch_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
        control.should_epoch_stop = False
        # 调用事件处理函数，并返回其结果
        return self.call_event("on_epoch_begin", args, state, control)

    # 当每个 epoch 结束时调用的方法
    def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
        # 调用事件处理函数，并返回其结果
        return self.call_event("on_epoch_end", args, state, control)

    # 当每个训练步骤开始时调用的方法，设置控制参数，标志不记录日志、不评估、不保存模型
    def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
        control.should_log = False
        control.should_evaluate = False
        control.should_save = False
        # 调用事件处理函数，并返回其结果
        return self.call_event("on_step_begin", args, state, control)

    # 当每个训练步骤的子步骤结束时调用的方法
    def on_substep_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
        # 调用事件处理函数，并返回其结果
        return self.call_event("on_substep_end", args, state, control)

    # 当每个训练步骤结束时调用的方法
    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
        # 调用事件处理函数，并返回其结果
        return self.call_event("on_step_end", args, state, control)

    # 当执行评估（evaluation）时调用的方法，设置控制参数，标志不执行评估
    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics):
        control.should_evaluate = False
        # 调用事件处理函数，并返回其结果
        return self.call_event("on_evaluate", args, state, control, metrics=metrics)

    # 当执行预测（prediction）时调用的方法
    def on_predict(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics):
        # 调用事件处理函数，并返回其结果
        return self.call_event("on_predict", args, state, control, metrics=metrics)

    # 当执行保存模型时调用的方法，设置控制参数，标志不执行保存
    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
        control.should_save = False
        # 调用事件处理函数，并返回其结果
        return self.call_event("on_save", args, state, control)

    # 当记录日志时调用的方法，设置控制参数，标志不记录日志
    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs):
        control.should_log = False
        # 调用事件处理函数，并返回其结果
        return self.call_event("on_log", args, state, control, logs=logs)

    # 当执行预测步骤时调用的方法
    def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
        # 调用事件处理函数，并返回其结果
        return self.call_event("on_prediction_step", args, state, control)

    # 调用指定事件的处理函数，并将参数传递给回调函数，返回最终的控制参数
    def call_event(self, event, args, state, control, **kwargs):
        for callback in self.callbacks:
            # 调用回调函数的指定事件，并传递相关参数和关键字参数
            result = getattr(callback, event)(
                args,
                state,
                control,
                model=self.model,
                tokenizer=self.tokenizer,
                optimizer=self.optimizer,
                lr_scheduler=self.lr_scheduler,
                train_dataloader=self.train_dataloader,
                eval_dataloader=self.eval_dataloader,
                **kwargs,
            )
            # 如果回调函数返回的结果不为空，则更新控制参数
            if result is not None:
                control = result
        # 返回最终的控制参数
        return control
    """
    A [`TrainerCallback`] that handles the default flow of the training loop for logs, evaluation and checkpoints.
    """

    # 当每个训练步骤结束时触发的回调函数
    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        # 日志记录
        if state.global_step == 1 and args.logging_first_step:
            control.should_log = True
        # 根据步骤间隔策略记录日志
        if args.logging_strategy == IntervalStrategy.STEPS and state.global_step % state.logging_steps == 0:
            control.should_log = True

        # 评估模型
        if (
            args.evaluation_strategy == IntervalStrategy.STEPS
            and state.global_step % state.eval_steps == 0
            and args.eval_delay <= state.global_step
        ):
            control.should_evaluate = True

        # 保存模型
        if (
            args.save_strategy == IntervalStrategy.STEPS
            and state.save_steps > 0
            and state.global_step % state.save_steps == 0
        ):
            control.should_save = True

        # 结束训练
        if state.global_step >= state.max_steps:
            control.should_training_stop = True

        return control

    # 当每个 epoch 结束时触发的回调函数
    def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        # 根据 epoch 间隔策略记录日志
        if args.logging_strategy == IntervalStrategy.EPOCH:
            control.should_log = True

        # 根据 epoch 间隔策略评估模型
        if args.evaluation_strategy == IntervalStrategy.EPOCH and args.eval_delay <= state.epoch:
            control.should_evaluate = True

        # 根据 epoch 间隔策略保存模型
        if args.save_strategy == IntervalStrategy.EPOCH:
            control.should_save = True

        return control


class ProgressCallback(TrainerCallback):
    """
    A [`TrainerCallback`] that displays the progress of training or evaluation.
    """

    def __init__(self):
        self.training_bar = None
        self.prediction_bar = None

    # 在训练开始时触发的回调函数
    def on_train_begin(self, args, state, control, **kwargs):
        if state.is_world_process_zero:
            # 如果是主进程，创建并显示训练进度条
            self.training_bar = tqdm(total=state.max_steps, dynamic_ncols=True)
        self.current_step = 0

    # 当每个训练步骤结束时触发的回调函数
    def on_step_end(self, args, state, control, **kwargs):
        if state.is_world_process_zero:
            # 如果是主进程，更新训练进度条
            self.training_bar.update(state.global_step - self.current_step)
            self.current_step = state.global_step

    # 在预测步骤时触发的回调函数
    def on_prediction_step(self, args, state, control, eval_dataloader=None, **kwargs):
        if state.is_world_process_zero and has_length(eval_dataloader):
            if self.prediction_bar is None:
                # 如果预测进度条尚未创建，创建并显示预测进度条
                self.prediction_bar = tqdm(
                    total=len(eval_dataloader), leave=self.training_bar is None, dynamic_ncols=True
                )
            # 更新预测进度条
            self.prediction_bar.update(1)
    # 当世界进程号为零时执行以下代码（即主进程执行）
    def on_evaluate(self, args, state, control, **kwargs):
        # 检查是否为主进程
        if state.is_world_process_zero:
            # 如果预测进度条对象存在，则关闭它
            if self.prediction_bar is not None:
                self.prediction_bar.close()
            # 将预测进度条对象设为None，表示已关闭
            self.prediction_bar = None

    # 当世界进程号为零时执行以下代码（即主进程执行）
    def on_predict(self, args, state, control, **kwargs):
        # 检查是否为主进程
        if state.is_world_process_zero:
            # 如果预测进度条对象存在，则关闭它
            if self.prediction_bar is not None:
                self.prediction_bar.close()
            # 将预测进度条对象设为None，表示已关闭
            self.prediction_bar = None

    # 当世界进程号为零且训练进度条对象存在时执行以下代码（即主进程执行）
    def on_log(self, args, state, control, logs=None, **kwargs):
        # 检查是否为主进程且训练进度条对象存在
        if state.is_world_process_zero and self.training_bar is not None:
            # 移除日志中的"total_flos"项（如果存在）
            _ = logs.pop("total_flos", None)
            # 将日志信息转换为字符串并写入训练进度条
            self.training_bar.write(str(logs))

    # 当世界进程号为零时执行以下代码（即主进程执行）
    def on_train_end(self, args, state, control, **kwargs):
        # 检查是否为主进程
        if state.is_world_process_zero:
            # 关闭训练进度条对象
            self.training_bar.close()
            # 将训练进度条对象设为None，表示已关闭
            self.training_bar = None
class PrinterCallback(TrainerCallback):
    """
    A bare [`TrainerCallback`] that just prints the logs.
    """

    def on_log(self, args, state, control, logs=None, **kwargs):
        # Pop the "total_flos" key from logs if it exists and assign to _
        _ = logs.pop("total_flos", None)
        # If the current process is the local process zero, print the logs
        if state.is_local_process_zero:
            print(logs)


class EarlyStoppingCallback(TrainerCallback):
    """
    A [`TrainerCallback`] that handles early stopping.

    Args:
        early_stopping_patience (`int`):
            Use with `metric_for_best_model` to stop training when the specified metric worsens for
            `early_stopping_patience` evaluation calls.
        early_stopping_threshold(`float`, *optional*):
            Use with TrainingArguments `metric_for_best_model` and `early_stopping_patience` to denote how much the
            specified metric must improve to satisfy early stopping conditions. `

    This callback depends on [`TrainingArguments`] argument *load_best_model_at_end* functionality to set best_metric
    in [`TrainerState`]. Note that if the [`TrainingArguments`] argument *save_steps* differs from *eval_steps*, the
    early stopping will not occur until the next save step.
    """

    def __init__(self, early_stopping_patience: int = 1, early_stopping_threshold: Optional[float] = 0.0):
        # Initialize with the provided early_stopping_patience and early_stopping_threshold
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        # Initialize early_stopping_patience_counter to track failed metric improvements
        self.early_stopping_patience_counter = 0

    def check_metric_value(self, args, state, control, metric_value):
        # Determine the comparison operator based on whether greater_is_better is True or False
        operator = np.greater if args.greater_is_better else np.less
        # Check if metric_value is better than the current best_metric with respect to the operator and threshold
        if state.best_metric is None or (
            operator(metric_value, state.best_metric)
            and abs(metric_value - state.best_metric) > self.early_stopping_threshold
        ):
            # Reset early_stopping_patience_counter if metric improves significantly
            self.early_stopping_patience_counter = 0
        else:
            # Increment early_stopping_patience_counter if metric does not improve
            self.early_stopping_patience_counter += 1

    def on_train_begin(self, args, state, control, **kwargs):
        # Ensure that load_best_model_at_end is set to True for this callback
        assert args.load_best_model_at_end, "EarlyStoppingCallback requires load_best_model_at_end = True"
        # Ensure that metric_for_best_model is defined to determine the best metric for early stopping
        assert args.metric_for_best_model is not None, "EarlyStoppingCallback requires metric_for_best_model is defined"
        # Ensure that evaluation_strategy is not set to NO, as this callback depends on evaluation intervals
        assert args.evaluation_strategy != IntervalStrategy.NO, "EarlyStoppingCallback requires IntervalStrategy of steps or epoch"
    # 定义一个方法 `on_evaluate`，用于评估模型的性能
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        # 获取用于最佳模型选择的指标名称
        metric_to_check = args.metric_for_best_model
        # 如果指标名称不以 "eval_" 开头，则添加前缀 "eval_"
        if not metric_to_check.startswith("eval_"):
            metric_to_check = f"eval_{metric_to_check}"
        # 从 metrics 字典中获取指定指标的值
        metric_value = metrics.get(metric_to_check)

        # 如果指标值为 None，则记录警告信息并禁用早停功能
        if metric_value is None:
            logger.warning(
                f"early stopping required metric_for_best_model, but did not find {metric_to_check} so early stopping"
                " is disabled"
            )
            return

        # 检查当前指标值是否符合早停条件
        self.check_metric_value(args, state, control, metric_value)
        # 如果早停计数器大于等于早停阈值，则设置控制器的训练停止标志为 True
        if self.early_stopping_patience_counter >= self.early_stopping_patience:
            control.should_training_stop = True

`.\trainer_pt_utils.py`

# coding=utf-8
# 版权 2020-present HuggingFace Inc. 团队
#
# 根据 Apache 许可证 2.0 版本许可
# 除非符合许可证，否则不得使用此文件
# 您可以在以下地址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按"原样"分发软件
# 没有任何明示或暗示的担保或条件
# 有关特定语言的权限，请参阅许可证
"""
用于 Trainer 类的 Torch 实用程序。
"""

import copy
import datetime
import io
import json
import math
import os
import sys
import warnings
from collections.abc import Mapping
from contextlib import contextmanager
from dataclasses import dataclass, field
from logging import StreamHandler
from typing import Any, Dict, Iterator, List, Optional, Union

import numpy as np
import torch
import torch.distributed as dist
from torch import nn
from torch.optim.lr_scheduler import LRScheduler
from torch.utils.data import Dataset, IterableDataset, RandomSampler, Sampler
from torch.utils.data.distributed import DistributedSampler

from .integrations.deepspeed import is_deepspeed_zero3_enabled
from .tokenization_utils_base import BatchEncoding
from .utils import is_sagemaker_mp_enabled, is_torch_xla_available, is_training_run_on_sagemaker, logging

# 如果在 SageMaker 上运行训练，将日志处理器添加到标准输出流
if is_training_run_on_sagemaker():
    logging.add_handler(StreamHandler(sys.stdout))

# 如果 Torch XLA 可用，则导入相关模块
if is_torch_xla_available():
    import torch_xla.core.xla_model as xm

# 用于抑制 PyTorch 版本 1.4.2-1.7.0 发出的不希望的警告
try:
    from torch.optim.lr_scheduler import SAVE_STATE_WARNING
except ImportError:
    SAVE_STATE_WARNING = ""

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


def get_dataloader_sampler(dataloader):
    if hasattr(dataloader, "batch_sampler") and dataloader.batch_sampler is not None:
        return get_dataloader_sampler(dataloader.batch_sampler)
    elif hasattr(dataloader, "sampler"):
        return dataloader.sampler


def atleast_1d(tensor_or_array: Union[torch.Tensor, np.ndarray]):
    # 至少将输入的 Tensor 或数组转换为至少一维的形式
    if isinstance(tensor_or_array, torch.Tensor):
        if hasattr(torch, "atleast_1d"):
            tensor_or_array = torch.atleast_1d(tensor_or_array)
        elif tensor_or_array.ndim < 1:
            tensor_or_array = tensor_or_array[None]
    else:
        tensor_or_array = np.atleast_1d(tensor_or_array)
    return tensor_or_array


def torch_pad_and_concatenate(tensor1, tensor2, padding_index=-100):
    """连接 `tensor1` 和 `tensor2` 在第一轴上，如果需要在第二轴上进行填充。"""
    tensor1 = atleast_1d(tensor1)
    tensor2 = atleast_1d(tensor2)

    if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
        return torch.cat((tensor1, tensor2), dim=0)

    # 计算新的形状
    # 计算新的张量形状，其中行数为两个输入张量行数之和，列数为两个输入张量列数的最大值，其它维度保持不变
    new_shape = (tensor1.shape[0] + tensor2.shape[0], max(tensor1.shape[1], tensor2.shape[1])) + tensor1.shape[2:]
    
    # 创建一个用指定值填充的新张量，形状为 new_shape，填充值为 padding_index
    result = tensor1.new_full(new_shape, padding_index)
    
    # 将 tensor1 的内容复制到结果张量的前 tensor1.shape[0] 行、tensor1.shape[1] 列的区域
    result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1
    
    # 将 tensor2 的内容复制到结果张量的从 tensor1.shape[0] 行开始、tensor2.shape[1] 列的区域
    result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2
    
    # 返回填充好的结果张量
    return result
def numpy_pad_and_concatenate(array1, array2, padding_index=-100):
    """Concatenates `array1` and `array2` on first axis, applying padding on the second if necessary."""
    # Ensure array1 and array2 are at least 1-dimensional
    array1 = atleast_1d(array1)
    array2 = atleast_1d(array2)

    # Check if array1 is 1-dimensional or if both arrays have the same second dimension
    if len(array1.shape) == 1 or array1.shape[1] == array2.shape[1]:
        # Concatenate array1 and array2 along the first axis
        return np.concatenate((array1, array2), axis=0)

    # Determine the new shape for the result tensor
    new_shape = (array1.shape[0] + array2.shape[0], max(array1.shape[1], array2.shape[1])) + array1.shape[2:]

    # Create a result tensor filled with the padding_index value
    result = np.full_like(array1, padding_index, shape=new_shape)
    # Copy array1 into the appropriate position of the result tensor
    result[: array1.shape[0], : array1.shape[1]] = array1
    # Copy array2 into the appropriate position of the result tensor
    result[array1.shape[0] :, : array2.shape[1]] = array2
    return result


def nested_concat(tensors, new_tensors, padding_index=-100):
    """
    Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or
    nested list/tuples/dict of tensors.
    """
    # Ensure tensors and new_tensors have the same type
    assert type(tensors) == type(
        new_tensors
    ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
    
    # Handle case where tensors and new_tensors are lists or tuples
    if isinstance(tensors, (list, tuple)):
        # Recursively concatenate each pair of tensors/new_tensors elements
        return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
    # Handle case where tensors is a torch.Tensor
    elif isinstance(tensors, torch.Tensor):
        return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
    # Handle case where tensors is a Mapping (like dict)
    elif isinstance(tensors, Mapping):
        # Recursively concatenate each pair of tensors/new_tensors items
        return type(tensors)({k: nested_concat(t, new_tensors[k], padding_index=padding_index) for k, t in tensors.items()})
    # Handle case where tensors is a numpy.ndarray
    elif isinstance(tensors, np.ndarray):
        return numpy_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
    else:
        # Raise an error for unsupported types
        raise TypeError(f"Unsupported type for concatenation: got {type(tensors)}")


def find_batch_size(tensors):
    """
    Find the first dimension of a tensor in a nested list/tuple/dict of tensors.
    """
    # Handle case where tensors is a list or tuple
    if isinstance(tensors, (list, tuple)):
        # Recursively search for the first dimension in each element
        for t in tensors:
            result = find_batch_size(t)
            if result is not None:
                return result
    # Handle case where tensors is a Mapping (like dict)
    elif isinstance(tensors, Mapping):
        # Recursively search for the first dimension in each value
        for key, value in tensors.items():
            result = find_batch_size(value)
            if result is not None:
                return result
    # Handle case where tensors is a torch.Tensor
    elif isinstance(tensors, torch.Tensor):
        # Return the size of the first dimension of the tensor, or None if the tensor is empty
        return tensors.shape[0] if len(tensors.shape) >= 1 else None
    # Handle case where tensors is a numpy.ndarray
    elif isinstance(tensors, np.ndarray):
        # Return the size of the first dimension of the array, or None if the array is empty
        return tensors.shape[0] if len(tensors.shape) >= 1 else None


def nested_numpify(tensors):
    "Numpify `tensors` (even if it's a nested list/tuple/dict of tensors)."
    # Handle case where tensors is a list or tuple
    if isinstance(tensors, (list, tuple)):
        # Recursively convert each element to numpy
        return type(tensors)(nested_numpify(t) for t in tensors)
    # Handle case where tensors is a Mapping (like dict)
    if isinstance(tensors, Mapping):
        # Recursively convert each value to numpy
        return type(tensors)({k: nested_numpify(t) for k, t in tensors.items()})

    # Convert tensor to numpy array assuming it's on CPU
    t = tensors.cpu()
    # 如果张量 `t` 的数据类型是 torch.bfloat16
    if t.dtype == torch.bfloat16:
        # 截至 NumPy 1.21.4 版本，NumPy 不支持 bfloat16 数据类型（参见链接：
        # https://github.com/numpy/numpy/blob/a47ecdea856986cd60eabbd53265c2ca5916ad5d/doc/source/user/basics.types.rst ）。
        # 在 NumPy 添加对 bfloat16 的支持之前，我们需要将数据类型转换为 float32。
        t = t.to(torch.float32)
    # 将张量 `t` 转换为 NumPy 数组并返回
    return t.numpy()
# 分离 `tensors`，即使它是张量的嵌套列表/元组/字典
def nested_detach(tensors):
    if isinstance(tensors, (list, tuple)):  # 如果是列表或元组
        return type(tensors)(nested_detach(t) for t in tensors)  # 递归地对每个元素进行分离操作
    elif isinstance(tensors, Mapping):  # 如果是字典类型
        return type(tensors)({k: nested_detach(t) for k, t in tensors.items()})  # 递归地对每个值进行分离操作
    return tensors.detach()  # 对单个张量进行分离操作


# 对 `tensors` 进行 XLA 网格归约操作，使用给定的 `name`
def nested_xla_mesh_reduce(tensors, name):
    if is_torch_xla_available():  # 检查是否安装了 Torch XLA
        import torch_xla.core.xla_model as xm

        if isinstance(tensors, (list, tuple)):  # 如果是列表或元组
            return type(tensors)(nested_xla_mesh_reduce(t, f"{name}_{i}") for i, t in enumerate(tensors))
        if isinstance(tensors, Mapping):  # 如果是字典类型
            return type(tensors)(
                {k: nested_xla_mesh_reduce(t, f"{name}_{i}") for i, (k, t) in enumerate(tensors.items())}
            )

        tensors = atleast_1d(tensors)  # 将张量至少视为1维张量
        return xm.mesh_reduce(name, tensors, torch.cat)  # 使用 XLA 进行网格归约操作
    else:
        raise ImportError("Torch xla must be installed to use `nested_xla_mesh_reduce`")  # 抛出导入错误


# 分布式环境下对 `tensor` 进行拼接操作，支持截断操作
def distributed_concat(tensor: Any, num_total_examples: Optional[int] = None) -> Any:
    try:
        if isinstance(tensor, (tuple, list)):  # 如果是元组或列表
            return type(tensor)(distributed_concat(t, num_total_examples) for t in tensor)  # 递归地对每个元素进行拼接操作
        if isinstance(tensor, Mapping):  # 如果是字典类型
            return type(tensor)({k: distributed_concat(t, num_total_examples) for k, t in tensor.items()})  # 递归地对每个值进行拼接操作

        tensor = atleast_1d(tensor).contiguous()  # 将张量至少视为1维张量，并确保其连续性
        output_tensors = [tensor.clone() for _ in range(dist.get_world_size())]  # 创建每个进程的输出张量副本
        dist.all_gather(output_tensors, tensor)  # 在所有进程间进行全局收集操作
        concat = torch.cat(output_tensors, dim=0)  # 沿指定维度拼接张量

        # 截断由 SequentialDistributedSampler 添加的虚拟元素
        if num_total_examples is not None:
            concat = concat[:num_total_examples]

        return concat  # 返回拼接后的结果张量
    except AssertionError:
        raise AssertionError("Not currently using distributed training")  # 抛出断言错误


# 在分布式环境下广播标量的张量化值
def distributed_broadcast_scalars(
    scalars: List[Union[int, float]],
    num_total_examples: Optional[int] = None,
    device: Optional[torch.device] = torch.device("cuda"),
) -> torch.Tensor:
    try:
        tensorized_scalar = torch.tensor(scalars).to(device)  # 将标量列表转换为张量，并移到指定设备
        output_tensors = [tensorized_scalar.clone() for _ in range(dist.get_world_size())]  # 创建每个进程的输出张量副本
        dist.all_gather(output_tensors, tensorized_scalar)  # 在所有进程间进行全局收集操作
        concat = torch.cat(output_tensors, dim=0)  # 沿指定维度拼接张量

        # 截断由 SequentialDistributedSampler 添加的虚拟元素
        if num_total_examples is not None:
            concat = concat[:num_total_examples]

        return concat  # 返回拼接后的结果张量
    except AssertionError:
        raise AssertionError("Not currently using distributed training")  # 抛出断言错误


# 重新发布未捕获的 PyTorch 警告
def reissue_pt_warnings(caught_warnings):
    if len(caught_warnings) > 1:  # 如果捕获到的警告数量大于1
        for w in caught_warnings:  # 对每个警告进行迭代
            if w.category != UserWarning or w.message != SAVE_STATE_WARNING:  # 如果不是用户警告或不是 SAVE_STATE_WARNING
                warnings.warn(w.message, w.category)  # 发出警告信息
@contextmanager
def torch_distributed_zero_first(local_rank: int):
    """
    分布式训练中的装饰器，使所有进程等待每个本地主节点执行某些操作。

    Args:
        local_rank (`int`): 本地进程的排名。
    """
    # 如果本地排名不是-1或0，则进行同步
    if local_rank not in [-1, 0]:
        dist.barrier()
    yield
    # 如果本地排名是0，则进行同步
    if local_rank == 0:
        dist.barrier()


class DistributedSamplerWithLoop(DistributedSampler):
    """
    类似于`torch.utils.data.distributed.DistributedSampler`，但在洗牌样本末尾循环以使每个进程具有批次大小的整数倍样本。

    Args:
        dataset (`torch.utils.data.Dataset`):
            用于采样的数据集。
        batch_size (`int`):
            此采样器使用的批次大小。
        kwargs (`Dict[str, Any]`, *可选*):
            传递给`DistributedSampler`的所有其他关键字参数。
    """

    def __init__(self, dataset, batch_size, **kwargs):
        super().__init__(dataset, **kwargs)
        self.batch_size = batch_size

    def __iter__(self):
        indices = list(super().__iter__())
        # 如果索引数量是批次大小的整数倍，则余数为0；否则余数为批次大小减去索引数量模批次大小的余数
        remainder = 0 if len(indices) % self.batch_size == 0 else self.batch_size - len(indices) % self.batch_size
        # DistributedSampler已经从开头添加了样本，使样本数是世界大小的整数倍，因此我们跳过这些样本。
        start_remainder = 1 if self.rank < len(self.dataset) % self.num_replicas else 0
        indices += indices[start_remainder : start_remainder + remainder]
        return iter(indices)


class SequentialDistributedSampler(Sampler):
    """
    顺序子采样器，按顺序子采样索引，使最终在收集所有结果时更容易。

    即使我们只在评估和预测中使用此采样器（没有训练），这意味着模型参数不必同步（即使前向传递次数不同，也不会挂起同步），
    我们仍然向采样器添加额外的样本，使其可以被`gather`或`reduce`，以便在循环结束时轻松处理。
    """
    # 初始化函数，用于设置分布式采样器的参数和状态
    def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None):
        # 发出警告信息，提示 SequentialDistributedSampler 将在 Transformers v5 版本中移除
        warnings.warn(
            "SequentialDistributedSampler is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        
        # 如果未指定 num_replicas，则检查分布式环境是否可用并获取全局大小
        if num_replicas is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            num_replicas = dist.get_world_size()
        
        # 如果未指定 rank，则检查分布式环境是否可用并获取当前进程的排名
        if rank is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            rank = dist.get_rank()
        
        # 将输入的 dataset 存储在实例变量中
        self.dataset = dataset
        self.num_replicas = num_replicas
        self.rank = rank
        
        # 计算数据集的样本总数
        num_samples = len(self.dataset)
        
        # 如果指定了 batch_size，则将 num_samples 调整为 batch_size 的整数倍
        if batch_size is not None:
            self.num_samples = int(math.ceil(num_samples / (batch_size * num_replicas))) * batch_size
        else:
            self.num_samples = int(math.ceil(num_samples / num_replicas))
        
        # 计算总体的样本大小，考虑了 replica 数量
        self.total_size = self.num_samples * self.num_replicas
        self.batch_size = batch_size

    # 迭代器方法，返回当前进程应处理的数据样本索引
    def __iter__(self):
        # 创建初始索引列表，长度与数据集相同
        indices = list(range(len(self.dataset)))

        # 添加额外的样本以确保能够均匀分割
        indices += indices[: (self.total_size - len(indices))]
        # 断言索引列表长度与 total_size 是否匹配
        assert (
            len(indices) == self.total_size
        ), f"Indices length {len(indices)} and total size {self.total_size} mismatched"

        # 子采样，根据当前进程的排名和 num_samples 获取对应的样本索引
        indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples]
        # 断言子采样后的索引列表长度与 num_samples 是否匹配
        assert (
            len(indices) == self.num_samples
        ), f"Indices length {len(indices)} and sample number {self.num_samples} mismatched"

        # 返回迭代器，包含当前进程应处理的数据样本索引
        return iter(indices)

    # 返回每个进程应处理的数据样本数
    def __len__(self):
        return self.num_samples
# 根据给定的数据集和批量大小，返回适合在TPU环境下使用的数据采样器
def get_tpu_sampler(dataset: torch.utils.data.Dataset, batch_size: int):
    # 如果只有一个进程或者没有TPU环境，返回一个随机采样器
    if xm.xrt_world_size() <= 1:
        return RandomSampler(dataset)
    # 使用分布式采样器，设置副本数为TPU进程数，当前进程的排名为序号
    return DistributedSampler(dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal())


# 创建与给定数组结构相同的嵌套结构，但是第一维的大小固定为num_samples
def nested_new_like(arrays, num_samples, padding_index=-100):
    if isinstance(arrays, (list, tuple)):
        return type(arrays)(nested_new_like(x, num_samples) for x in arrays)
    # 使用padding_index填充数组的元素，使其形状变为(num_samples, arrays.shape[1], ...)
    return np.full_like(arrays, padding_index, shape=(num_samples, *arrays.shape[1:]))


# 将给定数组扩展到新的序列长度，使用padding_index进行填充
def expand_like(arrays, new_seq_length, padding_index=-100):
    # 创建与arrays相同结构的结果数组，第二维度扩展到new_seq_length，使用padding_index填充
    result = np.full_like(arrays, padding_index, shape=(arrays.shape[0], new_seq_length) + arrays.shape[2:])
    # 将原数组的数据复制到结果数组中对应位置
    result[:, : arrays.shape[1]] = arrays
    return result


# 对嵌套的张量列表/元组/字典进行截断处理，使其最大长度为limit
def nested_truncate(tensors, limit):
    if isinstance(tensors, (list, tuple)):
        return type(tensors)(nested_truncate(t, limit) for t in tensors)
    if isinstance(tensors, Mapping):
        return type(tensors)({k: nested_truncate(t, limit) for k, t in tensors.items()})
    # 对于张量，直接截取到指定的limit长度
    return tensors[:limit]


class DistributedTensorGatherer:
    """
    一个负责在CPU上按块正确聚合张量（或嵌套的张量列表/元组/字典）的类。

    如果我们的数据集有16个样本，每个进程批量大小为2，有3个进程，并且我们在每一步都聚合并传输到CPU，
    我们的采样器将生成以下索引：

        `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1]`

    为了使每个进程获得相同的数据集长度的多个数目，然后进程0、1和2将负责为以下样本做出预测：

        - P0: `[0, 1, 2, 3, 4, 5]`
        - P1: `[6, 7, 8, 9, 10, 11]`
        - P2: `[12, 13, 14, 15, 0, 1]`

    每个进程的第一个批次将是

        - P0: `[0, 1]`
        - P1: `[6, 7]`
        - P2: `[12, 13]`

    因此，如果我们在第一个批次结束时聚合，我们将获得一个对应于以下索引的张量（或嵌套的张量列表/元组）：

        `[0, 1, 6, 7, 12, 13]`

    如果我们直接连接我们的结果而不采取任何预防措施，用户最终会在预测循环结束时按以下顺序获得索引的预测结果：

        `[0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1]`

    由于某种原因，这种情况并不会符合他们的期望。这个类就是为了解决这个问题。
    """
    pass
    Args:
        world_size (`int`):
            分布式训练中使用的进程数。
        num_samples (`int`):
            数据集中的样本数。
        make_multiple_of (`int`, *可选*):
            如果传入，表示每个进程处理的数据集大小应该是此参数的倍数（通过增加样本数来实现）。
        padding_index (`int`, *可选*, 默认为 -100):
            如果数组的序列长度不相同时使用的填充索引。

    """
    初始化方法，初始化分布式张量收集器对象。
    """
    def __init__(self, world_size, num_samples, make_multiple_of=None, padding_index=-100):
        # 发出警告，提醒该类在 Transformers 的 v5 版本中将被移除
        warnings.warn(
            "DistributedTensorGatherer is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        # 设置对象属性：分布式训练使用的进程数
        self.world_size = world_size
        # 设置对象属性：数据集中的样本数
        self.num_samples = num_samples
        # 计算处理后的总样本数，考虑是否需要使总样本数为 make_multiple_of 的倍数
        total_size = world_size if make_multiple_of is None else world_size * make_multiple_of
        # 计算并设置对象属性：处理后的总样本数，确保是 make_multiple_of 的倍数
        self.total_samples = int(np.ceil(num_samples / total_size)) * total_size
        # 计算并设置对象属性：每个进程需要处理的样本数
        self.process_length = self.total_samples // world_size
        # 初始化对象私有属性
        self._storage = None
        self._offsets = None
        # 设置对象属性：填充索引，用于处理长度不同的数组时的填充
        self.padding_index = padding_index

    """
    添加数组到内部存储，如果是第一次添加数组，则初始化存储到完整大小，以便在开始时发生内存溢出。
    """
    def add_arrays(self, arrays):
        if arrays is None:
            return
        # 如果内部存储为空，则根据总样本数和填充索引初始化存储空间
        if self._storage is None:
            self._storage = nested_new_like(arrays, self.total_samples, padding_index=self.padding_index)
            # 根据每个进程需要处理的样本数初始化偏移量列表
            self._offsets = list(range(0, self.total_samples, self.process_length))

        # 将数组添加到内部存储，并返回添加的片段长度和更新后的存储空间
        slice_len, self._storage = self._nested_set_tensors(self._storage, arrays)
        
        # 更新每个进程的偏移量，以便下一次添加数据时从正确的位置开始
        for i in range(self.world_size):
            self._offsets[i] += slice_len
    # 递归设置张量数据到存储中
    def _nested_set_tensors(self, storage, arrays):
        # 如果数组是列表或元组
        if isinstance(arrays, (list, tuple)):
            # 递归调用_nested_set_tensors来处理每个元素，返回结果的第一个元素和类型一致的数组
            result = [self._nested_set_tensors(x, y) for x, y in zip(storage, arrays)]
            # 返回结果的第一个元素的第一个元素，以及类型与输入数组相同的生成器表达式生成的数组
            return result[0][0], type(arrays)(r[1] for r in result)
        
        # 断言：传入的数组的第一个维度应为self.world_size的整数倍
        assert (
            arrays.shape[0] % self.world_size == 0
        ), f"Arrays passed should all have a first dimension multiple of {self.world_size}, found {arrays.shape[0]}."
        
        # 计算每个分片的长度
        slice_len = arrays.shape[0] // self.world_size
        
        # 遍历每个进程
        for i in range(self.world_size):
            # 如果数组的维度为1
            if len(arrays.shape) == 1:
                # 将arrays的第i个分片复制到storage的相应位置
                storage[self._offsets[i] : self._offsets[i] + slice_len] = arrays[i * slice_len : (i + 1) * slice_len]
            else:
                # 如果storage的维度大于1且小于arrays的第二维度，则动态扩展storage
                if len(storage.shape) > 1 and storage.shape[1] < arrays.shape[1]:
                    storage = expand_like(storage, arrays.shape[1], padding_index=self.padding_index)
                # 将arrays的第i个分片复制到storage的相应位置，并限定复制的列数为arrays的第二维度
                storage[self._offsets[i] : self._offsets[i] + slice_len, : arrays.shape[1]] = arrays[
                    i * slice_len : (i + 1) * slice_len
                ]
        
        # 返回每个分片的长度和更新后的storage
        return slice_len, storage

    # 完成最终处理，返回正确收集的数组，并截断到样本数量（因为采样器添加了一些额外样本以保证每个进程得到相同长度的数据集）
    def finalize(self):
        """
        Return the properly gathered arrays and truncate to the number of samples (since the sampler added some extras
        to get each process a dataset of the same length).
        """
        # 如果_storage为None，则返回空
        if self._storage is None:
            return
        # 如果第一个偏移值不等于process_length，则记录警告
        if self._offsets[0] != self.process_length:
            logger.warning("Not all data has been set. Are you sure you passed all values?")
        # 调用nested_truncate函数对_storage进行截断，返回截断后的样本数量
        return nested_truncate(self._storage, self.num_samples)
@dataclass
class LabelSmoother:
    """
    Adds label-smoothing on a pre-computed output from a Transformers model.

    Args:
        epsilon (`float`, *optional*, defaults to 0.1):
            The label smoothing factor.
        ignore_index (`int`, *optional*, defaults to -100):
            The index in the labels to ignore when computing the loss.
    """

    epsilon: float = 0.1
    ignore_index: int = -100

    def __call__(self, model_output, labels, shift_labels=False):
        # Extract logits from model output, handling both dict and list inputs
        logits = model_output["logits"] if isinstance(model_output, dict) else model_output[0]
        
        # If shift_labels is True, remove the last token from logits and labels
        if shift_labels:
            logits = logits[..., :-1, :].contiguous()
            labels = labels[..., 1:].contiguous()

        # Calculate negative log probabilities for each label using log_softmax
        log_probs = -nn.functional.log_softmax(logits, dim=-1)
        
        # Ensure labels have an additional dimension if necessary
        if labels.dim() == log_probs.dim() - 1:
            labels = labels.unsqueeze(-1)

        # Create a mask for padding positions
        padding_mask = labels.eq(self.ignore_index)
        
        # Clamp labels to be non-negative (in case ignore_index is -100)
        labels = torch.clamp(labels, min=0)
        
        # Compute negative log likelihood loss for each element
        nll_loss = log_probs.gather(dim=-1, index=labels)
        
        # Calculate smoothed loss by summing log_probs over the last dimension
        smoothed_loss = log_probs.sum(dim=-1, keepdim=True, dtype=torch.float32)

        # Zero out losses corresponding to padding positions
        nll_loss.masked_fill_(padding_mask, 0.0)
        smoothed_loss.masked_fill_(padding_mask, 0.0)

        # Calculate the number of active elements (non-padded)
        num_active_elements = padding_mask.numel() - padding_mask.long().sum()

        # Compute mean nll_loss over all active elements
        nll_loss = nll_loss.sum() / num_active_elements
        
        # Compute mean smoothed_loss over all active elements and label dimensions
        smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1])
        
        # Combine nll_loss and smoothed_loss with label smoothing factor epsilon
        return (1 - self.epsilon) * nll_loss + self.epsilon * smoothed_loss


def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, generator=None):
    """
    Return a list of indices so that each slice of `batch_size` consecutive indices correspond to elements of similar
    lengths. To do this, the indices are:

    - randomly permuted
    - grouped in mega-batches of size `mega_batch_mult * batch_size`
    - sorted by length in each mega-batch

    The result is the concatenation of all mega-batches, with the batch of `batch_size` containing the element of
    maximum length placed first, so that an OOM happens sooner rather than later.
    """
    # Default for mega_batch_mult: 50 or the number to get 4 megabatches, whichever is smaller.
    if mega_batch_mult is None:
        mega_batch_mult = min(len(lengths) // (batch_size * 4), 50)
        # Just in case, for tiny datasets
        if mega_batch_mult == 0:
            mega_batch_mult = 1

    # Generate random permutation of indices using torch.randperm
    indices = torch.randperm(len(lengths), generator=generator)
    
    # Calculate mega-batch size
    megabatch_size = mega_batch_mult * batch_size
    # 将索引列表按照 megabatch_size 分割成多个子列表，每个子列表转换为列表形式
    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
    # 对每个 megabatch 根据其包含的元素在 lengths 中的长度进行降序排序
    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]

    # 获取每个 megabatch 中最长元素的长度，构成一个列表
    megabatch_maximums = [lengths[megabatch[0]] for megabatch in megabatches]
    # 找出最长元素长度列表中的最大值的索引
    max_idx = torch.argmax(torch.tensor(megabatch_maximums)).item()
    # 将最长元素置换到第一个 megabatch 的第一个位置
    megabatches[0][0], megabatches[max_idx][0] = megabatches[max_idx][0], megabatches[0][0]

    # 展开多个 megabatch，以形成一个扁平的索引列表
    return [i for megabatch in megabatches for i in megabatch]
# 定义一个继承自Sampler的自定义采样器，用于按照数据集特征长度分组采样数据索引，并保留一定的随机性。
class LengthGroupedSampler(Sampler):
    r"""
    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
    keeping a bit of randomness.
    """

    def __init__(
        self,
        batch_size: int,
        dataset: Optional[Dataset] = None,
        lengths: Optional[List[int]] = None,
        model_input_name: Optional[str] = None,
        generator=None,
    ):
        # 如果未提供 dataset 和 lengths 中的任何一个，则抛出错误
        if dataset is None and lengths is None:
            raise ValueError("One of dataset and lengths must be provided.")

        self.batch_size = batch_size
        # 如果未提供 lengths，则尝试从 dataset 推断长度
        if lengths is None:
            # 如果数据集的第一个元素不是字典或BatchEncoding对象，或者 model_input_name 不在第一个元素的键中，则抛出错误
            model_input_name = model_input_name if model_input_name is not None else "input_ids"
            if (
                not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding))
                or model_input_name not in dataset[0]
            ):
                raise ValueError(
                    "Can only automatically infer lengths for datasets whose items are dictionaries with an "
                    f"'{model_input_name}' key."
                )
            # 推断数据集每个元素的长度，并存储在 lengths 中
            lengths = [len(feature[model_input_name]) for feature in dataset]
        # 如果 lengths 是 torch.Tensor 类型，则转换为 List[int] 类型
        elif isinstance(lengths, torch.Tensor):
            logger.info(
                "If lengths is a torch.Tensor, LengthGroupedSampler will be slow. Converting lengths to List[int]..."
            )
            lengths = lengths.tolist()

        # 存储采样器的长度信息
        self.lengths = lengths
        self.generator = generator

    # 返回采样器的长度，即数据集的样本数
    def __len__(self):
        return len(self.lengths)

    # 返回一个迭代器，生成按照长度分组并加入一定随机性的数据索引
    def __iter__(self):
        # 调用函数获取按长度分组的数据索引
        indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=self.generator)
        return iter(indices)


# 定义一个继承自DistributedSampler的分布式采样器，用于按照数据集特征长度分组采样数据索引，并保留一定的随机性。
class DistributedLengthGroupedSampler(DistributedSampler):
    r"""
    Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same
    length while keeping a bit of randomness.
    """

    # 从PyTorch的DistributedSampler复制并调整的构造函数。
    def __init__(
        self,
        batch_size: int,
        dataset: Optional[Dataset] = None,
        num_replicas: Optional[int] = None,
        rank: Optional[int] = None,
        seed: int = 0,
        drop_last: bool = False,
        lengths: Optional[List[int]] = None,
        model_input_name: Optional[str] = None,
        # generator 参数用于控制随机数生成器
        generator=None,
    ):
        # 调用父类的构造函数，初始化分布式采样器的基本参数
        super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=False, seed=seed)
        # 如果未提供 lengths，则尝试从 dataset 推断长度
        if lengths is None:
            # 如果数据集的第一个元素不是字典或BatchEncoding对象，或者 model_input_name 不在第一个元素的键中，则抛出错误
            model_input_name = model_input_name if model_input_name is not None else "input_ids"
            if (
                not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding))
                or model_input_name not in dataset[0]
            ):
                raise ValueError(
                    "Can only automatically infer lengths for datasets whose items are dictionaries with an "
                    f"'{model_input_name}' key."
                )
            # 推断数据集每个元素的长度，并存储在 lengths 中
            lengths = [len(feature[model_input_name]) for feature in dataset]
        # 如果 lengths 是 torch.Tensor 类型，则转换为 List[int] 类型
        elif isinstance(lengths, torch.Tensor):
            logger.info(
                "If lengths is a torch.Tensor, DistributedLengthGroupedSampler will be slow. Converting lengths to List[int]..."
            )
            lengths = lengths.tolist()

        # 存储采样器的长度信息
        self.lengths = lengths
        self.batch_size = batch_size
        self.drop_last = drop_last
        self.generator = generator
        ):
            # 检查 dataset 和 lengths 是否至少提供了一个，否则抛出 ValueError 异常
            raise ValueError("One of dataset and lengths must be provided.")
        if num_replicas is None:
            # 如果未指定 num_replicas，则根据是否可用判断是否需要分布式支持
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            # 获取分布式环境下的进程数作为 num_replicas
            num_replicas = dist.get_world_size()
        if rank is None:
            # 如果未指定 rank，则根据是否可用判断是否需要分布式支持
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            # 获取当前进程在分布式环境中的 rank
            rank = dist.get_rank()

        # 设置批量大小、进程数、当前进程的 rank、当前 epoch 数及是否丢弃最后一批数据的标志
        self.batch_size = batch_size
        self.num_replicas = num_replicas
        self.rank = rank
        self.epoch = 0
        self.drop_last = drop_last

        if lengths is None:
            # 如果未提供 lengths，则尝试自动推断
            model_input_name = model_input_name if model_input_name is not None else "input_ids"
            if (
                not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding))
                or model_input_name not in dataset[0]
            ):
                # 检查 dataset 的第一个元素是否为字典或 BatchEncoding，并确保包含指定的 model_input_name
                raise ValueError(
                    "Can only automatically infer lengths for datasets whose items are dictionaries with an "
                    f"'{model_input_name}' key."
                )
            # 推断每个样本的长度
            lengths = [len(feature[model_input_name]) for feature in dataset]
        elif isinstance(lengths, torch.Tensor):
            # 如果 lengths 是 torch.Tensor，则警告使用 DistributedLengthGroupedSampler 会很慢，将 lengths 转换为 List[int]
            logger.info(
                "If lengths is a torch.Tensor, DistributedLengthGroupedSampler will be slow. Converting lengths to"
                " List[int]..."
            )
            lengths = lengths.tolist()

        self.lengths = lengths

        # 如果设置丢弃最后一批数据，并且数据集长度不能被进程数整除，则需要计算实际的样本数以确保每个进程获得相等数量的数据
        if self.drop_last and len(self.lengths) % self.num_replicas != 0:
            # 计算每个进程应分配的样本数，向上取整以确保每个进程分配相同数量的数据
            self.num_samples = math.ceil((len(self.lengths) - self.num_replicas) / self.num_replicas)
        else:
            # 否则，每个进程平均分配数据，向上取整
            self.num_samples = math.ceil(len(self.lengths) / self.num_replicas)
        # 总共的样本数为每个进程的样本数乘以进程数
        self.total_size = self.num_samples * self.num_replicas
        # 设置随机种子
        self.seed = seed
    def __iter__(self) -> Iterator:
        # 根据当前的 epoch 和 seed 决定性地对索引进行洗牌
        g = torch.Generator()
        g.manual_seed(self.seed + self.epoch)
        # 使用自定义的生成器 g，根据长度信息和批次大小生成分组索引
        indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=g)

        if not self.drop_last:
            # 如果不丢弃最后一部分数据，添加额外的样本使得索引列表能够均匀分割
            indices += indices[: (self.total_size - len(indices))]
        else:
            # 如果丢弃最后一部分数据，截取索引列表使其长度符合要求
            indices = indices[: self.total_size]
        assert len(indices) == self.total_size

        # 对索引进行子采样，根据排名、总大小和副本数目筛选索引
        indices = indices[self.rank : self.total_size : self.num_replicas]
        assert len(indices) == self.num_samples

        # 返回迭代器对象，用于遍历子采样后的索引
        return iter(indices)
class ShardSampler(Sampler):
    """
    Sampler that shards batches between several processes. Dispatches indices batch by batch: on 2 processes with batch
    size 4, the first two batches are `[0, 1, 2, 3, 4, 5, 6, 7]` and `[8, 9, 10, 11, 12, 13, 14, 15]`, which shard into
    `[0, 1, 2, 3]` and `[8, 9, 10, 11]` for GPU-0 and `[4, 5, 6, 7]` and `[12, 13, 14, 15]` for GPU-1.

    The sampler thus yields `[0, 1, 2, 3, 8, 9, 10, 11]` on GPU-0 and `[4, 5, 6, 7, 12, 13, 14, 15]` on GPU-1.
    """

    def __init__(
        self,
        dataset: Dataset,
        batch_size: int = 1,
        drop_last: bool = False,
        num_processes: int = 1,
        process_index: int = 0,
    ):
        # Initialize the sampler with dataset, batch size, drop_last flag,
        # number of processes, and process index.
        self.dataset = dataset
        self.batch_size = batch_size
        self.drop_last = drop_last
        self.num_processes = num_processes
        self.process_index = process_index

        # Calculate the total batch size across all processes.
        self.total_batch_size = total_batch_size = batch_size * num_processes

        # Determine the total number of samples considering drop_last option.
        num_batches = len(dataset) // total_batch_size if drop_last else math.ceil(len(dataset) / total_batch_size)
        self.total_num_samples = num_batches * total_batch_size

    def __iter__(self):
        # Generate initial indices from the dataset.
        indices = list(range(len(self.dataset)))

        # Add extra samples to make the number of samples evenly divisible by total_num_samples.
        while len(indices) < self.total_num_samples:
            indices += indices[: (self.total_num_samples - len(indices))]

        # Generate indices for the current process based on batch size and process index.
        result = []
        for batch_start in range(self.batch_size * self.process_index, self.total_num_samples, self.total_batch_size):
            result += indices[batch_start : batch_start + self.batch_size]

        return iter(result)

    def __len__(self):
        # Each shard only sees a fraction of total_num_samples.
        return self.total_num_samples // self.num_processes


class IterableDatasetShard(IterableDataset):
    """
    Wraps a PyTorch `IterableDataset` to generate samples for one of the processes only. Instances of this class will
    always yield a number of samples that is a round multiple of the actual batch size (which is `batch_size x
    num_processes`). Depending on the value of the `drop_last` attribute, it will either stop the iteration at the
    first batch that would be too small or loop with indices from the beginning.

    On two processes with an iterable dataset yielding of `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]` with a batch size of
    2:

    - the shard on process 0 will yield `[0, 1, 4, 5, 8, 9]` so will see batches `[0, 1]`, `[4, 5]`, `[8, 9]`
    - the shard on process 1 will yield `[2, 3, 6, 7, 10, 11]` so will see batches `[2, 3]`, `[6, 7]`, `[10, 11]`
    """
    # 如果 IterableDataset 实现了一些需要在所有进程上以相同方式应用的随机化（例如，洗牌），则应该在 dataset 的 generator 属性中使用 torch.Generator 生成你的随机数，并调用此对象的 set_epoch 方法。它将在开始迭代之前，将该 generator 的种子设置为 seed + epoch 的值在所有进程上。
    # 或者，你也可以在你的可迭代数据集中实现 set_epoch() 方法来处理这个问题。

    # 初始化方法，初始化 IterableDatasetShard 实例
    def __init__(
        self,
        dataset: IterableDataset,  # 要分割为多个分片的批次采样器
        batch_size: int = 1,  # 每个分片的批次大小，默认为1
        drop_last: bool = False,  # 是否舍弃最后不完整的批次，或者使用从开头取样的样本来补全最后的批次
        num_processes: int = 1,  # 并行运行的进程数，默认为1
        process_index: int = 0,  # 当前进程的索引，默认为0
        seed: int = 0,  # 在 set_epoch 方法中用于随机数生成的随机种子，默认为0
    ):
        self.dataset = dataset  # 分片的数据集
        self.batch_size = batch_size  # 批次大小
        self.drop_last = drop_last  # 是否舍弃最后不完整的批次
        self.num_processes = num_processes  # 并行进程数
        self.process_index = process_index  # 当前进程的索引
        self.seed = seed  # 随机种子
        self.epoch = 0  # 初始化时的迭代周期为0
        self.num_examples = 0  # 样本数量为0

    # 设置迭代周期的方法
    def set_epoch(self, epoch): 
        self.epoch = epoch  # 将当前迭代周期设置为传入的值
        if hasattr(self.dataset, "set_epoch"):  # 如果数据集有 set_epoch 方法
            self.dataset.set_epoch(epoch)  # 调用数据集的 set_epoch 方法，将数据集中的迭代周期设置为传入的值
    # 定义迭代器的方法，用于迭代数据集中的元素
    def __iter__(self):
        # 初始化示例数为零
        self.num_examples = 0
        # 如果数据集没有 set_epoch 方法，但有 generator 属性且其类型是 torch.Generator
        if (
            not hasattr(self.dataset, "set_epoch")
            and hasattr(self.dataset, "generator")
            and isinstance(self.dataset.generator, torch.Generator)
        ):
            # 设置数据集的随机种子为当前 epoch 加上初始种子值
            self.dataset.generator.manual_seed(self.seed + self.epoch)
        
        # 计算真实的批量大小，考虑到进程数量
        real_batch_size = self.batch_size * self.num_processes
        # 计算当前进程的数据片段范围
        process_slice = range(self.process_index * self.batch_size, (self.process_index + 1) * self.batch_size)

        # 初始化第一个批次为 None
        first_batch = None
        # 初始化当前批次列表
        current_batch = []
        
        # 遍历数据集中的每一个元素
        for element in self.dataset:
            # 增加示例数计数
            self.num_examples += 1
            # 将元素添加到当前批次中
            current_batch.append(element)
            
            # 当当前批次达到真实批量大小时，开始生成批次中的元素
            if len(current_batch) == real_batch_size:
                for i in process_slice:
                    yield current_batch[i]
                # 如果第一个批次还未设置，则复制当前批次为第一个批次
                if first_batch is None:
                    first_batch = current_batch.copy()
                # 重置当前批次列表
                current_batch = []
        
        # 如果 drop_last 为 False，并且当前批次列表中还有剩余元素
        if not self.drop_last and len(current_batch) > 0:
            # 如果第一个批次还未设置，则复制当前批次为第一个批次
            if first_batch is None:
                first_batch = current_batch.copy()
            # 将当前批次补齐至真实批量大小
            while len(current_batch) < real_batch_size:
                current_batch += first_batch
            # 生成批次中的元素
            for i in process_slice:
                yield current_batch[i]

    # 定义返回数据集长度的方法
    def __len__(self):
        # 如果 drop_last 为 True，计算不包括不完整批次的数据集长度
        if self.drop_last:
            return (len(self.dataset) // (self.batch_size * self.num_processes)) * self.batch_size
        else:
            # 否则，计算包括不完整批次的数据集长度
            return math.ceil(len(self.dataset) / (self.batch_size * self.num_processes)) * self.batch_size
# 获取当前学习率的辅助方法
def _get_learning_rate(self):
    if self.is_deepspeed_enabled:
        # 如果使用了 DeepSpeed，并且启用了 fp16 和动态损失缩放，优化器/调度器在前几十步可能不会运行，
        # 因此在这个热身阶段调用 `get_last_lr` 可能会失败，因此需要进行以下处理：
        try:
            last_lr = self.lr_scheduler.get_last_lr()[0]
        except AssertionError as e:
            if "need to call step" in str(e):
                logger.warning("tried to get lr value before scheduler/optimizer started stepping, returning lr=0")
                last_lr = 0
            else:
                raise
    else:
        # 如果没有使用 DeepSpeed，根据不同的调度器类型获取最后的学习率
        if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            last_lr = self.optimizer.param_groups[0]["lr"]
        else:
            last_lr = self.lr_scheduler.get_last_lr()[0]
        # 如果最后的学习率是一个 Tensor，则将其转换为 Python 数字
        if torch.is_tensor(last_lr):
            last_lr = last_lr.item()
    return last_lr


# 将秒数转换为 hh:mm:ss.msec 格式，毫秒部分保留两位小数
def _secs2timedelta(secs):
    msec = int(abs(secs - int(secs)) * 100)
    return f"{datetime.timedelta(seconds=int(secs))}.{msec:02d}"


# 将 Trainer 返回的指标格式化为人类可读的格式
def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]:
    """
    Args:
        metrics (`Dict[str, float]`):
            训练/评估/预测返回的指标

    Returns:
        metrics (`Dict[str, float]`): 格式化后的指标
    """

    metrics_copy = metrics.copy()
    for k, v in metrics_copy.items():
        if "_mem_" in k:
            # 如果指标名称中包含 `_mem_`，将其转换为以 MB 为单位的字符串表示
            metrics_copy[k] = f"{ v >> 20 }MB"
        elif "_runtime" in k:
            # 如果指标名称中包含 `_runtime`，将其转换为 hh:mm:ss.msec 格式
            metrics_copy[k] = _secs2timedelta(v)
        elif k == "total_flos":
            # 如果是 `total_flos` 指标，将其转换为以 GF 为单位的字符串表示
            metrics_copy[k] = f"{ int(v) >> 30 }GF"
        elif isinstance(metrics_copy[k], float):
            # 对于其他浮点数指标，保留四位小数
            metrics_copy[k] = round(v, 4)

    return metrics_copy


# 以特定格式记录指标
def log_metrics(self, split, metrics):
    """
    Args:
        split (`str`):
            模式/分割名称：`train`, `eval`, `test` 中的一个
        metrics (`Dict[str, float]`):
            训练/评估/预测返回的指标字典

    Notes on memory reports:

    要获取内存使用报告，需要安装 `psutil`。您可以使用 `pip install psutil` 进行安装。

    当运行此方法时，您将看到一个报告，其中包含：

    ```
    init_mem_cpu_alloc_delta   =     1301MB
    init_mem_cpu_peaked_delta  =      154MB
    init_mem_gpu_alloc_delta   =      230MB
    init_mem_gpu_peaked_delta  =        0MB
    train_mem_cpu_alloc_delta  =     1345MB
    train_mem_cpu_peaked_delta =        0MB
    ```
    """
    # 训练过程中 GPU 内存分配的变化，增加了 693MB
    train_mem_gpu_alloc_delta  =      693MB
    # 训练过程中 GPU 内存达到的峰值，增加了 7MB
    train_mem_gpu_peaked_delta =        7MB
    """
    Print formatted metrics for a given split.

    This function prints out metrics in a formatted way for a specified split (e.g., 'train', 'validation').
    It calculates the maximum width required for keys and values to ensure aligned printing. Metrics are sorted
    by keys before printing.

    Args:
    - split (str): The name of the split (e.g., 'train', 'validation') for which metrics are being printed.
    - metrics (dict): A dictionary containing metrics to be printed.

    Notes:
    - This function is intended to be called to print metrics during training or evaluation.
    - Metrics are formatted to ensure proper alignment based on the widest key and value in the provided dictionary.
    """
    if not self.is_world_process_zero():
        # If the current process is not the primary process, do not print metrics
        return

    # Print section header indicating the type of metrics being printed
    print(f"***** {split} metrics *****")

    # Format metrics for printing
    metrics_formatted = self.metrics_format(metrics)

    # Determine the maximum width needed for keys and values for proper alignment
    k_width = max(len(str(x)) for x in metrics_formatted.keys())
    v_width = max(len(str(x)) for x in metrics_formatted.values())

    # Iterate through sorted metrics and print each key-value pair
    for key in sorted(metrics_formatted.keys()):
        # Print each metric key-value pair with proper alignment
        print(f"  {key: <{k_width}} = {metrics_formatted[key]:>{v_width}}")
#`
def save_metrics(self, split, metrics, combined=True):
    """
    Save metrics into a json file for that split, e.g. `train_results.json`.

    Under distributed environment this is done only for a process with rank 0.

    Args:
        split (`str`):
            Mode/split name: one of `train`, `eval`, `test`, `all`
        metrics (`Dict[str, float]`):
            The metrics returned from train/evaluate/predict
        combined (`bool`, *optional*, defaults to `True`):
            Creates combined metrics by updating `all_results.json` with metrics of this call

    To understand the metrics please read the docstring of [`~Trainer.log_metrics`]. The only difference is that raw
    unformatted numbers are saved in the current method.

    """
    # 检查是否为分布式环境中的主进程
    if not self.is_world_process_zero():
        return

    # 构建输出文件路径，命名为 split_results.json
    path = os.path.join(self.args.output_dir, f"{split}_results.json")
    # 打开文件并写入 metrics 数据，格式化 JSON，缩进为 4，按键排序
    with open(path, "w") as f:
        json.dump(metrics, f, indent=4, sort_keys=True)

    # 如果需要合并结果
    if combined:
        # 构建合并结果文件路径，命名为 all_results.json
        path = os.path.join(self.args.output_dir, "all_results.json")
        # 如果合并结果文件已存在，读取其内容
        if os.path.exists(path):
            with open(path, "r") as f:
                all_metrics = json.load(f)
        else:
            all_metrics = {}

        # 更新合并结果
        all_metrics.update(metrics)
        # 写入更新后的合并结果到文件
        with open(path, "w") as f:
            json.dump(all_metrics, f, indent=4, sort_keys=True)


def save_state(self):
    """
    Saves the Trainer state, since Trainer.save_model saves only the tokenizer with the model

    Under distributed environment this is done only for a process with rank 0.
    """
    # 检查是否为分布式环境中的主进程
    if not self.is_world_process_zero():
        return

    # 构建状态保存文件路径，命名为 trainer_state.json
    path = os.path.join(self.args.output_dir, "trainer_state.json")
    # 调用模型状态对象保存方法，将状态保存到文件
    self.state.save_to_json(path)


def get_model_param_count(model, trainable_only=False):
    """
    Calculate model's total param count. If trainable_only is True then count only those requiring grads
    """
    # 如果启用了 DeepSpeed Zero-3，定义 numel 函数以支持 DeepSpeed 的参数数量计算
    if is_deepspeed_zero3_enabled():

        def numel(p):
            return p.ds_numel if hasattr(p, "ds_numel") else p.numel()

    else:
        # 否则，使用标准参数数量计算方法
        def numel(p):
            return p.numel()

    # 返回模型参数数量之和，依据是否只计算可训练参数
    return sum(numel(p) for p in model.parameters() if not trainable_only or p.requires_grad)


def get_parameter_names(model, forbidden_layer_types):
    """
    Returns the names of the model parameters that are not inside a forbidden layer.
    """
    result = []
    # 遍历模型的子模块，递归获取参数名，排除禁止层类型的参数
    for name, child in model.named_children():
        result += [
            f"{name}.{n}"
            for n in get_parameter_names(child, forbidden_layer_types)
            if not isinstance(child, tuple(forbidden_layer_types))
        ]
    # 添加模型中定义的参数（使用 nn.Parameter），因为这些参数不在任何子模块中
    result += list(model._parameters.keys())
    return result


def get_module_class_from_name(module, name):
    """
    Gets a class from a module by its name.

    Args:
        module (`torch.nn.Module`): The module to get the class from.
        name (`str`): The name of the class.
    """
    获取给定模块中特定类名的类对象。
    
    参数：
    module: 给定的模块对象
    name: 要查找的类名
    
    返回值：
    如果找到与给定名称匹配的类对象，则返回该类对象；如果找不到，返回 None。
    """
    modules_children = list(module.children())  # 获取模块的所有子模块列表
    if module.__class__.__name__ == name:  # 如果当前模块的类名与目标名称相同
        return module.__class__  # 返回当前模块的类对象
    elif len(modules_children) == 0:  # 如果当前模块没有子模块
        return  # 返回空，表示未找到目标类
    else:
        for child_module in modules_children:  # 遍历所有子模块
            module_class = get_module_class_from_name(child_module, name)  # 递归调用获取目标类对象
            if module_class is not None:  # 如果找到了目标类对象
                return module_class  # 返回目标类对象
# 如果当前进程为主进程，则执行删除指定目录下的文件
def remove_dummy_checkpoint(is_main_process, output_dir, filenames):
    if is_main_process:
        for filename in filenames:
            file = os.path.join(output_dir, filename)
            # 如果文件存在，则删除该文件
            if os.path.isfile(file):
                os.remove(file)

# 检查是否启用了SageMaker的模型并行功能
if is_sagemaker_mp_enabled():
    # 导入SageMaker模型并行的Torch扩展
    import smdistributed.modelparallel.torch as smp

    # 使用SMP的step装饰器定义前向传播和反向传播步骤
    @smp.step()
    def smp_forward_backward(model, inputs, gradient_accumulation_steps=1):
        # 执行模型前向传播
        outputs = model(**inputs)
        # 提取损失值，如果输出是字典则取loss键，否则取第一个元素
        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        # 根据梯度累积步数计算平均损失
        loss /= gradient_accumulation_steps
        # 执行模型反向传播
        model.backward(loss)
        return loss

    # 使用SMP的step装饰器定义仅前向传播步骤
    @smp.step()
    def smp_forward_only(model, inputs):
        return model(**inputs)

    # SMP下的数据收集函数，递归地收集嵌套的列表、元组或字典中的张量
    def smp_gather(tensor):
        if isinstance(tensor, (list, tuple)):
            return type(tensor)(smp_gather(t) for t in tensor)
        elif isinstance(tensor, dict):
            return type(tensor)({k: smp_gather(v) for k, v in tensor.items()})
        # 如果不是张量类型则抛出类型错误
        elif not isinstance(tensor, torch.Tensor):
            raise TypeError(
                f"Can't gather the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors."
            )
        # 使用SMP的allgather函数在DP_GROUP中收集所有张量
        all_tensors = smp.allgather(tensor, smp.CommGroup.DP_GROUP)
        # 将每个张量至少转换为1维，并将它们连接起来
        all_tensors = [atleast_1d(t) for t in all_tensors]
        return torch.cat([t.cpu() for t in all_tensors], dim=0)

    # SMP下的嵌套张量连接函数，递归地连接嵌套的列表、元组或字典中的张量
    def smp_nested_concat(tensor):
        if isinstance(tensor, (list, tuple)):
            return type(tensor)(smp_nested_concat(t) for t in tensor)
        elif isinstance(tensor, dict):
            return type(tensor)({k: smp_nested_concat(v) for k, v in tensor.items()})
        # 如果不是StepOutput类型，则进行连接并将结果从计算图中分离并移到CPU上
        # 注：这里由于StepOutput与smp.step同名，Python可能会混淆
        return tensor.concat().detach().cpu()

# 加速器配置类，用于自定义加速器相关参数
@dataclass
class AcceleratorConfig:
    """
    A subset of arguments relating to the underlying [`accelerate.Accelerator`]
    implementation utilized in the `Trainer` that can be customized.
    Mostly relating to data.
    """
    # Parameters:
    # split_batches (`bool`, *optional*, defaults to `False`):
    #     Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If
    #     `True` the actual batch size used will be the same on any kind of distributed processes, but it must be a
    #     round multiple of the `num_processes` you are using. If `False`, actual batch size used will be the one set
    #     in your script multiplied by the number of processes.
    # dispatch_batches (`bool`, *optional*):
    #     If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process
    #     and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose
    #     underlying dataset is an `IterableDataset`, `False` otherwise.
    # even_batches (`bool`, *optional*, defaults to `True`):
    #     If set to `True`, in cases where the total batch size across all processes does not exactly divide the
    #     dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among
    #     all workers.
    # use_seedable_sampler (`bool`, *optional*, defaults to `True`):
    #     Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures
    #     training results are fully reproducable using a different sampling technique. While seed-to-seed results
    #     may differ, on average the differences are neglible when using multiple different seeds to compare. Should
    #     also be ran with [`~utils.set_seed`] for the best results.
    even_batches: bool = field(
        default=True,
        metadata={
            "help": "If set to `True`, in cases where the total batch size across all processes does not exactly divide the"
            " dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among"
            " all workers."
        },
    )
    # 是否使用偶数批次
    # 如果设置为 `True`，在所有进程的总批次大小无法精确地整除数据集的情况下，
    # 将在数据集的开头复制样本，以便批次可以在所有工作器之间均匀分配

    use_seedable_sampler: bool = field(
        default=True,
        metadata={
            "help": "Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`])."
            "Ensures training results are fully reproducable using a different sampling technique. "
            "While seed-to-seed results may differ, on average the differences are neglible when using"
            "multiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results."
        },
    )
    # 是否使用可种子化的随机采样器
    # 如果设置为 `True`，将使用可完全种子化的随机采样器（[`accelerate.data_loader.SeedableRandomSampler`]）。
    # 确保使用不同的采样技术可以完全复制训练结果。
    # 尽管种子之间的结果可能会有所不同，但在使用多个不同种子进行比较时，平均差异微乎其微。
    # 应与 [`~utils.set_seed`] 一起使用以获得最佳结果。

    @classmethod
    def from_json_file(cls, json_file):
        # 检查文件是否存在，选择合适的打开方式
        open_file = io.open if os.path.exists(json_file) else open
        with open_file(json_file, "r", encoding="utf-8") as f:
            # 加载 JSON 文件内容为字典
            config_dict = json.load(f)
        # 检查字典中是否有未知键，并加载合理的默认值
        extra_keys = sorted(key for key in config_dict.keys() if key not in cls.__dataclass_fields__.keys())
        if len(extra_keys) > 0:
            # 如果配置文件中存在未知键，抛出 ValueError 异常
            raise ValueError(
                f"The config file at {json_file} had unknown keys ({extra_keys}), please try upgrading your `transformers`"
                " version or fix (and potentially remove these keys) from your config file."
            )
        # 使用加载的配置字典创建当前类的实例并返回
        return cls(**config_dict)

    # 将对象转换为字典的方法
    def to_dict(self):
        # 使用深度复制来创建当前对象的属性字典并返回
        return copy.deepcopy(self.__dict__)
# 创建一个自定义的优化器类 LayerWiseDummyOptimizer，继承自 torch.optim.Optimizer
class LayerWiseDummyOptimizer(torch.optim.Optimizer):
    """
    对于像 GaLoRE 优化器这样的分层优化器，优化步骤已经通过后梯度钩子完成。
    因此，关键在于创建一个虚拟的优化器，在训练过程中返回空操作。

    初始想法来自 LLaMA-Factory 中的 @hiyouga：
    https://github.com/hiyouga/LLaMA-Factory/commit/8664262cde3919e10eaecbd66e8c5d356856362e#diff-ebe08ab14496dfb9e06075f0fdd36799ef6d1535cc4dd4715b74c4e3e06fe3ba
    """

    # 初始化函数，接受 optimizer_dict 和任意的 args 和 kwargs
    def __init__(self, optimizer_dict=None, *args, **kwargs):
        # 创建一个虚拟张量
        dummy_tensor = torch.randn(1, 1)
        self.optimizer_dict = optimizer_dict
        # 调用父类的初始化函数，传入一个包含虚拟张量的列表和学习率 lr 的字典
        super().__init__([dummy_tensor], {"lr": 1e-03})

    # 定义了 zero_grad 方法，设置为无操作
    def zero_grad(self, set_to_none: bool = True) -> None:
        pass

    # 定义了 step 方法，设置为无操作，并返回空值
    def step(self, closure=None) -> Optional[float]:
        pass


# 创建一个自定义的调度器类 LayerWiseDummyScheduler，继承自 LRScheduler
class LayerWiseDummyScheduler(LRScheduler):
    """
    对于像 GaLoRE 优化器这样的分层优化器，优化和调度步骤已经通过后梯度钩子完成。
    因此，关键在于创建一个虚拟的调度器，在训练过程中返回空操作。
    """

    # 初始化函数，接受任意的 args 和 kwargs
    def __init__(self, *args, **kwargs):
        # 创建一个 LayerWiseDummyOptimizer 的实例作为优化器
        optimizer = LayerWiseDummyOptimizer()
        last_epoch = -1
        verbose = False
        # 调用父类 LRScheduler 的初始化函数，传入虚拟优化器、上一个 epoch、是否详细输出
        super().__init__(optimizer, last_epoch, verbose)

    # 定义了 get_lr 方法，返回当前优化器各参数组的学习率列表
    def get_lr(self):
        return [group["lr"] for group in self.optimizer.param_groups]

    # 定义了 _get_closed_form_lr 方法，返回基础学习率的列表
    def _get_closed_form_lr(self):
        return self.base_lrs

`.\trainer_seq2seq.py`

# 引入警告模块，用于处理可能的警告信息
import warnings
# 从标准库中复制深度拷贝函数
from copy import deepcopy
# 引入处理路径操作的 Path 类
from pathlib import Path
# 引入类型检查相关的工具
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

# 引入 PyTorch 库
import torch
# 从 PyTorch 中引入神经网络模块
from torch import nn
# 引入数据集类
from torch.utils.data import Dataset

# 从当前包中的指定模块导入 GenerationConfig 类
from .generation.configuration_utils import GenerationConfig
# 从当前包中的指定模块导入 is_deepspeed_zero3_enabled 函数
from .integrations.deepspeed import is_deepspeed_zero3_enabled
# 从当前包中导入 Trainer 类
from .trainer import Trainer
# 从当前包中导入 logging 工具
from .utils import logging

# 如果是类型检查环境，导入以下几个类型
if TYPE_CHECKING:
    # 从当前包中导入 DataCollator 类
    from .data.data_collator import DataCollator
    # 从当前包中导入 PreTrainedModel 类
    from .modeling_utils import PreTrainedModel
    # 从当前包中导入 PreTrainedTokenizerBase 类
    from .tokenization_utils_base import PreTrainedTokenizerBase
    # 从当前包中导入 TrainerCallback 类
    from .trainer_callback import TrainerCallback
    # 从当前包中导入 EvalPrediction 和 PredictionOutput 类
    from .trainer_utils import EvalPrediction, PredictionOutput
    # 从当前包中导入 TrainingArguments 类
    from .training_args import TrainingArguments

# 获取 logger 对象，用于记录日志信息
logger = logging.get_logger(__name__)

# 定义 Seq2SeqTrainer 类，继承自 Trainer 类
class Seq2SeqTrainer(Trainer):
    # 初始化函数，接受多个参数用于模型训练
    def __init__(
        self,
        model: Union["PreTrainedModel", nn.Module] = None,  # 模型参数，可以是预训练模型或者 PyTorch nn.Module
        args: "TrainingArguments" = None,  # 训练参数，类型为 TrainingArguments
        data_collator: Optional["DataCollator"] = None,  # 数据收集器，可选的 DataCollator 类型
        train_dataset: Optional[Dataset] = None,  # 训练数据集，可选的 Dataset 类型
        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,  # 评估数据集，可选的 Dataset 或字典类型
        tokenizer: Optional["PreTrainedTokenizerBase"] = None,  # 分词器，可选的 PreTrainedTokenizerBase 类型
        model_init: Optional[Callable[[], "PreTrainedModel"]] = None,  # 模型初始化函数，可选的无参函数
        compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None,  # 计算评估指标的函数，可选的输入为 EvalPrediction 输出为字典类型
        callbacks: Optional[List["TrainerCallback"]] = None,  # 回调函数列表，可选的 TrainerCallback 类型列表
        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),  # 优化器和学习率调度器元组，默认为 (None, None)
        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,  # 预处理 logits 的函数，可选的输入为两个张量，输出为张量
        ):
            # 调用父类的初始化方法，传入以下参数来初始化模型训练器
            super().__init__(
                model=model,  # 模型对象
                args=args,  # 训练过程中的参数配置
                data_collator=data_collator,  # 数据收集器，用于处理批量数据
                train_dataset=train_dataset,  # 训练数据集
                eval_dataset=eval_dataset,  # 评估数据集
                tokenizer=tokenizer,  # 分词器对象
                model_init=model_init,  # 模型初始化函数
                compute_metrics=compute_metrics,  # 计算评估指标的函数
                callbacks=callbacks,  # 回调函数列表
                optimizers=optimizers,  # 优化器对象
                preprocess_logits_for_metrics=preprocess_logits_for_metrics,  # 用于评估指标的逻辑预处理函数
            )

            # 如果在参数中指定了生成配置 GenerationConfig，则覆盖模型的生成配置
            # 优先级：args.generation_config > model.generation_config > 默认的 GenerationConfig
            if self.args.generation_config is not None:
                # 加载指定路径下的生成配置文件
                gen_config = self.load_generation_config(self.args.generation_config)
                # 将加载的生成配置设置为模型的生成配置
                self.model.generation_config = gen_config

        @staticmethod
    # 加载生成配置信息，可以接受字符串或GenerationConfig类型的参数，并返回一个GenerationConfig对象
    def load_generation_config(gen_config_arg: Union[str, GenerationConfig]) -> GenerationConfig:
        """
        Loads a `~generation.GenerationConfig` from the `Seq2SeqTrainingArguments.generation_config` arguments.

        Args:
            gen_config_arg (`str` or [`~generation.GenerationConfig`]):
                `Seq2SeqTrainingArguments.generation_config` argument.

        Returns:
            A `~generation.GenerationConfig`.
        """

        # 如果gen_config_arg是GenerationConfig类型，则进行深拷贝并返回
        if isinstance(gen_config_arg, GenerationConfig):
            gen_config = deepcopy(gen_config_arg)
        else:
            # 如果gen_config_arg是str或Path类型
            pretrained_model_name = Path(gen_config_arg) if isinstance(gen_config_arg, str) else gen_config_arg
            config_file_name = None

            # 确定pretrained_model_name指向的是文件路径、目录路径还是模型ID或URL
            # 这一步是为了确定config_file_name的值
            if pretrained_model_name.is_file():
                config_file_name = pretrained_model_name.name
                pretrained_model_name = pretrained_model_name.parent
            # 如果是目录路径
            elif pretrained_model_name.is_dir():
                pass
            # 如果是模型ID或URL
            else:
                pretrained_model_name = gen_config_arg

            # 使用pretrained_model_name和config_file_name创建GenerationConfig对象
            gen_config = GenerationConfig.from_pretrained(pretrained_model_name, config_file_name)

        # 严格验证以便在早期发现问题。在训练结束时，GenerationConfig.save_pretrained()运行，如果验证时有警告则会抛出异常。
        try:
            with warnings.catch_warnings(record=True) as caught_warnings:
                gen_config.validate()
            if len(caught_warnings) > 0:
                raise ValueError(str([w.message for w in caught_warnings]))
        except ValueError as exc:
            # 如果验证失败，则抛出异常，指示生成的配置实例无效
            raise ValueError(
                "The loaded generation config instance is invalid -- `GenerationConfig.validate()` throws warnings "
                "and/or exceptions. Fix these issues to train your model.\n\nThrown during validation:\n" + str(exc)
            )
        # 返回生成的GenerationConfig对象
        return gen_config
        ) -> Dict[str, float]:
        """
        Run evaluation and returns metrics.

        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
        (pass it to the init `compute_metrics` argument).

        You can also subclass and override this method to inject custom behavior.

        Args:
            eval_dataset (`Dataset`, *optional*):
                Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns
                not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
                method.
            ignore_keys (`List[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.
            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
                "eval_bleu" if the prefix is `"eval"` (default)
            max_length (`int`, *optional*):
                The maximum target length to use when predicting with the generate method.
            num_beams (`int`, *optional*):
                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
                beam search.
            gen_kwargs:
                Additional `generate` specific kwargs.

        Returns:
            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
            dictionary also contains the epoch number which comes from the training state.
        """

        gen_kwargs = gen_kwargs.copy()

        # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the
        # training args
        if (
            gen_kwargs.get("max_length") is None
            and gen_kwargs.get("max_new_tokens") is None
            and self.args.generation_max_length is not None
        ):
            # Set max_length from training args if not already set in gen_kwargs
            gen_kwargs["max_length"] = self.args.generation_max_length
        if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
            # Set num_beams from training args if not already set in gen_kwargs
            gen_kwargs["num_beams"] = self.args.generation_num_beams

        # Assign the gather function to use for predictions
        self.gather_function = self.accelerator.gather
        # Store the generated kwargs internally for later use
        self._gen_kwargs = gen_kwargs

        # Call the evaluate method from the superclass with provided arguments
        return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
    ) -> "PredictionOutput":
        """
        Run prediction and returns predictions and potential metrics.

        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
        will also return metrics, like in `evaluate()`.

        Args:
            test_dataset (`Dataset`):
                Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the
                `model.forward()` method are automatically removed. Has to implement the method `__len__`
            ignore_keys (`List[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.
            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
                "eval_bleu" if the prefix is `"eval"` (default)
            max_length (`int`, *optional*):
                The maximum target length to use when predicting with the generate method.
            num_beams (`int`, *optional*):
                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
                beam search.
            gen_kwargs:
                Additional `generate` specific kwargs.

        <Tip>

        If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
        padding in a token classification task) the predictions will be padded (on the right) to allow for
        concatenation into one array. The padding index is -100.

        </Tip>

        Returns: *NamedTuple* A namedtuple with the following keys:

            - predictions (`np.ndarray`): The predictions on `test_dataset`.
            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
              labels).
        """

        # Copy gen_kwargs to avoid modifying the original
        gen_kwargs = gen_kwargs.copy()

        # Legacy argument setting: If max_length or max_new_tokens is not explicitly set, use values from training args
        if (
            gen_kwargs.get("max_length") is None
            and gen_kwargs.get("max_new_tokens") is None
            and self.args.generation_max_length is not None
        ):
            gen_kwargs["max_length"] = self.args.generation_max_length
        
        # Legacy argument setting: If num_beams is not explicitly set, use value from training args
        if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
            gen_kwargs["num_beams"] = self.args.generation_num_beams
        
        # Set gather function to the accelerator's gather method
        self.gather_function = self.accelerator.gather
        
        # Store the modified gen_kwargs internally
        self._gen_kwargs = gen_kwargs

        # Call the predict method of the superclass to perform predictions
        return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
    # 定义一个方法 `prediction_step`，用于执行预测步骤
    def prediction_step(
        self,
        model: nn.Module,
        inputs: Dict[str, Union[torch.Tensor, Any]],
        prediction_loss_only: bool,
        ignore_keys: Optional[List[str]] = None,
        **gen_kwargs,
    ):
        # 如果存在自定义的分词器并且有定义 PAD 标记的 ID
        if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
            # 设置 PAD 标记 ID，如果未定义则使用 EOS 标记 ID
            pad_token_id = (
                self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
            )
        else:
            # 如果未定义分词器或者 PAD 标记 ID，检查模型配置中是否有定义 PAD 标记 ID
            if self.model.config.pad_token_id is not None:
                pad_token_id = self.model.config.pad_token_id
            else:
                # 如果都未定义，则抛出数值错误异常
                raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors")

        # 创建一个形状为 (tensor.shape[0], max_length) 的张量，填充为 PAD 标记 ID
        padded_tensor = pad_token_id * torch.ones(
            (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
        )
        # 将输入张量的内容复制到创建的填充张量中，保留原始内容的形状
        padded_tensor[:, : tensor.shape[-1]] = tensor
        # 返回填充后的张量
        return padded_tensor

`.\trainer_utils.py`

# 设置文件编码为 UTF-8

# 版权声明和许可证信息

"""
PyTorch-独立工具类，为 Trainer 类提供支持。
"""

# 导入标准库和第三方库
import copy  # 导入 copy 模块，用于复制对象
import functools  # 导入 functools 模块，用于高阶函数（higher-order functions）
import gc  # 导入 gc 模块，用于垃圾回收
import inspect  # 导入 inspect 模块，用于检查对象
import os  # 导入 os 模块，用于操作系统相关功能
import random  # 导入 random 模块，用于生成随机数
import re  # 导入 re 模块，用于正则表达式操作
import threading  # 导入 threading 模块，用于线程支持
import time  # 导入 time 模块，用于时间操作

from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union  # 导入类型提示相关的类和函数

import numpy as np  # 导入 NumPy 库

# 导入自定义模块和函数
from .utils import (
    ExplicitEnum,  # 导入 ExplicitEnum 枚举类
    is_psutil_available,  # 导入检查 psutil 库是否可用的函数
    is_tf_available,  # 导入检查 TensorFlow 是否可用的函数
    is_torch_available,  # 导入检查 PyTorch 是否可用的函数
    is_torch_cuda_available,  # 导入检查 PyTorch CUDA 是否可用的函数
    is_torch_mps_available,  # 导入检查 PyTorch MPS 是否可用的函数
    is_torch_npu_available,  # 导入检查 PyTorch NPU 是否可用的函数
    is_torch_xla_available,  # 导入检查 PyTorch XLA 是否可用的函数
    is_torch_xpu_available,  # 导入检查 PyTorch XPU 是否可用的函数
    requires_backends,  # 导入装饰器函数 requires_backends
)


if is_torch_available():  # 如果 PyTorch 可用
    import torch  # 导入 PyTorch 库


def seed_worker(_: Any):  # 定义一个用于设置 worker 种子的辅助函数
    """
    Helper function to set worker seed during Dataloader initialization.
    """
    worker_seed = torch.initial_seed() % 2**32  # 获取当前 PyTorch 种子并进行处理
    set_seed(worker_seed)  # 调用全局设置种子的函数


def enable_full_determinism(seed: int, warn_only: bool = False):  # 定义一个函数启用全确定性
    """
    Helper function for reproducible behavior during distributed training. See
    - https://pytorch.org/docs/stable/notes/randomness.html for pytorch
    - https://www.tensorflow.org/api_docs/python/tf/config/experimental/enable_op_determinism for tensorflow
    """
    # 首先设置种子
    set_seed(seed)

    if is_torch_available():  # 如果 PyTorch 可用
        # 启用 PyTorch 的确定性模式，可能需要设置环境变量 'CUDA_LAUNCH_BLOCKING' 或 'CUBLAS_WORKSPACE_CONFIG'
        os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
        torch.use_deterministic_algorithms(True, warn_only=warn_only)

        # 启用 CUDNN 的确定性模式
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    if is_tf_available():  # 如果 TensorFlow 可用
        import tensorflow as tf  # 导入 TensorFlow 库

        tf.config.experimental.enable_op_determinism()  # 启用 TensorFlow 的确定性操作


def set_seed(seed: int):  # 定义一个设置种子的辅助函数
    """
    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch` and/or `tf` (if installed).

    Args:
        seed (`int`): The seed to set.
    """
    random.seed(seed)  # 设置 Python 内置随机数生成器的种子
    np.random.seed(seed)  # 设置 NumPy 库的种子
    if is_torch_available():  # 如果 PyTorch 可用
        torch.manual_seed(seed)  # 设置 PyTorch 随机数生成器的种子
        torch.cuda.manual_seed_all(seed)  # 设置所有 CUDA 设备的种子
        # ^^ 即使 CUDA 不可用也可以安全调用这个函数
    if is_torch_npu_available():  # 如果 PyTorch NPU 可用
        torch.npu.manual_seed_all(seed)  # 设置所有 NPU 设备的种子
    # 如果当前环境支持 Torch XPU（加速处理单元），则设置所有 XPU 的随机种子为指定的种子值
    if is_torch_xpu_available():
        torch.xpu.manual_seed_all(seed)
    
    # 如果当前环境支持 TensorFlow，则设置 TensorFlow 的随机种子为指定的种子值
    if is_tf_available():
        import tensorflow as tf
    
        tf.random.set_seed(seed)
def neftune_post_forward_hook(module, input, output):
    """
    Implements the NEFTune forward pass for the model using forward hooks. Note this works only for torch.nn.Embedding
    layers. This method is slightly adapted from the original source code that can be found here:
    https://github.com/neelsjain/NEFTune Simply add it to your model as follows:
    ```
    model = ...
    model.embed_tokens.neftune_noise_alpha = 0.1
    model.embed_tokens.register_forward_hook(neftune_post_forward_hook)
    ```
    Args:
        module (`torch.nn.Module`):
            The embedding module where the hook is attached. Note that you need to set `module.neftune_noise_alpha` to
            the desired noise alpha value.
        input (`torch.Tensor`):
            The input tensor to the model.
        output (`torch.Tensor`):
            The output tensor of the model (i.e. the embeddings).
    """
    # Check if the module is in training mode
    if module.training:
        # Calculate the total number of elements in the output tensor
        dims = torch.tensor(output.size(1) * output.size(2))
        # Calculate magnitude normalization factor based on neftune_noise_alpha
        mag_norm = module.neftune_noise_alpha / torch.sqrt(dims)
        # Add uniform noise to the output tensor
        output = output + torch.zeros_like(output).uniform_(-mag_norm, mag_norm)
    # Return the modified output tensor
    return output


class EvalPrediction:
    """
    Evaluation output (always contains labels), to be used to compute metrics.

    Parameters:
        predictions (`np.ndarray`): Predictions of the model.
        label_ids (`np.ndarray`): Targets to be matched.
        inputs (`np.ndarray`, *optional*):
    """

    def __init__(
        self,
        predictions: Union[np.ndarray, Tuple[np.ndarray]],
        label_ids: Union[np.ndarray, Tuple[np.ndarray]],
        inputs: Optional[Union[np.ndarray, Tuple[np.ndarray]]] = None,
    ):
        # Initialize with predictions, label_ids, and optional inputs
        self.predictions = predictions
        self.label_ids = label_ids
        self.inputs = inputs

    def __iter__(self):
        # Return an iterator over predictions, label_ids, and inputs if available
        if self.inputs is not None:
            return iter((self.predictions, self.label_ids, self.inputs))
        else:
            return iter((self.predictions, self.label_ids))

    def __getitem__(self, idx):
        # Return the item corresponding to the given index
        if idx < 0 or idx > 2:
            raise IndexError("tuple index out of range")
        if idx == 2 and self.inputs is None:
            raise IndexError("tuple index out of range")
        if idx == 0:
            return self.predictions
        elif idx == 1:
            return self.label_ids
        elif idx == 2:
            return self.inputs


class EvalLoopOutput(NamedTuple):
    """
    NamedTuple for evaluation loop output, containing predictions, label_ids, metrics, and num_samples.

    Attributes:
        predictions (Union[np.ndarray, Tuple[np.ndarray]]): Predictions from the model.
        label_ids (Optional[Union[np.ndarray, Tuple[np.ndarray]]]): Target labels.
        metrics (Optional[Dict[str, float]]): Metrics computed during evaluation.
        num_samples (Optional[int]): Number of samples evaluated.
    """
    predictions: Union[np.ndarray, Tuple[np.ndarray]]
    label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
    metrics: Optional[Dict[str, float]]
    num_samples: Optional[int]


class PredictionOutput(NamedTuple):
    """
    NamedTuple for prediction output, containing predictions, label_ids, and metrics.

    Attributes:
        predictions (Union[np.ndarray, Tuple[np.ndarray]]): Predictions from the model.
        label_ids (Optional[Union[np.ndarray, Tuple[np.ndarray]]]): Target labels.
        metrics (Optional[Dict[str, float]]): Metrics computed during prediction.
    """
    predictions: Union[np.ndarray, Tuple[np.ndarray]]
    label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
    metrics: Optional[Dict[str, float]]


class TrainOutput(NamedTuple):
    """
    NamedTuple for training output, containing global_step, training_loss, and metrics.

    Attributes:
        global_step (int): Current global step of training.
        training_loss (float): Loss computed during training.
        metrics (Dict[str, float]): Metrics computed during training.
    """
    global_step: int
    training_loss: float
    metrics: Dict[str, float]


PREFIX_CHECKPOINT_DIR = "checkpoint"
# 编译正则表达式，用于匹配检查点目录名称的格式，预期格式为 PREFIX_CHECKPOINT_DIR-数字
_re_checkpoint = re.compile(r"^" + PREFIX_CHECKPOINT_DIR + r"\-(\d+)$")


def get_last_checkpoint(folder):
    # 获取指定文件夹中的所有内容列表
    content = os.listdir(folder)
    # 筛选出是有效检查点目录的路径列表，即名称符合正则表达式要求且是目录
    checkpoints = [
        path
        for path in content
        if _re_checkpoint.search(path) is not None and os.path.isdir(os.path.join(folder, path))
    ]
    # 如果没有有效的检查点目录，则返回 None
    if len(checkpoints) == 0:
        return
    # 返回最新的检查点目录的完整路径
    return os.path.join(folder, max(checkpoints, key=lambda x: int(_re_checkpoint.search(x).groups()[0])))


class IntervalStrategy(ExplicitEnum):
    # 定义枚举类 `IntervalStrategy`，包含 NO、STEPS 和 EPOCH 三个枚举值
    NO = "no"
    STEPS = "steps"
    EPOCH = "epoch"


class EvaluationStrategy(ExplicitEnum):
    # 定义枚举类 `EvaluationStrategy`，包含 NO、STEPS 和 EPOCH 三个枚举值
    NO = "no"
    STEPS = "steps"
    EPOCH = "epoch"


class HubStrategy(ExplicitEnum):
    # 定义枚举类 `HubStrategy`，包含 END、EVERY_SAVE、CHECKPOINT 和 ALL_CHECKPOINTS 四个枚举值
    END = "end"
    EVERY_SAVE = "every_save"
    CHECKPOINT = "checkpoint"
    ALL_CHECKPOINTS = "all_checkpoints"


class BestRun(NamedTuple):
    """
    通过超参数搜索找到的最佳运行结果的命名元组。

    Parameters:
        run_id (`str`):
            最佳运行的 ID （如果模型被保存，则对应的检查点将位于以 run-{run_id} 结尾的文件夹中）。
        objective (`float`):
            获得此运行的目标值。
        hyperparameters (`Dict[str, Any]`):
            用于此运行的超参数。
        run_summary (`Optional[Any]`):
            调优实验的摘要。对于 Ray 后端，为 `ray.tune.ExperimentAnalysis` 对象。
    """

    run_id: str
    objective: Union[float, List[float]]
    hyperparameters: Dict[str, Any]
    run_summary: Optional[Any] = None


def default_compute_objective(metrics: Dict[str, float]) -> float:
    """
    在进行超参数搜索时最大化/最小化的默认目标函数。如果没有提供任何指标，则为评估损失；否则为所有指标之和。

    Args:
        metrics (`Dict[str, float]`): evaluate 方法返回的指标。

    Return:
        `float`: 最小化或最大化的目标值。
    """
    # 深拷贝指标字典
    metrics = copy.deepcopy(metrics)
    # 移除评估损失指标
    loss = metrics.pop("eval_loss", None)
    _ = metrics.pop("epoch", None)
    # 移除速度相关指标
    speed_metrics = [
        m
        for m in metrics.keys()
        if m.endswith("_runtime") or m.endswith("_per_second") or m.endswith("_compilation_time")
    ]
    for sm in speed_metrics:
        _ = metrics.pop(sm, None)
    # 如果指标字典为空，则返回评估损失，否则返回所有指标之和
    return loss if len(metrics) == 0 else sum(metrics.values())


def default_hp_space_optuna(trial) -> Dict[str, float]:
    from .integrations import is_optuna_available

    # 确保 Optuna 已安装
    assert is_optuna_available(), "This function needs Optuna installed: `pip install optuna`"
    # 返回一个包含超参数的字典，用于 Optuna 的超参数搜索
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
        "seed": trial.suggest_int("seed", 1, 40),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32, 64]),
    }
    }


注释：


    # 函数定义结束
# 默认超参数空间设置函数，用于Ray调优
def default_hp_space_ray(trial) -> Dict[str, float]:
    # 检查是否安装了Ray Tune库
    from .integrations import is_ray_tune_available
    assert is_ray_tune_available(), "This function needs ray installed: `pip install ray[tune]`"
    
    # 导入Ray Tune库
    from ray import tune
    
    # 返回超参数字典
    return {
        "learning_rate": tune.loguniform(1e-6, 1e-4),  # 学习率在对数均匀分布中取值
        "num_train_epochs": tune.choice(list(range(1, 6))),  # 训练周期在1到5之间的选择
        "seed": tune.uniform(1, 40),  # 种子值在1到40之间均匀分布
        "per_device_train_batch_size": tune.choice([4, 8, 16, 32, 64]),  # 每设备训练批量大小的选择
    }


# 默认超参数空间设置函数，用于SigOpt调优
def default_hp_space_sigopt(trial):
    # 返回超参数列表
    return [
        {"bounds": {"min": 1e-6, "max": 1e-4}, "name": "learning_rate", "type": "double", "transformamtion": "log"},  # 学习率在对数变换下的双精度边界
        {"bounds": {"min": 1, "max": 6}, "name": "num_train_epochs", "type": "int"},  # 训练周期在1到6之间的整数边界
        {"bounds": {"min": 1, "max": 40}, "name": "seed", "type": "int"},  # 种子值在1到40之间的整数边界
        {
            "categorical_values": ["4", "8", "16", "32", "64"],  # 每设备训练批量大小的类别值列表
            "name": "per_device_train_batch_size",
            "type": "categorical",
        },
    ]


# 默认超参数空间设置函数，用于W&B调优
def default_hp_space_wandb(trial) -> Dict[str, float]:
    # 检查是否安装了W&B库
    from .integrations import is_wandb_available
    if not is_wandb_available():
        raise ImportError("This function needs wandb installed: `pip install wandb`")
    
    # 返回超参数字典
    return {
        "method": "random",
        "metric": {"name": "objective", "goal": "minimize"},  # 优化目标为最小化目标函数
        "parameters": {
            "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},  # 学习率在均匀分布中取值
            "num_train_epochs": {"distribution": "int_uniform", "min": 1, "max": 6},  # 训练周期在均匀整数分布中取值
            "seed": {"distribution": "int_uniform", "min": 1, "max": 40},  # 种子值在均匀整数分布中取值
            "per_device_train_batch_size": {"values": [4, 8, 16, 32, 64]},  # 每设备训练批量大小的固定值列表
        },
    }


# 超参数搜索后端类型枚举类
class HPSearchBackend(ExplicitEnum):
    OPTUNA = "optuna"
    RAY = "ray"
    SIGOPT = "sigopt"
    WANDB = "wandb"


# 是否为主进程函数
def is_main_process(local_rank):
    """
    Whether or not the current process is the local process, based on `xm.get_ordinal()` (for TPUs) first, then on
    `local_rank`.
    """
    if is_torch_xla_available():
        # 如果是在TPU上，使用torch_xla库来判断当前进程是否为主进程
        import torch_xla.core.xla_model as xm
        return xm.get_ordinal() == 0
    return local_rank in [-1, 0]


# 总进程数量函数
def total_processes_number(local_rank):
    """
    Return the number of processes launched in parallel. Works with `torch.distributed` and TPUs.
    """
    if is_torch_xla_available():
        # 如果是在TPU上，使用torch_xla库来获取并行启动的总进程数
        import torch_xla.core.xla_model as xm
        return xm.xrt_world_size()
    elif local_rank != -1 and is_torch_available():
        # 如果不是TPU且使用了torch分布式，使用torch库来获取并行启动的总进程数
        import torch
        return torch.distributed.get_world_size()
    return 1


# 速度指标函数
def speed_metrics(split, start_time, num_samples=None, num_steps=None, num_tokens=None):
    """
    Measure and return speed performance metrics.

    This function requires a time snapshot `start_time` before the operation to be measured starts and this function
    should be run immediately after the operation to be measured has completed.

    Args:
    - split: name to prefix metric (like train, eval, test...)
    - start_time: operation start time
    """
    # 计算代码运行时长
    runtime = time.time() - start_time
    # 初始化结果字典，存储运行时间信息
    result = {f"{split}_runtime": round(runtime, 4)}
    # 若运行时间为零，直接返回结果字典
    if runtime == 0:
        return result
    # 如果有指定的样本数量，计算每秒处理的样本数
    if num_samples is not None:
        samples_per_second = num_samples / runtime
        result[f"{split}_samples_per_second"] = round(samples_per_second, 3)
    # 如果有指定的步骤数量，计算每秒处理的步骤数
    if num_steps is not None:
        steps_per_second = num_steps / runtime
        result[f"{split}_steps_per_second"] = round(steps_per_second, 3)
    # 如果有指定的标记数量，计算每秒处理的标记数
    if num_tokens is not None:
        tokens_per_second = num_tokens / runtime
        result[f"{split}_tokens_per_second"] = round(tokens_per_second, 3)
    # 返回包含运行时间信息的结果字典
    return result
# 定义枚举类 SchedulerType，表示调度器类型，继承自 ExplicitEnum
class SchedulerType(ExplicitEnum):
    LINEAR = "linear"  # 线性调度器类型
    COSINE = "cosine"  # 余弦退火调度器类型
    COSINE_WITH_RESTARTS = "cosine_with_restarts"  # 带重启的余弦退火调度器类型
    POLYNOMIAL = "polynomial"  # 多项式调度器类型
    CONSTANT = "constant"  # 恒定调度器类型
    CONSTANT_WITH_WARMUP = "constant_with_warmup"  # 带预热的恒定调度器类型
    INVERSE_SQRT = "inverse_sqrt"  # 倒数平方根调度器类型
    REDUCE_ON_PLATEAU = "reduce_lr_on_plateau"  # 在平台上减少学习率调度器类型

# 定义 TrainerMemoryTracker 类，用于跟踪 CPU 和 GPU 内存
class TrainerMemoryTracker:
    """
    A helper class that tracks cpu and gpu memory.

    This class will silently skip unless `psutil` is available. Install with `pip install psutil`.

    When a stage completes, it can pass metrics dict to update with the memory metrics gathered during this stage.

    Example :

    ```
    self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
    self._memory_tracker.start()
    # code ...
    metrics = {"train_runtime": 10.5}
    self._memory_tracker.stop_and_update_metrics(metrics)
    ```

    At the moment GPU tracking is only for `pytorch`, but can be extended to support `tensorflow`.

    To understand this class' intricacies please read the documentation of [`~Trainer.log_metrics`].
    """

    # 将训练器方法映射到指标前缀的字典
    stages = {
        "__init__": "init",  # 初始化阶段
        "train": "train",  # 训练阶段
        "_inner_training_loop": "train",  # 内部训练循环阶段
        "evaluate": "eval",  # 评估阶段
        "predict": "test",  # 预测阶段
    }

    def __init__(self, skip_memory_metrics=False):
        self.skip_memory_metrics = skip_memory_metrics  # 是否跳过内存指标的标志

        if not is_psutil_available():
            # 如果 psutil 不可用，则跳过内存指标的收集
            self.skip_memory_metrics = True

        if self.skip_memory_metrics:
            return  # 如果跳过内存指标，则直接返回

        import psutil  # 导入 psutil 模块，用于内存和系统进程的检测

        # 根据不同的 GPU 类型检测并导入相应的 torch 模块
        if is_torch_cuda_available():
            import torch
            self.torch = torch  # 导入 torch 库
            self.gpu = {}  # 初始化 GPU 字典
        elif is_torch_mps_available():
            import torch
            self.torch = torch
            self.gpu = {}
        elif is_torch_xpu_available():
            import torch
            self.torch = torch
            self.gpu = {}
        elif is_torch_npu_available():
            import torch
            self.torch = torch
            self.gpu = {}
        else:
            self.torch = None  # 如果没有可用的 GPU，将 torch 设为 None

        self.process = psutil.Process()  # 获取当前进程的 psutil 进程对象

        self.cur_stage = None  # 当前阶段名称
        self.cpu = {}  # CPU 内存字典
        self.init_reported = False  # 初始化报告标志

    def derive_stage(self):
        """自动推断阶段/调用者名称"""
        caller = inspect.currentframe().f_back.f_back.f_code.co_name  # 获取调用者函数名
        if caller in self.stages:
            return self.stages[caller]  # 返回对应的阶段名称
        else:
            raise ValueError(
                f"was called from {caller}, but only expect to be called from one of {self.stages.keys()}"
            )  # 如果调用者不在预期的阶段中，则抛出异常

    def cpu_mem_used(self):
        """获取当前进程的驻留集大小内存"""
        return self.process.memory_info().rss  # 返回当前进程的内存使用情况
    # 定义一个方法，用于监视 CPU 和内存的峰值使用情况
    def peak_monitor_func(self):
        # 初始化 CPU 和内存的峰值使用为 -1
        self.cpu_mem_used_peak = -1

        # 无限循环，持续监视峰值使用情况
        while True:
            # 更新当前的 CPU 和内存使用的峰值
            self.cpu_mem_used_peak = max(self.cpu_mem_used(), self.cpu_mem_used_peak)

            # 不能使用 sleep，否则可能无法捕获到正确的峰值（此注释是有意为之的）
            # time.sleep(0.001) # 1msec

            # 如果停止监视峰值使用，则退出循环
            if not self.peak_monitoring:
                break

    # 启动方法，开始跟踪调用者的阶段
    def start(self):
        """开始跟踪调用者的阶段"""
        # 如果设置为跳过内存指标，则直接返回
        if self.skip_memory_metrics:
            return

        # 推断当前阶段
        stage = self.derive_stage()

        # 处理在训练期间 eval 的嵌套调用 - 简单地忽略这些情况
        if self.cur_stage is not None and self.cur_stage != stage:
            return

        # 设置当前阶段
        self.cur_stage = stage

        # 执行垃圾回收
        gc.collect()

        # 如果存在 torch 对象
        if self.torch is not None:
            # 如果 CUDA 可用，重置 CUDA 的峰值内存统计并清空缓存
            if torch.cuda.is_available():
                self.torch.cuda.reset_peak_memory_stats()
                self.torch.cuda.empty_cache()
            # 如果支持 Torch XPU，重置 XPU 的峰值内存统计并清空缓存
            elif is_torch_xpu_available():
                self.torch.xpu.reset_peak_memory_stats()
                self.torch.xpu.empty_cache()
            # 如果支持 Torch NPU，重置 NPU 的峰值内存统计并清空缓存
            elif is_torch_npu_available():
                self.torch.npu.reset_peak_memory_stats()
                self.torch.npu.empty_cache()
            # 如果支持 Torch MPS，清空 MPS 的缓存
            elif is_torch_mps_available():
                self.torch.mps.empty_cache()

        # GPU 内存使用情况
        if self.torch is not None:
            if torch.cuda.is_available():
                self.gpu_mem_used_at_start = self.torch.cuda.memory_allocated()
            elif is_torch_xpu_available():
                self.gpu_mem_used_at_start = self.torch.xpu.memory_allocated()
            elif is_torch_npu_available():
                self.gpu_mem_used_at_start = self.torch.npu.memory_allocated()
            elif is_torch_mps_available():
                self.gpu_mem_used_at_start = self.torch.mps.current_allocated_memory()

        # CPU 内存使用情况
        self.cpu_mem_used_at_start = self.cpu_mem_used()

        # 开启峰值监视
        self.peak_monitoring = True
        # 创建一个线程用于运行峰值监视方法
        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
        peak_monitor_thread.daemon = True
        peak_monitor_thread.start()
    # 更新指定阶段的度量指标
    def update_metrics(self, stage, metrics):
        """updates the metrics"""
        # 如果设置跳过内存度量，则直接返回
        if self.skip_memory_metrics:
            return

        # 处理在训练期间嵌套调用 eval 的情况，简单忽略这些调用
        if self.cur_stage is not None and self.cur_stage != stage:
            return

        # 如果未报告过初始化阶段的度量指标，则在 train/val/predict 之前插入 "init" 阶段
        stages = [stage]
        if not self.init_reported:
            stages.insert(0, "init")
            self.init_reported = True

        # 遍历所有阶段，更新内存相关的度量指标
        for stage in stages:
            for t in ["alloc", "peaked"]:
                # 更新 CPU 内存的增量度量指标
                if stage in self.cpu and t in self.cpu[stage]:
                    metrics[f"{stage}_mem_cpu_{t}_delta"] = self.cpu[stage][t]
                # 如果存在 GPU，更新 GPU 内存的增量度量指标
                if self.torch is not None and stage in self.gpu and t in self.gpu[stage]:
                    metrics[f"{stage}_mem_gpu_{t}_delta"] = self.gpu[stage][t]

            # 如果需要额外的调试信息，可以启用以下代码
            # for t in ["begin", "end"]:
            #     if stage in self.cpu and t in self.cpu[stage]:
            #         metrics[f"{stage}_mem_cpu_{t}"] = self.cpu[stage][t]
            #     if self.torch is not None and stage in self.gpu and t in self.gpu[stage]:
            #         metrics[f"{stage}_mem_gpu_{t}"] = self.gpu[stage][t]

        # 在 init 阶段报告内存使用情况
        if stages[0] == "init":
            # 报告 init 阶段 CPU 内存的初始使用量
            metrics["before_init_mem_cpu"] = self.cpu["init"]["begin"]
            # 如果存在 GPU，则报告 init 阶段 GPU 内存的初始使用量
            if self.torch is not None:
                metrics["before_init_mem_gpu"] = self.gpu["init"]["begin"]

            # 如果希望在 init 和下一个阶段之间报告额外的内存分配情况，可以启用以下代码
            # if self.cpu["init"]["end"] != self.cpu[stage]["begin"]:
            #     metrics[f"after_init_mem_cpu_delta"] = self.cpu[stage]["begin"] - self.cpu["init"]["end"]
            # if self.torch is not None and self.gpu["init"]["end"] != self.gpu[stage]["begin"]:
            #     metrics[f"after_init_mem_gpu_delta"] = self.gpu[stage]["begin"] - self.gpu["init"]["end"]

    # 结合停止和度量更新的函数调用，简化代码逻辑
    def stop_and_update_metrics(self, metrics=None):
        """combine stop and metrics update in one call for simpler code"""
        # 如果设置跳过内存度量，则直接返回
        if self.skip_memory_metrics:
            return

        # 推导当前阶段
        stage = self.derive_stage()
        # 执行停止操作
        self.stop(stage)

        # init 阶段没有度量需要更新，所以只保存数据以供后续阶段检索
        if metrics is not None:
            # 更新度量指标
            self.update_metrics(stage, metrics)
# 检查数据集是否实现了 __len__() 方法，并且调用该方法不会引发错误
def has_length(dataset):
    try:
        # 返回数据集的长度是否不为 None
        return len(dataset) is not None
    except TypeError:
        # 如果调用 len() 方法时出现 TypeError，则返回 False
        # TypeError: len() of unsized object
        return False


# 递归地调用 `.item()` 方法，将字典中的每个元素转换为其对应的标量值
def denumpify_detensorize(metrics):
    if isinstance(metrics, (list, tuple)):
        # 如果 metrics 是列表或元组，则递归调用 denumpify_detensorize() 处理每个元素
        return type(metrics)(denumpify_detensorize(m) for m in metrics)
    elif isinstance(metrics, dict):
        # 如果 metrics 是字典，则递归调用 denumpify_detensorize() 处理每对键值对
        return type(metrics)({k: denumpify_detensorize(v) for k, v in metrics.items()})
    elif isinstance(metrics, np.generic):
        # 如果 metrics 是 numpy 标量类型，则调用 .item() 方法获取其标量值
        return metrics.item()
    elif is_torch_available() and isinstance(metrics, torch.Tensor) and metrics.numel() == 1:
        # 如果 PyTorch 可用且 metrics 是包含单个元素的 Tensor，则调用 .item() 方法获取其标量值
        return metrics.item()
    # 其他情况直接返回 metrics
    return metrics


# 返回函数的参数数量，即使函数是 partial function 也可以
def number_of_arguments(func):
    if isinstance(func, functools.partial):
        # 如果 func 是 functools.partial 类型，则获取其真实函数的参数数量
        total_args = len(inspect.signature(func.func).parameters)
        # 减去 partial function 自身的参数数量得到真实的函数参数数量
        return total_args - len(func.args) - len(func.keywords)
    # 直接返回函数的参数数量
    return len(inspect.signature(func).parameters)


# 尝试执行函数，如果因内存不足或 CUDNN 相关异常失败，则减半批处理大小继续执行
def find_executable_batch_size(
    function: callable = None, starting_batch_size: int = 128, auto_find_batch_size: bool = False
):
    """
    Args:
    A basic decorator that will try to execute `function`. If it fails from exceptions related to out-of-memory or
    CUDNN, the batch size is cut in half and passed to `function`. `function` must take in a `batch_size` parameter as
    its first argument.
        function (`callable`, *optional*)
            A function to wrap
        starting_batch_size (`int`, *optional*)
            The batch size to try and fit into memory
        auto_find_batch_size (`bool`, *optional*)
            If False, will just execute `function`
    """
    if function is None:
        # 如果 function 参数为 None，则返回一个带有默认参数的 functools.partial 对象
        return functools.partial(
            find_executable_batch_size,
            starting_batch_size=starting_batch_size,
            auto_find_batch_size=auto_find_batch_size,
        )

    if auto_find_batch_size:
        # 如果 auto_find_batch_size 为 True，则引入 accelerate 模块，调用其内部的批处理大小搜索函数
        requires_backends(find_executable_batch_size, "accelerate")
        from accelerate.utils import find_executable_batch_size as accelerate_find_executable_batch_size

        return accelerate_find_executable_batch_size(function=function, starting_batch_size=starting_batch_size)

    # 如果 auto_find_batch_size 为 False，则直接调用传入的 function，并传递起始批处理大小参数
    return functools.partial(function, batch_size=starting_batch_size)


# 枚举类定义了一些特定的选项
class FSDPOption(ExplicitEnum):
    FULL_SHARD = "full_shard"
    SHARD_GRAD_OP = "shard_grad_op"
    NO_SHARD = "no_shard"
    HYBRID_SHARD = "hybrid_shard"
    HYBRID_SHARD_ZERO2 = "hybrid_shard_zero2"
    OFFLOAD = "offload"
    AUTO_WRAP = "auto_wrap"


class RemoveColumnsCollator:
    """Wrap the data collator to remove unused columns before they are passed to the collator."""
    # 数据整理器类，用于在传递给整理器之前移除未使用的列
    # 初始化函数，用于实例化对象时进行初始化操作
    def __init__(
        self,
        data_collator,                   # 数据合并器，用于合并特征
        signature_columns,               # 特征签名列，用于指定模型的输入特征
        logger=None,                     # 日志记录器，可选
        model_name: Optional[str] = None, # 模型名称，可选参数
        description: Optional[str] = None, # 描述信息，可选参数
    ):
        self.data_collator = data_collator  # 将数据合并器赋值给对象属性
        self.signature_columns = signature_columns  # 将特征签名列赋值给对象属性
        self.logger = logger                # 将日志记录器赋值给对象属性
        self.description = description      # 将描述信息赋值给对象属性
        self.model_name = model_name        # 将模型名称赋值给对象属性
        self.message_logged = False         # 初始化消息日志标记为 False，用于记录是否已经输出过消息

    # 私有方法，用于移除特征中不属于特征签名列的列
    def _remove_columns(self, feature: dict) -> dict:
        if not isinstance(feature, dict):  # 如果 feature 不是字典类型，则直接返回 feature
            return feature
        if not self.message_logged and self.logger and self.model_name:
            # 计算被忽略的列，即特征字典中存在但不属于特征签名列的列
            ignored_columns = list(set(feature.keys()) - set(self.signature_columns))
            if len(ignored_columns) > 0:
                # 构造日志信息，记录被忽略的列以及相关的描述信息和模型名称
                dset_description = "" if self.description is None else f"in the {self.description} set"
                self.logger.info(
                    f"The following columns {dset_description} don't have a corresponding argument in "
                    f"`{self.model_name}.forward` and have been ignored: {', '.join(ignored_columns)}."
                    f" If {', '.join(ignored_columns)} are not expected by `{self.model_name}.forward`, "
                    " you can safely ignore this message."
                )
                self.message_logged = True  # 设置消息日志标记为 True，表示已经输出过消息
        # 返回仅包含特征签名列的特征字典
        return {k: v for k, v in feature.items() if k in self.signature_columns}

    # 对象可调用方法，用于处理特征列表
    def __call__(self, features: List[dict]):
        # 对每个特征调用 _remove_columns 方法，移除不属于特征签名列的列
        features = [self._remove_columns(feature) for feature in features]
        return self.data_collator(features)  # 使用数据合并器合并处理后的特征列表并返回
def check_target_module_exists(optim_target_modules, key: str, return_is_regex: bool = False):
    """A helper method to check if the passed module's key name matches any of the target modules in the optim_target_modules.

    Args:
        optim_target_modules (`Union[str, List[str]]`):
            A list of strings to try to match. Can be also a full string.
        key (`str`):
            A key to search any matches in optim_target_modules
        return_is_regex (`bool`):
            If set to `True`, the method will return whether the passed `optim_target_modules`
            is a regex or not.

    Returns:
        `bool` : True of match object if key matches any target modules from config, False or
        None if no match found
        `bool` : If the matched target module is a regex to silence out the warnings in Trainer
        for extra modules being found (only if `target_module_found=True` for an array of regex).
    """
    # Initialize variables to track if target module is found and if it's a regex
    target_module_found = False
    is_regex = False

    # Check if optim_target_modules is a single string
    if isinstance(optim_target_modules, str):
        # Check if key matches the entire optim_target_modules string as a regex
        target_module_found = bool(re.fullmatch(optim_target_modules, key))
        # Determine if optim_target_modules is a regex based on whether it exactly matches key
        is_regex = True if not optim_target_modules == key else False
    # Check if key is directly in the list of optim_target_modules
    elif key in optim_target_modules:
        target_module_found = True
    # Check if key contains any substring that matches elements in optim_target_modules
    elif any(target_key in key for target_key in optim_target_modules):
        target_module_found = True
    # Check if key matches any element in optim_target_modules as a regex
    elif any(bool(re.fullmatch(optim_target_module, key)) for optim_target_module in optim_target_modules):
        target_module_found = True
        is_regex = True

    # If return_is_regex is True, return both target_module_found and is_regex
    if return_is_regex:
        return target_module_found, is_regex

    # Otherwise, return only target_module_found
    return target_module_found

`.\training_args.py`

# 导入必要的库和模块，这些库和模块用于整个程序的功能实现
import contextlib  # 上下文管理工具，用于创建上下文管理器和支持上下文管理协议的对象
import io  # 提供了用于处理流的核心工具，如文本、二进制和内存缓冲区
import json  # 处理 JSON 格式数据的库
import math  # 数学函数库，提供了标准的数学运算函数
import os  # 操作系统相关功能的库，提供了与操作系统交互的方法
import warnings  # 警告处理工具，用于控制警告的显示方式

from dataclasses import asdict, dataclass, field, fields  # 数据类相关功能，用于创建和操作数据类
from datetime import timedelta  # 处理时间间隔的类和函数
from enum import Enum  # 枚举类型的支持
from pathlib import Path  # 处理路径的类和函数
from typing import Any, Dict, List, Optional, Union  # 类型提示相关功能

from huggingface_hub import get_full_repo_name  # Hugging Face Hub 相关功能，用于获取完整仓库名
from packaging import version  # 版本号处理工具，用于比较和操作版本号

from .debug_utils import DebugOption  # 自定义模块中的调试选项
from .trainer_utils import (  # 自定义模块中的训练器相关工具
    EvaluationStrategy,
    FSDPOption,
    HubStrategy,
    IntervalStrategy,
    SchedulerType,
)
from .utils import (  # 自定义模块中的实用工具集合
    ACCELERATE_MIN_VERSION,
    ExplicitEnum,
    cached_property,
    is_accelerate_available,
    is_safetensors_available,
    is_sagemaker_dp_enabled,
    is_sagemaker_mp_enabled,
    is_torch_available,
    is_torch_bf16_cpu_available,
    is_torch_bf16_gpu_available,
    is_torch_neuroncore_available,
    is_torch_npu_available,
    is_torch_tf32_available,
    is_torch_xla_available,
    is_torch_xpu_available,
    logging,
    requires_backends,
)
from .utils.generic import strtobool  # 自定义模块中的通用工具，如字符串转布尔值
from .utils.import_utils import is_optimum_neuron_available  # 自定义模块中的导入工具，检查神经核是否可用

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)
# 复制日志级别字典，以便在训练器日志级别中添加 passsive 级别
log_levels = logging.get_log_levels_dict().copy()
trainer_log_levels = dict(**log_levels, passive=-1)

# 如果 Torch 可用，导入相关模块
if is_torch_available():
    import torch  # 导入 PyTorch 库
    import torch.distributed as dist  # 导入 PyTorch 分布式训练支持模块

    from .pytorch_utils import is_torch_greater_or_equal_than_2_0  # 导入自定义的 PyTorch 工具函数

# 如果 Accelerate 可用，导入相关模块
if is_accelerate_available():
    from accelerate.state import AcceleratorState, PartialState  # 导入加速器状态相关模块
    from accelerate.utils import DistributedType  # 导入分布式类型枚举

    from .trainer_pt_utils import AcceleratorConfig  # 导入自定义的加速器配置类

# 如果 Torch XLA 可用，导入相关模块
if is_torch_xla_available():
    import torch_xla.core.xla_model as xm  # 导入 Torch XLA 核心模块

# 如果 Torch NeuronCore 可用，导入相关模块
if is_torch_neuroncore_available(check_device=False):
    # 支持 Torchrun 的特定导入，参考：https://github.com/pytorch/xla/pull/3609
    pass
    # 检查是否设置了环境变量 TORCHELASTIC_RUN_ID
    if os.environ.get("TORCHELASTIC_RUN_ID"):
        # 检查是否有最佳神经元可用
        if is_optimum_neuron_available():
            # 如果有最佳神经元可用，记录信息提示用户使用 TrainiumTrainer 进行训练
            logger.info(
                "Make sure that you are performing the training with the TrainiumTrainer from optimum[neuron], this "
                "will fail otherwise."
            )
        else:
            # 如果没有最佳神经元可用，警告用户使用 optimum[neuron] 的 TrainiumTrainer 替代 Transformers 库进行训练
            logger.warning(
                "Please use the TrainiumTrainer from optimum[neuron] instead of the Transformers library to perform "
                "training on AWS Trainium instances. More information here: "
                "https://github.com/huggingface/optimum-neuron"
            )
            # 导入 torch_xla.distributed.xla_backend 并使用其 ProcessGroupXla
            import torch_xla.distributed.xla_backend as xbn
            
            # 如果当前的分布式组不是 ProcessGroupXla 类型，则尝试使用 XLA 后端初始化分布式进程组
            if not isinstance(dist.group.WORLD, xbn.ProcessGroupXla):
                dist.init_process_group(backend="xla")
                # 再次检查分布式组是否成功初始化为 ProcessGroupXla 类型，否则抛出断言错误
                if not isinstance(dist.group.WORLD, xbn.ProcessGroupXla):
                    raise AssertionError("Failed to initialize torch.distributed process group using XLA backend.")
if is_sagemaker_mp_enabled():
    # 如果在SageMaker中启用了模型并行，则导入相应的模型并行库
    import smdistributed.modelparallel.torch as smp
    # 初始化模型并行
    smp.init()


def default_logdir() -> str:
    """
    Same default as PyTorch
    """
    # 导入所需的库
    import socket
    from datetime import datetime

    # 获取当前时间并格式化
    current_time = datetime.now().strftime("%b%d_%H-%M-%S")
    # 构建默认的日志目录路径
    return os.path.join("runs", current_time + "_" + socket.gethostname())


def get_int_from_env(env_keys, default):
    """Returns the first positive env value found in the `env_keys` list or the default."""
    # 遍历环境变量列表
    for e in env_keys:
        # 获取环境变量值，并尝试转换为整数，如果无法转换则返回默认值
        val = int(os.environ.get(e, -1))
        if val >= 0:
            return val
    # 如果所有环境变量都不符合要求，则返回默认值
    return default


def get_xla_device_type(device: "torch.device") -> Optional[str]:
    """
    Returns the xla device type (CPU|GPU|TPU) or None if the device is a non-xla device.
    """
    # 检查是否支持PyTorch XLA
    if is_torch_xla_available():
        # 如果设备类型为CPU，则返回"CPU"
        if device.type == "cpu":
            return "CPU"
        # 否则返回XLA真实设备列表中第一个设备类型
        return xm.xla_real_devices([device])[0].split(":")[0]
    # 如果不支持PyTorch XLA，则返回None
    return None


class OptimizerNames(ExplicitEnum):
    """
    Stores the acceptable string identifiers for optimizers.
    """

    # 枚举优化器的可接受字符串标识
    ADAMW_HF = "adamw_hf"
    ADAMW_TORCH = "adamw_torch"
    ADAMW_TORCH_FUSED = "adamw_torch_fused"
    ADAMW_TORCH_XLA = "adamw_torch_xla"
    ADAMW_TORCH_NPU_FUSED = "adamw_torch_npu_fused"
    ADAMW_APEX_FUSED = "adamw_apex_fused"
    ADAFACTOR = "adafactor"
    ADAMW_ANYPRECISION = "adamw_anyprecision"
    SGD = "sgd"
    ADAGRAD = "adagrad"
    ADAMW_BNB = "adamw_bnb_8bit"
    ADAMW_8BIT = "adamw_8bit"  # just an alias for adamw_bnb_8bit
    LION_8BIT = "lion_8bit"
    LION = "lion_32bit"
    PAGED_ADAMW = "paged_adamw_32bit"
    PAGED_ADAMW_8BIT = "paged_adamw_8bit"
    PAGED_LION = "paged_lion_32bit"
    PAGED_LION_8BIT = "paged_lion_8bit"
    RMSPROP = "rmsprop"
    RMSPROP_BNB = "rmsprop_bnb"
    RMSPROP_8BIT = "rmsprop_bnb_8bit"
    RMSPROP_32BIT = "rmsprop_bnb_32bit"
    GALORE_ADAMW = "galore_adamw"
    GALORE_ADAMW_8BIT = "galore_adamw_8bit"
    GALORE_ADAFACTOR = "galore_adafactor"
    GALORE_ADAMW_LAYERWISE = "galore_adamw_layerwise"
    GALORE_ADAMW_8BIT_LAYERWISE = "galore_adamw_8bit_layerwise"
    GALORE_ADAFACTOR_LAYERWISE = "galore_adafactor_layerwise"


# TODO: `TrainingArguments` users rely on it being fully mutable. In the future see if we can narrow this to a few keys: https://github.com/huggingface/transformers/pull/25903
@dataclass
class TrainingArguments:
    """
    TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
    itself**.

    Using [`HfArgumentParser`] we can turn this class into
    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
    command line.

    """

    # 指定框架为PyTorch
    framework = "pt"
    # 定义输出目录路径，用于存储模型预测和检查点
    output_dir: str = field(
        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
    )
    overwrite_output_dir: bool = field(
        default=False,
        metadata={
            "help": (
                "Overwrite the content of the output directory. "
                "Use this to continue training if output_dir points to a checkpoint directory."
            )
        },
    )
    # 是否覆盖输出目录的内容，默认为False
    # 当output_dir指向检查点目录时，设置为True以继续训练

    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
    # 是否运行训练，默认为False

    do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
    # 是否在开发集上运行评估，默认为False

    do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
    # 是否在测试集上运行预测，默认为False

    evaluation_strategy: Union[IntervalStrategy, str] = field(
        default="no",
        metadata={"help": "The evaluation strategy to use."},
    )
    # 使用的评估策略，默认为"no"

    prediction_loss_only: bool = field(
        default=False,
        metadata={"help": "When performing evaluation and predictions, only returns the loss."},
    )
    # 在执行评估和预测时，是否只返回损失，默认为False

    per_device_train_batch_size: int = field(
        default=8, metadata={"help": "Batch size per GPU/TPU/MPS/NPU core/CPU for training."}
    )
    # 每个GPU/TPU/MPS/NPU core/CPU的训练批次大小，默认为8

    per_device_eval_batch_size: int = field(
        default=8, metadata={"help": "Batch size per GPU/TPU/MPS/NPU core/CPU for evaluation."}
    )
    # 每个GPU/TPU/MPS/NPU core/CPU的评估批次大小，默认为8

    per_gpu_train_batch_size: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "Deprecated, the use of `--per_device_train_batch_size` is preferred. "
                "Batch size per GPU/TPU core/CPU for training."
            )
        },
    )
    # 每个GPU/TPU core/CPU的训练批次大小（已弃用），建议使用`--per_device_train_batch_size`

    per_gpu_eval_batch_size: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "Deprecated, the use of `--per_device_eval_batch_size` is preferred. "
                "Batch size per GPU/TPU core/CPU for evaluation."
            )
        },
    )
    # 每个GPU/TPU core/CPU的评估批次大小（已弃用），建议使用`--per_device_eval_batch_size`

    gradient_accumulation_steps: int = field(
        default=1,
        metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
    )
    # 执行反向传播/更新步骤之前累积的更新步骤数，默认为1

    eval_accumulation_steps: Optional[int] = field(
        default=None,
        metadata={"help": "Number of predictions steps to accumulate before moving the tensors to the CPU."},
    )
    # 在将张量移动到CPU之前累积的预测步骤数，默认为None

    eval_delay: Optional[float] = field(
        default=0,
        metadata={
            "help": (
                "Number of epochs or steps to wait for before the first evaluation can be performed, depending on the"
                " evaluation_strategy."
            )
        },
    )
    # 在第一次评估之前等待的时期或步骤数，取决于评估策略，默认为0

    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
    # AdamW优化器的初始学习率，默认为5e-5

    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
    # 如果应用的话，AdamW的权重衰减率，默认为0.0

    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
    # AdamW优化器的Beta1参数，默认为0.9

    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
    # AdamW优化器的Beta2参数，默认为0.999

    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
    # AdamW优化器的Epsilon参数，默认为1e-8
    # 定义最大梯度范数，默认为1.0，用于梯度裁剪
    max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."})

    # 定义总的训练周期数，默认为3.0
    num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
    
    # 定义最大训练步数，默认为-1，如果大于0，则设置总的训练步数，覆盖num_train_epochs的设定
    max_steps: int = field(
        default=-1,
        metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
    )
    
    # 定义学习率调度器的类型，默认为"linear"
    lr_scheduler_type: Union[SchedulerType, str] = field(
        default="linear",
        metadata={"help": "The scheduler type to use."},
    )
    
    # 学习率调度器的额外参数设定，默认为空字典，例如{'num_cycles': 1}用于余弦退火重启时的参数设置
    lr_scheduler_kwargs: Optional[Dict] = field(
        default_factory=dict,
        metadata={
            "help": (
                "Extra parameters for the lr_scheduler such as {'num_cycles': 1} for the cosine with hard restarts"
            )
        },
    )
    
    # 线性预热的比例，默认为0.0，表示在总步数的这一部分上进行线性预热
    warmup_ratio: float = field(
        default=0.0, metadata={"help": "Linear warmup over warmup_ratio fraction of total steps."}
    )
    
    # 线性预热的步数，默认为0，表示固定的线性预热步数
    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})

    # 主节点日志记录级别，默认为"passive"，允许应用程序设定日志级别
    log_level: Optional[str] = field(
        default="passive",
        metadata={
            "help": (
                "Logger log level to use on the main node. Possible choices are the log levels as strings: 'debug',"
                " 'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and"
                " lets the application set the level. Defaults to 'passive'."
            ),
            "choices": trainer_log_levels.keys(),  # 可选的日志级别
        },
    )
    
    # 复制节点日志记录级别，默认为"warning"，与主节点日志记录级别相同
    log_level_replica: Optional[str] = field(
        default="warning",
        metadata={
            "help": "Logger log level to use on replica nodes. Same choices and defaults as ``log_level``",
            "choices": trainer_log_levels.keys(),  # 可选的日志级别
        },
    )
    
    # 多节点分布式训练时，是否在每个节点记录日志，默认为True表示每个节点都记录日志
    log_on_each_node: bool = field(
        default=True,
        metadata={
            "help": (
                "When doing a multinode distributed training, whether to log once per node or just once on the main"
                " node."
            )
        },
    )
    
    # Tensorboard日志目录，默认为None
    logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."})
    
    # 训练过程中的日志记录策略，默认为"steps"，表示每隔一定步数记录一次日志
    logging_strategy: Union[IntervalStrategy, str] = field(
        default="steps",
        metadata={"help": "The logging strategy to use."},
    )
    
    # 是否记录第一个全局步数，默认为False
    logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"})
    
    # 每隔多少步记录一次日志，默认为500，可以是整数或小于1的浮点数，表示比例
    logging_steps: float = field(
        default=500,
        metadata={
            "help": (
                "Log every X updates steps. Should be an integer or a float in range `[0,1)`. "
                "If smaller than 1, will be interpreted as ratio of total training steps."
            )
        },
    )
    
    # 是否过滤掉记录中的NaN和Inf损失，默认为True
    logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."})
    
    # 检查点保存策略，默认为"steps"，表示每隔一定步数保存一次检查点
    save_strategy: Union[IntervalStrategy, str] = field(
        default="steps",
        metadata={"help": "The checkpoint save strategy to use."},
    )
    # 定义一个浮点类型的字段 `save_steps`，默认值为 500
    save_steps: float = field(
        default=500,
        metadata={
            "help": (
                "Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`. "
                "If smaller than 1, will be interpreted as ratio of total training steps."
            )
        },
    )

    # 定义一个可选整数类型的字段 `save_total_limit`，默认值为 None
    save_total_limit: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in"
                " `output_dir`. When `load_best_model_at_end` is enabled, the 'best' checkpoint according to"
                " `metric_for_best_model` will always be retained in addition to the most recent ones. For example,"
                " for `save_total_limit=5` and `load_best_model_at_end=True`, the four last checkpoints will always be"
                " retained alongside the best model. When `save_total_limit=1` and `load_best_model_at_end=True`,"
                " it is possible that two checkpoints are saved: the last one and the best one (if they are different)."
                " Default is unlimited checkpoints"
            )
        },
    )

    # 定义一个可选布尔类型的字段 `save_safetensors`，默认值为 True
    save_safetensors: Optional[bool] = field(
        default=True,
        metadata={
            "help": "Use safetensors saving and loading for state dicts instead of default torch.load and torch.save."
        },
    )

    # 定义一个布尔类型的字段 `save_on_each_node`，默认值为 False
    save_on_each_node: bool = field(
        default=False,
        metadata={
            "help": (
                "When doing multi-node distributed training, whether to save models and checkpoints on each node, or"
                " only on the main one"
            )
        },
    )

    # 定义一个布尔类型的字段 `save_only_model`，默认值为 False
    save_only_model: bool = field(
        default=False,
        metadata={
            "help": (
                "When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state."
                "Note that when this is true, you won't be able to resume training from checkpoint."
                "This enables you to save storage by not storing the optimizer, scheduler & rng state."
                "You can only load the model using from_pretrained with this option set to True."
            )
        },
    )

    # 定义一个布尔类型的字段 `no_cuda`，默认值为 False
    no_cuda: bool = field(
        default=False,
        metadata={"help": "This argument is deprecated. It will be removed in version 5.0 of 🤗 Transformers."},
    )

    # 定义一个布尔类型的字段 `use_cpu`，默认值为 False
    use_cpu: bool = field(
        default=False,
        metadata={
            "help": " Whether or not to use cpu. If set to False, we will use cuda/tpu/mps/npu device if available."
        },
    )

    # 定义一个布尔类型的字段 `use_mps_device`，默认值为 False
    use_mps_device: bool = field(
        default=False,
        metadata={
            "help": "This argument is deprecated. `mps` device will be used if available similar to `cuda` device."
            " It will be removed in version 5.0 of 🤗 Transformers"
        },
    )
    seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
    # 设置随机种子，用于训练开始时的随机性
    data_seed: Optional[int] = field(default=None, metadata={"help": "Random seed to be used with data samplers."})
    # 数据采样器使用的随机种子，可选参数
    jit_mode_eval: bool = field(
        default=False, metadata={"help": "Whether or not to use PyTorch jit trace for inference"}
    )
    # 是否使用 PyTorch jit 追踪进行推断
    use_ipex: bool = field(
        default=False,
        metadata={
            "help": (
                "Use Intel extension for PyTorch when it is available, installation:"
                " 'https://github.com/intel/intel-extension-for-pytorch'"
            )
        },
    )
    # 在可用时是否使用 Intel 扩展进行 PyTorch 加速
    bf16: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA"
                " architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change."
            )
        },
    )
    # 是否使用 bf16（混合）精度替代 32 位精度
    fp16: bool = field(
        default=False,
        metadata={"help": "Whether to use fp16 (mixed) precision instead of 32-bit"},
    )
    # 是否使用 fp16（混合）精度替代 32 位精度
    fp16_opt_level: str = field(
        default="O1",
        metadata={
            "help": (
                "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
                "See details at https://nvidia.github.io/apex/amp.html"
            )
        },
    )
    # fp16 使用的优化级别，选择在 ['O0', 'O1', 'O2', 'O3'] 中的一个
    half_precision_backend: str = field(
        default="auto",
        metadata={
            "help": "The backend to be used for half precision.",
            "choices": ["auto", "apex", "cpu_amp"],
        },
    )
    # 用于半精度计算的后端选择，可选值为 ['auto', 'apex', 'cpu_amp']
    bf16_full_eval: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to use full bfloat16 evaluation instead of 32-bit. This is an experimental API and it may"
                " change."
            )
        },
    )
    # 是否使用 bf16（完整）评估替代 32 位精度
    fp16_full_eval: bool = field(
        default=False,
        metadata={"help": "Whether to use full float16 evaluation instead of 32-bit"},
    )
    # 是否使用 fp16（完整）评估替代 32 位精度
    tf32: Optional[bool] = field(
        default=None,
        metadata={
            "help": (
                "Whether to enable tf32 mode, available in Ampere and newer GPU architectures. This is an experimental"
                " API and it may change."
            )
        },
    )
    # 是否启用 tf32 模式，仅适用于 Ampere 及更新的 GPU 架构
    local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
    # 分布式训练中的本地排名
    ddp_backend: Optional[str] = field(
        default=None,
        metadata={
            "help": "The backend to be used for distributed training",
            "choices": ["nccl", "gloo", "mpi", "ccl", "hccl"],
        },
    )
    # 分布式训练使用的后端选择，可选值为 ['nccl', 'gloo', 'mpi', 'ccl', 'hccl']
    tpu_num_cores: Optional[int] = field(
        default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"}
    )
    # TPU 使用的核心数
    tpu_metrics_debug: bool = field(
        default=False,
        metadata={
            "help": (
                "已弃用，推荐使用 `--debug tpu_metrics_debug`。TPU：是否打印调试指标"
            )
        },
    )
    debug: Union[str, List[DebugOption]] = field(
        default="",
        metadata={
            "help": (
                "是否启用调试模式。当前选项："
                "`underflow_overflow`（检测激活和权重中的下溢和上溢），"
                "`tpu_metrics_debug`（在TPU上打印调试指标）。"
            )
        },
    )

    dataloader_drop_last: bool = field(
        default=False, metadata={"help": "如果不是批量大小的整数倍，丢弃最后不完整的批次。"}
    )
    eval_steps: Optional[float] = field(
        default=None,
        metadata={
            "help": (
                "每隔X步运行一次评估。应为整数或范围为`[0,1)`的浮点数。"
                "如果小于1，将解释为总训练步数的比例。"
            )
        },
    )
    dataloader_num_workers: int = field(
        default=0,
        metadata={
            "help": (
                "用于数据加载的子进程数（仅适用于PyTorch）。"
                "0表示数据将在主进程中加载。"
            )
        },
    )
    dataloader_prefetch_factor: Optional[int] = field(
        default=None if not is_torch_available() or is_torch_greater_or_equal_than_2_0 else 2,
        metadata={
            "help": (
                "每个工作进程预加载的批次数。"
                "2表示每个工作进程预加载2 * num_workers批次。"
                "对于PyTorch < 2.0.0，默认为2，否则为None。"
            )
        },
    )
    past_index: int = field(
        default=-1,
        metadata={"help": "如果 >= 0，则使用输出的相应部分作为下一步的过去状态。"},
    )

    run_name: Optional[str] = field(
        default=None, metadata={"help": "运行的可选描述符。主要用于wandb日志记录。"}
    )
    disable_tqdm: Optional[bool] = field(
        default=None, metadata={"help": "是否禁用tqdm进度条。"}
    )

    remove_unused_columns: Optional[bool] = field(
        default=True, metadata={"help": "在使用nlp.Dataset时，移除模型不需要的列。"}
    )
    label_names: Optional[List[str]] = field(
        default=None, metadata={"help": "输入字典中与标签对应的键列表。"}
    )
    load_best_model_at_end: Optional[bool] = field(
        default=False,
        metadata={
            "help": (
                "Whether or not to load the best model found during training at the end of training. When this option"
                " is enabled, the best checkpoint will always be saved. See `save_total_limit` for more."
            )
        },
    )
    # 是否在训练结束时加载找到的最佳模型。启用此选项时，始终保存最佳检查点。详见 `save_total_limit`。
    
    metric_for_best_model: Optional[str] = field(
        default=None, metadata={"help": "The metric to use to compare two different models."}
    )
    # 用于比较两个不同模型的度量标准。

    greater_is_better: Optional[bool] = field(
        default=None, metadata={"help": "Whether the `metric_for_best_model` should be maximized or not."}
    )
    # 是否应最大化 `metric_for_best_model`。

    ignore_data_skip: bool = field(
        default=False,
        metadata={
            "help": (
                "When resuming training, whether or not to skip the first epochs and batches to get to the same"
                " training data."
            )
        },
    )
    # 在恢复训练时，是否跳过初始的若干轮次和批次，以达到相同的训练数据。

    fsdp: Optional[Union[List[FSDPOption], str]] = field(
        default="",
        metadata={
            "help": (
                "Whether or not to use PyTorch Fully Sharded Data Parallel (FSDP) training (in distributed training"
                " only). The base option should be `full_shard`, `shard_grad_op` or `no_shard` and you can add"
                " CPU-offload to `full_shard` or `shard_grad_op` like this: full_shard offload` or `shard_grad_op"
                " offload`. You can add auto-wrap to `full_shard` or `shard_grad_op` with the same syntax: full_shard"
                " auto_wrap` or `shard_grad_op auto_wrap`."
            ),
        },
    )
    # 是否使用 PyTorch 完全分片数据并行（FSDP）训练（仅限分布式训练）。基本选项应为 `full_shard`、`shard_grad_op` 或 `no_shard`，
    # 可以如下方式添加 CPU-offload 到 `full_shard` 或 `shard_grad_op`：`full_shard offload` 或 `shard_grad_op offload`。
    # 可以使用相同的语法为 `full_shard` 或 `shard_grad_op` 添加自动包装：`full_shard auto_wrap` 或 `shard_grad_op auto_wrap`。

    fsdp_min_num_params: int = field(
        default=0,
        metadata={
            "help": (
                "This parameter is deprecated. FSDP's minimum number of parameters for Default Auto Wrapping. (useful"
                " only when `fsdp` field is passed)."
            )
        },
    )
    # 此参数已弃用。FSDP 的默认自动包装最小参数数量。（仅当传递 `fsdp` 字段时有效）。

    # Do not touch this type annotation or it will stop working in CLI
    fsdp_config: Optional[Union[dict, str]] = field(
        default=None,
        metadata={
            "help": (
                "Config to be used with FSDP (Pytorch Fully Sharded  Data Parallel). The value is either a "
                "fsdp json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`."
            )
        },
    )
    # 用于 FSDP（Pytorch 完全分片数据并行）的配置。值可以是 fsdp 的 JSON 配置文件（例如 `fsdp_config.json`）或已加载的 `dict`。

    fsdp_transformer_layer_cls_to_wrap: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "This parameter is deprecated. Transformer layer class name (case-sensitive) to wrap, e.g,"
                " `BertLayer`, `GPTJBlock`, `T5Block` .... (useful only when `fsdp` flag is passed)."
            )
        },
    )
    # 此参数已弃用。要包装的 Transformer 层类名（区分大小写），例如 `BertLayer`、`GPTJBlock`、`T5Block` ...... （仅当传递 `fsdp` 标志时有效）。

    # Do not touch this type annotation or it will stop working in CLI
    accelerator_config: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Config to be used with the internal Accelerator object initializtion. The value is either a "
                "accelerator json config file (e.g., `accelerator_config.json`) or an already loaded json file as `dict`."
            )
        },
    )
    # accelerator_config参数，用于内部加速器对象初始化的配置
    deepspeed: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Enable deepspeed and pass the path to deepspeed json config file (e.g. `ds_config.json`) or an already"
                " loaded json file as a dict"
            )
        },
    )
    # deepspeed参数，用于启用deepspeed并传递deepspeed json配置文件的路径或已加载的json文件作为字典
    label_smoothing_factor: float = field(
        default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."}
    )
    # label_smoothing_factor参数，用于应用标签平滑的ε值（零表示不进行标签平滑）

    default_optim = "adamw_torch"
    # 默认优化器设定为"adamw_torch"
    # XXX: enable when pytorch==2.0.1 comes out - we want to give it time to get all the bugs sorted out
    # if is_torch_available() and version.parse(version.parse(torch.__version__).base_version) >= version.parse("2.1.0"):
    #     default_optim = "adamw_torch_fused"
    # and update the doc above to:
    # optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch_fused"` (for torch<2.1.0 `"adamw_torch"`):
    # 当pytorch版本为2.0.1时启用，我们希望给它足够的时间来解决所有的bug
    # 如果torch可用且版本大于等于2.1.0，则将默认优化器更新为"adamw_torch_fused"，否则为"adamw_torch"
    optim: Union[OptimizerNames, str] = field(
        default=default_optim,
        metadata={"help": "The optimizer to use."},
    )
    # optim参数，用于指定要使用的优化器
    optim_args: Optional[str] = field(default=None, metadata={"help": "Optional arguments to supply to optimizer."})
    # optim_args参数，用于传递给优化器的可选参数
    adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."})
    # adafactor参数，用于指定是否使用Adafactor替代AdamW
    group_by_length: bool = field(
        default=False,
        metadata={"help": "Whether or not to group samples of roughly the same length together when batching."},
    )
    # group_by_length参数，用于指定是否在批处理时将大致相同长度的样本分组在一起
    length_column_name: Optional[str] = field(
        default="length",
        metadata={"help": "Column name with precomputed lengths to use when grouping by length."},
    )
    # length_column_name参数，用于指定在按长度分组时使用的预计算长度的列名
    report_to: Optional[List[str]] = field(
        default=None, metadata={"help": "The list of integrations to report the results and logs to."}
    )
    # report_to参数，用于指定要报告结果和日志的集成列表
    ddp_find_unused_parameters: Optional[bool] = field(
        default=None,
        metadata={
            "help": (
                "When using distributed training, the value of the flag `find_unused_parameters` passed to "
                "`DistributedDataParallel`."
            )
        },
    )
    # ddp_find_unused_parameters参数，用于在使用分布式训练时传递给`DistributedDataParallel`的`find_unused_parameters`标志的值
    ddp_bucket_cap_mb: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "When using distributed training, the value of the flag `bucket_cap_mb` passed to "
                "`DistributedDataParallel`."
            )
        },
    )
    # ddp_bucket_cap_mb参数，用于在使用分布式训练时传递给`DistributedDataParallel`的`bucket_cap_mb`标志的值
    # 用于分布式训练中，指定是否将 `broadcast_buffers` 标志传递给 `DistributedDataParallel`。
    ddp_broadcast_buffers: Optional[bool] = field(
        default=None,
        metadata={
            "help": (
                "When using distributed training, the value of the flag `broadcast_buffers` passed to "
                "`DistributedDataParallel`."
            )
        },
    )

    # 是否为 DataLoader 固定内存。
    dataloader_pin_memory: bool = field(
        default=True, metadata={"help": "Whether or not to pin memory for DataLoader."}
    )

    # 是否保持 DataLoader 的 worker 进程持久化，不在每次数据集使用完后关闭。
    dataloader_persistent_workers: bool = field(
        default=False,
        metadata={
            "help": "If True, the data loader will not shut down the worker processes after a dataset has been consumed once. This allows to maintain the workers Dataset instances alive. Can potentially speed up training, but will increase RAM usage."
        },
    )

    # 是否跳过将内存分析报告添加到指标中。
    skip_memory_metrics: bool = field(
        default=True, metadata={"help": "Whether or not to skip adding of memory profiler reports to metrics."}
    )

    # 是否使用旧版的 prediction_loop 在 Trainer 中。
    use_legacy_prediction_loop: bool = field(
        default=False, metadata={"help": "Whether or not to use the legacy prediction_loop in the Trainer."}
    )

    # 是否在训练结束后上传训练好的模型到模型中心。
    push_to_hub: bool = field(
        default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
    )

    # 从检查点恢复训练的路径。
    resume_from_checkpoint: Optional[str] = field(
        default=None,
        metadata={"help": "The path to a folder with a valid checkpoint for your model."},
    )

    # 与本地 `output_dir` 保持同步的模型中心的名称。
    hub_model_id: Optional[str] = field(
        default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
    )

    # 在 `--push_to_hub` 激活时使用的模型中心策略。
    hub_strategy: Union[HubStrategy, str] = field(
        default="every_save",
        metadata={"help": "The hub strategy to use when `--push_to_hub` is activated."},
    )

    # 用于推送模型到模型中心的令牌。
    hub_token: Optional[str] = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})

    # 模型存储库是否是私有的。
    hub_private_repo: bool = field(default=False, metadata={"help": "Whether the model repository is private or not."})

    # 如果为 `False`，则如果上一个推送未完成，Trainer 将跳过推送。
    hub_always_push: bool = field(
        default=False,
        metadata={"help": "Unless `True`, the Trainer will skip pushes if the previous one wasn't finished yet."},
    )

    # 是否使用梯度检查点来节省内存，尽管会导致反向传播速度变慢。
    gradient_checkpointing: bool = field(
        default=False,
        metadata={
            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
        },
    )

    # 梯度检查点的关键字参数，例如 `use_reentrant`，将传递给 `torch.utils.checkpoint.checkpoint` 通过 `model.gradient_checkpointing_enable`。
    gradient_checkpointing_kwargs: Optional[dict] = field(
        default=None,
        metadata={
            "help": "Gradient checkpointing key word arguments such as `use_reentrant`. Will be passed to `torch.utils.checkpoint.checkpoint` through `model.gradient_checkpointing_enable`."
        },
    )

    # 是否将输入传递给 `compute_metrics` 函数以计算指标。
    include_inputs_for_metrics: bool = field(
        default=False, metadata={"help": "Whether or not the inputs will be passed to the `compute_metrics` function."}
    )
    # 已弃用的参数
    fp16_backend: str = field(
        default="auto",
        metadata={
            "help": "Deprecated. Use half_precision_backend instead",
            "choices": ["auto", "apex", "cpu_amp"],
        },
    )
    # 初始化一个字符串字段，表示混合精度计算的后端选择，默认为"auto"，可选值为["auto", "apex", "cpu_amp"]。
    push_to_hub_model_id: Optional[str] = field(
        default=None, metadata={"help": "The name of the repository to which push the `Trainer`."}
    )
    # 可选的字符串字段，用于指定要推送到的模型仓库的名称。
    push_to_hub_organization: Optional[str] = field(
        default=None, metadata={"help": "The name of the organization in with to which push the `Trainer`."}
    )
    # 可选的字符串字段，用于指定要推送到的组织的名称。
    push_to_hub_token: Optional[str] = field(
        default=None, metadata={"help": "The token to use to push to the Model Hub."}
    )
    # 可选的字符串字段，用于指定用于推送到模型中心的令牌。
    _n_gpu: int = field(init=False, repr=False, default=-1)
    # 不可初始化和不可显示的整数字段，表示GPU的数量，默认为-1。
    mp_parameters: str = field(
        default="",
        metadata={"help": "Used by the SageMaker launcher to send mp-specific args. Ignored in Trainer"},
    )
    # 字符串字段，默认为空字符串，用于SageMaker启动器发送特定的多进程参数，Trainer中被忽略。

    auto_find_batch_size: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to automatically decrease the batch size in half and rerun the training loop again each time"
                " a CUDA Out-of-Memory was reached"
            )
        },
    )
    # 布尔字段，默认为False，控制是否在每次CUDA内存溢出时自动减少批量大小并重新运行训练循环。
    full_determinism: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to call enable_full_determinism instead of set_seed for reproducibility in distributed"
                " training. Important: this will negatively impact the performance, so only use it for debugging."
            )
        },
    )
    # 布尔字段，默认为False，控制是否在分布式训练中使用enable_full_determinism而不是set_seed来实现可重复性。
    torchdynamo: Optional[str] = field(
        default=None,
        metadata={
            "help": "This argument is deprecated, use `--torch_compile_backend` instead.",
        },
    )
    # 可选的字符串字段，已废弃，建议使用`--torch_compile_backend`代替。
    ray_scope: Optional[str] = field(
        default="last",
        metadata={
            "help": (
                'The scope to use when doing hyperparameter search with Ray. By default, `"last"` will be used. Ray'
                " will then use the last checkpoint of all trials, compare those, and select the best one. However,"
                " other options are also available. See the Ray documentation"
                " (https://docs.ray.io/en/latest/tune/api_docs/analysis.html"
                "#ray.tune.ExperimentAnalysis.get_best_trial)"
                " for more options."
            )
        },
    )
    # 可选的字符串字段，默认为"last"，用于在使用Ray进行超参数搜索时指定作用域。
    ddp_timeout: Optional[int] = field(
        default=1800,
        metadata={
            "help": "Overrides the default timeout for distributed training (value should be given in seconds)."
        },
    )
    # 可选的整数字段，默认为1800，用于覆盖分布式训练的默认超时时间（以秒为单位）。
    torch_compile: bool = field(
        default=False, metadata={"help": "If set to `True`, the model will be wrapped in `torch.compile`."}
    )
    # 布尔字段，默认为False，如果设置为True，模型将被包装在torch.compile中。
    torch_compile_backend: Optional[str] = field(
        default=None,
        metadata={
            "help": "Which backend to use with `torch.compile`, passing one will trigger a model compilation.",
        },
    )
    # 可选的字符串字段，用于指定在torch.compile中使用的后端。
    torch_compile_mode: Optional[str] = field(
        default=None,
        metadata={
            "help": "Which mode to use with `torch.compile`, passing one will trigger a model compilation.",
        },
    )
    # 可选的编译模式，用于指定 `torch.compile` 的模式，传入一个值将触发模型编译。

    dispatch_batches: Optional[bool] = field(
        default=None,
        metadata={"help": "Deprecated. Pass {'dispatch_batches':VALUE} to `accelerator_config`."},
    )
    # 已弃用。通过将 {'dispatch_batches':VALUE} 传递给 `accelerator_config` 来代替。

    split_batches: Optional[bool] = field(
        default=None,
        metadata={"help": "Deprecated. Pass {'split_batches':True} to `accelerator_config`."},
    )
    # 已弃用。通过将 {'split_batches':True} 传递给 `accelerator_config` 来代替。

    include_tokens_per_second: Optional[bool] = field(
        default=False,
        metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."},
    )
    # 如果设置为 `True`，速度指标将包括 `tgs`（每设备每秒标记数）。

    include_num_input_tokens_seen: Optional[bool] = field(
        default=False,
        metadata={
            "help": "If set to `True`, will track the number of input tokens seen throughout training. (May be slower in distributed training)"
        },
    )
    # 如果设置为 `True`，将跟踪训练过程中看到的输入标记数量。（在分布式训练中可能会变慢）

    neftune_noise_alpha: Optional[float] = field(
        default=None,
        metadata={
            "help": "Activates neftune noise embeddings into the model. NEFTune has been proven to drastically improve model performances for instrcution fine-tuning. Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes."
        },
    )
    # 激活 NEFTune 噪声嵌入到模型中。NEFTune 已被证明可以显著改善指令微调的模型性能。查看原始论文：https://arxiv.org/abs/2310.05914 和原始代码：https://github.com/neelsjain/NEFTune。仅支持 `PreTrainedModel` 和 `PeftModel` 类。

    optim_target_modules: Union[None, str, List[str]] = field(
        default=None,
        metadata={
            "help": "Target modules for the optimizer defined in the `optim` argument. Only used for the GaLore optimizer at the moment."
        },
    )
    # 用于优化器中 `optim` 参数定义的目标模块。目前仅用于 GaLore 优化器。

    def __str__(self):
        self_as_dict = asdict(self)

        # Remove deprecated arguments. That code should be removed once
        # those deprecated arguments are removed from TrainingArguments. (TODO: v5)
        del self_as_dict["per_gpu_train_batch_size"]
        del self_as_dict["per_gpu_eval_batch_size"]

        self_as_dict = {k: f"<{k.upper()}>" if k.endswith("_token") else v for k, v in self_as_dict.items()}

        attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())]
        return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})"

    __repr__ = __str__
    # 将对象转换为字符串表示形式的方法和其 `__repr__` 方法的重写。

    @property
    def train_batch_size(self) -> int:
        """
        The actual batch size for training (may differ from `per_gpu_train_batch_size` in distributed training).
        """
        # 如果定义了 per_gpu_train_batch_size，则发出警告信息，因为这个参数在将来版本中将被移除
        if self.per_gpu_train_batch_size:
            logger.warning(
                "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
                "version. Using `--per_device_train_batch_size` is preferred."
            )
        # 根据是否设置了 per_gpu_train_batch_size 来确定每个设备的批处理大小
        per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
        # 计算实际的训练批处理大小，考虑到 GPU 数量
        train_batch_size = per_device_batch_size * max(1, self.n_gpu)
        return train_batch_size

    @property
    def eval_batch_size(self) -> int:
        """
        The actual batch size for evaluation (may differ from `per_gpu_eval_batch_size` in distributed training).
        """
        # 如果定义了 per_gpu_eval_batch_size，则发出警告信息，因为这个参数在将来版本中将被移除
        if self.per_gpu_eval_batch_size:
            logger.warning(
                "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
                "version. Using `--per_device_eval_batch_size` is preferred."
            )
        # 根据是否设置了 per_gpu_eval_batch_size 来确定每个设备的批处理大小
        per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
        # 计算实际的评估批处理大小，考虑到 GPU 数量
        eval_batch_size = per_device_batch_size * max(1, self.n_gpu)
        return eval_batch_size

    @property
    def ddp_timeout_delta(self) -> timedelta:
        """
        The actual timeout for torch.distributed.init_process_group since it expects a timedelta variable.
        """
        # 返回用于 torch.distributed.init_process_group 的超时时间，作为 timedelta 变量
        return timedelta(seconds=self.ddp_timeout)

    @cached_property
    @property
    def device(self) -> "torch.device":
        """
        The device used by this process.
        """
        # 确保 torch 被正确加载
        requires_backends(self, ["torch"])
        # 返回当前进程使用的设备对象
        return self._setup_devices

    @property
    def n_gpu(self):
        """
        The number of GPUs used by this process.

        Note:
            This will only be greater than one when you have multiple GPUs available but are not using distributed
            training. For distributed training, it will always be 1.
        """
        # 确保 torch 被正确加载
        requires_backends(self, ["torch"])
        # 确保 self._n_gpu 被正确设置
        if not hasattr(self, "_n_gpu"):
            _ = self._setup_devices
        # 返回当前进程使用的 GPU 数量
        return self._n_gpu
    def parallel_mode(self):
        """
        The current mode used for parallelism if multiple GPUs/TPU cores are available. One of:

        - `ParallelMode.NOT_PARALLEL`: no parallelism (CPU or one GPU).
        - `ParallelMode.NOT_DISTRIBUTED`: several GPUs in one single process (uses `torch.nn.DataParallel`).
        - `ParallelMode.DISTRIBUTED`: several GPUs, each having its own process (uses
          `torch.nn.DistributedDataParallel`).
        - `ParallelMode.TPU`: several TPU cores.
        """
        # 确保所需后端库存在，此处需要 "torch"
        requires_backends(self, ["torch"])
        # 如果当前环境支持 TPU，则返回 TPU 并行模式
        if is_torch_xla_available():
            return ParallelMode.TPU
        # 如果使用 SageMaker 并启用了模型并行，则返回 SageMaker 模型并行模式
        elif is_sagemaker_mp_enabled():
            return ParallelMode.SAGEMAKER_MODEL_PARALLEL
        # 如果使用 SageMaker 并启用了数据并行，则返回 SageMaker 数据并行模式
        elif is_sagemaker_dp_enabled():
            return ParallelMode.SAGEMAKER_DATA_PARALLEL
        # 如果分布式状态存在且不是未分布式类型，或者本地排名不为 -1，则返回分布式并行模式
        elif (
            self.distributed_state is not None and self.distributed_state.distributed_type != DistributedType.NO
        ) or (self.distributed_state is None and self.local_rank != -1):
            return ParallelMode.DISTRIBUTED
        # 如果 GPU 数量大于 1，则返回非分布式并行模式
        elif self.n_gpu > 1:
            return ParallelMode.NOT_DISTRIBUTED
        # 否则返回非并行模式
        else:
            return ParallelMode.NOT_PARALLEL

    @property
    def world_size(self):
        """
        The number of processes used in parallel.
        """
        # 确保所需后端库存在，此处需要 "torch"
        requires_backends(self, ["torch"])
        # 如果分布式状态存在，则返回并行使用的进程数
        if self.distributed_state is not None:
            return self.distributed_state.num_processes
        # 如果使用 SageMaker 并且未启用批次预调整，则返回数据并行的大小
        elif is_sagemaker_mp_enabled():
            return smp.dp_size() if not smp.state.cfg.prescaled_batch else smp.rdp_size()
        # 否则返回默认值 1
        return 1

    @property
    def process_index(self):
        """
        The index of the current process used.
        """
        # 确保所需后端库存在，此处需要 "torch"
        requires_backends(self, ["torch"])
        # 如果分布式状态存在，则返回当前进程的索引
        if self.distributed_state is not None:
            return self.distributed_state.process_index
        # 如果使用 SageMaker 并且未启用批次预调整，则返回数据并行的排名
        elif is_sagemaker_mp_enabled():
            return smp.dp_rank() if not smp.state.cfg.prescaled_batch else smp.rdp_rank()
        # 否则返回默认值 0
        return 0

    @property
    def local_process_index(self):
        """
        The index of the local process used.
        """
        # 确保所需后端库存在，此处需要 "torch"
        requires_backends(self, ["torch"])

        # 如果分布式状态存在，则返回本地进程的索引
        if self.distributed_state is not None:
            return self.distributed_state.local_process_index
        # 如果使用 SageMaker 并启用了本地排名，则返回本地排名
        elif is_sagemaker_mp_enabled():
            return smp.local_rank()
        # 否则返回默认值 0
        return 0

    @property
    def should_log(self):
        """
        Whether or not the current process should produce log.
        """
        # 如果设置为在每个节点上记录日志，则仅当本地进程索引为 0 时返回 True
        if self.log_on_each_node:
            return self.local_process_index == 0
        else:
            # 如果使用 SageMaker 并且当前进程排名为 0，则返回 True
            if is_sagemaker_mp_enabled():
                return smp.rank() == 0
            # 否则仅当当前进程索引为 0 时返回 True
            else:
                return self.process_index == 0
    def should_save(self):
        """
        Whether or not the current process should write to disk, e.g., to save models and checkpoints.
        """
        # 如果设置为在每个节点保存，则仅在本地进程索引为0时返回True
        if self.save_on_each_node:
            return self.local_process_index == 0
        else:
            # 如果在SageMaker多进程环境中启用了多进程，则仅在排名为0的进程返回True
            if is_sagemaker_mp_enabled():
                return smp.rank() == 0
            else:
                # 否则，仅在进程索引为0时返回True
                return self.process_index == 0

    def get_process_log_level(self):
        """
        Returns the log level to be used depending on whether this process is the main process of node 0, main process
        of node non-0, or a non-main process.

        For the main process the log level defaults to the logging level set (`logging.WARNING` if you didn't do
        anything) unless overridden by `log_level` argument.

        For the replica processes the log level defaults to `logging.WARNING` unless overridden by `log_level_replica`
        argument.

        The choice between the main and replica process settings is made according to the return value of `should_log`.
        """
        # 将log_level和log_level_replica转换为整数
        log_level = trainer_log_levels[self.log_level]
        log_level_replica = trainer_log_levels[self.log_level_replica]

        # 如果log_level为-1，则使用当前日志级别设置的详细程度
        log_level_main_node = logging.get_verbosity() if log_level == -1 else log_level
        # 如果log_level_replica为-1，则使用默认的WARNING日志级别
        log_level_replica_node = logging.get_verbosity() if log_level_replica == -1 else log_level_replica
        # 根据should_log方法的返回值选择主进程或副本进程的日志级别设置
        return log_level_main_node if self.should_log else log_level_replica_node

    @property
    def place_model_on_device(self):
        """
        Can be subclassed and overridden for some specific integrations.
        """
        # 如果未启用SageMaker多进程，则返回True；否则返回False
        return not is_sagemaker_mp_enabled()

    @property
    def _no_sync_in_gradient_accumulation(self):
        """
        Whether or not to use no_sync for the gradients when doing gradient accumulation.
        """
        # 当不使用DeepSpeed、SageMaker分布式训练、SageMaker多进程或Torch NeuronCore时返回True，否则返回False
        return not (
            self.deepspeed or is_sagemaker_dp_enabled() or is_sagemaker_mp_enabled() or is_torch_neuroncore_available()
        )

    @contextlib.contextmanager
    # 定义一个上下文管理器，用于在 Torch 分布式环境中执行主进程的操作，
    # 阻塞副本进程，并在完成后释放副本。
    def main_process_first(self, local=True, desc="work"):
        """
        A context manager for torch distributed environment where on needs to do something on the main process, while
        blocking replicas, and when it's finished releasing the replicas.

        One such use is for `datasets`'s `map` feature which to be efficient should be run once on the main process,
        which upon completion saves a cached version of results and which then automatically gets loaded by the
        replicas.

        Args:
            local (`bool`, *optional*, defaults to `True`):
                if `True` first means process of rank 0 of each node if `False` first means process of rank 0 of node
                rank 0 In multi-node environment with a shared filesystem you most likely will want to use
                `local=False` so that only the main process of the first node will do the processing. If however, the
                filesystem is not shared, then the main process of each node will need to do the processing, which is
                the default behavior.
            desc (`str`, *optional*, defaults to `"work"`):
                a work description to be used in debug logs

        """
        # 检查当前环境是否支持 Torch，并且是否处于分布式环境中
        if is_torch_available() and self.world_size > 1:
            # 根据参数确定主进程的描述信息
            main_process_desc = "main local process" if local else "main process"
            # 根据当前的分布式状态确定是否为主进程
            if self.distributed_state is not None:
                is_main_process = (
                    self.distributed_state.is_local_main_process if local else self.distributed_state.is_main_process
                )
            elif is_sagemaker_mp_enabled():
                is_main_process = smp.rank() == 0

            try:
                if not is_main_process:
                    # 告知所有副本进程等待
                    logger.debug(f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}")

                    # 如果支持 Torch XLA，则使用其同步方法
                    if is_torch_xla_available():
                        xm.rendezvous(desc)
                    else:
                        # 否则使用 Torch 的分布式 barrier
                        dist.barrier()
                # 使用 yield 将控制权交给调用者，允许在主进程完成后继续执行
                yield
            finally:
                if is_main_process:
                    # 主进程完成任务，释放所有副本
                    logger.debug(f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas")
                    if is_torch_xla_available():
                        xm.rendezvous(desc)
                    else:
                        dist.barrier()
        else:
            # 如果不满足分布式条件，则直接 yield
            yield

    # 获取线性预热所需的步数
    def get_warmup_steps(self, num_training_steps: int):
        """
        Get number of steps used for a linear warmup.
        """
        warmup_steps = (
            self.warmup_steps if self.warmup_steps > 0 else math.ceil(num_training_steps * self.warmup_ratio)
        )
        return warmup_steps
    def to_dict(self):
        """
        Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates
        the token values by removing their value.
        """
        # 创建一个空字典 `d`，用于存储实例的序列化数据，仅包含可以初始化的字段
        d = {field.name: getattr(self, field.name) for field in fields(self) if field.init}

        # 遍历字典 `d` 中的每个键值对
        for k, v in d.items():
            # 如果值 `v` 是枚举类型 `Enum`，则将其替换为其值
            if isinstance(v, Enum):
                d[k] = v.value
            # 如果值 `v` 是列表且第一个元素是枚举类型 `Enum`，则将列表中所有枚举元素替换为其值
            if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum):
                d[k] = [x.value for x in v]
            # 如果键 `k` 以 "_token" 结尾，将其值 `v` 替换为 `<K_UPPERCASE>` 形式的字符串
            if k.endswith("_token"):
                d[k] = f"<{k.upper()}>"
            # 如果加速器配置可用且值 `v` 是 `AcceleratorConfig` 类型，则将其序列化为字典形式
            if is_accelerate_available() and isinstance(v, AcceleratorConfig):
                d[k] = v.to_dict()
        return d

    def to_json_string(self):
        """
        Serializes this instance to a JSON string.
        """
        # 将实例序列化为 JSON 字符串，使用两个空格缩进
        return json.dumps(self.to_dict(), indent=2)

    def to_sanitized_dict(self) -> Dict[str, Any]:
        """
        Sanitized serialization to use with TensorBoard’s hparams
        """
        # 获取原始的字典表示形式
        d = self.to_dict()
        # 将训练批次大小和评估批次大小添加到字典 `d` 中
        d = {**d, **{"train_batch_size": self.train_batch_size, "eval_batch_size": self.eval_batch_size}}

        # 定义有效的数据类型列表
        valid_types = [bool, int, float, str]
        if is_torch_available():
            valid_types.append(torch.Tensor)

        # 返回字典，其中值的类型在有效类型列表中，否则转换为字符串形式
        return {k: v if type(v) in valid_types else str(v) for k, v in d.items()}

    # The following methods are there to simplify the instantiation of `TrainingArguments`
    # 下面的方法用于简化 `TrainingArguments` 的实例化设置
    def set_training(
        self,
        learning_rate: float = 5e-5,
        batch_size: int = 8,
        weight_decay: float = 0,
        num_epochs: float = 3,
        max_steps: int = -1,
        gradient_accumulation_steps: int = 1,
        seed: int = 42,
        gradient_checkpointing: bool = False,
    ):
        """
        A method that regroups all basic arguments linked to the training.

        <Tip>

        Calling this method will automatically set `self.do_train` to `True`.

        </Tip>

        Args:
            learning_rate (`float`, *optional*, defaults to 5e-5):
                The initial learning rate for the optimizer.
            batch_size (`int` *optional*, defaults to 8):
                The batch size per device (GPU/TPU core/CPU...) used for training.
            weight_decay (`float`, *optional*, defaults to 0):
                The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in the
                optimizer.
            num_train_epochs(`float`, *optional*, defaults to 3.0):
                Total number of training epochs to perform (if not an integer, will perform the decimal part percents
                of the last epoch before stopping training).
            max_steps (`int`, *optional*, defaults to -1):
                If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
                For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until
                `max_steps` is reached.
            gradient_accumulation_steps (`int`, *optional*, defaults to 1):
                Number of updates steps to accumulate the gradients for, before performing a backward/update pass.

                <Tip warning={true}>

                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
                logging, evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training
                examples.

                </Tip>

            seed (`int`, *optional*, defaults to 42):
                Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use
                the [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized
                parameters.
            gradient_checkpointing (`bool`, *optional*, defaults to `False`):
                If True, use gradient checkpointing to save memory at the expense of slower backward pass.

        Example:

        ```
        >>> from transformers import TrainingArguments

        >>> args = TrainingArguments("working_dir")
        >>> args = args.set_training(learning_rate=1e-4, batch_size=32)
        >>> args.learning_rate
        1e-4
        ```
        """
        # 设置 self.do_train 为 True，表明将执行训练过程
        self.do_train = True
        # 设置初始学习率
        self.learning_rate = learning_rate
        # 设置每个设备上的训练批次大小
        self.per_device_train_batch_size = batch_size
        # 设置权重衰减
        self.weight_decay = weight_decay
        # 设置训练的总轮数
        self.num_train_epochs = num_epochs
        # 设置最大训练步数
        self.max_steps = max_steps
        # 设置梯度累积步数
        self.gradient_accumulation_steps = gradient_accumulation_steps
        # 设置随机种子
        self.seed = seed
        # 设置是否使用梯度检查点
        self.gradient_checkpointing = gradient_checkpointing
        # 返回设置好的参数对象 self
        return self
    # 定义一个方法，用于设置评估相关的所有参数
    def set_evaluate(
        self,
        strategy: Union[str, IntervalStrategy] = "no",
        steps: int = 500,
        batch_size: int = 8,
        accumulation_steps: Optional[int] = None,
        delay: Optional[float] = None,
        loss_only: bool = False,
        jit_mode: bool = False,
    ):
        """
        A method that regroups all arguments linked to evaluation.

        Args:
            strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
                The evaluation strategy to adopt during training. Possible values are:

                    - `"no"`: No evaluation is done during training.
                    - `"steps"`: Evaluation is done (and logged) every `steps`.
                    - `"epoch"`: Evaluation is done at the end of each epoch.

                Setting a `strategy` different from `"no"` will set `self.do_eval` to `True`.
            steps (`int`, *optional*, defaults to 500):
                Number of update steps between two evaluations if `strategy="steps"`.
            batch_size (`int` *optional*, defaults to 8):
                The batch size per device (GPU/TPU core/CPU...) used for evaluation.
            accumulation_steps (`int`, *optional*):
                Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU.
                If left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster
                but requires more memory).
            delay (`float`, *optional*):
                Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
                evaluation_strategy.
            loss_only (`bool`, *optional*, defaults to `False`):
                Ignores all outputs except the loss.
            jit_mode (`bool`, *optional*):
                Whether or not to use PyTorch jit trace for inference.

        Example:

        ```
        >>> from transformers import TrainingArguments

        >>> args = TrainingArguments("working_dir")
        >>> args = args.set_evaluate(strategy="steps", steps=100)
        >>> args.eval_steps
        100
        ```
        """
        # 将传入的评估策略转换为IntervalStrategy枚举类型
        self.evaluation_strategy = IntervalStrategy(strategy)
        # 如果评估策略为STEPS，并且steps设置为0，则抛出数值错误
        if self.evaluation_strategy == IntervalStrategy.STEPS and steps == 0:
            raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.")
        # 根据评估策略是否为NO来设置是否进行评估
        self.do_eval = self.evaluation_strategy != IntervalStrategy.NO
        # 设置评估步数
        self.eval_steps = steps
        # 设置每个设备的评估批量大小
        self.per_device_eval_batch_size = batch_size
        # 设置评估累积步数
        self.eval_accumulation_steps = accumulation_steps
        # 设置评估延迟
        self.eval_delay = delay
        # 设置是否只计算损失
        self.prediction_loss_only = loss_only
        # 设置是否启用JIT模式用于评估
        self.jit_mode_eval = jit_mode
        # 返回当前对象的引用
        return self
    ):
        """
        A method that regroups all basic arguments linked to testing on a held-out dataset.

        <Tip>

        Calling this method will automatically set `self.do_predict` to `True`.

        </Tip>

        Args:
            batch_size (`int` *optional*, defaults to 8):
                The batch size per device (GPU/TPU core/CPU...) used for testing.
            loss_only (`bool`, *optional*, defaults to `False`):
                Ignores all outputs except the loss.
            jit_mode (`bool`, *optional*):
                Whether or not to use PyTorch jit trace for inference.

        Example:

        ```
        >>> from transformers import TrainingArguments

        >>> args = TrainingArguments("working_dir")
        >>> args = args.set_testing(batch_size=32)
        >>> args.per_device_eval_batch_size
        32
        ```
        """
        # 将self.do_predict设置为True，表示在调用此方法后进行预测
        self.do_predict = True
        # 设置每个设备（GPU/TPU核心/CPU...）用于测试的批处理大小
        self.per_device_eval_batch_size = batch_size
        # 设置是否仅计算预测损失，忽略所有其他输出
        self.prediction_loss_only = loss_only
        # 设置是否使用PyTorch的jit追踪进行推断
        self.jit_mode_eval = jit_mode
        # 返回设置后的对象本身，以支持方法链式调用
        return self
    ):
        """
        A method that regroups all arguments linked to checkpoint saving.

        Args:
            strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
                The checkpoint save strategy to adopt during training. Possible values are:

                    - `"no"`: No save is done during training.
                    - `"epoch"`: Save is done at the end of each epoch.
                    - `"steps"`: Save is done every `save_steps`.

            steps (`int`, *optional*, defaults to 500):
                Number of updates steps before two checkpoint saves if `strategy="steps"`.
            total_limit (`int`, *optional*):
                If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
                `output_dir`.
            on_each_node (`bool`, *optional*, defaults to `False`):
                When doing multi-node distributed training, whether to save models and checkpoints on each node, or
                only on the main one.

                This should not be activated when the different nodes use the same storage as the files will be saved
                with the same names for each node.

        Example:

        ```
        >>> from transformers import TrainingArguments

        >>> args = TrainingArguments("working_dir")
        >>> args = args.set_save(strategy="steps", steps=100)
        >>> args.save_steps
        100
        ```
        """
        self.save_strategy = IntervalStrategy(strategy)
        # 设置保存策略为指定的策略类型
        if self.save_strategy == IntervalStrategy.STEPS and steps == 0:
            raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.")
        # 如果保存策略为步数，并且步数设置为0，则抛出数值错误
        self.save_steps = steps
        # 设置保存步数
        self.save_total_limit = total_limit
        # 设置总共保存的最大数量限制
        self.save_on_each_node = on_each_node
        # 设置是否在每个节点上保存模型和检查点
        return self
        # 返回当前实例化对象，以便支持链式调用
    def set_logging(
        self,
        strategy: Union[str, IntervalStrategy] = "steps",
        steps: int = 500,
        report_to: Union[str, List[str]] = "none",
        level: str = "passive",
        first_step: bool = False,
        nan_inf_filter: bool = False,
        on_each_node: bool = False,
        replica_level: str = "passive",
    def set_push_to_hub(
        self,
        model_id: str,
        strategy: Union[str, HubStrategy] = "every_save",
        token: Optional[str] = None,
        private_repo: bool = False,
        always_push: bool = False,
    def set_optimizer(
        self,
        name: Union[str, OptimizerNames] = "adamw_torch",
        learning_rate: float = 5e-5,
        weight_decay: float = 0,
        beta1: float = 0.9,
        beta2: float = 0.999,
        epsilon: float = 1e-8,
        args: Optional[str] = None,
    ):
        """
        A method that regroups all arguments linked to the optimizer and its hyperparameters.

        Args:
            name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
                The optimizer to use: `"adamw_hf"`, `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`,
                `"adamw_anyprecision"` or `"adafactor"`.
            learning_rate (`float`, *optional*, defaults to 5e-5):
                The initial learning rate.
            weight_decay (`float`, *optional*, defaults to 0):
                The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights.
            beta1 (`float`, *optional*, defaults to 0.9):
                The beta1 hyperparameter for the adam optimizer or its variants.
            beta2 (`float`, *optional*, defaults to 0.999):
                The beta2 hyperparameter for the adam optimizer or its variants.
            epsilon (`float`, *optional*, defaults to 1e-8):
                The epsilon hyperparameter for the adam optimizer or its variants.
            args (`str`, *optional*):
                Optional arguments that are supplied to AnyPrecisionAdamW (only useful when
                `optim="adamw_anyprecision"`).

        Example:

        ```
        >>> from transformers import TrainingArguments

        >>> args = TrainingArguments("working_dir")
        >>> args = args.set_optimizer(name="adamw_torch", beta1=0.8)
        >>> args.optim
        'adamw_torch'
        ```
        """
        # 设置优化器名称，将输入的名称转换为 OptimizerNames 对象
        self.optim = OptimizerNames(name)
        # 设置初始学习率
        self.learning_rate = learning_rate
        # 设置权重衰减率，应用于除所有偏置和 LayerNorm 权重以外的所有层
        self.weight_decay = weight_decay
        # 设置 adam 优化器及其变体的 beta1 参数
        self.adam_beta1 = beta1
        # 设置 adam 优化器及其变体的 beta2 参数
        self.adam_beta2 = beta2
        # 设置 adam 优化器及其变体的 epsilon 参数
        self.adam_epsilon = epsilon
        # 设置优化器的额外参数
        self.optim_args = args
        # 返回当前对象，以支持方法链调用
        return self
    ):
        """
        A method that regroups all arguments linked to the learning rate scheduler and its hyperparameters.

        Args:
            name (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`):
                The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values.
            num_epochs(`float`, *optional*, defaults to 3.0):
                Total number of training epochs to perform (if not an integer, will perform the decimal part percents
                of the last epoch before stopping training).
            max_steps (`int`, *optional*, defaults to -1):
                If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
                For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until
                `max_steps` is reached.
            warmup_ratio (`float`, *optional*, defaults to 0.0):
                Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
            warmup_steps (`int`, *optional*, defaults to 0):
                Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of
                `warmup_ratio`.

        Example:

        ```
        >>> from transformers import TrainingArguments

        >>> args = TrainingArguments("working_dir")
        >>> args = args.set_lr_scheduler(name="cosine", warmup_ratio=0.05)
        >>> args.warmup_ratio
        0.05
        ```
        """
        # 设置学习率调度器类型
        self.lr_scheduler_type = SchedulerType(name)
        # 设置训练的总轮次
        self.num_train_epochs = num_epochs
        # 设置最大训练步数
        self.max_steps = max_steps
        # 设置线性预热的比例
        self.warmup_ratio = warmup_ratio
        # 设置线性预热的步数
        self.warmup_steps = warmup_steps
        # 返回设置后的对象本身
        return self

    def set_dataloader(
        self,
        train_batch_size: int = 8,
        eval_batch_size: int = 8,
        drop_last: bool = False,
        num_workers: int = 0,
        pin_memory: bool = True,
        persistent_workers: bool = False,
        prefetch_factor: Optional[int] = None,
        auto_find_batch_size: bool = False,
        ignore_data_skip: bool = False,
        sampler_seed: Optional[int] = None,
# 定义一个枚举类 ParallelMode，用于表示并行计算模式的选项
class ParallelMode(Enum):
    # 表示非并行模式
    NOT_PARALLEL = "not_parallel"
    # 表示非分布式模式
    NOT_DISTRIBUTED = "not_distributed"
    # 表示分布式模式
    DISTRIBUTED = "distributed"
    # 表示使用Sagemaker的模型并行计算模式
    SAGEMAKER_MODEL_PARALLEL = "sagemaker_model_parallel"
    # 表示使用Sagemaker的数据并行计算模式
    SAGEMAKER_DATA_PARALLEL = "sagemaker_data_parallel"
    # 表示使用TPU进行计算
    TPU = "tpu"

Transformers-源码解析-一百三十六-

Transformers 源码解析（一百三十六）

.\trainer_callback.py

.\trainer_pt_utils.py

.\trainer_seq2seq.py

.\trainer_utils.py

.\training_args.py

`.\trainer_callback.py`

`.\trainer_pt_utils.py`

`.\trainer_seq2seq.py`

`.\trainer_utils.py`

`.\training_args.py`