Transformers 源码解析（十二）

`.\models\auto\init.py`

# 引入类型检查标记，用于条件检查时的类型提示
from typing import TYPE_CHECKING

# 从 utils 模块中导入所需内容
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_torch_available,
)

# 定义模块的导入结构，包含不同子模块及其对应的导入项列表
_import_structure = {
    "auto_factory": ["get_values"],
    "configuration_auto": ["ALL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CONFIG_MAPPING", "MODEL_NAMES_MAPPING", "AutoConfig"],
    "feature_extraction_auto": ["FEATURE_EXTRACTOR_MAPPING", "AutoFeatureExtractor"],
    "image_processing_auto": ["IMAGE_PROCESSOR_MAPPING", "AutoImageProcessor"],
    "processing_auto": ["PROCESSOR_MAPPING", "AutoProcessor"],
    "tokenization_auto": ["TOKENIZER_MAPPING", "AutoTokenizer"],
}

# 尝试检查是否 Torch 可用，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass  # 如果 Torch 不可用，则不进行任何操作
else:
    # 如果 Torch 可用，则继续执行以下代码段（未提供完整代码，此处应补充具体操作）
    pass

# 尝试检查是否 TensorFlow 可用，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass  # 如果 TensorFlow 不可用，则不进行任何操作
else:
    # 如果 TensorFlow 可用，则继续执行以下代码段（未提供完整代码，此处应补充具体操作）
    pass
    # 将"modeling_tf_auto"键添加到_import_structure字典中，其对应的值是包含多个字符串的列表
    _import_structure["modeling_tf_auto"] = [
        "TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",  # 包含音频分类模型映射的字符串
        "TF_MODEL_FOR_CAUSAL_LM_MAPPING",  # 包含因果语言模型映射的字符串
        "TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",  # 包含图像分类模型映射的字符串
        "TF_MODEL_FOR_MASK_GENERATION_MAPPING",  # 包含生成掩码模型映射的字符串
        "TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING",  # 包含掩码图像建模模型映射的字符串
        "TF_MODEL_FOR_MASKED_LM_MAPPING",  # 包含掩码语言模型映射的字符串
        "TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",  # 包含多选题模型映射的字符串
        "TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",  # 包含下一句预测模型映射的字符串
        "TF_MODEL_FOR_PRETRAINING_MAPPING",  # 包含预训练模型映射的字符串
        "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",  # 包含问答模型映射的字符串
        "TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",  # 包含文档问答模型映射的字符串
        "TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",  # 包含语义分割模型映射的字符串
        "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",  # 包含序列到序列因果语言模型映射的字符串
        "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",  # 包含序列分类模型映射的字符串
        "TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",  # 包含语音序列到序列模型映射的字符串
        "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",  # 包含表格问答模型映射的字符串
        "TF_MODEL_FOR_TEXT_ENCODING_MAPPING",  # 包含文本编码模型映射的字符串
        "TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",  # 包含标记分类模型映射的字符串
        "TF_MODEL_FOR_VISION_2_SEQ_MAPPING",  # 包含视觉到序列模型映射的字符串
        "TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",  # 包含零样本图像分类模型映射的字符串
        "TF_MODEL_MAPPING",  # 包含通用模型映射的字符串
        "TF_MODEL_WITH_LM_HEAD_MAPPING",  # 包含带语言模型头部模型映射的字符串
        "TFAutoModel",  # 自动选择模型的通用类
        "TFAutoModelForAudioClassification",  # 自动选择音频分类模型的类
        "TFAutoModelForCausalLM",  # 自动选择因果语言模型的类
        "TFAutoModelForImageClassification",  # 自动选择图像分类模型的类
        "TFAutoModelForMaskedImageModeling",  # 自动选择掩码图像建模模型的类
        "TFAutoModelForMaskedLM",  # 自动选择掩码语言模型的类
        "TFAutoModelForMaskGeneration",  # 自动选择生成掩码模型的类
        "TFAutoModelForMultipleChoice",  # 自动选择多选题模型的类
        "TFAutoModelForNextSentencePrediction",  # 自动选择下一句预测模型的类
        "TFAutoModelForPreTraining",  # 自动选择预训练模型的类
        "TFAutoModelForDocumentQuestionAnswering",  # 自动选择文档问答模型的类
        "TFAutoModelForQuestionAnswering",  # 自动选择问答模型的类
        "TFAutoModelForSemanticSegmentation",  # 自动选择语义分割模型的类
        "TFAutoModelForSeq2SeqLM",  # 自动选择序列到序列语言模型的类
        "TFAutoModelForSequenceClassification",  # 自动选择序列分类模型的类
        "TFAutoModelForSpeechSeq2Seq",  # 自动选择语音序列到序列模型的类
        "TFAutoModelForTableQuestionAnswering",  # 自动选择表格问答模型的类
        "TFAutoModelForTextEncoding",  # 自动选择文本编码模型的类
        "TFAutoModelForTokenClassification",  # 自动选择标记分类模型的类
        "TFAutoModelForVision2Seq",  # 自动选择视觉到序列模型的类
        "TFAutoModelForZeroShotImageClassification",  # 自动选择零样本图像分类模型的类
        "TFAutoModelWithLMHead",  # 自动选择带语言模型头部模型的类
    ]
try:
    # 检查是否可用 Flax 库，若不可用则抛出 OptionalDependencyNotAvailable 异常
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    # 捕获 OptionalDependencyNotAvailable 异常，不做任何处理
    pass
else:
    # 如果 Flax 可用，则定义 Flax 模型的导入结构
    _import_structure["modeling_flax_auto"] = [
        "FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
        "FLAX_MODEL_FOR_CAUSAL_LM_MAPPING",
        "FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
        "FLAX_MODEL_FOR_MASKED_LM_MAPPING",
        "FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
        "FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
        "FLAX_MODEL_FOR_PRETRAINING_MAPPING",
        "FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
        "FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
        "FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
        "FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
        "FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
        "FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING",
        "FLAX_MODEL_MAPPING",
        "FlaxAutoModel",
        "FlaxAutoModelForCausalLM",
        "FlaxAutoModelForImageClassification",
        "FlaxAutoModelForMaskedLM",
        "FlaxAutoModelForMultipleChoice",
        "FlaxAutoModelForNextSentencePrediction",
        "FlaxAutoModelForPreTraining",
        "FlaxAutoModelForQuestionAnswering",
        "FlaxAutoModelForSeq2SeqLM",
        "FlaxAutoModelForSequenceClassification",
        "FlaxAutoModelForSpeechSeq2Seq",
        "FlaxAutoModelForTokenClassification",
        "FlaxAutoModelForVision2Seq",
    ]

if TYPE_CHECKING:
    # 若为类型检查模式，则从相应模块导入所需符号
    from .auto_factory import get_values
    from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, MODEL_NAMES_MAPPING, AutoConfig
    from .feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
    from .image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
    from .processing_auto import PROCESSOR_MAPPING, AutoProcessor
    from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer

    try:
        # 检查是否可用 Torch 库，若不可用则抛出 OptionalDependencyNotAvailable 异常
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 捕获 OptionalDependencyNotAvailable 异常，不做任何处理
        pass

    try:
        # 检查是否可用 TensorFlow 库，若不可用则抛出 OptionalDependencyNotAvailable 异常
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 捕获 OptionalDependencyNotAvailable 异常，不做任何处理
        pass
    # 如果不是Flax可用状态，则引发OptionalDependencyNotAvailable异常
    else:
        # 从当前目录下的modeling_tf_auto模块导入多个TF模型映射和TF模型类
        from .modeling_tf_auto import (
            TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
            TF_MODEL_FOR_CAUSAL_LM_MAPPING,
            TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
            TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
            TF_MODEL_FOR_MASK_GENERATION_MAPPING,
            TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
            TF_MODEL_FOR_MASKED_LM_MAPPING,
            TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
            TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
            TF_MODEL_FOR_PRETRAINING_MAPPING,
            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
            TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
            TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
            TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
            TF_MODEL_FOR_TEXT_ENCODING_MAPPING,
            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
            TF_MODEL_FOR_VISION_2_SEQ_MAPPING,
            TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
            TF_MODEL_MAPPING,
            TF_MODEL_WITH_LM_HEAD_MAPPING,
            TFAutoModel,
            TFAutoModelForAudioClassification,
            TFAutoModelForCausalLM,
            TFAutoModelForDocumentQuestionAnswering,
            TFAutoModelForImageClassification,
            TFAutoModelForMaskedImageModeling,
            TFAutoModelForMaskedLM,
            TFAutoModelForMaskGeneration,
            TFAutoModelForMultipleChoice,
            TFAutoModelForNextSentencePrediction,
            TFAutoModelForPreTraining,
            TFAutoModelForQuestionAnswering,
            TFAutoModelForSemanticSegmentation,
            TFAutoModelForSeq2SeqLM,
            TFAutoModelForSequenceClassification,
            TFAutoModelForSpeechSeq2Seq,
            TFAutoModelForTableQuestionAnswering,
            TFAutoModelForTextEncoding,
            TFAutoModelForTokenClassification,
            TFAutoModelForVision2Seq,
            TFAutoModelForZeroShotImageClassification,
            TFAutoModelWithLMHead,
        )

    # 尝试检测是否Flax可用，如果不可用则捕获异常并忽略
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    # 如果导入模块失败，则尝试从当前包的子模块中导入多个符号和名称
    else:
        from .modeling_flax_auto import (
            FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,  # 导入音频分类模型映射
            FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,            # 导入因果语言模型映射
            FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,  # 导入图像分类模型映射
            FLAX_MODEL_FOR_MASKED_LM_MAPPING,            # 导入遮蔽语言模型映射
            FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,      # 导入多选题模型映射
            FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,  # 导入下一句预测模型映射
            FLAX_MODEL_FOR_PRETRAINING_MAPPING,          # 导入预训练模型映射
            FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING,   # 导入问答模型映射
            FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, # 导入序列到序列因果语言模型映射
            FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,  # 导入序列分类模型映射
            FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,     # 导入语音序列到序列模型映射
            FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, # 导入标记分类模型映射
            FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING,         # 导入视觉到序列模型映射
            FLAX_MODEL_MAPPING,                          # 导入通用模型映射
            FlaxAutoModel,                               # 导入通用 Flax 自动模型
            FlaxAutoModelForCausalLM,                    # 导入因果语言模型的 Flax 自动模型
            FlaxAutoModelForImageClassification,         # 导入图像分类的 Flax 自动模型
            FlaxAutoModelForMaskedLM,                    # 导入遮蔽语言模型的 Flax 自动模型
            FlaxAutoModelForMultipleChoice,              # 导入多选题的 Flax 自动模型
            FlaxAutoModelForNextSentencePrediction,      # 导入下一句预测的 Flax 自动模型
            FlaxAutoModelForPreTraining,                 # 导入预训练的 Flax 自动模型
            FlaxAutoModelForQuestionAnswering,           # 导入问答的 Flax 自动模型
            FlaxAutoModelForSeq2SeqLM,                   # 导入序列到序列语言模型的 Flax 自动模型
            FlaxAutoModelForSequenceClassification,      # 导入序列分类的 Flax 自动模型
            FlaxAutoModelForSpeechSeq2Seq,               # 导入语音序列到序列的 Flax 自动模型
            FlaxAutoModelForTokenClassification,         # 导入标记分类的 Flax 自动模型
            FlaxAutoModelForVision2Seq,                  # 导入视觉到序列的 Flax 自动模型
        )
else:
    # 导入 sys 模块，用于动态配置当前模块
    import sys

    # 将当前模块的名称和其他相关信息交给 _LazyModule 类处理，并赋值给 sys.modules 中的当前模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\autoformer\configuration_autoformer.py`

# 设置编码格式为 UTF-8

# 版权声明，声明此代码的版权归 HuggingFace Inc. 团队所有，保留所有权利。
# 根据 Apache License, Version 2.0 许可证使用本文件。您可以在符合许可证的情况下使用此文件，
# 您可以获取许可证的副本，具体网址在 http://www.apache.org/licenses/LICENSE-2.0
# 除非适用法律要求或书面同意，否则本软件根据“原样”分发，无任何明示或暗示的担保或条件。
# 有关更多信息，请参见许可证文档。

""" Autoformer model configuration"""

# 引入必要的模块
from typing import List, Optional
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义预训练配置文件的映射
AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "huggingface/autoformer-tourism-monthly": "https://huggingface.co/huggingface/autoformer-tourism-monthly/resolve/main/config.json",
}

# Autoformer 配置类，继承自 PretrainedConfig 类
class AutoformerConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of an [`AutoformerModel`]. It is used to instantiate an
    Autoformer model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Autoformer
    [huggingface/autoformer-tourism-monthly](https://huggingface.co/huggingface/autoformer-tourism-monthly)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    ```
    >>> from transformers import AutoformerConfig, AutoformerModel

    >>> # Initializing a default Autoformer configuration
    >>> configuration = AutoformerConfig()

    >>> # Randomly initializing a model (with random weights) from the configuration
    >>> model = AutoformerModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型为 "autoformer"
    model_type = "autoformer"

    # 属性映射字典，将 AutoformerConfig 类的属性映射到预训练模型的配置中
    attribute_map = {
        "hidden_size": "d_model",
        "num_attention_heads": "encoder_attention_heads",
        "num_hidden_layers": "encoder_layers",
    }
    # 初始化函数，用于设置模型的各种参数和默认值
    def __init__(
        self,
        prediction_length: Optional[int] = None,  # 预测长度，可选参数，默认为 None
        context_length: Optional[int] = None,     # 上下文长度，可选参数，默认为 None
        distribution_output: str = "student_t",   # 分布输出类型，默认为 "student_t"
        loss: str = "nll",                        # 损失函数类型，默认为 "nll"
        input_size: int = 1,                      # 输入数据的维度，默认为 1
        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],  # 滞后序列，列表，默认为 [1, 2, 3, 4, 5, 6, 7]
        scaling: bool = True,                     # 是否进行数据缩放，默认为 True
        num_time_features: int = 0,               # 时间特征的数量，默认为 0
        num_dynamic_real_features: int = 0,       # 动态实数特征的数量，默认为 0
        num_static_categorical_features: int = 0, # 静态分类特征的数量，默认为 0
        num_static_real_features: int = 0,        # 静态实数特征的数量，默认为 0
        cardinality: Optional[List[int]] = None,  # 分类特征的基数，可选参数，默认为 None
        embedding_dimension: Optional[List[int]] = None,  # 嵌入维度，可选参数，默认为 None
        d_model: int = 64,                        # 模型的维度，默认为 64
        encoder_attention_heads: int = 2,         # 编码器注意力头的数量，默认为 2
        decoder_attention_heads: int = 2,         # 解码器注意力头的数量，默认为 2
        encoder_layers: int = 2,                  # 编码器层数，默认为 2
        decoder_layers: int = 2,                  # 解码器层数，默认为 2
        encoder_ffn_dim: int = 32,                # 编码器中 FFN 层的维度，默认为 32
        decoder_ffn_dim: int = 32,                # 解码器中 FFN 层的维度，默认为 32
        activation_function: str = "gelu",        # 激活函数类型，默认为 "gelu"
        dropout: float = 0.1,                     # 通用的 dropout 比例，默认为 0.1
        encoder_layerdrop: float = 0.1,           # 编码器层 dropout 比例，默认为 0.1
        decoder_layerdrop: float = 0.1,           # 解码器层 dropout 比例，默认为 0.1
        attention_dropout: float = 0.1,           # 注意力机制的 dropout 比例，默认为 0.1
        activation_dropout: float = 0.1,          # 激活函数的 dropout 比例，默认为 0.1
        num_parallel_samples: int = 100,          # 并行采样数量，默认为 100
        init_std: float = 0.02,                   # 初始化标准差，默认为 0.02
        use_cache: bool = True,                   # 是否使用缓存，默认为 True
        is_encoder_decoder=True,                  # 是否是编码器-解码器结构，默认为 True
        # Autoformer 参数
        label_length: int = 10,                   # 标签长度，默认为 10
        moving_average: int = 25,                 # 移动平均窗口大小，默认为 25
        autocorrelation_factor: int = 3,          # 自相关因子，默认为 3
        **kwargs,                                 # 其他未指定的参数，作为字典接收
        # 时间序列特定配置
        self.prediction_length = prediction_length  # 设置预测长度
        self.context_length = context_length if context_length is not None else prediction_length  # 设置上下文长度，默认为预测长度
        self.distribution_output = distribution_output  # 分布输出配置
        self.loss = loss  # 损失函数配置
        self.input_size = input_size  # 输入尺寸
        self.num_time_features = num_time_features  # 时间特征数量
        self.lags_sequence = lags_sequence  # 滞后序列配置
        self.scaling = scaling  # 是否进行缩放处理
        self.num_dynamic_real_features = num_dynamic_real_features  # 动态实数特征数量
        self.num_static_real_features = num_static_real_features  # 静态实数特征数量
        self.num_static_categorical_features = num_static_categorical_features  # 静态分类特征数量
        if cardinality is not None and num_static_categorical_features > 0:
            if len(cardinality) != num_static_categorical_features:
                raise ValueError(
                    "The cardinality should be a list of the same length as `num_static_categorical_features`"
                )
            self.cardinality = cardinality  # 静态分类特征的基数列表
        else:
            self.cardinality = [0]  # 默认基数为0，表示无静态分类特征
        if embedding_dimension is not None and num_static_categorical_features > 0:
            if len(embedding_dimension) != num_static_categorical_features:
                raise ValueError(
                    "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
                )
            self.embedding_dimension = embedding_dimension  # 静态分类特征的嵌入维度列表
        else:
            self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]  # 默认嵌入维度计算
        self.num_parallel_samples = num_parallel_samples  # 并行采样数量设置

        # Transformer 架构配置
        self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features  # 特征大小计算
        self.d_model = d_model  # Transformer 模型的维度
        self.encoder_attention_heads = encoder_attention_heads  # 编码器注意力头数
        self.decoder_attention_heads = decoder_attention_heads  # 解码器注意力头数
        self.encoder_ffn_dim = encoder_ffn_dim  # 编码器前馈网络维度
        self.decoder_ffn_dim = decoder_ffn_dim  # 解码器前馈网络维度
        self.encoder_layers = encoder_layers  # 编码器层数
        self.decoder_layers = decoder_layers  # 解码器层数

        self.dropout = dropout  # 普通的dropout率
        self.attention_dropout = attention_dropout  # 注意力机制中的dropout率
        self.activation_dropout = activation_dropout  # 激活函数的dropout率
        self.encoder_layerdrop = encoder_layerdrop  # 编码器层级dropout率
        self.decoder_layerdrop = decoder_layerdrop  # 解码器层级dropout率

        self.activation_function = activation_function  # 激活函数类型
        self.init_std = init_std  # 初始化标准差

        self.use_cache = use_cache  # 是否使用缓存

        # Autoformer
        self.label_length = label_length  # 标签长度
        self.moving_average = moving_average  # 移动平均配置
        self.autocorrelation_factor = autocorrelation_factor  # 自相关因子配置

        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)  # 调用父类初始化函数
    # 定义一个私有方法 `_number_of_features`，返回整数类型的值
    def _number_of_features(self) -> int:
        # 计算所有嵌入维度的总和
        return (
            sum(self.embedding_dimension)
            # 加上动态实数特征的数量
            + self.num_dynamic_real_features
            # 加上时间特征的数量
            + self.num_time_features
            # 加上静态实数特征的数量
            + self.num_static_real_features
            # 加上输入大小的两倍，代表 log1p(abs(loc)) 和 log(scale) 特征
            + self.input_size * 2
        )

`.\models\autoformer\modeling_autoformer.py`

# 设置编码格式为 UTF-8，确保代码中可以正确处理各种字符
# 版权声明，这些代码的版权归清华大学 THUML、亚马逊公司及其关联公司以及HuggingFace团队所有
# 根据 Apache 许可证 2.0 版本，你可以在遵守许可证的情况下使用这些代码
# 访问 http://www.apache.org/licenses/LICENSE-2.0 查看许可证的详细信息

""" PyTorch Autoformer model. """

# 导入必要的库和模块
import math  # 导入数学库
from dataclasses import dataclass  # 导入数据类
from typing import List, Optional, Tuple, Union  # 导入类型提示相关模块

import numpy as np  # 导入 NumPy 库
import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 中的 checkpoint 功能
from torch import nn  # 从 PyTorch 中导入神经网络模块

# 导入额外的自定义模块和函数
from ...activations import ACT2FN  # 从模型中导入激活函数 ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask  # 导入注意力掩码相关函数
from ...modeling_outputs import (  # 导入模型输出相关类
    BaseModelOutput,
    ModelOutput,
    SampleTSPredictionOutput,
    Seq2SeqTSPredictionOutput,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型相关工具函数
from ...time_series_utils import (  # 导入时间序列相关输出
    NegativeBinomialOutput,
    NormalOutput,
    StudentTOutput,
)
from ...utils import (  # 导入通用工具函数
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_autoformer import AutoformerConfig  # 导入 Autoformer 的配置文件

# 获取 logger 对象用于记录日志
logger = logging.get_logger(__name__)

# 用于文档的配置名称
_CONFIG_FOR_DOC = "AutoformerConfig"


@dataclass
class AutoFormerDecoderOutput(ModelOutput):
    """
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
    """
    pass  # AutoFormerDecoderOutput 类的基类，用于模型输出，可能包含过去的键/值以加速顺序解码
    # 最后一层模型的隐藏状态，形状为 `(batch_size, sequence_length, hidden_size)`
    last_hidden_state: torch.FloatTensor = None
    
    # 每个时间序列的趋势张量，形状为 `(batch_size, sequence_length, hidden_size)`
    trend: torch.FloatTensor = None
    
    # 如果使用了缓存 (`use_cache=True` 或 `config.use_cache=True`)，则返回的预计算密钥和值
    # 是一个元组，包含长度为 `config.n_layers` 的元组，每个元组包含两个形状为
    # `(batch_size, num_heads, sequence_length, embed_size_per_head)` 的张量。如果
    # `config.is_encoder_decoder=True`，还包括两个额外的张量，形状为
    # `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`。
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
    
    # 如果输出隐藏状态 (`output_hidden_states=True` 或 `config.output_hidden_states=True`)，
    # 则返回的隐藏状态是一个元组，包含以下两个张量：
    # 1. 形状为 `(batch_size, sequence_length, hidden_size)` 的模型每一层的输出隐藏状态；
    # 2. 如果模型有嵌入层，则包括形状为 `(batch_size, sequence_length, hidden_size)` 的初始嵌入输出。
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    
    # 如果输出注意力权重 (`output_attentions=True` 或 `config.output_attentions=True`)，
    # 则返回的注意力权重是一个元组，包含每一层的注意力权重张量：
    # 形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    
    # 如果输出交叉注意力权重 (`output_attentions=True` 且 `config.add_cross_attention=True` 或 `config.output_attentions=True`)，
    # 则返回的交叉注意力权重是一个元组，包含每一层的交叉注意力权重张量：
    # 形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 定义一个名为 attentions 的可选类型变量，用于存储一个包含 torch.FloatTensor 类型对象的元组或者为 None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    
    # 定义一个名为 cross_attentions 的可选类型变量，用于存储一个包含 torch.FloatTensor 类型对象的元组或者为 None
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 定义一个数据类，用于存储Autoformer模型的输出，包括最后隐藏状态、趋势、过去的键值、解码器隐藏状态、
# 解码器注意力、交叉注意力、编码器最后隐藏状态、编码器隐藏状态、编码器注意力、位置和规模以及静态特征
@dataclass
class AutoformerModelOutput(ModelOutput):
    """
    Autoformer model output that contains the additional trend output.
    """

    last_hidden_state: torch.FloatTensor = None  # 最后隐藏状态
    trend: torch.FloatTensor = None  # 趋势
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None  # 过去的键值
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 解码器隐藏状态
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None  # 解码器注意力
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None  # 交叉注意力
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None  # 编码器最后隐藏状态
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 编码器隐藏状态
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None  # 编码器注意力
    loc: Optional[torch.FloatTensor] = None  # 位置
    scale: Optional[torch.FloatTensor] = None  # 规模
    static_features: Optional[torch.FloatTensor] = None  # 静态特征


AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "huggingface/autoformer-tourism-monthly",
    # 查看所有Autoformer模型的列表链接
    # See all Autoformer models at https://huggingface.co/models?filter=autoformer
]


# 从transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesFeatureEmbedder复制而来，
# 更名为AutoformerFeatureEmbedder，用于嵌入序列的分类特征
class AutoformerFeatureEmbedder(nn.Module):
    """
    Embed a sequence of categorical features.

    Args:
        cardinalities (`list[int]`):
            List of cardinalities of the categorical features.
        embedding_dims (`list[int]`):
            List of embedding dimensions of the categorical features.
    """

    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
        super().__init__()

        # 计算分类特征的数量
        self.num_features = len(cardinalities)
        # 创建嵌入层列表，每个分类特征对应一个嵌入层
        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        if self.num_features > 1:
            # 切分最后一个维度，得到一个形状为(N, T)或者(N)的长度为self.num_features的数组
            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
        else:
            cat_feature_slices = [features]

        # 将每个切片通过对应的嵌入层嵌入，并在最后一个维度上拼接起来
        return torch.cat(
            [
                embed(cat_feature_slice.squeeze(-1))
                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
            ],
            dim=-1,
        )


# 从transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler复制而来，
# 更名为AutoformerStdScaler，用于标准化特征
class AutoformerStdScaler(nn.Module):
    """
    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
    subtracting from the mean and dividing by the standard deviation.
    """
    # 初始化方法，接受一个 AutoformerConfig 对象作为参数
    def __init__(self, config: AutoformerConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 如果 config 对象有 scaling_dim 属性，则将其赋值给 self.dim；否则默认为 1
        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
        # 如果 config 对象有 keepdim 属性，则将其赋值给 self.keepdim；否则默认为 True
        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
        # 如果 config 对象有 minimum_scale 属性，则将其赋值给 self.minimum_scale；否则默认为 1e-5
        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-5

    # 前向传播方法，接受两个参数并返回三个 Tensor 对象的元组
    def forward(
        self, data: torch.Tensor, observed_indicator: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        """
        # 计算 observed_indicator 在指定维度上的和，根据 keepdim 参数决定是否保持维度
        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
        # 将 denominator 中的值限制下限为 1.0
        denominator = denominator.clamp_min(1.0)
        # 计算 loc（均值），使用 data 和 observed_indicator 的乘积，并在指定维度上求和
        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator

        # 计算方差，使用 data、loc 和 observed_indicator 计算
        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
        # 计算 scale（标准差），在方差的基础上加上 minimum_scale，然后取平方根
        scale = torch.sqrt(variance + self.minimum_scale)
        # 返回标准化后的 data、loc 和 scale
        return (data - loc) / scale, loc, scale
# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer
class AutoformerMeanScaler(nn.Module):
    """
    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
    accordingly.
    """

    def __init__(self, config: AutoformerConfig):
        super().__init__()
        # 初始化时从配置中获取缩放的维度，默认为第一维度
        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
        # 是否保持维度，默认为True
        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
        # 最小缩放值，默认为1e-10
        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
        # 默认缩放值，如果配置中有指定则使用，否则为None
        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None

    def forward(
        self, data: torch.Tensor, observed_indicator: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        """
        # 计算加权平均绝对值，以第一维度为基础
        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
        # 计算观测指标的数量
        num_observed = observed_indicator.sum(self.dim, keepdim=True)

        # 计算缩放比例，确保不会除以零
        scale = ts_sum / torch.clamp(num_observed, min=1)

        # 如果未提供 `default_scale`，则使用批次的缩放比例，否则使用指定的缩放比例
        if self.default_scale is None:
            batch_sum = ts_sum.sum(dim=0)
            batch_observations = torch.clamp(num_observed.sum(0), min=1)
            default_scale = torch.squeeze(batch_sum / batch_observations)
        else:
            default_scale = self.default_scale * torch.ones_like(scale)

        # 应用默认缩放比例到没有观测到的地方
        scale = torch.where(num_observed > 0, scale, default_scale)

        # 确保缩放比例至少为 `self.minimum_scale`
        scale = torch.clamp(scale, min=self.minimum_scale)
        # 应用缩放到数据上
        scaled_data = data / scale

        # 如果不保持维度，则去除对应维度的缩放比例
        if not self.keepdim:
            scale = scale.squeeze(dim=self.dim)

        return scaled_data, torch.zeros_like(scale), scale


# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer
class AutoformerNOPScaler(nn.Module):
    """
    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
    """
    # 初始化方法，接受一个配置参数 `config`，类型为 `AutoformerConfig`
    def __init__(self, config: AutoformerConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 如果配置对象 `config` 中有 `scaling_dim` 属性，则将其赋值给 `self.dim`，否则设为 1
        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
        # 如果配置对象 `config` 中有 `keepdim` 属性，则将其赋值给 `self.keepdim`，否则设为 True
        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True

    # 前向传播方法，接受输入数据 `data` 和可选的观察指示器 `observed_indicator`
    # 返回值是一个元组，包含三个 `torch.Tensor` 类型的张量
    def forward(
        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        # 计算 `data` 张量每个维度的均值，并创建与 `data` 相同形状的全为 1 的张量 `scale`
        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
        # 创建与 `data` 相同形状的全为 0 的张量 `loc`
        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
        # 返回原始输入 `data`，以及计算得到的 `loc` 和 `scale`
        return data, loc, scale
# 从transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average中复制过来的函数
def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
    """
    计算给定维度上张量的加权平均值，并对与权重为零相关的值进行掩码处理，
    这意味着你将得到`nan * 0 = nan`的替代值`0 * 0 = 0`。

    Args:
        input_tensor (`torch.FloatTensor`):
            输入张量，需要计算平均值。
        weights (`torch.FloatTensor`, *可选*):
            权重张量，与`input_tensor`形状相同。
        dim (`int`, *可选*):
            沿着哪个维度对`input_tensor`进行平均。

    Returns:
        `torch.FloatTensor`: 沿指定`dim`平均值的张量。
    """
    if weights is not None:
        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
    else:
        return input_tensor.mean(dim=dim)


# 从transformers.models.time_series_transformer.modeling_time_series_transformer.nll中复制过来的函数
def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
    """
    从输入分布计算与目标相关的负对数似然损失。
    """
    return -input.log_prob(target)


# 从transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding复制到Autoformer
class AutoformerSinusoidalPositionalEmbedding(nn.Embedding):
    """该模块产生任意长度的正弦位置嵌入。"""

    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
        super().__init__(num_positions, embedding_dim)
        self.weight = self._init_weight(self.weight)

    @staticmethod
    def _init_weight(out: nn.Parameter) -> nn.Parameter:
        """
        与XLM create_sinusoidal_embeddings相同，除了特征不是交错的。余弦特征在向量的第二半部分。[dim // 2:]
        """
        n_pos, dim = out.shape
        position_enc = np.array(
            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
        )
        out.requires_grad = False  # 设置早以避免在pytorch-1.8+中出错
        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
        out.detach_()
        return out

    @torch.no_grad()
    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
        """`input_ids_shape` is expected to be [bsz x seqlen]."""
        # 从 `input_ids_shape` 中获取 batch size (bsz) 和 sequence length (seq_len)
        bsz, seq_len = input_ids_shape[:2]
        # 根据 past_key_values_length 和 seq_len 创建位置编码的张量，设备为 self.weight 的设备
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
        )
        # 调用父类的 forward 方法，传入位置编码张量，返回结果张量
        return super().forward(positions)
# 从transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesValueEmbedding复制到Autoformer
class AutoformerValueEmbedding(nn.Module):
    def __init__(self, feature_size, d_model):
        super().__init__()
        # 定义线性投影层，将输入特征大小映射到模型维度大小，无偏置
        self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)

    def forward(self, x):
        # 前向传播函数，将输入数据进行线性投影
        return self.value_projection(x)


# 基于以下链接的类
# https://github.com/thuml/Autoformer/blob/c6a0694ff484753f2d986cc0bb1f99ee850fc1a8/layers/Autoformer_EncDec.py#L39
# 其中AutoformerSeriesDecompositionLayer是series_decomp + moving_average
class AutoformerSeriesDecompositionLayer(nn.Module):
    """
    返回时间序列的趋势和季节部分。计算方式为:

        x_trend = AvgPool(Padding(X)) and x_seasonal = X - x_trend
    """

    def __init__(self, config: AutoformerConfig):
        super().__init__()
        # 设置移动平均的内核大小
        self.kernel_size = config.moving_average
        # 定义一维平均池化层，用于计算移动平均
        self.avg = nn.AvgPool1d(kernel_size=self.kernel_size, stride=1, padding=0)

    def forward(self, x):
        """输入形状: Batch x Time x EMBED_DIM"""
        # 在时间序列的两端进行填充
        num_of_pads = (self.kernel_size - 1) // 2
        front = x[:, 0:1, :].repeat(1, num_of_pads, 1)
        end = x[:, -1:, :].repeat(1, num_of_pads, 1)
        x_padded = torch.cat([front, x, end], dim=1)

        # 计算时间序列的趋势和季节部分
        x_trend = self.avg(x_padded.permute(0, 2, 1)).permute(0, 2, 1)
        x_seasonal = x - x_trend
        return x_seasonal, x_trend


# 基于以下链接的类
# https://github.com/thuml/Autoformer/blob/c6a0694ff484753f2d986cc0bb1f99ee850fc1a8/layers/Autoformer_EncDec.py#L6
# 其中AutoformerLayernorm是my_Layernorm
class AutoformerLayernorm(nn.Module):
    """
    为季节部分设计的特殊层归一化，计算方式为: AutoformerLayernorm(x) = nn.LayerNorm(x)
    - torch.mean(nn.LayerNorm(x))
    """

    def __init__(self, config: AutoformerConfig):
        super().__init__()
        # 定义LayerNorm层，将模型维度归一化
        self.layernorm = nn.LayerNorm(config.d_model)

    def forward(self, x):
        # 对输入数据进行LayerNorm
        x_hat = self.layernorm(x)
        # 计算偏置，对LayerNorm的输出进行均值操作
        bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1)
        return x_hat - bias


class AutoformerAttention(nn.Module):
    """
    自相关机制，包含以下两个阶段:
        (1) 基于周期的依赖发现 (2) 时间延迟聚合
    该模块替代了传统的自注意力机制。
    """

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        autocorrelation_factor: int = 3,
        # 省略了后续的初始化参数说明
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        # 初始化线性层，用于查询、键、值和输出的投影
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

        # 自动相关因子，用于注意力计算
        self.autocorrelation_factor = autocorrelation_factor

    # 重新塑造张量形状，用于多头注意力的计算
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    # 前向传播函数，实现注意力机制的计算
    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
# AutoformerEncoderLayer 类定义，继承自 nn.Module，表示这是一个 PyTorch 模型层
class AutoformerEncoderLayer(nn.Module):
    # 初始化函数，接受一个 AutoformerConfig 对象作为参数
    def __init__(self, config: AutoformerConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 设置 embed_dim 为配置中的 d_model，表示嵌入维度
        self.embed_dim = config.d_model
        # self_attn 属性，使用 AutoformerAttention 自定义注意力层
        self.self_attn = AutoformerAttention(
            embed_dim=self.embed_dim,  # 设置注意力层的嵌入维度
            num_heads=config.encoder_attention_heads,  # 注意力头的数量
            dropout=config.attention_dropout,  # 注意力层的dropout率
            autocorrelation_factor=config.autocorrelation_factor,  # 自相关因子
        )
        # self_attn_layer_norm 属性，LayerNorm 层，用于规范化注意力层的输出
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        # dropout 属性，全局的dropout率
        self.dropout = config.dropout
        # activation_fn 属性，激活函数，根据配置选择对应的激活函数
        self.activation_fn = ACT2FN[config.activation_function]
        # activation_dropout 属性，激活函数的dropout率
        self.activation_dropout = config.activation_dropout
        # fc1 属性，全连接层1，输入维度为 embed_dim，输出维度为配置中的 encoder_ffn_dim
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        # fc2 属性，全连接层2，输入维度为配置中的 encoder_ffn_dim，输出维度为 embed_dim
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        # final_layer_norm 属性，最终输出的 LayerNorm 层
        self.final_layer_norm = AutoformerLayernorm(config)
        # decomp1 和 decomp2 属性，使用 AutoformerSeriesDecompositionLayer 进行时间序列分解
        self.decomp1 = AutoformerSeriesDecompositionLayer(config)
        self.decomp2 = AutoformerSeriesDecompositionLayer(config)

    # forward 方法，定义了模型层的前向传播逻辑
    def forward(
        self,
        hidden_states: torch.FloatTensor,  # 输入的隐藏状态张量
        attention_mask: torch.FloatTensor,  # 注意力掩码张量
        layer_head_mask: torch.FloatTensor,  # 层头掩码张量
        output_attentions: Optional[bool] = False,  # 是否输出注意力权重，默认为 False
    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        # 保存输入的原始值，用于残差连接
        residual = hidden_states
        # 调用自注意力机制层进行计算
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 使用 dropout 进行正则化
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 残差连接
        hidden_states = residual + hidden_states
        # 在此处添加层归一化以改进模型性能
        hidden_states = self.self_attn_layer_norm(hidden_states)
        # 经过第一个线性层和激活函数
        hidden_states, _ = self.decomp1(hidden_states)

        # 保存输入的原始值，用于残差连接
        residual = hidden_states
        # 经过第二个线性层和激活函数
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 使用 dropout 进行正则化
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        # 经过第三个线性层
        hidden_states = self.fc2(hidden_states)
        # 使用 dropout 进行正则化
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 残差连接
        hidden_states = residual + hidden_states
        # 经过第二个分解层
        hidden_states, _ = self.decomp2(hidden_states)
        # 最终层归一化
        hidden_states = self.final_layer_norm(hidden_states)

        # 如果隐藏状态的数据类型为 torch.float16 并且存在无穷大或 NaN 的情况，进行数值截断
        if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        # 构建输出元组
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则添加到输出元组中
        if output_attentions:
            outputs += (attn_weights,)

        return outputs
# 定义 AutoformerDecoderLayer 类，继承自 nn.Module
class AutoformerDecoderLayer(nn.Module):
    # 初始化方法，接受一个 AutoformerConfig 类型的 config 参数
    def __init__(self, config: AutoformerConfig):
        # 调用父类 nn.Module 的初始化方法
        super().__init__()
        # 设置 embed_dim 属性为 config.d_model，即模型的维度
        self.embed_dim = config.d_model

        # 初始化自注意力层 self_attn，使用 AutoformerAttention 类
        self.self_attn = AutoformerAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            autocorrelation_factor=config.autocorrelation_factor,
        )

        # 设置 dropout 属性为 config.dropout，用于网络的随机失活
        self.dropout = config.dropout
        # 设置 activation_fn 属性为 config.activation_function 对应的激活函数
        self.activation_fn = ACT2FN[config.activation_function]
        # 设置 activation_dropout 属性为 config.activation_dropout，用于激活函数的随机失活

        self.activation_dropout = config.activation_dropout

        # 初始化自注意力层后的 LayerNorm 层 self_attn_layer_norm
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 初始化编码器注意力层 encoder_attn，使用 AutoformerAttention 类
        self.encoder_attn = AutoformerAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            autocorrelation_factor=config.autocorrelation_factor,
        )

        # 初始化编码器注意力层后的 LayerNorm 层 encoder_attn_layer_norm
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 初始化全连接层 fc1，输入维度为 self.embed_dim，输出维度为 config.decoder_ffn_dim
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        # 初始化全连接层 fc2，输入维度为 config.decoder_ffn_dim，输出维度为 self.embed_dim
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)

        # 初始化最终的 LayerNorm 层 final_layer_norm，使用 AutoformerLayernorm 类
        self.final_layer_norm = AutoformerLayernorm(config)

        # 初始化 AutoformerSeriesDecompositionLayer 类的实例 decomp1, decomp2, decomp3
        self.decomp1 = AutoformerSeriesDecompositionLayer(config)
        self.decomp2 = AutoformerSeriesDecompositionLayer(config)
        self.decomp3 = AutoformerSeriesDecompositionLayer(config)

        # 初始化趋势投影层 trend_projection，使用 nn.Conv1d 类
        # 设置输入通道数为 self.embed_dim，输出通道数为 config.feature_size
        # 使用 kernel_size=3 的卷积核，步长为 1，padding 方式为 circular，无偏置项
        self.trend_projection = nn.Conv1d(
            in_channels=self.embed_dim,
            out_channels=config.feature_size,
            kernel_size=3,
            stride=1,
            padding=1,
            padding_mode="circular",
            bias=False,
        )

    # 前向传播方法定义，接受多个参数，包括隐藏状态、注意力掩码等
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,
    ):
        # 这里可以添加具体的前向传播逻辑，但不在注释范围内
        pass

# 定义 AutoformerPreTrainedModel 类，继承自 PreTrainedModel
class AutoformerPreTrainedModel(PreTrainedModel):
    # 设置配置类为 AutoformerConfig
    config_class = AutoformerConfig
    # 设置基础模型前缀为 "model"
    base_model_prefix = "model"
    # 设置主输入名称为 "past_values"
    main_input_name = "past_values"
    # 支持梯度检查点技术
    supports_gradient_checkpointing = True
    # 初始化神经网络模块的权重
    def _init_weights(self, module):
        std = self.config.init_std
        # 如果模块是线性层或一维卷积层
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            # 使用正态分布初始化权重，均值为0，标准差为config中指定的值std
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果存在偏置项，则将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是AutoformerSinusoidalPositionalEmbedding类型的，不进行任何操作
        elif isinstance(module, AutoformerSinusoidalPositionalEmbedding):
            pass
        # 如果模块是嵌入层
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，均值为0，标准差为config中指定的值std
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果指定了padding_idx，则将对应位置的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
# AUTOFORMER_START_DOCSTRING 变量，包含了关于 Autoformer 模型的详细文档字符串，介绍了模型的继承关系和参数说明
AUTOFORMER_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`AutoformerConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# AUTOFORMER_INPUTS_DOCSTRING 变量，当前为空字符串，用于添加输入参数的文档字符串
AUTOFORMER_INPUTS_DOCSTRING = r"""
"""


# AutoformerEncoder 类定义，继承自 AutoformerPreTrainedModel，代表了 Autoformer 模型的编码器部分
class AutoformerEncoder(AutoformerPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`AutoformerEncoderLayer`].

    Args:
        config: AutoformerConfig
    """

    def __init__(self, config: AutoformerConfig):
        # 调用父类构造函数初始化模型
        super().__init__(config)

        # 初始化类成员变量
        self.dropout = config.dropout  # 设置模型的 dropout 率
        self.layerdrop = config.encoder_layerdrop  # 设置编码器层级的 dropout 率
        if config.prediction_length is None:
            raise ValueError("The `prediction_length` config needs to be specified.")

        # 初始化模型的值嵌入和位置嵌入
        self.value_embedding = AutoformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
        self.embed_positions = AutoformerSinusoidalPositionalEmbedding(
            config.context_length + config.prediction_length, config.d_model
        )
        
        # 使用 AutoformerEncoderLayer 初始化编码器的层，并组成层的列表
        self.layers = nn.ModuleList([AutoformerEncoderLayer(config) for _ in range(config.encoder_layers)])
        
        # 应用层归一化到嵌入
        self.layernorm_embedding = nn.LayerNorm(config.d_model)

        self.gradient_checkpointing = False
        # 初始化权重并进行最终处理
        self.post_init()

    # 前向传播函数，接受多个可选的输入参数，并返回模型的输出
    def forward(
        self,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 初始化函数，接受一个 AutoformerConfig 类型的参数 config
    def __init__(self, config: AutoformerConfig):
        # 调用父类的初始化函数，传入 config 参数
        super().__init__(config)
        # 设置 dropout 参数为 config 中的 dropout 设置
        self.dropout = config.dropout
        # 设置 layerdrop 参数为 config 中的 decoder_layerdrop 设置
        self.layerdrop = config.decoder_layerdrop
        # 如果 config 中的 prediction_length 参数为 None，则抛出数值错误异常
        if config.prediction_length is None:
            raise ValueError("The `prediction_length` config needs to be specified.")

        # 初始化 AutoformerValueEmbedding 对象，使用 config 中的 feature_size 和 d_model 参数
        self.value_embedding = AutoformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
        # 初始化 AutoformerSinusoidalPositionalEmbedding 对象，使用 config 中的 context_length、prediction_length 和 d_model 参数
        self.embed_positions = AutoformerSinusoidalPositionalEmbedding(
            config.context_length + config.prediction_length, config.d_model
        )
        # 使用列表推导式初始化 nn.ModuleList，包含 config.decoder_layers 个 AutoformerDecoderLayer 对象
        self.layers = nn.ModuleList([AutoformerDecoderLayer(config) for _ in range(config.decoder_layers)])
        # 初始化 nn.LayerNorm 对象，使用 config 中的 d_model 参数
        self.layernorm_embedding = nn.LayerNorm(config.d_model)

        # 使用 nn.Linear 初始化 seasonality_projection 属性，将 d_model 映射到 feature_size
        self.seasonality_projection = nn.Linear(config.d_model, config.feature_size)

        # 设置 gradient_checkpointing 属性为 False
        self.gradient_checkpointing = False
        # 执行初始化函数 post_init，用于初始化权重并应用最终处理
        self.post_init()

    # 前向传播函数，接受多个可选参数并返回结果
    def forward(
        self,
        trend: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 使用自动形态编码器的基类进行模型定义，输出原始隐藏状态，没有特定的顶部头部。
# 继承自AutoformerPreTrainedModel类
class AutoformerModel(AutoformerPreTrainedModel):
    
    def __init__(self, config: AutoformerConfig):
        super().__init__(config)

        # 根据配置选择合适的数据缩放器
        if config.scaling == "mean" or config.scaling is True:
            self.scaler = AutoformerMeanScaler(config)
        elif config.scaling == "std":
            self.scaler = AutoformerStdScaler(config)
        else:
            self.scaler = AutoformerNOPScaler(config)

        # 如果有静态分类特征，则初始化特征嵌入器
        if config.num_static_categorical_features > 0:
            self.embedder = AutoformerFeatureEmbedder(
                cardinalities=config.cardinality, embedding_dims=config.embedding_dimension
            )

        # 初始化编码器和解码器部分
        self.encoder = AutoformerEncoder(config)  # 自动形态编码器的编码器部分
        self.decoder = AutoformerDecoder(config)  # 自动形态编码器的解码器部分

        # 用于解码器季节性和趋势初始化的分解层
        self.decomposition_layer = AutoformerSeriesDecompositionLayer(config)

        # 初始化权重并应用最终处理
        self.post_init()

    @property
    def _past_length(self) -> int:
        # 返回上下文长度和滞后序列中的最大值之和，作为过去观察长度
        return self.config.context_length + max(self.config.lags_sequence)

    def get_lagged_subsequences(
        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
        # 根据给定序列获取滞后子序列，指定子序列长度和偏移量
    ) -> torch.Tensor:
        """
        Returns lagged subsequences of a given sequence. Returns a tensor of shape (batch_size, subsequences_length,
        feature_size, indices_length), containing lagged subsequences. Specifically, lagged[i, j, :, k] = sequence[i,
        -indices[k]-subsequences_length+j, :].

        Args:
            sequence (`torch.Tensor` or shape `(batch_size, context_length,
                feature_size)`): The sequence from which lagged subsequences should be extracted.
            subsequences_length (`int`):
                Length of the subsequences to be extracted.
            shift (`int`, *optional* defaults to 0):
                Shift the lags by this amount back in the time index.
        """

        # calculates the indices of the lags by subtracting the shift value from the given lags_sequence
        indices = [lag - shift for lag in self.config.lags_sequence]

        # checks if the maximum lag plus the length of the subsequences exceeds the length of the input sequence
        sequence_length = sequence.shape[1]
        if max(indices) + subsequences_length > sequence_length:
            raise ValueError(
                f"lags cannot go further than history length, found lag {max(indices)} "
                f"while history length is only {sequence_length}"
            )

        # extracts the lagged subsequences from the input sequence using the calculated indices
        lagged_values = []
        for lag_index in indices:
            begin_index = -lag_index - subsequences_length
            end_index = -lag_index if lag_index > 0 else None
            lagged_values.append(sequence[:, begin_index:end_index, ...])

        # return as stacked tensor in the feature dimension
        return torch.stack(lagged_values, dim=-1)

    def create_network_inputs(
        self,
        past_values: torch.Tensor,
        past_time_features: torch.Tensor,
        static_categorical_features: Optional[torch.Tensor] = None,
        static_real_features: Optional[torch.Tensor] = None,
        past_observed_mask: Optional[torch.Tensor] = None,
        future_values: Optional[torch.Tensor] = None,
        future_time_features: Optional[torch.Tensor] = None,
    ):
        """
        Creates inputs for the network by combining past values, time features, and optional static features.

        Args:
            past_values (`torch.Tensor`): Tensor containing past values of shape (batch_size, context_length, feature_size).
            past_time_features (`torch.Tensor`): Tensor containing time features for the past values.
            static_categorical_features (`Optional[torch.Tensor]`, *optional*):
                Tensor containing static categorical features.
            static_real_features (`Optional[torch.Tensor]`, *optional*):
                Tensor containing static real-valued features.
            past_observed_mask (`Optional[torch.Tensor]`, *optional*):
                Mask indicating which past values are observed.
            future_values (`Optional[torch.Tensor]`, *optional*):
                Tensor containing future values if available.
            future_time_features (`Optional[torch.Tensor]`, *optional*):
                Tensor containing time features for the future values.
        """
        return NotImplementedError

    def get_encoder(self):
        """
        Returns the encoder object associated with this model.
        """
        return self.encoder

    def get_decoder(self):
        """
        Returns the decoder object associated with this model.
        """
        return self.decoder

    @add_start_docstrings_to_model_forward(AUTOFORMER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=AutoformerModelOutput, config_class=_CONFIG_FOR_DOC)
    # 定义一个方法 `forward`，用于执行模型的前向传播过程，接受多个参数作为输入
    def forward(
        self,
        # 过去的值，作为模型的输入之一，是一个 Tensor
        past_values: torch.Tensor,
        # 过去的时间特征，也是模型输入的一部分，是一个 Tensor
        past_time_features: torch.Tensor,
        # 过去观测的遮罩，用于指示哪些观测值在过去是可见的，是一个 Tensor
        past_observed_mask: torch.Tensor,
        # 静态的分类特征，可选输入，如果有的话是一个 Tensor
        static_categorical_features: Optional[torch.Tensor] = None,
        # 静态的实数特征，可选输入，如果有的话是一个 Tensor
        static_real_features: Optional[torch.Tensor] = None,
        # 未来的值，可选输入，如果有的话是一个 Tensor
        future_values: Optional[torch.Tensor] = None,
        # 未来的时间特征，可选输入，如果有的话是一个 Tensor
        future_time_features: Optional[torch.Tensor] = None,
        # 解码器注意力遮罩，可选输入，如果有的话是一个 LongTensor
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        # 头部遮罩，可选输入，如果有的话是一个 Tensor
        head_mask: Optional[torch.Tensor] = None,
        # 解码器头部遮罩，可选输入，如果有的话是一个 Tensor
        decoder_head_mask: Optional[torch.Tensor] = None,
        # 交叉注意力头部遮罩，可选输入，如果有的话是一个 Tensor
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        # 编码器输出，可选输入，如果有的话是一个浮点数列表
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        # 过去的键值对，可选输入，如果有的话是一个浮点数列表
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        # 输出隐藏状态，可选参数，如果设置为 True 则输出隐藏状态
        output_hidden_states: Optional[bool] = None,
        # 输出注意力权重，可选参数，如果设置为 True 则输出注意力权重
        output_attentions: Optional[bool] = None,
        # 使用缓存，可选参数，如果设置为 True 则使用缓存
        use_cache: Optional[bool] = None,
        # 返回字典，可选参数，如果设置为 True 则返回字典
        return_dict: Optional[bool] = None,
# 使用装饰器为该类添加文档字符串，描述了该类是基于 Autoformer 模型的时间序列预测模型，带有一个分布输出头部
# 以用于时间序列预测。
@add_start_docstrings(
    "The Autoformer Model with a distribution head on top for time-series forecasting.",
    AUTOFORMER_START_DOCSTRING,
)
class AutoformerForPrediction(AutoformerPreTrainedModel):
    def __init__(self, config: AutoformerConfig):
        # 调用父类构造函数，传入配置对象来初始化
        super().__init__(config)
        # 使用给定配置初始化 AutoformerModel 模型
        self.model = AutoformerModel(config)
        
        # 根据配置选择分布输出类型，并初始化相应的分布输出对象
        if config.distribution_output == "student_t":
            self.distribution_output = StudentTOutput(dim=config.input_size)
        elif config.distribution_output == "normal":
            self.distribution_output = NormalOutput(dim=config.input_size)
        elif config.distribution_output == "negative_binomial":
            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
        else:
            # 如果配置中指定的分布输出类型未知，则引发值错误异常
            raise ValueError(f"Unknown distribution output {config.distribution_output}")

        # 根据分布输出对象的特征大小获取参数投影方法
        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.feature_size)
        # 设置目标形状为分布输出对象的事件形状
        self.target_shape = self.distribution_output.event_shape

        # 根据配置选择损失函数，如果未知则引发值错误异常
        if config.loss == "nll":
            self.loss = nll
        else:
            raise ValueError(f"Unknown loss function {config.loss}")

        # 初始化分布输出对象的权重并应用最终处理
        self.post_init()

    # 返回解码器输出的参数投影
    def output_params(self, decoder_output):
        return self.parameter_projection(decoder_output[:, -self.config.prediction_length :, :])

    # 获取编码器部分
    def get_encoder(self):
        return self.model.get_encoder()

    # 获取解码器部分
    def get_decoder(self):
        return self.model.get_decoder()

    # 使用 torch.jit.ignore 装饰器，指示编译时忽略该方法，该方法用于生成分布对象
    @torch.jit.ignore
    def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
        # 如果指定了 trailing_n，则对参数进行切片
        sliced_params = params
        if trailing_n is not None:
            sliced_params = [p[:, -trailing_n:] for p in params]
        # 调用分布输出对象的 distribution 方法生成分布对象
        return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)

    # 使用装饰器为模型的前向传播方法添加文档字符串，文档字符串包含了输入的详细说明
    @add_start_docstrings_to_model_forward(AUTOFORMER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqTSPredictionOutput, config_class=_CONFIG_FOR_DOC)
    # 定义一个方法 `forward`，用于模型的前向传播
    def forward(
        self,
        # 过去的值作为输入，类型为 torch.Tensor
        past_values: torch.Tensor,
        # 过去的时间特征作为输入，类型为 torch.Tensor
        past_time_features: torch.Tensor,
        # 过去观察到的掩码，类型为 torch.Tensor
        past_observed_mask: torch.Tensor,
        # 可选的静态分类特征，类型为 Optional[torch.Tensor]
        static_categorical_features: Optional[torch.Tensor] = None,
        # 可选的静态实数特征，类型为 Optional[torch.Tensor]
        static_real_features: Optional[torch.Tensor] = None,
        # 可选的未来的值，类型为 Optional[torch.Tensor]
        future_values: Optional[torch.Tensor] = None,
        # 可选的未来时间特征，类型为 Optional[torch.Tensor]
        future_time_features: Optional[torch.Tensor] = None,
        # 可选的未来观察到的掩码，类型为 Optional[torch.Tensor]
        future_observed_mask: Optional[torch.Tensor] = None,
        # 可选的解码器注意力掩码，类型为 Optional[torch.LongTensor]
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        # 可选的头掩码，类型为 Optional[torch.Tensor]
        head_mask: Optional[torch.Tensor] = None,
        # 可选的解码器头部掩码，类型为 Optional[torch.Tensor]
        decoder_head_mask: Optional[torch.Tensor] = None,
        # 可选的交叉注意力头部掩码，类型为 Optional[torch.Tensor]
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        # 可选的编码器输出列表，类型为 Optional[List[torch.FloatTensor]]
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        # 可选的过去关键值列表，类型为 Optional[List[torch.FloatTensor]]
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        # 是否输出隐藏状态的标志，类型为 Optional[bool]
        output_hidden_states: Optional[bool] = None,
        # 是否输出注意力权重的标志，类型为 Optional[bool]
        output_attentions: Optional[bool] = None,
        # 是否使用缓存的标志，类型为 Optional[bool]
        use_cache: Optional[bool] = None,
        # 是否返回字典格式的结果，类型为 Optional[bool]
        return_dict: Optional[bool] = None,
    # 使用 @torch.no_grad() 装饰器，确保在生成过程中不计算梯度
    @torch.no_grad()
    # 定义一个方法 `generate`，用于生成过程
    def generate(
        # 过去的值作为输入，类型为 torch.Tensor
        self,
        past_values: torch.Tensor,
        # 过去的时间特征作为输入，类型为 torch.Tensor
        past_time_features: torch.Tensor,
        # 未来的时间特征作为输入，类型为 torch.Tensor
        future_time_features: torch.Tensor,
        # 可选的过去观察到的掩码，类型为 Optional[torch.Tensor]
        past_observed_mask: Optional[torch.Tensor] = None,
        # 可选的静态分类特征，类型为 Optional[torch.Tensor]
        static_categorical_features: Optional[torch.Tensor] = None,
        # 可选的静态实数特征，类型为 Optional[torch.Tensor]
        static_real_features: Optional[torch.Tensor] = None,
        # 是否输出注意力权重的标志，类型为 Optional[bool]
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态的标志，类型为 Optional[bool]
        output_hidden_states: Optional[bool] = None,

`.\models\autoformer\init.py`

# 版权声明和许可信息
# 该模块是 HuggingFace 团队的代码，版权归其所有
# 根据 Apache 许可证 2.0 版本进行许可
# 如果不遵循许可证，除非适用法律要求或书面同意，否则不得使用该文件
# 可以在 http://www.apache.org/licenses/LICENSE-2.0 获取许可证的副本

# 引入类型检查模块
from typing import TYPE_CHECKING

# 从工具模块中引入异常和懒加载模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义导入结构字典
_import_structure = {
    "configuration_autoformer": [
        "AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",  # 自动化配置的预训练配置映射
        "AutoformerConfig",  # Autoformer 的配置类
    ],
}

# 尝试检查是否存在 Torch 可用，若不存在则抛出异常 OptionalDependencyNotAvailable
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则更新导入结构字典以包含建模组件
    _import_structure["modeling_autoformer"] = [
        "AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",  # 自动化模型的预训练模型档案列表
        "AutoformerForPrediction",  # 用于预测的 Autoformer 模型
        "AutoformerModel",  # Autoformer 模型
        "AutoformerPreTrainedModel",  # Autoformer 预训练模型
    ]

# 如果处于类型检查模式
if TYPE_CHECKING:
    # 从自动化配置模块中导入相关内容
    from .configuration_autoformer import (
        AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,  # 自动化配置的预训练配置映射
        AutoformerConfig,  # Autoformer 的配置类
    )

    # 尝试检查是否存在 Torch 可用，若不存在则抛出异常 OptionalDependencyNotAvailable
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从建模模块中导入相关内容
        from .modeling_autoformer import (
            AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,  # 自动化模型的预训练模型档案列表
            AutoformerForPrediction,  # 用于预测的 Autoformer 模型
            AutoformerModel,  # Autoformer 模型
            AutoformerPreTrainedModel,  # Autoformer 预训练模型
        )

# 如果不处于类型检查模式
else:
    import sys

    # 将当前模块替换为懒加载模块，实现按需导入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\bark\configuration_bark.py`

# coding=utf-8
# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BARK model configuration"""

import os
from typing import Dict, Optional, Union

from ...configuration_utils import PretrainedConfig
from ...utils import add_start_docstrings, logging
from ..auto import CONFIG_MAPPING

# 获取名为 logging 的模块中的日志记录器对象
logger = logging.get_logger(__name__)

# BARK_PRETRAINED_CONFIG_ARCHIVE_MAP 是一个映射表，将模型名称映射到其预训练配置文件的 URL
BARK_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "suno/bark-small": "https://huggingface.co/suno/bark-small/resolve/main/config.json",
    "suno/bark": "https://huggingface.co/suno/bark/resolve/main/config.json",
}

# BARK_SUBMODELCONFIG_START_DOCSTRING 是一个多行字符串，用于说明配置类的作用和用法
BARK_SUBMODELCONFIG_START_DOCSTRING = """
    This is the configuration class to store the configuration of a [`{model}`]. It is used to instantiate the model
    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Bark [suno/bark](https://huggingface.co/suno/bark)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # 设置块大小，定义模型可能使用的最大序列长度，默认为 1024。通常设置为较大值（例如 512、1024 或 2048），以防万一。
    block_size (`int`, *optional*, defaults to 1024):
    
    # 输入词汇表大小，用于 Bark 子模型。定义在调用 `{model}` 时可以表示的不同 token 数量。默认为 10,048，但应根据所选子模型仔细考虑。
    input_vocab_size (`int`, *optional*, defaults to 10_048):
    
    # 输出词汇表大小，用于 Bark 子模型。定义在向前传递 `{model}` 时可以表示的不同 token 数量。默认为 10,048，但应根据所选子模型仔细考虑。
    output_vocab_size (`int`, *optional*, defaults to 10_048):
    
    # 给定子模型中的隐藏层数量。默认为 12。
    num_layers (`int`, *optional*, defaults to 12):
    
    # Transformer 架构中每个注意力层的注意力头数量。默认为 12。
    num_heads (`int`, *optional*, defaults to 12):
    
    # 架构中“中间”（通常称为前馈）层的维度大小。默认为 768。
    hidden_size (`int`, *optional*, defaults to 768):
    
    # 嵌入层、编码器和池化器中所有全连接层的 dropout 概率。默认为 0.0，即不使用 dropout。
    dropout (`float`, *optional*, defaults to 0.0):
    
    # 是否在线性层和层归一化层中使用偏置。默认为 `True`。
    bias (`bool`, *optional*, defaults to `True`):
    
    # 初始化所有权重矩阵的截断正态初始化器的标准差。默认为 0.02。
    initializer_range (`float`, *optional*, defaults to 0.02):
    
    # 模型是否应返回最后的键/值注意力。并非所有模型都使用此功能。默认为 `True`。
    use_cache (`bool`, *optional*, defaults to `True`):
"""
定义了一个名为 BarkSubModelConfig 的类，继承自 PretrainedConfig。

model_type 属性指定为 "bark_module"，用于标识模型类型为 Bark 模块。
keys_to_ignore_at_inference 属性指定在推断时要忽略的键，这里包括 "past_key_values"。

attribute_map 属性是一个映射字典，将类内部属性名映射到外部使用的名称，例如将 num_attention_heads 映射为 num_heads。

__init__ 方法用于初始化类的实例，接受多个参数来设置模型配置的各个属性，如 block_size、input_vocab_size 等。

from_pretrained 方法是一个类方法，用于从预训练模型加载配置。它接受预训练模型的名称或路径，并支持设置缓存目录、强制下载等参数。

在方法内部，通过调用 cls.get_config_dict 方法获取预训练模型的配置字典。如果配置字典中的 model_type 为 "bark"，则从中提取对应的 Bark 配置。

警告日志用于提示用户，如果加载的预训练模型类型与当前类定义的模型类型不匹配，可能会导致错误。
"""
    # 获取模型的配置信息
    configuration = model.config
# 在 `BarkSubModelConfig` 的基础上定义了一个名为 `BarkSemanticConfig` 的类
class BarkSemanticConfig(BarkSubModelConfig):
    # 设定模型类型为 "semantic"
    model_type = "semantic"

# 在 `BarkSubModelConfig` 的基础上定义了一个名为 `BarkCoarseConfig` 的类
@add_start_docstrings(
    # 添加起始文档字符串，使用 `BARK_SUBMODELCONFIG_START_DOCSTRING` 格式化字符串
    BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkCoarseConfig", model="BarkCoarseModel"),
    """
    Example:

    ```
    >>> from transformers import BarkCoarseConfig, BarkCoarseModel

    >>> # Initializing a Bark sub-module style configuration
    >>> configuration = BarkCoarseConfig()

    >>> # Initializing a model (with random weights) from the suno/bark style configuration
    >>> model = BarkCoarseModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```""",
)
class BarkCoarseConfig(BarkSubModelConfig):
    # 设定模型类型为 "coarse_acoustics"
    model_type = "coarse_acoustics"

# 在 `BarkSubModelConfig` 的基础上定义了一个名为 `BarkFineConfig` 的类
@add_start_docstrings(
    # 添加起始文档字符串，使用 `BARK_SUBMODELCONFIG_START_DOCSTRING` 格式化字符串
    BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkFineConfig", model="BarkFineModel"),
    """
        n_codes_total (`int`, *optional*, defaults to 8):
            The total number of audio codebooks predicted. Used in the fine acoustics sub-model.
        n_codes_given (`int`, *optional*, defaults to 1):
            The number of audio codebooks predicted in the coarse acoustics sub-model. Used in the acoustics
            sub-models.
    Example:

    ```
    >>> from transformers import BarkFineConfig, BarkFineModel

    >>> # Initializing a Bark sub-module style configuration
    >>> configuration = BarkFineConfig()

    >>> # Initializing a model (with random weights) from the suno/bark style configuration
    >>> model = BarkFineModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```""",
)
class BarkFineConfig(BarkSubModelConfig):
    # 设定模型类型为 "fine_acoustics"
    model_type = "fine_acoustics"

    def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs):
        # 初始化方法，设定了一些参数和默认值
        self.n_codes_total = n_codes_total  # 总音频码书预测数量，默认为8
        self.n_codes_given = n_codes_given  # 粗声学子模型中音频码书预测数量，默认为1

        # 调用父类的初始化方法
        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

# 继承自 `PretrainedConfig` 的 `BarkConfig` 类
class BarkConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`BarkModel`]. It is used to instantiate a Bark
    model according to the specified sub-models configurations, defining the model architecture.

    Instantiating a configuration with the defaults will yield a similar configuration to that of the Bark
    [suno/bark](https://huggingface.co/suno/bark) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
    semantic_config ([`BarkSemanticConfig`], *optional*):
        Configuration of the underlying semantic sub-model.
    coarse_acoustics_config ([`BarkCoarseConfig`], *optional*):
        Configuration of the underlying coarse acoustics sub-model.
    fine_acoustics_config ([`BarkFineConfig`], *optional*):
        Configuration of the underlying fine acoustics sub-model.
    """
        codec_config ([`AutoConfig`], *optional*):
            Configuration of the underlying codec sub-model.



        model_type = "bark"



        def __init__(
            self,
            semantic_config: Dict = None,
            coarse_acoustics_config: Dict = None,
            fine_acoustics_config: Dict = None,
            codec_config: Dict = None,
            initializer_range=0.02,
            **kwargs,
        ):
            # 如果semantic_config为None，则使用默认空字典并记录日志
            if semantic_config is None:
                semantic_config = {}
                logger.info("semantic_config is None. initializing the semantic model with default values.")

            # 如果coarse_acoustics_config为None，则使用默认空字典并记录日志
            if coarse_acoustics_config is None:
                coarse_acoustics_config = {}
                logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.")

            # 如果fine_acoustics_config为None，则使用默认空字典并记录日志
            if fine_acoustics_config is None:
                fine_acoustics_config = {}
                logger.info("fine_acoustics_config is None. initializing the fine model with default values.")

            # 如果codec_config为None，则使用默认空字典并记录日志
            if codec_config is None:
                codec_config = {}
                logger.info("codec_config is None. initializing the codec model with default values.")

            # 初始化各个配置对象，如果给定配置为空，则创建默认配置对象
            self.semantic_config = BarkSemanticConfig(**semantic_config)
            self.coarse_acoustics_config = BarkCoarseConfig(**coarse_acoustics_config)
            self.fine_acoustics_config = BarkFineConfig(**fine_acoustics_config)
            
            # 确定codec_model_type，如果未指定则默认为"encodec"
            codec_model_type = codec_config["model_type"] if "model_type" in codec_config else "encodec"
            self.codec_config = CONFIG_MAPPING[codec_model_type](**codec_config)

            # 设置初始化范围
            self.initializer_range = initializer_range

            super().__init__(**kwargs)



        @classmethod
        def from_sub_model_configs(
            cls,
            semantic_config: BarkSemanticConfig,
            coarse_acoustics_config: BarkCoarseConfig,
            fine_acoustics_config: BarkFineConfig,
            codec_config: PretrainedConfig,
            **kwargs,
        ):
        ):
        r"""
        从bark子模型配置中实例化一个[`BarkConfig`]（或派生类）。

        Returns:
            [`BarkConfig`]: 配置对象的一个实例
        """
        return cls(
            semantic_config=semantic_config.to_dict(),  # 将语义配置转换为字典形式
            coarse_acoustics_config=coarse_acoustics_config.to_dict(),  # 将粗略声学配置转换为字典形式
            fine_acoustics_config=fine_acoustics_config.to_dict(),  # 将精细声学配置转换为字典形式
            codec_config=codec_config.to_dict(),  # 将编解码器配置转换为字典形式
            **kwargs,  # 传递额外的关键字参数
        )

`.\models\bark\convert_suno_to_hf.py`

"""Convert Bark checkpoint."""
# 导入所需的库和模块
import argparse
import os
from pathlib import Path

import torch
from bark.generation import _load_model as _bark_load_model
from huggingface_hub import hf_hub_download

# 导入 Transformers 库中相关的类和函数
from transformers import EncodecConfig, EncodecModel, set_seed
from transformers.models.bark.configuration_bark import (
    BarkCoarseConfig,
    BarkConfig,
    BarkFineConfig,
    BarkSemanticConfig,
)
from transformers.models.bark.generation_configuration_bark import (
    BarkCoarseGenerationConfig,
    BarkFineGenerationConfig,
    BarkGenerationConfig,
    BarkSemanticGenerationConfig,
)
from transformers.models.bark.modeling_bark import BarkCoarseModel, BarkFineModel, BarkModel, BarkSemanticModel
from transformers.utils import logging


# 设置日志级别为 info
logging.set_verbosity_info()
logger = logging.get_logger(__name__)

# 设置随机种子
set_seed(770)

# 定义一个字典，用于将模型层次结构中的旧层次名称映射到新名称
new_layer_name_dict = {
    "c_attn": "att_proj",
    "c_proj": "out_proj",
    "c_fc": "in_proj",
    "transformer.": "",
    "h.": "layers.",
    "ln_1": "layernorm_1",
    "ln_2": "layernorm_2",
    "ln_f": "layernorm_final",
    "wpe": "position_embeds_layer",
    "wte": "input_embeds_layer",
}

# 定义远程模型路径的字典
REMOTE_MODEL_PATHS = {
    "text_small": {
        "repo_id": "suno/bark",
        "file_name": "text.pt",
    },
    "coarse_small": {
        "repo_id": "suno/bark",
        "file_name": "coarse.pt",
    },
    "fine_small": {
        "repo_id": "suno/bark",
        "file_name": "fine.pt",
    },
    "text": {
        "repo_id": "suno/bark",
        "file_name": "text_2.pt",
    },
    "coarse": {
        "repo_id": "suno/bark",
        "file_name": "coarse_2.pt",
    },
    "fine": {
        "repo_id": "suno/bark",
        "file_name": "fine_2.pt",
    },
}

# 获取当前文件的路径
CUR_PATH = os.path.dirname(os.path.abspath(__file__))
# 设置默认缓存目录
default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
# 设置最终的缓存目录路径
CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0")


# 根据模型类型和是否使用小模型返回对应的检查点文件路径
def _get_ckpt_path(model_type, use_small=False):
    key = model_type
    if use_small:
        key += "_small"
    return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"])


# 下载模型文件到本地缓存目录
def _download(from_hf_path, file_name):
    os.makedirs(CACHE_DIR, exist_ok=True)
    hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR)


# 加载模型的函数，根据模型类型和大小选择对应的模型类和配置类
def _load_model(ckpt_path, device, use_small=False, model_type="text"):
    if model_type == "text":
        ModelClass = BarkSemanticModel
        ConfigClass = BarkSemanticConfig
        GenerationConfigClass = BarkSemanticGenerationConfig
    elif model_type == "coarse":
        ModelClass = BarkCoarseModel
        ConfigClass = BarkCoarseConfig
        GenerationConfigClass = BarkCoarseGenerationConfig
    elif model_type == "fine":
        ModelClass = BarkFineModel
        ConfigClass = BarkFineConfig
        GenerationConfigClass = BarkFineGenerationConfig
    else:
        raise NotImplementedError()
    model_key = f"{model_type}_small" if use_small else model_type
    # 获取远程模型路径中与模型键对应的模型信息
    model_info = REMOTE_MODEL_PATHS[model_key]
    
    # 如果检查点路径不存在
    if not os.path.exists(ckpt_path):
        # 输出日志信息，指示模型类型未找到，并下载到 `CACHE_DIR` 中
        logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
        # 下载模型文件
        _download(model_info["repo_id"], model_info["file_name"])
    
    # 加载检查点文件到 torch 中
    checkpoint = torch.load(ckpt_path, map_location=device)
    
    # 从检查点中获取模型参数
    # 这是一个临时解决方案
    model_args = checkpoint["model_args"]
    
    # 如果模型参数中没有 `input_vocab_size`
    if "input_vocab_size" not in model_args:
        # 使用 `vocab_size` 来填充 `input_vocab_size` 和 `output_vocab_size`
        model_args["input_vocab_size"] = model_args["vocab_size"]
        model_args["output_vocab_size"] = model_args["vocab_size"]
        # 删除原来的 `vocab_size` 键
        del model_args["vocab_size"]
    
    # 将 Bark 模型参数转换为 HF Bark 模型参数
    model_args["num_heads"] = model_args.pop("n_head")
    model_args["hidden_size"] = model_args.pop("n_embd")
    model_args["num_layers"] = model_args.pop("n_layer")
    
    # 使用模型参数创建配置对象
    model_config = ConfigClass(**checkpoint["model_args"])
    
    # 使用配置对象实例化模型
    model = ModelClass(config=model_config)
    
    # 创建模型生成配置对象
    model_generation_config = GenerationConfigClass()
    
    # 将生成配置对象赋值给模型的生成配置
    model.generation_config = model_generation_config
    
    # 获取模型的状态字典
    state_dict = checkpoint["model"]
    
    # 修复检查点中的问题
    unwanted_prefix = "_orig_mod."
    for k, v in list(state_dict.items()):
        # 如果键以不需要的前缀开头
        if k.startswith(unwanted_prefix):
            # 替换键的一部分与 HF 实现中的相应层名称
            new_k = k[len(unwanted_prefix):]
            for old_layer_name in new_layer_name_dict:
                new_k = new_k.replace(old_layer_name, new_layer_name_dict[old_layer_name])
            # 替换原始键
            state_dict[new_k] = state_dict.pop(k)
    
    # 查找额外的键
    extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
    extra_keys = {k for k in extra_keys if not k.endswith(".attn.bias")}
    
    # 查找丢失的键
    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
    missing_keys = {k for k in missing_keys if not k.endswith(".attn.bias")}
    
    # 如果有额外的键存在，则引发值错误
    if len(extra_keys) != 0:
        raise ValueError(f"extra keys found: {extra_keys}")
    
    # 如果有丢失的键存在，则引发值错误
    if len(missing_keys) != 0:
        raise ValueError(f"missing keys: {missing_keys}")
    
    # 加载状态字典到模型中（允许部分匹配）
    model.load_state_dict(state_dict, strict=False)
    
    # 计算模型参数数量（不包括嵌入层）
    n_params = model.num_parameters(exclude_embeddings=True)
    
    # 获取最佳验证损失值
    val_loss = checkpoint["best_val_loss"].item()
    
    # 输出日志信息，指示模型已加载，包括参数数量和验证损失值
    logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
    
    # 将模型设置为评估模式
    model.eval()
    
    # 将模型移动到指定设备
    model.to(device)
    
    # 删除检查点和状态字典，释放内存
    del checkpoint, state_dict
    
    # 返回加载并配置好的模型
    return model
# 定义函数，用于加载特定类型的 PyTorch 模型到指定路径下的文件夹
def load_model(pytorch_dump_folder_path, use_small=False, model_type="text"):
    # 检查模型类型是否合法，只允许 "text", "coarse", "fine" 三种类型
    if model_type not in ("text", "coarse", "fine"):
        raise NotImplementedError()

    # 设定设备为 CPU，执行模型转换操作
    device = "cpu"  # do conversion on cpu

    # 获取模型的检查点路径
    ckpt_path = _get_ckpt_path(model_type, use_small=use_small)
    # 载入模型，返回加载的模型对象
    model = _load_model(ckpt_path, device, model_type=model_type, use_small=use_small)

    # 加载 bark 初始模型
    bark_model = _bark_load_model(ckpt_path, "cpu", model_type=model_type, use_small=use_small)

    # 如果模型类型为 "text"，则从 bark_model 字典中获取 "model" 键对应的值
    if model_type == "text":
        bark_model = bark_model["model"]

    # 检查初始化的模型和新模型的参数数量是否相同
    if model.num_parameters(exclude_embeddings=True) != bark_model.get_num_params():
        raise ValueError("initial and new models don't have the same number of parameters")

    # 检查新模型和 bark 模型的输出是否相同
    batch_size = 5
    sequence_length = 10

    # 根据模型类型不同，生成不同形状的随机张量 vec，并计算模型的输出
    if model_type in ["text", "coarse"]:
        vec = torch.randint(256, (batch_size, sequence_length), dtype=torch.int)
        output_old_model = bark_model(vec)[0]
        output_new_model_total = model(vec)
        # 取最后一个时间步的输出 logits
        output_new_model = output_new_model_total.logits[:, [-1], :]
    else:
        prediction_codeboook_channel = 3
        n_codes_total = 8
        vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int)
        output_new_model_total = model(prediction_codeboook_channel, vec)
        output_old_model = bark_model(prediction_codeboook_channel, vec)
        output_new_model = output_new_model_total.logits

    # 检查新旧模型输出的形状是否一致
    if output_new_model.shape != output_old_model.shape:
        raise ValueError("initial and new outputs don't have the same shape")
    # 检查新旧模型输出的数值差异是否在阈值内
    if (output_new_model - output_old_model).abs().max().item() > 1e-3:
        raise ValueError("initial and new outputs are not equal")

    # 创建存储 PyTorch 模型的文件夹，如果不存在则创建
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 将模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)


# 加载完整的 Bark 模型
def load_whole_bark_model(
    semantic_path,
    coarse_path,
    fine_path,
    append_text,
    hub_path,
    folder_path,
):
    # 构建 PyTorch 模型保存的文件夹路径
    pytorch_dump_folder_path = os.path.join(folder_path, append_text)

    # 从预训练的配置文件加载 BarkSemanticConfig
    semanticConfig = BarkSemanticConfig.from_pretrained(os.path.join(semantic_path, "config.json"))
    # 从预训练的配置文件加载 BarkCoarseConfig
    coarseAcousticConfig = BarkCoarseConfig.from_pretrained(os.path.join(coarse_path, "config.json"))
    # 从预训练的配置文件加载 BarkFineConfig
    fineAcousticConfig = BarkFineConfig.from_pretrained(os.path.join(fine_path, "config.json"))
    # 从预训练模型加载 EncodecConfig
    codecConfig = EncodecConfig.from_pretrained("facebook/encodec_24khz")

    # 从预训练模型加载 BarkSemanticModel
    semantic = BarkSemanticModel.from_pretrained(semantic_path)
    # 从预训练模型加载 BarkCoarseModel
    coarseAcoustic = BarkCoarseModel.from_pretrained(coarse_path)
    # 从预训练模型加载 BarkFineModel
    fineAcoustic = BarkFineModel.from_pretrained(fine_path)
    # 从预训练模型加载 EncodecModel
    codec = EncodecModel.from_pretrained("facebook/encodec_24khz")

    # 根据子模型的配置创建 BarkConfig 对象
    bark_config = BarkConfig.from_sub_model_configs(
        semanticConfig, coarseAcousticConfig, fineAcousticConfig, codecConfig
    )
    # 使用多个子模型配置参数创建 BarkGenerationConfig 对象
    bark_generation_config = BarkGenerationConfig.from_sub_model_configs(
        semantic.generation_config, coarseAcoustic.generation_config, fineAcoustic.generation_config
    )

    # 创建 BarkModel 对象，并传入 bark_config 参数
    bark = BarkModel(bark_config)

    # 将各个子模型的实例赋给 BarkModel 对象的属性
    bark.semantic = semantic
    bark.coarse_acoustics = coarseAcoustic
    bark.fine_acoustics = fineAcoustic
    bark.codec_model = codec

    # 将之前创建的 BarkGenerationConfig 对象赋给 BarkModel 对象的 generation_config 属性
    bark.generation_config = bark_generation_config

    # 创建目录 pytorch_dump_folder_path（如果不存在的话）
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    
    # 将 BarkModel 对象保存到指定路径 pytorch_dump_folder_path，并推送到模型中心（hub）
    bark.save_pretrained(pytorch_dump_folder_path, repo_id=hub_path, push_to_hub=True)
if __name__ == "__main__":
    # 如果脚本作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必需的参数
    parser.add_argument("model_type", type=str, help="text, coarse or fine.")
    # 添加一个必需的参数，用于指定模型类型，接受字符串类型输入，例如 "text", "coarse" 或 "fine"

    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加一个必需的参数，用于指定 PyTorch 模型输出路径，接受字符串类型输入

    parser.add_argument("--is_small", action="store_true", help="convert the small version instead of the large.")
    # 添加一个可选的布尔类型参数，用于指定是否使用小版本而非大版本模型

    args = parser.parse_args()
    # 解析命令行参数，并将结果存储在 args 变量中

    load_model(args.pytorch_dump_folder_path, model_type=args.model_type, use_small=args.is_small)
    # 调用 load_model 函数，传入解析得到的参数：模型输出路径、模型类型和是否使用小版本模型

`.\models\bark\generation_configuration_bark.py`

# coding=utf-8
# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BARK model generation configuration"""

import copy
from typing import Dict

from ...generation.configuration_utils import GenerationConfig
from ...utils import logging

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 定义 BarkSemanticGenerationConfig 类，继承自 GenerationConfig
class BarkSemanticGenerationConfig(GenerationConfig):
    # 模型类型为语义生成
    model_type = "semantic"

    def __init__(
        self,
        eos_token_id=10_000,
        renormalize_logits=True,
        max_new_tokens=768,
        output_scores=False,
        return_dict_in_generate=False,
        output_hidden_states=False,
        output_attentions=False,
        temperature=1.0,
        do_sample=False,
        text_encoding_offset=10_048,
        text_pad_token=129_595,
        semantic_infer_token=129_599,
        semantic_vocab_size=10_000,
        max_input_semantic_length=256,
        semantic_rate_hz=49.9,
        min_eos_p=None,
        **kwargs,
    ):
        # 调用父类 GenerationConfig 的构造函数，初始化配置参数
        super().__init__(
            eos_token_id=eos_token_id,
            renormalize_logits=renormalize_logits,
            max_new_tokens=max_new_tokens,
            output_scores=output_scores,
            return_dict_in_generate=return_dict_in_generate,
            output_hidden_states=output_hidden_states,
            output_attentions=output_attentions,
            temperature=temperature,
            do_sample=do_sample,
            **kwargs,
        )
        # 设置额外的语义生成配置参数
        self.text_encoding_offset = text_encoding_offset
        self.text_pad_token = text_pad_token
        self.semantic_infer_token = semantic_infer_token
        self.semantic_vocab_size = semantic_vocab_size
        self.max_input_semantic_length = max_input_semantic_length
        self.semantic_rate_hz = semantic_rate_hz
        self.min_eos_p = min_eos_p

# 定义 BarkCoarseGenerationConfig 类，继承自 GenerationConfig
class BarkCoarseGenerationConfig(GenerationConfig):
    # 模型类型为粗粒度声学生成
    model_type = "coarse_acoustics"

    def __init__(
        self,
        renormalize_logits=True,
        output_scores=False,
        return_dict_in_generate=False,
        output_hidden_states=False,
        output_attentions=False,
        temperature=1.0,
        do_sample=False,
        coarse_semantic_pad_token=12_048,
        coarse_rate_hz=75,
        n_coarse_codebooks=2,
        coarse_infer_token=12_050,
        max_coarse_input_length=256,
        max_coarse_history: int = 630,
        sliding_window_len: int = 60,
        **kwargs,
    ):
        # 调用父类 GenerationConfig 的构造函数，初始化配置参数
        super().__init__(
            renormalize_logits=renormalize_logits,
            output_scores=output_scores,
            return_dict_in_generate=return_dict_in_generate,
            output_hidden_states=output_hidden_states,
            output_attentions=output_attentions,
            temperature=temperature,
            do_sample=do_sample,
            **kwargs,
        )
        # 设置额外的粗粒度声学生成配置参数
        self.coarse_semantic_pad_token = coarse_semantic_pad_token
        self.coarse_rate_hz = coarse_rate_hz
        self.n_coarse_codebooks = n_coarse_codebooks
        self.coarse_infer_token = coarse_infer_token
        self.max_coarse_input_length = max_coarse_input_length
        self.max_coarse_history = max_coarse_history
        self.sliding_window_len = sliding_window_len

# 定义 BarkFineGenerationConfig 类，继承自 GenerationConfig
class BarkFineGenerationConfig(GenerationConfig):
    # 模型类型为细粒度声学生成
    model_type = "fine_acoustics"

    def __init__(
        self,
        temperature=1.0,
        max_fine_history_length=512,
        max_fine_input_length=1024,
        n_fine_codebooks=8,
        **kwargs,
    ):
        # 调用父类 GenerationConfig 的构造函数，初始化配置参数
        super().__init__(
            temperature=temperature,
            **kwargs,
        )
        # 设置额外的细粒度声学生成配置参数
        self.max_fine_history_length = max_fine_history_length
        self.max_fine_input_length = max_fine_input_length
        self.n_fine_codebooks = n_fine_codebooks
    ):
        """
        Class that holds a generation configuration for `BarkFineModel`.

        `BarkFineModel` is an autoencoder model, so should not usually be used for generation. However, under the
        hood, it uses `temperature` when used by `BarkModel`.

        This configuration inherits from `GenerationConfig` and can be used to control the model generation. Read the
        documentation from `GenerationConfig` for more information.

        Args:
            temperature (`float`, *optional*):
                The value used to modulate the next token probabilities.
            max_fine_history_length (`int`, *optional*, defaults to 512):
                Max length of the fine history vector.
            max_fine_input_length (`int`, *optional*, defaults to 1024):
                Max length of fine input vector.
            n_fine_codebooks (`int`, *optional*, defaults to 8):
                Number of codebooks used.
        """
        # 调用父类构造函数，初始化基类的 temperature 参数
        super().__init__(temperature=temperature)

        # 设置当前类的属性值，用于控制生成配置
        self.max_fine_history_length = max_fine_history_length
        self.max_fine_input_length = max_fine_input_length
        self.n_fine_codebooks = n_fine_codebooks

    def validate(self, **kwargs):
        """
        Overrides GenerationConfig.validate because BarkFineGenerationConfig don't use any parameters outside
        temperature.
        """
        # 由于 BarkFineGenerationConfig 不使用除 temperature 外的任何参数，因此重写了 GenerationConfig.validate 方法。
        pass
class BarkGenerationConfig(GenerationConfig):
    model_type = "bark"
    is_composition = True

    # TODO (joao): nested from_dict
    # 定义一个待办事项，表示需要从字典中嵌套生成配置对象

    def __init__(
        self,
        semantic_config: Dict = None,
        coarse_acoustics_config: Dict = None,
        fine_acoustics_config: Dict = None,
    ):
        r"""
        Instantiate a [`BarkGenerationConfig`] (or a derived class) from bark sub-models generation configuration.

        Returns:
            [`BarkGenerationConfig`]: An instance of a configuration object
        """
        return cls(
            semantic_config=semantic_config.to_dict(),
            coarse_acoustics_config=coarse_acoustics_config.to_dict(),
            fine_acoustics_config=fine_acoustics_config.to_dict(),
            **kwargs,
        )


    def to_dict(self):
        """
        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].

        Returns:
            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
        """
        # Deep copy the internal dictionary representation of the object
        output = copy.deepcopy(self.__dict__)

        # Convert nested `semantic_config`, `coarse_acoustics_config`, and `fine_acoustics_config` objects to dictionaries
        output["semantic_config"] = self.semantic_config.to_dict()
        output["coarse_acoustics_config"] = self.coarse_acoustics_config.to_dict()
        output["fine_acoustics_config"] = self.fine_acoustics_config.to_dict()

        # Add the class-level attribute `model_type` to the output dictionary
        output["model_type"] = self.__class__.model_type
        return output

`.\models\bark\modeling_bark.py`

# 定义了一个 PyTorch 模型中的自注意力机制类 BarkSelfAttention
class BarkSelfAttention(nn.Module):
    # 从 GPTNeoSelfAttention 和 Bark 代码适配而来的自注意力机制类 BarkSelfAttention
    # BarkSelfAttention 可以有两种注意力类型，即全局注意力和因果注意力
    def __init__(self, config, is_causal=False):
        super().__init__()

        # regularization
        self.dropout = config.dropout  # 设置对象的 dropout 率
        self.attn_dropout = nn.Dropout(config.dropout)  # 创建一个 Dropout 层，用于注意力计算中的 dropout
        self.resid_dropout = nn.Dropout(config.dropout)  # 创建一个 Dropout 层，用于残差连接中的 dropout

        self.embed_dim = config.hidden_size  # 设置嵌入维度
        self.num_heads = config.num_heads  # 设置注意力头的数量
        self.head_dim = self.embed_dim // self.num_heads  # 计算每个注意力头的维度

        if config.hidden_size % config.num_heads != 0:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )

        # key, query, value projections for all heads, but in a batch
        self.att_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.bias)
        # 使用线性变换定义注意力机制的 key、query、value 投影，以及多头机制
        # 输出投影
        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=config.bias)

        self.is_causal = is_causal  # 是否使用因果关系（causal）的标志
        if is_causal:
            block_size = config.block_size
            # 创建一个下三角形式的因果关系矩阵，并注册为模型的缓冲区
            bias = torch.tril(torch.ones((block_size, block_size), dtype=bool)).view(1, 1, block_size, block_size)
            self.register_buffer("bias", bias)

    # Copied from transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention._split_heads
    def _split_heads(self, tensor, num_heads, attn_head_size):
        """
        Splits hidden_size dim into attn_head_size and num_heads
        """
        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
        # 重新塑造张量的形状，将隐藏层维度分割为注意力头大小和注意力头数量
        tensor = tensor.view(new_shape)
        return tensor.permute(0, 2, 1, 3)  # 返回重组后的张量，变换维度顺序为 (batch, head, seq_length, head_features)

    def _merge_heads(self, tensor, num_heads, attn_head_size):
        """
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        """

        # re-assemble all head outputs side by side
        # (batch, num_heads, seq_len, attn_head_size) -> (batch, seq_len, num_heads*attn_head_size)
        tensor = tensor.transpose(1, 2).contiguous()
        # 将所有注意力头的输出重新组合到一起，变换维度为 (batch, seq_len, num_heads*attn_head_size)
        tensor = tensor.view(tensor.size()[:-2] + (num_heads * attn_head_size,))

        return tensor
    # 定义注意力机制函数，接受查询（query）、键（key）、值（value）、注意力掩码（attention_mask）和头部掩码（head_mask）
    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
        # 计算注意力权重，使用query与key的转置矩阵相乘，乘以1除以query维度与key维度的平方根
        attn_weights = torch.matmul(query, key.transpose(-1, -2)) * (1.0 / math.sqrt(self.head_dim))

        # 如果是因果注意力机制，需要处理因果性，即当前位置只能依赖于之前的位置
        if self.is_causal:
            query_length, key_length = query.size(-2), key.size(-2)

            # 填充注意力权重的左上部分（上三角部分）为负无穷大（inf）
            attn_weights = attn_weights.masked_fill(
                self.bias[:, :, key_length - query_length : key_length, :key_length] == 0,
                torch.finfo(attn_weights.dtype).min,
            )

        # 如果有注意力掩码，将其应用于注意力权重
        if attention_mask is not None:
            attn_weights = attn_weights + attention_mask

        # 对注意力权重进行 softmax 操作，使得所有注意力权重的总和为1
        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
        attn_weights = attn_weights.to(value.dtype)

        # 对注意力权重应用注意力dropout，以减少过拟合
        attn_weights = self.attn_dropout(attn_weights)

        # 如果指定了头部掩码，将其应用于注意力权重
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        # 计算注意力输出，将注意力权重与值进行加权求和
        attn_output = torch.matmul(attn_weights, value)

        return attn_output, attn_weights

    # 定义前向传播函数，接受隐藏状态（hidden_states）、注意力掩码（attention_mask）、过去键值（past_key_values）、头部掩码（head_mask）、缓存使用标志（use_cache）、输出注意力权重标志（output_attentions）
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        past_key_values=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
    ):
        # 通过线性投影层将隐藏状态映射为查询（query）、键（key）、值（value）
        query, key, value = self.att_proj(hidden_states).split(self.embed_dim, dim=2)

        # 将查询（query）、键（key）、值（value）按头部数和头部维度进行分割
        query = self._split_heads(query, self.num_heads, self.head_dim)
        key = self._split_heads(key, self.num_heads, self.head_dim)
        value = self._split_heads(value, self.num_heads, self.head_dim)

        # 如果存在过去的键值，将当前的键值与过去的键值连接起来
        if past_key_values is not None:
            past_key = past_key_values[0]
            past_value = past_key_values[1]
            key = torch.cat((past_key, key), dim=-2)
            value = torch.cat((past_value, value), dim=-2)

        # 如果需要使用缓存，设置当前的键值对为当前状态的键值对
        if use_cache is True:
            present = (key, value)
        else:
            present = None

        # 调用注意力函数进行注意力计算，得到注意力输出和注意力权重
        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)

        # 合并多头注意力的输出结果
        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)

        # 通过输出投影层映射为最终的注意力输出
        attn_output = self.out_proj(attn_output)

        # 应用残差连接的dropout，以防止过拟合
        attn_output = self.resid_dropout(attn_output)

        # 将注意力输出和可能的缓存输出作为模型的输出
        outputs = (attn_output, present)

        # 如果需要输出注意力权重，将注意力权重也添加到模型输出中
        if output_attentions:
            outputs += (attn_weights,)

        return outputs
    class BarkSelfFlashAttention2(BarkSelfAttention):
        """
        Bark flash attention module. This module inherits from `BarkSelfAttention` as the weights of the module stays
        untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
        flash attention and deal with padding tokens in case the input contains any of them.
        """

        # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
        def __init__(self, *args, **kwargs):
            # 调用父类的初始化函数
            super().__init__(*args, **kwargs)

            # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
            # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
            # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
            # 设置一个标志来表示是否使用顶部左对齐的掩码
            self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

        def _split_heads(self, tensor, num_heads, attn_head_size):
            """
            Splits hidden_size dim into attn_head_size and num_heads
            """
            # 重新调整张量的形状，将隐藏尺寸分成头数和注意力头大小
            new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
            tensor = tensor.view(new_shape)
            # Flash attention 要求输入具有以下形状
            # batch_size x seq_length x head_dim x hidden_dim - (batch, seq_length, head, head_features)
            return tensor

        def _merge_heads(self, tensor, num_heads, attn_head_size):
            """
            Merges attn_head_size dim and num_attn_heads dim into hidden_size
            """
            # 重新组合所有头部的输出并排放在一起
            # (batch, seq_len, num_heads, attn_head_size) -> (batch, seq_len, num_heads*attn_head_size)
            tensor = tensor.view(tensor.size()[:-2] + (num_heads * attn_head_size,))
            return tensor

        def forward(
            self,
            hidden_states,
            attention_mask=None,
            past_key_values=None,
            head_mask=None,
            use_cache=False,
            output_attentions=False,
        ):
            # 获取隐藏状态张量的批量大小、查询长度和最后一个维度的大小
            batch_size, query_len, _ = hidden_states.size()

            # 使用注意力投影层分别计算查询、键、值，并按照 embed_dim 维度进行切分
            query, key, value = self.att_proj(hidden_states).split(self.embed_dim, dim=2)

            # 将切分后的查询、键、值张量按照 num_heads 和 head_dim 进行重新组合
            query = self._split_heads(query, self.num_heads, self.head_dim)
            key = self._split_heads(key, self.num_heads, self.head_dim)
            value = self._split_heads(value, self.num_heads, self.head_dim)

            if past_key_values is not None:
                # 如果过去的键值对不为 None，则进行维度转置操作
                # (batch, head, seq_length, head_features) -> (batch, seq_length, head, head_features)
                past_key = past_key_values[0].transpose(1, 2)
                past_value = past_key_values[1].transpose(1, 2)
                # 在 seq_length 维度上合并当前和过去的键值对
                key = torch.cat((past_key, key), dim=1)
                value = torch.cat((past_value, value), dim=1)

            if use_cache is True:
                # 如果使用缓存，则将键和值张量的 head 维度与 seq_length 维度交换位置
                # (batch, head, seq_length, head_features)
                present = (key.transpose(1, 2), value.transpose(1, 2))
            else:
                present = None

            # 执行闪电注意力机制的前向传播，得到注意力输出
            attn_output = self._flash_attention_forward(query, key, value, attention_mask, query_len, dropout=self.dropout)

            # 将多头注意力的输出张量按 head 维度进行合并
            attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
            # 使用输出投影层处理合并后的注意力输出
            attn_output = self.out_proj(attn_output)
            # 对输出应用残差连接的 dropout
            attn_output = self.resid_dropout(attn_output)

            # 组装最终的输出元组
            outputs = (attn_output, present)
            if output_attentions:
                # 如果需要输出注意力权重，则设置 attn_weights 为 None 并加入到 outputs 中
                attn_weights = None
                outputs += (attn_weights,)

            return outputs

        # 从 transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward 复制而来
        def _flash_attention_forward(
            self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        # 如果未使用 Flash Attention 中的 top-left mask，则确定是否是因果关系（causal）
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # TODO: 一旦 Flash Attention for RoCm 升级到 2.1 版本，可以移除 `query_length != 1` 的检查。详细信息请参考 LlamaFlashAttention2 __init__ 中的注释。
            # 否则，需要同时满足因果关系（causal）和 query_length 不等于 1
            causal = self.is_causal and query_length != 1

        # 如果存在至少一个填充标记的情况
        if attention_mask is not None:
            # 获取批次大小
            batch_size = query_states.shape[0]
            # 调用 _upad_input 方法，用于处理输入数据的填充问题
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            # 获取当前序列长度
            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            # 获取批次中最大序列长度
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            # 调用 flash_attn_varlen_func 方法计算注意力输出（未经填充）
            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            # 调用 pad_input 方法对注意力输出进行填充处理
            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
        else:
            # 如果不存在填充标记，则直接调用 flash_attn_func 方法计算注意力输出
            attn_output = flash_attn_func(
                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
            )

        # 返回注意力输出
        return attn_output

    # 从 transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input 复制而来
    # 定义一个私有方法，用于处理注意力机制的输入数据，根据输入的参数进行数据处理和重组
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        # 调用 _get_unpad_data 函数获取未填充数据的索引、当前序列长度及批次内最大序列长度信息
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        
        # 获取 key_layer 的形状信息，包括批次大小、键-值序列长度、键值头数和头维度
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

        # 根据 indices_k 重新索引 key_layer，重塑形状以便与未填充数据匹配
        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )

        # 根据 indices_k 重新索引 value_layer，重塑形状以便与未填充数据匹配
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )

        # 根据 query_length 的不同情况处理 query_layer
        if query_length == kv_seq_len:
            # 如果 query_length 等于键-值序列长度，则根据 indices_k 重新索引 query_layer
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            # 如果 query_length 等于 1，则使用简化的处理方式
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # 这里有一个内存拷贝，效率较差。
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # 否则，假设存在左填充，根据 query_length 和 attention_mask 处理 query_layer
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

        # 返回处理后的 query_layer、key_layer、value_layer、查询索引 indices_q、
        # 以及 cu_seqlens 和 max_seqlen_in_batch 的元组
        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )
BARK_ATTENTION_CLASSES = {
    "eager": BarkSelfAttention,  # 定义一个字典，将字符串映射到对应的自定义自注意力类
    "flash_attention_2": BarkSelfFlashAttention2,  # 同上，另一个字符串映射到自定义闪存注意力类
}


class BarkLayerNorm(nn.Module):
    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False."""

    def __init__(self, hidden_size, bias=True):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))  # 初始化可学习的权重参数
        self.bias = nn.Parameter(torch.zeros(hidden_size)) if bias else None  # 根据bias参数初始化可学习的偏置参数，如果bias=False，则为None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, eps=1e-5)  # 使用PyTorch的layer_norm函数进行Layer Normalization


class BarkMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.in_proj = nn.Linear(config.hidden_size, 4 * config.hidden_size, bias=config.bias)  # 输入投影层，线性变换，可选择是否包含偏置
        self.out_proj = nn.Linear(4 * config.hidden_size, config.hidden_size, bias=config.bias)  # 输出投影层，线性变换，可选择是否包含偏置
        self.dropout = nn.Dropout(config.dropout)  # Dropout层，根据配置概率丢弃输入
        self.gelu = nn.GELU()  # GELU激活函数

    def forward(self, hidden_states):
        hidden_states = self.in_proj(hidden_states)  # 输入投影层
        hidden_states = self.gelu(hidden_states)  # GELU激活函数
        hidden_states = self.out_proj(hidden_states)  # 输出投影层
        hidden_states = self.dropout(hidden_states)  # Dropout层
        return hidden_states  # 返回处理后的隐藏状态


class BarkBlock(nn.Module):
    def __init__(self, config, is_causal=False):
        super().__init__()

        if is_causal:
            # 如果是因果的，使用自定义的LayerNorm，以便支持可选的偏置
            # 这个自定义的LayerNorm用于与Bark中留有可选偏置的自回归模型（对应于“Text”和“Coarse”模块）保持一致
            self.layernorm_1 = BarkLayerNorm(config.hidden_size, bias=config.bias)
            self.layernorm_2 = BarkLayerNorm(config.hidden_size, bias=config.bias)
        else:
            self.layernorm_1 = nn.LayerNorm(config.hidden_size)  # 否则使用PyTorch标准的LayerNorm
            self.layernorm_2 = nn.LayerNorm(config.hidden_size)

        self.attn = BARK_ATTENTION_CLASSES[config._attn_implementation](config, is_causal=is_causal)  # 根据配置选择对应的注意力机制类

        self.mlp = BarkMLP(config)  # 创建MLP模块

    def forward(
        self,
        hidden_states,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
        ):
        # 此处省略了forward函数的余下部分，因为要保持代码完整性，不做更改
        ):
            # 对隐藏状态进行 layer normalization
            intermediary_hidden_states = self.layernorm_1(hidden_states)

            # 使用 self.attn 进行注意力计算
            attn_outputs = self.attn(
                intermediary_hidden_states,
                past_key_values=past_key_values,
                attention_mask=attention_mask,
                head_mask=head_mask,
                use_cache=use_cache,
                output_attentions=output_attentions,
            )

            # 获取注意力计算的输出
            attn_output = attn_outputs[0]  # output_attn: output, present_key_values, (attn_weights)
            # 剩余的输出
            outputs = attn_outputs[1:]

            # 更新中间隐藏状态
            intermediary_hidden_states = hidden_states + attn_output
            # 经过第二个层归一化和多层感知机处理
            intermediary_hidden_states = intermediary_hidden_states + self.mlp(
                self.layernorm_2(intermediary_hidden_states)
            )

            # 如果使用缓存，将更新后的中间隐藏状态添加到输出中
            if use_cache:
                outputs = (intermediary_hidden_states,) + outputs
            else:
                # 否则，仅将更新后的中间隐藏状态与原输出的后续部分合并
                outputs = (intermediary_hidden_states,) + outputs[1:]

            # 返回更新后的输出，包括隐藏状态和可能的缓存
            return outputs  # hidden_states, ((present), attentions)
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""

class BarkPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # Configuration class specific to BarkPreTrainedModel
    config_class = BarkConfig

    # Gradient checkpointing support is disabled
    supports_gradient_checkpointing = False

    # Specific attribute for flash attention 2 support
    _supports_flash_attn_2 = True

    def _init_weights(self, module):
        """Initialize the weights of the module based on its type."""
        if isinstance(module, (nn.Linear,)):
            # Initialize linear layers' weights with normal distribution
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # Initialize embedding weights with normal distribution
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # Zero out weights corresponding to padding_idx if specified
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # Initialize LayerNorm bias to zero and weight to 1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def __init__(self, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)

    @property
    def device(self) -> torch.device:
        """
        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
        device).
        """
        # Check if the module has a _hf_hook to determine if it has been offloaded
        if not hasattr(self, "_hf_hook"):
            return get_parameter_device(self)
        
        # Traverse through all modules to find the execution device based on _hf_hook
        for module in self.modules():
            if (
                hasattr(module, "_hf_hook")
                and hasattr(module._hf_hook, "execution_device")
                and module._hf_hook.execution_device is not None
            ):
                return torch.device(module._hf_hook.execution_device)

        # Return the device of the parameters if no _hf_hook is found
        return get_parameter_device(self)
"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)

This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.

Parameters:
    config ([`{config}`]):
        Model configuration class with all the parameters of the model. Initializing with a config file does not
        load the weights associated with the model, only the configuration. Check out the
        [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BARK_MODEL_START_DOCSTRING = BARK_MODEL_START_DOCSTRING.strip()

"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
"""
BARK_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
"""
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`BarkConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BARK_FINE_INPUTS_DOCSTRING = r"""
    Args:
        codebook_idx (`int`):
            Index of the codebook that will be predicted.
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, number_of_codebooks)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it. Initially, indices of the first two codebooks are obtained from the `coarse` sub-model. The rest is
            predicted recursively by attending the previously predicted channels. The model predicts on windows of
            length 1024.
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): NOT IMPLEMENTED YET.
        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
            `past_key_values` is used, optionally only the last `input_embeds` have to be input (see
            `past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
            associated vectors than the model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

BARK_CAUSAL_MODEL_INPUTS_DOCSTRING = r"""
"""

# GPT2-like autoregressive model
class BarkCausalModel(BarkPreTrainedModel):
    # 使用BarkSubModelConfig类来配置模型
    config_class = BarkSubModelConfig
    # 初始化方法，接受一个配置参数 config
    def __init__(self, config):
        # 调用父类的初始化方法，传入配置参数 config
        super().__init__(config)
        # 将配置参数 config 保存在实例变量中
        self.config = config

        # 初始化输入词嵌入层，根据输入词汇大小和隐藏层大小创建 Embedding 层
        self.input_embeds_layer = nn.Embedding(config.input_vocab_size, config.hidden_size)
        # 初始化位置嵌入层，根据块大小和隐藏层大小创建 Embedding 层
        self.position_embeds_layer = nn.Embedding(config.block_size, config.hidden_size)

        # 初始化 Dropout 层，使用配置中的 dropout 比率
        self.drop = nn.Dropout(config.dropout)

        # 使用列表推导式创建多层 BarkBlock 模块组成的模块列表，每层使用相同的配置和是因果的标志
        self.layers = nn.ModuleList([BarkBlock(config, is_causal=True) for _ in range(config.num_layers)])

        # 根据配置中的 _attn_implementation 判断是否使用 Flash Attention 2
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"

        # 初始化最终的 LayerNorm 层，使用隐藏层大小和偏置配置进行初始化
        self.layernorm_final = BarkLayerNorm(config.hidden_size, bias=config.bias)

        # 初始化语言模型的线性层，将隐藏层映射到输出词汇大小的空间，没有偏置
        self.lm_head = nn.Linear(config.hidden_size, config.output_vocab_size, bias=False)

        # 禁用渐变检查点，用于后续的优化和训练过程
        self.gradient_checkpointing = False

        # 执行后期初始化，可能包括权重初始化和其他配置
        self.post_init()

    # 返回输入词嵌入层
    def get_input_embeddings(self):
        return self.input_embeds_layer

    # 设置新的输入词嵌入层
    def set_input_embeddings(self, new_embeddings):
        self.input_embeds_layer = new_embeddings
    # 准备用于生成的输入，根据传入的参数进行处理
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
        # 从 kwargs 中获取输入的嵌入向量（如果有）
        input_embeds = kwargs.get("input_embeds", None)

        # 从 kwargs 中获取注意力遮罩（如果有）
        attention_mask = kwargs.get("attention_mask", None)
        # 从 kwargs 中获取位置编码（如果有）
        position_ids = kwargs.get("position_ids", None)

        # 如果 past_key_values 不为 None，则执行以下操作
        if past_key_values is not None:
            # 忽略已经被 past_key_values 覆盖的 token
            seq_len = input_ids.shape[1]  # 获取输入序列的长度
            past_length = past_key_values[0][0].shape[2]  # 获取过去键值的长度

            # 如果输入序列的长度大于过去键值的长度，则移除前缀长度为 past_length
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 否则，默认行为：仅保留最后一个 token
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]  # 更新 input_ids 为移除前缀后的序列

            # input_embeds 已经被使用过，不再需要
            input_embeds = None
        else:
            # 如果 past_key_values 为 None，则执行以下操作
            if input_embeds is not None and kwargs.get("use_cache"):
                seq_len = input_embeds.shape[1]  # 获取嵌入向量的序列长度
            else:
                seq_len = input_ids.shape[1]  # 获取输入序列的长度

        # 确保 attention_mask 和 position_ids 的形状与奇怪的 Bark hack 对序列长度的减少对齐
        if attention_mask is not None:
            attention_mask = attention_mask[:, :seq_len]  # 调整 attention_mask 的长度为 seq_len
        if position_ids is not None:
            position_ids = position_ids[:, :seq_len]  # 调整 position_ids 的长度为 seq_len

        # 如果 attention_mask 存在且 position_ids 不存在，则执行以下操作
        if attention_mask is not None and position_ids is None:
            # 在批次生成时动态创建 position_ids
            position_ids = attention_mask.long().cumsum(-1) - 1  # 根据 attention_mask 创建 position_ids
            position_ids.masked_fill_(attention_mask == 0, 1)  # 将 position_ids 中 attention_mask 为 0 的位置填充为 1
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]  # 如果 past_key_values 存在，截取最后 input_ids 长度的 position_ids
        else:
            position_ids = None  # 否则置为 None

        # 如果 input_embeds 存在且 use_cache 为 True，则返回以下结果字典
        if input_embeds is not None and kwargs.get("use_cache"):
            return {
                "input_ids": None,
                "input_embeds": input_embeds,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "position_ids": position_ids,
                "attention_mask": attention_mask,
            }
        # 否则，返回以下结果字典
        return {
            "input_ids": input_ids,
            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache"),
            "position_ids": position_ids,
            "attention_mask": attention_mask,
        }

    # 将 Bark_causal_model 的输入文档字符串添加到模型前向方法中
    @add_start_docstrings_to_model_forward(BARK_CAUSAL_MODEL_INPUTS_DOCSTRING)
    # 定义一个类方法 `forward`，用于模型的前向传播。
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的token IDs张量，可选
        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,  # 用于存储过去的键值，可选
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码张量，可选
        position_ids: Optional[torch.Tensor] = None,  # 位置ID张量，可选
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码张量，可选
        labels: Optional[torch.LongTensor] = None,  # 标签张量，可选
        input_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入张量，可选
        use_cache: Optional[bool] = None,  # 是否使用缓存，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选
        return_dict: Optional[bool] = None,  # 是否返回字典格式的结果，可选
    ):
        # 静态方法 `_reorder_cache`，用于在beam搜索或采样时重新排序 `past_key_values` 缓存，
        # 以确保在每个生成步骤中与正确的beam_idx匹配。
        @staticmethod
        def _reorder_cache(
            past_key_values: Tuple[Tuple[torch.Tensor]],  # 包含过去键值的元组，必须为张量
            beam_idx: torch.Tensor  # beam索引张量，指定重新排序顺序
        ) -> Tuple[Tuple[torch.Tensor]]:
            """
            This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
            [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
            beam_idx at every generation step.
            """
            # 对于每个层的过去状态，使用 `beam_idx` 在设备上选择正确的过去状态，返回重新排序后的元组
            return tuple(
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
                for layer_past in past_key_values
            )
# 定义 BarkSemanticModel 类，继承自 BarkCausalModel 类
@add_start_docstrings(
    """Bark semantic (or text) model. It shares the same architecture as the coarse model.
    It is a GPT-2 like autoregressive model with a language modeling head on top.""",
    BARK_MODEL_START_DOCSTRING.format(config="BarkSemanticConfig"),
)
class BarkSemanticModel(BarkCausalModel):
    # 指定模型的前缀名称为 'semantic'
    base_model_prefix = "semantic"
    # 指定配置类为 BarkSemanticConfig
    config_class = BarkSemanticConfig

    # 定义生成方法
    def generate(
        self,
        input_ids: torch.Tensor,
        semantic_generation_config: BarkSemanticGenerationConfig = None,
        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        **kwargs,



# 定义 BarkCoarseModel 类，继承自 BarkCausalModel 类
@add_start_docstrings(
    """Bark coarse acoustics model.
    It shares the same architecture as the semantic (or text) model. It is a GPT-2 like autoregressive model with a
    language modeling head on top.""",
    BARK_MODEL_START_DOCSTRING.format(config="BarkCoarseConfig"),
)
class BarkCoarseModel(BarkCausalModel):
    # 指定模型的前缀名称为 'coarse_acoustics'
    base_model_prefix = "coarse_acoustics"
    # 指定配置类为 BarkCoarseConfig
    config_class = BarkCoarseConfig

    # 定义预处理历史数据的方法
    def preprocess_histories(
        self,
        max_coarse_history: int,
        semantic_to_coarse_ratio: int,
        batch_size: int,
        semantic_generation_config: int,
        codebook_size: int,
        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
    # 定义生成方法
    def generate(
        self,
        semantic_output: torch.Tensor,
        semantic_generation_config: BarkSemanticGenerationConfig = None,
        coarse_generation_config: BarkCoarseGenerationConfig = None,
        codebook_size: int = 1024,
        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
        return_output_lengths: Optional[bool] = None,
        **kwargs,



# 定义 BarkFineModel 类，继承自 BarkPreTrainedModel 类
@add_start_docstrings(
    """Bark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and
    language modeling heads, one for each codebook.""",
    BARK_MODEL_START_DOCSTRING.format(config="BarkFineConfig"),
)
class BarkFineModel(BarkPreTrainedModel):
    # 指定模型的前缀名称为 'fine_acoustics'
    base_model_prefix = "fine_acoustics"
    # 指定配置类为 BarkFineConfig
    config_class = BarkFineConfig
    # 主输入名称为 'codebook_idx'
    main_input_name = "codebook_idx"
    def __init__(self, config):
        # non-causal gpt-like model with one embedding layer and one lm_head for each codebook of Encodec
        # 使用给定的配置初始化模型
        super().__init__(config)
        self.config = config

        # initialize a modified non causal GPT-like model
        # note that for there is one embedding layer and one lm_head for each codebook of Encodec
        # 初始化修改后的非因果关系的类似GPT的模型
        # 每个Encodec编码书中都有一个嵌入层和一个lm_head
        self.input_embeds_layers = nn.ModuleList(
            [nn.Embedding(config.input_vocab_size, config.hidden_size) for _ in range(config.n_codes_total)]
        )
        # 初始化输入嵌入层列表，每个编码书一个嵌入层

        self.position_embeds_layer = nn.Embedding(config.block_size, config.hidden_size)
        # 初始化位置嵌入层，用于位置编码

        self.drop = nn.Dropout(config.dropout)
        # 初始化Dropout层，用于随机丢弃输入的一部分特征

        self.layers = nn.ModuleList([BarkBlock(config, is_causal=False) for _ in range(config.num_layers)])
        # 初始化模型的层列表，每层使用BarkBlock，is_causal参数为False表示非因果关系

        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
        # 根据配置决定是否使用Flash Attention版本2

        self.layernorm_final = nn.LayerNorm(config.hidden_size)
        # 初始化最终的Layer Norm层，对模型的隐藏状态进行归一化

        self.lm_heads = nn.ModuleList(
            [
                nn.Linear(config.hidden_size, config.output_vocab_size, bias=False)
                for _ in range(config.n_codes_given, config.n_codes_total)
            ]
        )
        # 初始化语言模型头列表，每个编码书一个lm_head

        self.gradient_checkpointing = False
        # 梯度检查点默认关闭

        self.n_codes_total = config.n_codes_total
        # 记录总编码书的数量

        # Initialize weights and apply final processing
        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # one embedding layers for each codebook
        # 返回每个编码书的输入嵌入层列表
        return self.input_embeds_layers

    def set_input_embeddings(self, new_embeddings):
        # one embedding layers for each codebook
        # 设置每个编码书的输入嵌入层列表
        self.input_embeds_layers = new_embeddings

    def get_output_embeddings(self):
        # one lm_head for each codebook
        # 返回每个编码书的语言模型头列表
        return self.lm_heads

    def set_output_embeddings(self, new_output_embeddings):
        # one lm_head for each codebook
        # 设置每个编码书的语言模型头列表
        self.lm_heads = new_output_embeddings

    def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
        old_embeddings_list = self.get_input_embeddings()
        # 获取当前的输入嵌入层列表

        new_embeddings_list = nn.ModuleList(
            [
                self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
                for old_embeddings in old_embeddings_list
            ]
        )
        # 生成新的调整大小后的输入嵌入层列表

        self.set_input_embeddings(new_embeddings_list)
        # 设置模型的新输入嵌入层列表

        new_num_tokens = new_embeddings_list[0].weight.shape[0]
        # 更新新的嵌入层中的标记数量

        # if word embeddings are not tied, make sure that lm head is resized as well
        # 如果单词嵌入未绑定，则确保也调整lm头的大小
        if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
            old_lm_head_list = self.get_output_embeddings()
            # 获取当前的语言模型头列表

            new_lm_head_list = nn.ModuleList(
                [self._get_resized_lm_head(old_lm_head, new_num_tokens) for old_lm_head in old_lm_head_list]
            )
            # 生成新的调整大小后的语言模型头列表

            self.set_output_embeddings(new_lm_head_list)
            # 设置模型的新语言模型头列表

        return self.get_input_embeddings()
    def resize_token_embeddings(
        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
    ) -> nn.Embedding:
        """
        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:
            new_num_tokens (`int`, *optional*):
                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc

        Return:
            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
        """
        # 调用内部方法调整模型的词嵌入大小，并获取返回的词嵌入模块
        model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
        # 如果未指定新的词汇量大小和填充到的倍数，直接返回原始的词嵌入模块
        if new_num_tokens is None and pad_to_multiple_of is None:
            return model_embeds

        # 更新基础模型和当前模型配置的词汇量大小
        self.config.output_vocab_size = model_embeds[0].weight.shape[0]
        self.config.vocab_size = model_embeds[0].weight.shape[0]
        # 更新当前对象的输出词汇量大小和词汇量大小
        self.output_vocab_size = model_embeds[0].weight.shape[0]
        self.vocab_size = model_embeds[0].weight.shape[0]

        # 如果需要，重新绑定权重
        self.tie_weights()

        # 返回调整后的词嵌入模块
        return model_embeds
    # 将输入嵌入列表和输出嵌入列表之间的权重进行绑定或克隆。

    if getattr(self.config, "tie_word_embeddings", True):
        # 如果配置中设置了torchscript标志，则无法处理参数共享，因此我们克隆权重而不是绑定。
        self._tied_weights_keys = []  # 初始化存储绑定权重的键列表
        output_embeddings = self.get_output_embeddings()  # 获取输出嵌入层对象
        input_embeddings = self.get_input_embeddings()    # 获取输入嵌入层对象

        for i in range(self.config.n_codes_total - self.config.n_codes_given):
            # 将输出嵌入层i的权重绑定到输入嵌入层i+1的权重上
            self._tie_or_clone_weights(output_embeddings[i], input_embeddings[i + 1])
            # 记录已绑定权重的键名，格式为"lm_heads.{i}.weight"
            self._tied_weights_keys.append(f"lm_heads.{i}.weight")

    # 递归地对模型的所有子模块调用 _tie_weights 方法，如果模块有该方法的话
    for module in self.modules():
        if hasattr(module, "_tie_weights"):
            module._tie_weights()



    # 生成模型的前向传播方法，接受一系列输入和可选的配置参数作为输入。

    @add_start_docstrings_to_model_forward(BARK_FINE_INPUTS_DOCSTRING)
    def forward(
        self,
        codebook_idx: int,  # 用于预测的代码本身的附加索引
        input_ids: Optional[torch.Tensor] = None,         # 输入的token id张量
        attention_mask: Optional[torch.Tensor] = None,    # 注意力掩码张量
        position_ids: Optional[torch.Tensor] = None,      # 位置id张量
        head_mask: Optional[torch.Tensor] = None,         # 头部掩码张量
        labels: Optional[torch.LongTensor] = None,        # 标签张量（用于监督学习）
        input_embeds: Optional[torch.Tensor] = None,      # 输入嵌入张量（可以替代input_ids）
        output_attentions: Optional[bool] = None,         # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,      # 是否输出隐藏状态
        return_dict: Optional[bool] = None,               # 是否以字典形式返回结果
    ):



    # 生成模型的生成方法，接受粗粒度输出、语义生成配置、粗粒度生成配置、细粒度生成配置、代码本尺寸等参数以及历史提示的可选字典输入。

    def generate(
        self,
        coarse_output: torch.Tensor,                            # 粗粒度输出的张量
        semantic_generation_config: BarkSemanticGenerationConfig = None,  # 语义生成配置对象
        coarse_generation_config: BarkCoarseGenerationConfig = None,      # 粗粒度生成配置对象
        fine_generation_config: BarkFineGenerationConfig = None,          # 细粒度生成配置对象
        codebook_size: int = 1024,                           # 代码本尺寸，默认为1024
        history_prompt: Optional[Dict[str, torch.Tensor]] = None,  # 历史提示的可选字典输入
        **kwargs,                                            # 其他关键字参数
    ):
"""
The full Bark model, a text-to-speech model composed of 4 sub-models:
- `BarkSemanticModel` (also referred to as the 'text' model): a causal auto-regressive transformer model that
  takes as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
- `BarkCoarseModel` (also referred to as the 'coarse acoustics' model), also a causal autoregressive transformer,
  that takes as input the results of the last model. It aims at regressing the first two audio codebooks necessary
  to `encodec`.
- `BarkFineModel` (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
  predicts the last codebooks based on the sum of the previous codebooks embeddings.
- After having predicted all the codebook channels from the `EncodecModel`, Bark uses it to decode the output audio
  array.

It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
output sound according to a specific predefined voice.
"""
@add_start_docstrings(
    """
    The full Bark model, a text-to-speech model composed of 4 sub-models:
    - [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that
      takes
    as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
    - [`BarkCoarseModel`] (also refered to as the 'coarse acoustics' model), also a causal autoregressive transformer,
    that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary
    to `encodec`.
    - [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
    predicts the last codebooks based on the sum of the previous codebooks embeddings.
    - having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio
      array.

    It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
    output sound according to specific predefined voice.
    """,
    BARK_START_DOCSTRING,
)
class BarkModel(BarkPreTrainedModel):
    config_class = BarkConfig

    def __init__(self, config):
        super().__init__(config)

        # Initialize the BarkSemanticModel with the provided semantic configuration
        self.semantic = BarkSemanticModel(config.semantic_config)

        # Initialize the BarkCoarseModel with the provided coarse acoustics configuration
        self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)

        # Initialize the BarkFineModel with the provided fine acoustics configuration
        self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)

        # Initialize the codec_model using AutoModel and the provided codec configuration
        self.codec_model = AutoModel.from_config(config.codec_config)

        # Store the provided configuration for later reference
        self.config = config

    @property
    def device(self) -> torch.device:
        """
        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
        device).
        """
        # Determine the device on which the BarkModel resides
        # Check if semantic model has the _hf_hook attribute indicating it has been offloaded
        if not hasattr(self.semantic, "_hf_hook"):
            return get_parameter_device(self)  # Return the device of the current module
        # If semantic model has _hf_hook, find the execution device from the hook
        for module in self.semantic.modules():
            if (
                hasattr(module, "_hf_hook")
                and hasattr(module._hf_hook, "execution_device")
                and module._hf_hook.execution_device is not None
            ):
                return torch.device(module._hf_hook.execution_device)  # Return the execution device found
    def enable_cpu_offload(self, gpu_id: Optional[int] = 0):
        r"""
        Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
        method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
        the next sub-model runs.

        Args:
            gpu_id (`int`, *optional*, defaults to 0):
                GPU id on which the sub-models will be loaded and offloaded.
        """
        if is_accelerate_available():
            from accelerate import cpu_offload_with_hook  # Importing the function for offloading to CPU
        else:
            raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")

        device = torch.device(f"cuda:{gpu_id}")  # Define the CUDA device based on given GPU id

        if self.device.type != "cpu":
            self.to("cpu")  # Move the entire model to CPU
            torch.cuda.empty_cache()  # Clear GPU cache to free up memory

        # Offload the input embedding layer to CPU and receive a hook for later layers
        self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)

        hook = None
        # Offload each sub-model to CPU sequentially and chain hooks between them
        for cpu_offloaded_model in [
            self.semantic,
            self.coarse_acoustics,
            self.fine_acoustics,
        ]:
            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)

        self.fine_acoustics_hook = hook  # Store the final hook after offloading fine_acoustics

        _, hook = cpu_offload_with_hook(self.codec_model, device, prev_module_hook=hook)
        self.codec_model_hook = hook  # Store the hook after offloading the codec_model

        # We'll offload the last model manually.
        self.codec_model_hook = hook

    def codec_decode(self, fine_output, output_lengths=None):
        """Turn quantized audio codes into audio array using encodec."""

        fine_output = fine_output.transpose(0, 1)  # Transpose the fine_output tensor
        emb = self.codec_model.quantizer.decode(fine_output)  # Decode the quantized audio codes

        if output_lengths is not None:
            # Decode audio samples, handling variable lengths with padding
            out = [sample[:, :l].unsqueeze(0) for (sample, l) in zip(emb, output_lengths)]
            audio_arr = [self.codec_model.decoder(sample).squeeze() for sample in out]
        else:
            out = self.codec_model.decoder(emb)  # Decode audio samples without length restrictions
            audio_arr = out.squeeze(1)  # Squeeze the codebook dimension

        return audio_arr

    @torch.no_grad()
    def generate(
        self,
        input_ids: Optional[torch.Tensor] = None,
        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
        return_output_lengths: Optional[bool] = None,
        **kwargs,
    ):
        """Method for generating outputs based on input_ids and optional prompts."""
        # Method for generating outputs based on input_ids and optional prompts
        pass

    @classmethod
    def _check_and_enable_flash_attn_2(
        cls,
        config,
        torch_dtype: Optional[torch.dtype] = None,
        device_map: Optional[Union[str, Dict[str, int]]] = None,
        hard_check_only: bool = False,
        check_device_map: bool = False,
    ):
        """Check and potentially enable flash attention mechanism."""
        # Check and potentially enable flash attention mechanism
        pass
        """
        `_check_and_enable_flash_attn_2`原本不扩展闪存注意力使能到模型的子配置。我们重写原始方法以确保Bark子模型在需要时使用Flash Attention。

        如果你不了解Flash Attention，请查看官方的Flash Attention存储库：
        https://github.com/Dao-AILab/flash-attention

        若要直接通过`BetterTransformer` API使用Flash Attention 1.0，请查看文档的特定部分以了解更多信息：
        https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models

        该方法检查当前设置是否与Flash Attention兼容，因为它要求模型处于半精度并且不在CPU上运行。

        如果所有检查都通过且`hard_check_only`为False，则该方法将把配置属性`_attn_implementation`设置为"flash_attention_2"，以便模型可以初始化正确的注意力模块。
        """
        # 调用父类方法以检查和启用Flash Attention 2
        config = super()._check_and_enable_flash_attn_2(
            config, torch_dtype, device_map, hard_check_only=hard_check_only, check_device_map=check_device_map
        )

        # 设置语义配置、粗略声学配置和精细声学配置的注意力实现属性与主配置保持一致
        config.semantic_config._attn_implementation = config._attn_implementation
        config.coarse_acoustics_config._attn_implementation = config._attn_implementation
        config.fine_acoustics_config._attn_implementation = config._attn_implementation

        # 返回更新后的配置对象
        return config

`.\models\bark\processing_bark.py`

"""
Processor class for Bark
"""
# 引入必要的库和模块
import json  # 导入处理 JSON 的模块
import os  # 导入操作系统相关功能的模块
from typing import Optional  # 导入类型提示中的 Optional 类型

import numpy as np  # 导入 NumPy 库

# 导入所需的自定义模块和函数
from ...feature_extraction_utils import BatchFeature
from ...processing_utils import ProcessorMixin
from ...utils import logging
from ...utils.hub import get_file_from_repo
from ..auto import AutoTokenizer  # 导入自动化的 Tokenizer

# 获取日志记录器
logger = logging.get_logger(__name__)


class BarkProcessor(ProcessorMixin):
    r"""
    Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor.

    Args:
        tokenizer ([`PreTrainedTokenizer`]):
            An instance of [`PreTrainedTokenizer`].
        speaker_embeddings (`Dict[Dict[str]]`, *optional*):
            Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
            `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
            embeddings. The values correspond to the path of the corresponding `np.ndarray`. See
            [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
            a list of `voice_preset_names`.

    """

    # 类属性：指定使用的 Tokenizer 类型
    tokenizer_class = "AutoTokenizer"
    # 类属性：指定必须具备的属性列表
    attributes = ["tokenizer"]

    # 类属性：定义不同类型的预设形状
    preset_shape = {
        "semantic_prompt": 1,
        "coarse_prompt": 2,
        "fine_prompt": 2,
    }

    def __init__(self, tokenizer, speaker_embeddings=None):
        # 构造函数：初始化 BarkProcessor 实例
        super().__init__(tokenizer)
        # 初始化属性：说话者嵌入（可选）
        self.speaker_embeddings = speaker_embeddings

    @classmethod
    def from_pretrained(
        cls, pretrained_processor_name_or_path, speaker_embeddings_dict_path="speaker_embeddings_path.json", **kwargs
    ):
        # 类方法：从预训练的处理器名称或路径创建 BarkProcessor 实例
        ):
            r"""
            Instantiate a Bark processor associated with a pretrained model.

            Args:
                pretrained_model_name_or_path (`str` or `os.PathLike`):
                    This can be either:

                    - a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on
                      huggingface.co.
                    - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
                      method, e.g., `./my_model_directory/`.
                speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
                    The name of the `.json` file containing the speaker_embeddings dictionary located in
                    `pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded.
                **kwargs
                    Additional keyword arguments passed along to both
                    [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
            """

            if speaker_embeddings_dict_path is not None:
                # 获取存储在指定路径的说话者嵌入字典文件路径
                speaker_embeddings_path = get_file_from_repo(
                    pretrained_processor_name_or_path,
                    speaker_embeddings_dict_path,
                    subfolder=kwargs.pop("subfolder", None),
                    cache_dir=kwargs.pop("cache_dir", None),
                    force_download=kwargs.pop("force_download", False),
                    proxies=kwargs.pop("proxies", None),
                    resume_download=kwargs.pop("resume_download", False),
                    local_files_only=kwargs.pop("local_files_only", False),
                    token=kwargs.pop("use_auth_token", None),
                    revision=kwargs.pop("revision", None),
                )
                if speaker_embeddings_path is None:
                    # 若找不到指定的说话者嵌入字典文件路径，则警告并设置为 None
                    logger.warning(
                        f"""`{os.path.join(pretrained_processor_name_or_path,speaker_embeddings_dict_path)}` does not exists
                        , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
                        dictionnary if wanted, otherwise set `speaker_embeddings_dict_path=None`."""
                    )
                    speaker_embeddings = None
                else:
                    # 若找到了指定的说话者嵌入字典文件路径，则读取其中的内容为 JSON 格式的字典数据
                    with open(speaker_embeddings_path) as speaker_embeddings_json:
                        speaker_embeddings = json.load(speaker_embeddings_json)
            else:
                # 如果未提供说话者嵌入字典文件路径，则设置为 None
                speaker_embeddings = None

            # 使用预训练的模型名或路径加载 tokenizer
            tokenizer = AutoTokenizer.from_pretrained(pretrained_processor_name_or_path, **kwargs)

            # 返回实例化后的 BarkProcessor 对象，包括 tokenizer 和 speaker_embeddings
            return cls(tokenizer=tokenizer, speaker_embeddings=speaker_embeddings)

        def save_pretrained(
            self,
            save_directory,
            speaker_embeddings_dict_path="speaker_embeddings_path.json",
            speaker_embeddings_directory="speaker_embeddings",
            push_to_hub: bool = False,
            **kwargs,
        """
        Saves the attributes of this processor (tokenizer...) in the specified directory so that it can be reloaded
        using the [`~BarkProcessor.from_pretrained`] method.

        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created
                if it does not exist).
            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
                The name of the `.json` file that will contains the speaker_embeddings nested path dictionary, if it
                exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`.
            speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`):
                The name of the folder in which the speaker_embeddings arrays will be saved.
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            kwargs:
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        """
        # 如果存在说话者嵌入，则创建保存目录
        if self.speaker_embeddings is not None:
            # 创建目录结构，确保路径存在
            os.makedirs(os.path.join(save_directory, speaker_embeddings_directory, "v2"), exist_ok=True)

            # 创建一个空的嵌入字典
            embeddings_dict = {}

            # 设置存储路径为保存目录
            embeddings_dict["repo_or_path"] = save_directory

            # 遍历每个提示键（prompt_key）
            for prompt_key in self.speaker_embeddings:
                # 排除键名为 "repo_or_path" 的情况
                if prompt_key != "repo_or_path":
                    # 载入语音预设
                    voice_preset = self._load_voice_preset(prompt_key)

                    # 创建临时字典
                    tmp_dict = {}
                    # 遍历每个键（key）
                    for key in self.speaker_embeddings[prompt_key]:
                        # 将语音预设保存为 .npy 文件
                        np.save(
                            os.path.join(
                                embeddings_dict["repo_or_path"], speaker_embeddings_directory, f"{prompt_key}_{key}"
                            ),
                            voice_preset[key],
                            allow_pickle=False,
                        )
                        # 更新临时字典
                        tmp_dict[key] = os.path.join(speaker_embeddings_directory, f"{prompt_key}_{key}.npy")

                    # 将临时字典添加到嵌入字典中
                    embeddings_dict[prompt_key] = tmp_dict

            # 将嵌入字典保存为 JSON 文件
            with open(os.path.join(save_directory, speaker_embeddings_dict_path), "w") as fp:
                json.dump(embeddings_dict, fp)

        # 调用父类方法保存预训练模型到指定目录，并可选择推送到 Hugging Face 模型中心
        super().save_pretrained(save_directory, push_to_hub, **kwargs)
    # 加载指定的语音预设数据，支持的关键字包括语义提示、粗略提示和细致提示
    def _load_voice_preset(self, voice_preset: str = None, **kwargs):
        # 从self.speaker_embeddings中获取指定voice_preset的路径信息
        voice_preset_paths = self.speaker_embeddings[voice_preset]

        # 初始化空字典用于存储语音预设数据
        voice_preset_dict = {}

        # 遍历语音预设的三个关键字：语义提示、粗略提示、细致提示
        for key in ["semantic_prompt", "coarse_prompt", "fine_prompt"]:
            # 检查路径信息中是否包含当前关键字，若不存在则抛出数值错误异常
            if key not in voice_preset_paths:
                raise ValueError(
                    f"Voice preset unrecognized, missing {key} as a key in self.speaker_embeddings[{voice_preset}]."
                )

            # 根据路径信息获取预设文件的路径
            path = get_file_from_repo(
                # 从self.speaker_embeddings获取存储库路径或基础路径
                self.speaker_embeddings.get("repo_or_path", "/"),
                # 获取指定key的文件路径
                voice_preset_paths[key],
                subfolder=kwargs.pop("subfolder", None),  # 子文件夹（可选）
                cache_dir=kwargs.pop("cache_dir", None),  # 缓存目录（可选）
                force_download=kwargs.pop("force_download", False),  # 是否强制下载（可选）
                proxies=kwargs.pop("proxies", None),  # 代理设置（可选）
                resume_download=kwargs.pop("resume_download", False),  # 是否恢复下载（可选）
                local_files_only=kwargs.pop("local_files_only", False),  # 仅使用本地文件（可选）
                token=kwargs.pop("use_auth_token", None),  # 认证令牌（可选）
                revision=kwargs.pop("revision", None),  # 版本号（可选）
            )
            
            # 若路径为None，则抛出数值错误异常，说明找不到指定路径的预设数据
            if path is None:
                raise ValueError(
                    f"""`{os.path.join(self.speaker_embeddings.get("repo_or_path", "/"), voice_preset_paths[key])}` does not exists
                    , no preloaded voice preset will be used - Make sure to provide correct paths to the {voice_preset}
                    embeddings."""
                )

            # 使用numpy加载指定路径的数据，存储到语音预设字典中的当前关键字位置
            voice_preset_dict[key] = np.load(path)

        # 返回加载后的语音预设字典
        return voice_preset_dict

    # 验证语音预设字典的有效性，确保包含必须的关键字和正确的数据类型和形状
    def _validate_voice_preset_dict(self, voice_preset: Optional[dict] = None):
        # 遍历语音预设的三个关键字：语义提示、粗略提示、细致提示
        for key in ["semantic_prompt", "coarse_prompt", "fine_prompt"]:
            # 检查语音预设字典是否缺少当前关键字，若是则抛出数值错误异常
            if key not in voice_preset:
                raise ValueError(f"Voice preset unrecognized, missing {key} as a key.")

            # 检查当前关键字的值是否为numpy数组类型，若不是则抛出数值错误异常
            if not isinstance(voice_preset[key], np.ndarray):
                raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.")

            # 检查当前关键字的值的维度是否与预期维度一致，若不是则抛出数值错误异常
            if len(voice_preset[key].shape) != self.preset_shape[key]:
                raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.")

    # 对象的可调用方法，用于执行模型的推理或生成任务
    def __call__(
        self,
        text=None,
        voice_preset=None,
        return_tensors="pt",
        max_length=256,
        add_special_tokens=False,
        return_attention_mask=True,
        return_token_type_ids=False,
        **kwargs,

`.\models\bark\init.py`

# 导入类型检查模块
from typing import TYPE_CHECKING

# 导入必要的异常和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义模块导入结构
_import_structure = {
    "configuration_bark": [
        "BARK_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "BarkCoarseConfig",
        "BarkConfig",
        "BarkFineConfig",
        "BarkSemanticConfig",
    ],
    "processing_bark": ["BarkProcessor"],
}

# 检查是否导入了 torch 模块，如果未导入，则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果导入了 torch 模块，则添加额外的模块到导入结构中
    _import_structure["modeling_bark"] = [
        "BARK_PRETRAINED_MODEL_ARCHIVE_LIST",
        "BarkFineModel",
        "BarkSemanticModel",
        "BarkCoarseModel",
        "BarkModel",
        "BarkPreTrainedModel",
        "BarkCausalModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 从相关模块导入特定的类或对象
    from .configuration_bark import (
        BARK_PRETRAINED_CONFIG_ARCHIVE_MAP,
        BarkCoarseConfig,
        BarkConfig,
        BarkFineConfig,
        BarkSemanticConfig,
    )
    from .processing_bark import BarkProcessor

    # 再次检查是否导入了 torch 模块，如果未导入，则继续忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果导入了 torch 模块，则从 modeling_bark 模块中导入特定的类或对象
        from .modeling_bark import (
            BARK_PRETRAINED_MODEL_ARCHIVE_LIST,
            BarkCausalModel,
            BarkCoarseModel,
            BarkFineModel,
            BarkModel,
            BarkPreTrainedModel,
            BarkSemanticModel,
        )

# 如果不是类型检查模式，则执行以下操作
else:
    # 导入 sys 模块
    import sys

    # 将当前模块替换为一个懒加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\bart\configuration_bart.py`

# 导入警告模块，用于在需要时发出警告
import warnings
# OrderedDict 是一个有序字典，可以记录元素插入的顺序
from collections import OrderedDict
# Any、Mapping 和 Optional 是用于类型提示的特定类型
from typing import Any, Mapping, Optional

# 导入预训练分词器 PreTrainedTokenizer
from ... import PreTrainedTokenizer
# 导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 导入 OnnxConfig、OnnxConfigWithPast 和 OnnxSeq2SeqConfigWithPast 用于 ONNX 模型配置
from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
# 导入计算轴维度的工具函数
from ...onnx.utils import compute_effective_axis_dimension
# 导入 TensorType 用于处理张量类型，is_torch_available 用于检查是否有 torch 库，logging 用于日志记录
from ...utils import TensorType, is_torch_available, logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# BART 预训练模型配置文件的映射，指定每个预训练模型的配置文件 URL
BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/config.json",
    # 更多 BART 模型配置文件映射可见于 https://huggingface.co/models?filter=bart
}

# BartConfig 是用于存储 BART 模型配置的类，继承自 PretrainedConfig
class BartConfig(PretrainedConfig):
    r"""
    这是用于存储 [`BartModel`] 配置的类。它用于根据指定的参数实例化 BART 模型，定义模型架构。
    使用默认参数实例化配置将得到类似于 BART [facebook/bart-large](https://huggingface.co/facebook/bart-large) 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。更多信息请阅读 [`PretrainedConfig`] 的文档。

    Example:

    ```
    >>> from transformers import BartConfig, BartModel

    >>> # 初始化一个 BART facebook/bart-large 风格的配置
    >>> configuration = BartConfig()

    >>> # 使用该配置初始化一个模型（带有随机权重）
    >>> model = BartModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    # 模型类型设定为 "bart"
    model_type = "bart"
    # 推断过程中忽略的键列表，这里忽略 "past_key_values"
    keys_to_ignore_at_inference = ["past_key_values"]
    # 属性映射，将 "num_attention_heads" 映射为 "encoder_attention_heads"，"hidden_size" 映射为 "d_model"
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
    # 初始化函数，用于初始化一个 Transformer 模型的参数和配置
    def __init__(
        self,
        vocab_size=50265,  # 词汇表大小，默认为 50265
        max_position_embeddings=1024,  # 最大位置编码长度，默认为 1024
        encoder_layers=12,  # 编码器层数，默认为 12 层
        encoder_ffn_dim=4096,  # 编码器中 Feed Forward 网络的维度，默认为 4096
        encoder_attention_heads=16,  # 编码器中注意力头的数量，默认为 16
        decoder_layers=12,  # 解码器层数，默认为 12 层
        decoder_ffn_dim=4096,  # 解码器中 Feed Forward 网络的维度，默认为 4096
        decoder_attention_heads=16,  # 解码器中注意力头的数量，默认为 16
        encoder_layerdrop=0.0,  # 编码器层的 dropout 比例，默认为 0.0（不使用）
        decoder_layerdrop=0.0,  # 解码器层的 dropout 比例，默认为 0.0（不使用）
        activation_function="gelu",  # 激活函数类型，默认为 GELU
        d_model=1024,  # 模型的维度，默认为 1024
        dropout=0.1,  # 全连接层和注意力层的 dropout 比例，默认为 0.1
        attention_dropout=0.0,  # 注意力机制中的 dropout 比例，默认为 0.0（不使用）
        activation_dropout=0.0,  # 激活函数中的 dropout 比例，默认为 0.0（不使用）
        init_std=0.02,  # 参数初始化的标准差，默认为 0.02
        classifier_dropout=0.0,  # 分类器中的 dropout 比例，默认为 0.0（不使用）
        scale_embedding=False,  # 是否对嵌入进行缩放，默认为 False
        use_cache=True,  # 是否使用缓存，默认为 True
        num_labels=3,  # 标签数量，默认为 3
        pad_token_id=1,  # 填充 token 的 ID，默认为 1
        bos_token_id=0,  # 开始 token 的 ID，默认为 0
        eos_token_id=2,  # 结束 token 的 ID，默认为 2
        is_encoder_decoder=True,  # 是否为编码解码模型，默认为 True
        decoder_start_token_id=2,  # 解码器开始 token 的 ID，默认为 2
        forced_eos_token_id=2,  # 强制结束 token 的 ID，默认为 2
        **kwargs,  # 其他关键字参数，用于接收额外的配置
    ):
        self.vocab_size = vocab_size  # 初始化词汇表大小
        self.max_position_embeddings = max_position_embeddings  # 初始化最大位置编码长度
        self.d_model = d_model  # 初始化模型维度
        self.encoder_ffn_dim = encoder_ffn_dim  # 初始化编码器的 Feed Forward 网络维度
        self.encoder_layers = encoder_layers  # 初始化编码器层数
        self.encoder_attention_heads = encoder_attention_heads  # 初始化编码器的注意力头数量
        self.decoder_ffn_dim = decoder_ffn_dim  # 初始化解码器的 Feed Forward 网络维度
        self.decoder_layers = decoder_layers  # 初始化解码器层数
        self.decoder_attention_heads = decoder_attention_heads  # 初始化解码器的注意力头数量
        self.dropout = dropout  # 初始化全连接层和注意力层的 dropout 比例
        self.attention_dropout = attention_dropout  # 初始化注意力机制中的 dropout 比例
        self.activation_dropout = activation_dropout  # 初始化激活函数中的 dropout 比例
        self.activation_function = activation_function  # 初始化激活函数类型
        self.init_std = init_std  # 初始化参数初始化的标准差
        self.encoder_layerdrop = encoder_layerdrop  # 初始化编码器层的 dropout 比例
        self.decoder_layerdrop = decoder_layerdrop  # 初始化解码器层的 dropout 比例
        self.classifier_dropout = classifier_dropout  # 初始化分类器中的 dropout 比例
        self.use_cache = use_cache  # 初始化是否使用缓存的标志
        self.num_hidden_layers = encoder_layers  # 初始化隐藏层的数量为编码器层数
        self.scale_embedding = scale_embedding  # 初始化是否对嵌入进行缩放的标志（如果为 True，则缩放因子为 sqrt(d_model)）
    
        super().__init__(
            num_labels=num_labels,  # 调用父类构造函数初始化标签数量
            pad_token_id=pad_token_id,  # 调用父类构造函数初始化填充 token 的 ID
            bos_token_id=bos_token_id,  # 调用父类构造函数初始化开始 token 的 ID
            eos_token_id=eos_token_id,  # 调用父类构造函数初始化结束 token 的 ID
            is_encoder_decoder=is_encoder_decoder,  # 调用父类构造函数初始化是否为编码解码模型的标志
            decoder_start_token_id=decoder_start_token_id,  # 调用父类构造函数初始化解码器开始 token 的 ID
            forced_eos_token_id=forced_eos_token_id,  # 调用父类构造函数初始化强制结束 token 的 ID
            **kwargs,  # 将额外的关键字参数传递给父类构造函数
        )
    
        # 确保对于 BART CNN 模型的向后兼容性
        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
            self.forced_bos_token_id = self.bos_token_id  # 如果未指定强制开始 token 的 ID，则使用默认的开始 token 的 ID
            warnings.warn(
                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
                "The config can simply be saved and uploaded again to be fixed."
            )
class BartOnnxConfig(OnnxSeq2SeqConfigWithPast):
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 根据任务类型设定通用输入格式
        if self.task in ["default", "seq2seq-lm"]:
            # 如果任务为默认或序列到序列语言建模，设定常见的输入格式
            common_inputs = OrderedDict(
                [
                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
                ]
            )

            if self.use_past:
                # 如果使用过去信息，设定解码器输入和注意力掩码的格式
                common_inputs["decoder_input_ids"] = {0: "batch"}
                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
            else:
                # 否则，设定解码器输入和注意力掩码的另一种格式
                common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}

            if self.use_past:
                # 如果使用过去信息，填充带有过去信息的键值对
                self.fill_with_past_key_values_(common_inputs, direction="inputs")
        elif self.task == "causal-lm":
            # 如果任务是因果语言建模，设定输入格式并处理过去的键值对
            common_inputs = OrderedDict(
                [
                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
                ]
            )
            if self.use_past:
                # 如果使用过去信息，为每个编码器层填充过去键和值的格式
                num_encoder_layers, _ = self.num_layers
                for i in range(num_encoder_layers):
                    common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
                    common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
        else:
            # 对于其他任务类型，设定通用的输入格式，包括解码器输入和注意力掩码
            common_inputs = OrderedDict(
                [
                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
                    ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
                    ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
                ]
            )

        return common_inputs

    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        # 根据任务类型设定通用输出格式
        if self.task in ["default", "seq2seq-lm"]:
            # 如果任务为默认或序列到序列语言建模，调用父类方法获取常见的输出格式
            common_outputs = super().outputs
        else:
            # 对于其他任务类型，调用父类方法获取带有过去信息的输出格式
            common_outputs = super(OnnxConfigWithPast, self).outputs
            if self.use_past:
                # 如果使用过去信息，为每个编码器层填充当前键和值的格式
                num_encoder_layers, _ = self.num_layers
                for i in range(num_encoder_layers):
                    common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
                    common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
        return common_outputs

    def _generate_dummy_inputs_for_default_and_seq2seq_lm(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
        # 生成默认和序列到序列语言建模的虚拟输入
        ) -> Mapping[str, Any]:
        # 生成编码器输入
        encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
            tokenizer, batch_size, seq_length, is_pair, framework
        )

        # 生成解码器输入
        decoder_seq_length = seq_length if not self.use_past else 1
        decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
            tokenizer, batch_size, decoder_seq_length, is_pair, framework
        )
        # 将解码器输入中的每个张量命名为"decoder_name"，并存放在decoder_inputs字典中
        decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
        # 将编码器和解码器的输入合并到common_inputs字典中
        common_inputs = dict(**encoder_inputs, **decoder_inputs)

        # 如果使用过去的信息
        if self.use_past:
            # 检查是否安装了PyTorch
            if not is_torch_available():
                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
            else:
                import torch
            
            # 获取输入张量的形状信息
            batch, encoder_seq_length = common_inputs["input_ids"].shape
            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
            num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
            
            # 定义编码器和解码器的形状
            encoder_shape = (
                batch,
                num_encoder_attention_heads,
                encoder_seq_length,
                self._config.hidden_size // num_encoder_attention_heads,
            )
            decoder_past_length = decoder_seq_length + 3
            decoder_shape = (
                batch,
                num_decoder_attention_heads,
                decoder_past_length,
                self._config.hidden_size // num_decoder_attention_heads,
            )

            # 扩展解码器注意力遮罩的长度，并添加到common_inputs中
            common_inputs["decoder_attention_mask"] = torch.cat(
                [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
            )

            # 初始化past_key_values列表
            common_inputs["past_key_values"] = []
            
            # 根据编码器和解码器层数的较小值生成past_key_values
            num_encoder_layers, num_decoder_layers = self.num_layers
            min_num_layers = min(num_encoder_layers, num_decoder_layers)
            
            # 确定需要初始化的剩余层次的名称
            remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"

            # 为每一层生成初始化的past_key_values元组
            for _ in range(min_num_layers):
                common_inputs["past_key_values"].append(
                    (
                        torch.zeros(decoder_shape),
                        torch.zeros(decoder_shape),
                        torch.zeros(encoder_shape),
                        torch.zeros(encoder_shape),
                    )
                )
            
            # TODO: test this.
            # 如果存在剩余的层次，使用相应的形状生成初始化的past_key_values元组
            shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
            for _ in range(min_num_layers, max_num_layers):
                common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
        
        # 返回生成的common_inputs字典
        return common_inputs
    def _generate_dummy_inputs_for_causal_lm(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ) -> Mapping[str, Any]:
        # 调用另一个方法生成通用输入，用于序列分类和问答任务的虚拟输入生成
        common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
            tokenizer, batch_size, seq_length, is_pair, framework
        )

        if self.use_past:
            # 检查是否使用了 self.use_past，若使用且没有安装 PyTorch，则抛出错误
            if not is_torch_available():
                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
            else:
                import torch
            
            # 获取输入数据的批次大小和序列长度
            batch, seqlen = common_inputs["input_ids"].shape
            
            # 计算 past_key_values 的长度，比输入序列长度多 2
            past_key_values_length = seqlen + 2
            
            # 获取编码器层数和注意力头数
            num_encoder_layers, _ = self.num_layers
            num_encoder_attention_heads, _ = self.num_attention_heads
            
            # 计算 past_key_values 的形状
            past_shape = (
                batch,
                num_encoder_attention_heads,
                past_key_values_length,
                self._config.hidden_size // num_encoder_attention_heads,
            )
            
            # 获取注意力掩码的数据类型，并将其扩展以适应新的 past_key_values 长度
            mask_dtype = common_inputs["attention_mask"].dtype
            common_inputs["attention_mask"] = torch.cat(
                [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
            )
            
            # 初始化 past_key_values 列表，每个层级都有一个零填充的 past_key_values 元组
            common_inputs["past_key_values"] = [
                (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers)
            ]
        
        # 返回生成的通用输入字典
        return common_inputs

    def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ) -> Mapping[str, Any]:
        # 从 OnnxConfig.generate_dummy_inputs 复制的代码
        # 为了代码清晰性没有使用 super(OnnxConfigWithPast, self).generate_dummy_inputs
        # 如果动态轴为 -1，则使用固定的 2 个样本维度来避免 ONNX 的优化
        batch_size = compute_effective_axis_dimension(
            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
        )

        # 如果动态轴为 -1，则使用固定的 8 个标记来避免 ONNX 的优化
        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
        seq_length = compute_effective_axis_dimension(
            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
        )

        # 根据计算的批次大小和序列长度生成虚拟输入
        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
        
        # 使用 tokenizer 将虚拟输入转换为字典形式的通用输入
        common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
        
        # 返回通用输入字典
        return common_inputs
    # 生成虚拟输入数据，根据不同的任务类型调用相应的内部方法来生成
    def generate_dummy_inputs(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ) -> Mapping[str, Any]:
        # 如果任务类型是"default"或"seq2seq-lm"，调用适用于这两种任务的虚拟输入生成方法
        if self.task in ["default", "seq2seq-lm"]:
            common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
            )

        # 如果任务类型是"causal-lm"，调用适用于因果语言模型任务的虚拟输入生成方法
        elif self.task == "causal-lm":
            common_inputs = self._generate_dummy_inputs_for_causal_lm(
                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
            )
        # 对于其他任务类型，调用适用于序列分类和问答任务的虚拟输入生成方法
        else:
            common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
            )

        # 返回生成的通用输入数据
        return common_inputs

    # 根据任务类型选择性地扁平化过去的键值对数据
    def _flatten_past_key_values_(self, flattened_output, name, idx, t):
        # 如果任务类型是"default"或"seq2seq-lm"，调用父类方法处理扁平化过去的键值对数据
        if self.task in ["default", "seq2seq-lm"]:
            flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
        else:
            # 对于其他任务类型，使用带有过去状态的配置类的父类方法处理扁平化过去的键值对数据
            flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
                flattened_output, name, idx, t
            )

`.\models\bart\convert_bart_original_pytorch_checkpoint_to_pytorch.py`

# 设置文件编码为 UTF-8
# 版权声明，指明代码版权归 HuggingFace Inc. 团队所有，遵循 Apache License 2.0
# 引入必要的库和模块
import argparse  # 引入命令行参数解析模块
import os  # 引入操作系统相关功能模块
from pathlib import Path  # 引入处理文件路径的模块

import fairseq  # 引入 fairseq 库，用于处理 BART 模型
import torch  # 引入 PyTorch 深度学习框架
from packaging import version  # 引入版本管理模块

from torch import nn  # 引入 PyTorch 的神经网络模块

from transformers import (  # 引入 Transformers 库中的相关模块和类
    BartConfig,
    BartForConditionalGeneration,
    BartForSequenceClassification,
    BartModel,
    BartTokenizer,
)
from transformers.utils import logging  # 引入 Transformers 的日志记录模块

# 定义 Fairseq 中已有的 BART 模型列表
FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"]
# 额外的架构映射，用于将模型名称映射到对应的类
extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification}

# 检查 Fairseq 版本是否大于等于 0.9.0，否则抛出异常
if version.parse(fairseq.__version__) < version.parse("0.9.0"):
    raise Exception("requires fairseq >= 0.9.0")

# 设置日志记录的详细程度为信息级别
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 示例文本用于后续操作
SAMPLE_TEXT = " Hello world! cécé herlolip"

# 用于 MNLI 模型的键重命名列表
mnli_rename_keys = [
    ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
    ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"),
    ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
    ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
]

# 移除状态字典中的特定键，这些键不被需要
def remove_ignore_keys_(state_dict):
    ignore_keys = [
        "encoder.version",
        "decoder.version",
        "model.encoder.version",
        "model.decoder.version",
        "_float_tensor",
    ]
    for k in ignore_keys:
        state_dict.pop(k, None)

# 将字典中的旧键重命名为新键
def rename_key(dct, old, new):
    val = dct.pop(old)
    dct[new] = val

# 从给定的检查点路径加载 XSum 模型
def load_xsum_checkpoint(checkpoint_path):
    """Checkpoint path should end in model.pt"""
    # 使用 torch 加载模型检查点
    sd = torch.load(checkpoint_path, map_location="cpu")
    # 从 PyTorch hub 中加载 BART CNN 模型
    hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval()
    # 加载模型的状态字典
    hub_interface.model.load_state_dict(sd["model"])
    return hub_interface

# 从给定的嵌入层创建线性层
def make_linear_from_emb(emb):
    vocab_size, emb_size = emb.weight.shape
    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
    lin_layer.weight.data = emb.weight.data
    return lin_layer

# 转换 BART 模型检查点到 Hugging Face 的格式
@torch.no_grad()
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None):
    """
    Copy/paste/tweak model's weights to our BERT structure.
    """
    # 如果给定的检查点路径不存在，则从 PyTorch hub 加载 BART 模型
    if not os.path.exists(checkpoint_path):
        bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval()
    else:
        # 否则，加载 XSum 检查点
        bart = load_xsum_checkpoint(checkpoint_path)
    # 更新 BART 模型的状态字典
    bart.model.upgrade_state_dict(bart.model.state_dict())
    # 如果 hf_checkpoint_name 为 None，则使用 checkpoint_path 替换 '.' 为 '-' 作为 hf_checkpoint_name
    if hf_checkpoint_name is None:
        hf_checkpoint_name = checkpoint_path.replace(".", "-")
    # 根据 hf_checkpoint_name 加载预训练配置
    config = BartConfig.from_pretrained(hf_checkpoint_name)
    # 使用 bart 对象编码示例文本 SAMPLE_TEXT，并添加一个维度
    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
    # 使用 hf_checkpoint_name 加载 BartTokenizer，并编码示例文本 SAMPLE_TEXT，并返回 PyTorch 张量，并添加一个维度
    tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
    # 检查 tokens 和 tokens2 是否完全相等，否则抛出异常
    if not torch.eq(tokens, tokens2).all():
        raise ValueError(
            f"converted tokenizer and pretrained tokenizer returned different output: {tokens} != {tokens2}"
        )

    # 如果 checkpoint_path 是 "bart.large.mnli"
    if checkpoint_path == "bart.large.mnli":
        # 获取当前 bart 模型的状态字典
        state_dict = bart.state_dict()
        # 移除需要忽略的键
        remove_ignore_keys_(state_dict)
        # 重命名特定键名到 mnli_rename_keys 中定义的目标键名
        state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
        for src, dest in mnli_rename_keys:
            rename_key(state_dict, src, dest)
        # 加载 Sequence Classification 模型
        model = BartForSequenceClassification(config).eval()
        # 加载状态字典到模型
        model.load_state_dict(state_dict)
        # 使用 bart 预测 mnli，返回 logits
        fairseq_output = bart.predict("mnli", tokens, return_logits=True)
        # 使用新加载的模型进行推理，得到新的模型输出（logits）
        new_model_outputs = model(tokens)[0]
    else:  # 没有分类头需要担心
        # 获取当前 bart 模型的状态字典
        state_dict = bart.model.state_dict()
        # 移除需要忽略的键
        remove_ignore_keys_(state_dict)
        # 更新共享权重的键名
        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
        # 使用 bart 提取特征
        fairseq_output = bart.extract_features(tokens)
        # 如果 hf_checkpoint_name 是 "facebook/bart-large"
        if hf_checkpoint_name == "facebook/bart-large":
            # 加载 BartModel 模型
            model = BartModel(config).eval()
            # 加载状态字典到模型
            model.load_state_dict(state_dict)
            # 使用新加载的模型进行推理，得到新的模型输出
            new_model_outputs = model(tokens).model[0]
        else:
            # 加载 BartForConditionalGeneration 模型
            model = BartForConditionalGeneration(config).eval()  # 一个现有的摘要检查点
            # 加载状态字典到模型
            model.model.load_state_dict(state_dict)
            # 如果模型有 lm_head 属性，则使用 make_linear_from_emb 函数创建线性层
            if hasattr(model, "lm_head"):
                model.lm_head = make_linear_from_emb(model.model.shared)
            # 使用新加载的模型进行推理，得到新的模型输出
            new_model_outputs = model.model(tokens)[0]

    # 检查输出结果的形状是否相等
    if fairseq_output.shape != new_model_outputs.shape:
        raise ValueError(
            f"`fairseq_output` shape and `new_model_output` shape are different: {fairseq_output.shape=}, {new_model_outputs.shape}"
        )
    # 如果 fairseq_output 和 new_model_outputs 中有任何不同的值，抛出异常
    if (fairseq_output != new_model_outputs).any().item():
        raise ValueError("Some values in `fairseq_output` are different from `new_model_outputs`")
    # 创建 PyTorch dump 文件夹路径（如果不存在）
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 将模型保存到指定的 PyTorch dump 文件夹路径中
    model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码
    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # 必选参数
    parser.add_argument(
        "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
    )
    # 添加一个必选参数 fairseq_path，用于指定 fairseq 模型的路径或名称

    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加一个必选参数 pytorch_dump_folder_path，用于指定 PyTorch 模型输出的路径

    parser.add_argument(
        "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum"
    )
    # 添加一个可选参数 --hf_config，用于指定要使用的 Hugging Face 模型架构

    args = parser.parse_args()
    # 解析命令行参数，并将其存储在 args 对象中

    convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config)
    # 调用 convert_bart_checkpoint 函数，传递解析后的参数：fairseq_path, pytorch_dump_folder_path 和 hf_config

`.\models\bart\modeling_bart.py`

# 设置文件编码格式为UTF-8

# 版权声明和许可信息

# 导入所需的库和模块
""" PyTorch BART模型."""
import copy  # 导入深拷贝函数
import math  # 导入数学函数
import warnings  # 导入警告模块
from typing import List, Optional, Tuple, Union  # 导入类型提示相关的模块

import torch  # 导入PyTorch库
import torch.nn.functional as F  # 导入PyTorch的函数库
import torch.utils.checkpoint  # 导入PyTorch的checkpoint模块
from torch import nn  # 导入PyTorch的神经网络模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 导入损失函数

from ...activations import ACT2FN  # 导入激活函数
from ...modeling_attn_mask_utils import (  # 导入注意力掩码的辅助函数
    _prepare_4d_attention_mask,
    _prepare_4d_attention_mask_for_sdpa,
    _prepare_4d_causal_attention_mask,
    _prepare_4d_causal_attention_mask_for_sdpa,
)
from ...modeling_outputs import (  # 导入模型输出相关的类和函数
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
    Seq2SeqQuestionAnsweringModelOutput,
    Seq2SeqSequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型的工具函数
from ...utils import (  # 导入工具函数
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)

from .configuration_bart import BartConfig  # 导入BART配置文件

if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func  # 如果支持flash attention，导入相关函数
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # 导入flash attention相关的辅助函数  # noqa

logger = logging.get_logger(__name__)  # 获取logger实例

_CHECKPOINT_FOR_DOC = "facebook/bart-base"  # 用于文档的检查点
_CONFIG_FOR_DOC = "BartConfig"  # 用于文档的配置

# Base model docstring
_EXPECTED_OUTPUT_SHAPE = [1, 8, 768]  # 预期输出形状

# SequenceClassification docstring
_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "valhalla/bart-large-sst2"  # 序列分类任务的检查点
_SEQ_CLASS_EXPECTED_LOSS = 0.0  # 序列分类任务的预期损失
_SEQ_CLASS_EXPECTED_OUTPUT = "'POSITIVE'"  # 序列分类任务的预期输出

# QuestionAsnwering docstring
_CHECKPOINT_FOR_QA = "valhalla/bart-large-finetuned-squadv1"  # 问答任务的检查点
_QA_EXPECTED_LOSS = 0.59  # 问答任务的预期损失
_QA_EXPECTED_OUTPUT = "' nice puppet'"  # 问答任务的预期输出

BART_PRETRAINED_MODEL_ARCHIVE_LIST = [  # 预训练模型存档列表
    "facebook/bart-large",
    # 查看所有BART模型，请访问 https://huggingface.co/models?filter=bart
]


# 从transformers.models.llama.modeling_llama._get_unpad_data复制过来的函数
def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)  # 计算批次中每个序列的长度
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()  # 找出非零位置的索引
    max_seqlen_in_batch = seqlens_in_batch.max().item()  # 获取批次中的最大序列长度
    # 对输入的序列长度进行累积求和，并在最前面填充一个零，以形成累计序列长度
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    # 返回三个值作为元组的形式
    return (
        indices,                # 返回的第一个元素是索引数组
        cu_seqlens,             # 返回的第二个元素是填充后的累计序列长度数组
        max_seqlen_in_batch,    # 返回的第三个元素是批次中最大的序列长度
    )
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    # 创建一个新的张量，与input_ids相同形状
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    # 将input_ids的内容向右移动一个位置
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    # 将decoder_start_token_id放置在shifted_input_ids的首列
    shifted_input_ids[:, 0] = decoder_start_token_id

    if pad_token_id is None:
        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # 将shifted_input_ids中可能存在的-100值替换为pad_token_id
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


class BartLearnedPositionalEmbedding(nn.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int):
        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
        # and adjust num_embeddings appropriately. Other models don't have this hack
        self.offset = 2
        # 调用父类构造函数初始化Embedding层
        super().__init__(num_embeddings + self.offset, embedding_dim)

    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
        """`input_ids' shape is expected to be [bsz x seqlen]."""

        bsz, seq_len = input_ids.shape[:2]
        # 根据当前序列长度和历史键值对长度计算位置张量
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
        ).expand(bsz, -1)

        return super().forward(positions + self.offset)


class BartAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[BartConfig] = None,
    ):
        super().__init__()
        # 初始化注意力机制的各种参数
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        self.config = config

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        # 缩放因子，用于缩放点积注意力的结果
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder
        self.is_causal = is_causal

        # 线性变换层，用于计算Q、K、V向量
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        # 输出层的线性变换
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 调整张量形状，以适应多头注意力计算的需求
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
    # 定义模型的前向传播方法
    def forward(
        # 隐藏状态：输入的张量，表示模型的隐藏状态
        self,
        # 键值状态：可选的张量，表示用于键值计算的状态
        hidden_states: torch.Tensor,
        # 过去的键值：可选的元组，包含过去计算得到的键值状态
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        # 注意力掩码：可选的张量，用于掩盖不需要处理的部分
        attention_mask: Optional[torch.Tensor] = None,
        # 层头掩码：可选的张量，用于层间的掩盖操作
        layer_head_mask: Optional[torch.Tensor] = None,
        # 输出注意力：布尔值，表示是否输出注意力权重信息
        output_attentions: bool = False,
    """
    Bart flash attention module. This module inherits from `BartAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """
    
    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        """
        Reshape the input tensor into the desired shape.
        
        Args:
        - tensor (torch.Tensor): The input tensor to reshape.
        - seq_len (int): Length of the sequence.
        - bsz (int): Batch size.
        
        Returns:
        - torch.Tensor: Reshaped tensor of shape (bsz, seq_len, num_heads, head_dim).
        """
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        """
        Perform the forward pass of the attention module.
        
        Args:
        - hidden_states (torch.Tensor): Input hidden states.
        - key_value_states (Optional[torch.Tensor]): Key and value states if provided separately.
        - past_key_value (Optional[Tuple[torch.Tensor]]): Past key and value tensors.
        - attention_mask (Optional[torch.Tensor]): Mask for attention computation.
        - layer_head_mask (Optional[torch.Tensor]): Mask for heads within a layer.
        - output_attentions (bool): Whether to output attentions.

        Returns:
        - torch.Tensor: Output tensor from the attention module.
        """
        # Forward pass logic goes here
        pass
    
    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
    def _flash_attention_forward(
        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
    ):
        """
        Perform forward pass specific to flash attention mechanism.
        
        Args:
        - query_states: Query states tensor.
        - key_states: Key states tensor.
        - value_states: Value states tensor.
        - attention_mask: Mask for attention computation.
        - query_length: Length of the query sequence.
        - dropout (float): Dropout rate.
        - softmax_scale: Scaling factor for softmax computation.

        Returns:
        - torch.Tensor: Output tensor after applying flash attention.
        """
        # Implementation details for flash attention forward pass
        pass
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        # Determine if causal masking is needed based on `_flash_attn_uses_top_left_mask` and `query_length`
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # Temporary check for RoCm compatibility; remove when Flash Attention for RoCm is updated
            causal = self.is_causal and query_length != 1

        # Apply attention masking if `attention_mask` is provided
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            # Unpad the input sequences based on the attention mask
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            # Compute attention scores for variable-length sequences
            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            # Pad the attention scores back to the original sequence length
            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
        else:
            # Compute attention scores without masking
            attn_output = flash_attn_func(
                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
            )

        return attn_output

    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
    # 在内部方法中，根据给定的注意力掩码处理输入数据，返回处理后的查询、键、值、索引等。
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        # 获取未填充数据的索引、当前序列长度和批次中最大序列长度
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        
        # 获取键层的形状信息
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
        
        # 根据索引重新排列键层和值层的数据
        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        
        # 根据查询长度决定如何处理查询层
        if query_length == kv_seq_len:
            # 如果查询长度等于键值序列长度，则重新排列查询层并更新相关变量
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            # 如果查询长度为1，则直接处理查询层为批次大小，同时更新相关变量
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # 这里存在一次内存拷贝，效率较低。
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # 否则，根据查询长度和注意力掩码未填充数据处理查询层
            # 注意力掩码仅保留后面的 -query_length 切片，假设为左填充。
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
        
        # 返回处理后的查询层、键层、值层、查询索引、序列长度元组和最大序列长度元组
        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )
class BartSdpaAttention(BartAttention):
    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 在 BartSdpaAttention 类中重写 forward 方法，用于执行自注意力机制
        # 参数说明：
        # - hidden_states: 输入的隐藏状态张量
        # - key_value_states: 可选参数，键值状态张量（默认为 None）
        # - past_key_value: 可选参数，过去的键值元组（默认为 None）
        # - attention_mask: 可选参数，注意力掩码张量（默认为 None）
        # - layer_head_mask: 可选参数，层头掩码张量（默认为 None）
        # - output_attentions: 是否输出注意力权重，默认为 False
        pass

BART_ATTENTION_CLASSES = {
    "eager": BartAttention,
    "sdpa": BartSdpaAttention,
    "flash_attention_2": BartFlashAttention2,
}

# 定义 BART 模型中不同注意力机制实现的类映射字典

class BartEncoderLayer(nn.Module):
    def __init__(self, config: BartConfig):
        super().__init__()
        self.embed_dim = config.d_model

        # 初始化 BartEncoderLayer 类，根据配置选择注意力机制实现类
        self.self_attn = BART_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
            config=config,
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.FloatTensor,
        attention_mask: torch.FloatTensor,
        layer_head_mask: torch.FloatTensor,
        output_attentions: Optional[bool] = False,
    ):
        # 在 BartEncoderLayer 类中重写 forward 方法，执行编码器层的前向传播
        # 参数说明：
        # - hidden_states: 输入的隐藏状态张量
        # - attention_mask: 注意力掩码张量
        # - layer_head_mask: 层头掩码张量
        # - output_attentions: 是否输出注意力权重，默认为 False
        pass
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        # 保存输入的隐藏状态作为残差连接的基准
        residual = hidden_states
        # 进行自注意力机制计算
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 对输出的隐藏状态进行 dropout 处理
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 将残差连接后的结果加回到隐藏状态中
        hidden_states = residual + hidden_states
        # 对加和后的隐藏状态进行 layer normalization
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # 再次保存当前隐藏状态作为残差连接的基准
        residual = hidden_states
        # 应用激活函数并传入第一个全连接层
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 对输出的隐藏状态进行 dropout 处理
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        # 传入第二个全连接层
        hidden_states = self.fc2(hidden_states)
        # 对输出的隐藏状态进行 dropout 处理
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 将残差连接后的结果加回到隐藏状态中
        hidden_states = residual + hidden_states
        # 对加和后的隐藏状态进行 layer normalization
        hidden_states = self.final_layer_norm(hidden_states)

        # 如果隐藏状态的数据类型是 float16 且包含无穷大或 NaN 值，则进行数值的 clamp 处理
        if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        # 准备输出结果，将隐藏状态打包成元组形式
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则将注意力权重也添加到输出中
        if output_attentions:
            outputs += (attn_weights,)

        # 返回最终的输出结果
        return outputs
class BartDecoderLayer(nn.Module):
    # BART 解码器层模块，继承自 nn.Module
    def __init__(self, config: BartConfig):
        # 初始化方法，接受一个 BartConfig 类型的参数 config
        super().__init__()
        self.embed_dim = config.d_model
        # 从 config 中获取模型的嵌入维度

        self.self_attn = BART_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            is_causal=True,
            config=config,
        )
        # 初始化自注意力机制，使用 BART_ATTENTION_CLASSES 中对应的实现类

        self.dropout = config.dropout
        # 设置 dropout 概率

        self.activation_fn = ACT2FN[config.activation_function]
        # 选择激活函数

        self.activation_dropout = config.activation_dropout
        # 激活函数的 dropout 概率

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        # LayerNorm 层，用于自注意力机制的输出

        self.encoder_attn = BART_ATTENTION_CLASSES[config._attn_implementation](
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            config=config,
        )
        # 初始化编码器注意力机制，同样使用 BART_ATTENTION_CLASSES 中对应的实现类

        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        # LayerNorm 层，用于编码器注意力机制的输出

        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        # 第一个全连接层

        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
        # 第二个全连接层

        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
        # 最终的 LayerNorm 层，用于整个层的输出

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,
    ):
        # 前向传播方法
        # hidden_states: 输入的隐藏状态张量
        # attention_mask: 注意力掩码，可选
        # encoder_hidden_states: 编码器的隐藏状态张量，可选
        # encoder_attention_mask: 编码器的注意力掩码，可选
        # layer_head_mask: 层级头掩码，可选
        # cross_attn_layer_head_mask: 跨注意力层级头掩码，可选
        # past_key_value: 缓存的键值对，可选
        # output_attentions: 是否输出注意力权重，可选，默认为 False
        # use_cache: 是否使用缓存，可选，默认为 True

        # 下面是具体的前向传播计算过程，不同的操作符和层的作用已在初始化时进行了注释
        # 每一步的输出都需要通过相应的层（如 LayerNorm、Linear、Dropout）进行处理
        pass  # 实际前向传播逻辑在具体使用时实现


class BartClassificationHead(nn.Module):
    # 用于句子级分类任务的头部模块
    def __init__(
        self,
        input_dim: int,
        inner_dim: int,
        num_classes: int,
        pooler_dropout: float,
    ):
        # 初始化方法
        super().__init__()
        self.dense = nn.Linear(input_dim, inner_dim)
        # 全连接层，将输入维度映射到内部维度

        self.dropout = nn.Dropout(p=pooler_dropout)
        # Dropout 层

        self.out_proj = nn.Linear(inner_dim, num_classes)
        # 最终的全连接层，将内部维度映射到类别数量

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 前向传播方法
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.dense(hidden_states)
        hidden_states = torch.tanh(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.out_proj(hidden_states)
        # 具体的前向传播计算过程，包括 Dropout、全连接层、激活函数等
        return hidden_states


class BartPreTrainedModel(PreTrainedModel):
    # BART 预训练模型基类
    config_class = BartConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"]
    _no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    # 一些类属性和标记，指示模型的特性和行为，不涉及具体的计算逻辑
    # 初始化模型的权重，根据模块类型设定不同的初始化方式
    def _init_weights(self, module):
        # 从配置中获取初始化标准差
        std = self.config.init_std
        # 如果是线性层模块
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化权重数据
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果有偏置项，将其数据初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是嵌入层模块
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重数据
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果有填充索引，将对应索引的权重数据初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()

    # 获取一个用于模型测试的虚拟输入数据字典
    @property
    def dummy_inputs(self):
        # 获取配置中的填充标记 ID
        pad_token = self.config.pad_token_id
        # 构造虚拟的输入 ID 张量
        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
        # 构造虚拟输入数据字典，包括注意力遮罩和输入 ID
        dummy_inputs = {
            "attention_mask": input_ids.ne(pad_token),  # 根据填充标记 ID 生成注意力遮罩
            "input_ids": input_ids,  # 将构造的输入 ID 加入输入数据字典
        }
        # 返回构造好的虚拟输入数据字典
        return dummy_inputs
class PretrainedBartModel(BartPreTrainedModel):
    def __init_subclass__(self):
        # 发出警告，提示使用已过时的 `PretrainedBartModel` 类，请改用 `BartPreTrainedModel`
        warnings.warn(
            "The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.",
            FutureWarning,
        )


class BartPretrainedModel(BartPreTrainedModel):
    def __init_subclass__(self):
        # 发出警告，提示使用已过时的 `PretrainedBartModel` 类，请改用 `BartPreTrainedModel`
        warnings.warn(
            "The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.",
            FutureWarning,
        )


BART_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`BartConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

BART_GENERATION_EXAMPLE = r"""
    Summarization example:

    ```
    >>> from transformers import AutoTokenizer, BartForConditionalGeneration

    >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

    >>> ARTICLE_TO_SUMMARIZE = (
    ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
    ...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
    ...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
    ... )
    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")

    >>> # Generate Summary
    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
    >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'
    ```

    Mask filling example:

    ```
    >>> from transformers import AutoTokenizer, BartForConditionalGeneration

    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
    >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

    >>> TXT = "My friends are <mask> but they eat too many carbs."
    >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
    >>> logits = model(input_ids).logits


"""
    # 找到输入序列中第一个遮罩标记的索引位置
    masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
    
    # 使用模型输出的 logits 对遮罩位置的预测结果进行 softmax 处理，得到概率分布
    probs = logits[0, masked_index].softmax(dim=0)
    
    # 获取概率分布中前五个最高概率对应的值和它们的索引
    values, predictions = probs.topk(5)
    
    # 将预测出的索引转换为词汇，并以列表形式返回
    tokenizer.decode(predictions).split()
    ['not', 'good', 'healthy', 'great', 'very']
# 定义 BART 模型的输入文档字符串
BART_INPUTS_DOCSTRING = r"""
"""


class BartEncoder(BartPreTrainedModel):
    """
    BART 编码器，由 *config.encoder_layers* 个自注意力层组成的Transformer编码器。

    Args:
        config: BartConfig
        embed_tokens (nn.Embedding): 输出的嵌入表示
    """

    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)

        self.dropout = config.dropout  # 配置的dropout率
        self.layerdrop = config.encoder_layerdrop  # 编码器层的dropout率

        embed_dim = config.d_model  # 嵌入维度
        self.padding_idx = config.pad_token_id  # 填充token的索引
        self.max_source_positions = config.max_position_embeddings  # 最大源序列位置
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0  # 嵌入缩放因子

        self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)  # 词嵌入层

        if embed_tokens is not None:
            self.embed_tokens.weight = embed_tokens.weight  # 如果提供了预训练的嵌入，则使用它

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
        )  # 学习的位置编码嵌入

        self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])  # 编码器层列表
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"  # 是否使用Flash Attention 2
        self._use_sdpa = config._attn_implementation == "sdpa"  # 是否使用SDPA（Scaled Dot-Product Attention）
        self.layernorm_embedding = nn.LayerNorm(embed_dim)  # 嵌入层的LayerNorm

        self.gradient_checkpointing = False  # 梯度检查点，默认关闭
        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens  # 获取输入的嵌入层

    def set_input_embeddings(self, value):
        self.embed_tokens = value  # 设置输入的嵌入层

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 初始化方法，接收一个BartConfig对象和一个可选的嵌入词表的参数
    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
        # 调用父类的初始化方法，传入配置对象config
        super().__init__(config)
        # 设置dropout比例为配置对象中的dropout值
        self.dropout = config.dropout
        # 设置层级dropout比例为配置对象中的decoder_layerdrop值
        self.layerdrop = config.decoder_layerdrop
        # 设置填充索引为配置对象中的pad_token_id值
        self.padding_idx = config.pad_token_id
        # 设置最大目标位置为配置对象中的max_position_embeddings值
        self.max_target_positions = config.max_position_embeddings
        # 如果配置对象中设置了scale_embedding为True，则设置embed_scale为d_model的平方根，否则为1.0
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

        # 初始化嵌入词表，使用nn.Embedding类，参数为vocab_size, d_model和padding_idx
        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)

        # 如果传入了额外的embed_tokens参数，则使用其权重覆盖当前embed_tokens的权重
        if embed_tokens is not None:
            self.embed_tokens.weight = embed_tokens.weight

        # 初始化位置嵌入，使用BartLearnedPositionalEmbedding类，参数为max_position_embeddings和d_model
        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
        )
        
        # 使用BartDecoderLayer类创建decoder层的ModuleList，长度为config.decoder_layers
        self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
        
        # 根据配置中的_attn_implementation值，设置是否使用flash_attention_2方法
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
        # 根据配置中的_attn_implementation值，设置是否使用sdpa方法
        self._use_sdpa = config._attn_implementation == "sdpa"

        # 初始化embedding的LayerNorm，参数为d_model
        self.layernorm_embedding = nn.LayerNorm(config.d_model)

        # 初始化梯度检查点标志为False
        self.gradient_checkpointing = False

        # 调用post_init方法，用于初始化权重并应用最终处理
        self.post_init()

    # 获取输入嵌入的方法，返回当前的embed_tokens对象
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置输入嵌入的方法，将传入的value赋值给embed_tokens
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # 前向传播方法
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    "The BART Model outputting raw hidden-states without any specific head on top.",
    BART_START_DOCSTRING,
)
# 定义 BART 模型类，继承自 BartPreTrainedModel
class BartModel(BartPreTrainedModel):
    # 被绑定权重的键名列表，用于共享编码和解码器的嵌入权重
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    # 初始化函数，接收一个 BartConfig 对象作为参数
    def __init__(self, config: BartConfig):
        # 调用父类的初始化方法
        super().__init__(config)

        # 设置填充索引和词汇表大小
        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
        # 创建共享的词嵌入层，大小为 vocab_size × config.d_model，使用 padding_idx 进行填充
        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)

        # 创建编码器和解码器对象，传入配置对象和共享的词嵌入层
        self.encoder = BartEncoder(config, self.shared)
        self.decoder = BartDecoder(config, self.shared)

        # 初始化权重并应用最终处理
        self.post_init()

    # 绑定权重函数
    def _tie_weights(self):
        # 如果配置要求绑定词嵌入权重，则将编码器和解码器的 embed_tokens 与共享的词嵌入层绑定或克隆
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)

    # 获取输入词嵌入函数
    def get_input_embeddings(self):
        return self.shared

    # 设置输入词嵌入函数
    def set_input_embeddings(self, value):
        self.shared = value
        self.encoder.embed_tokens = self.shared
        self.decoder.embed_tokens = self.shared

    # 获取编码器对象函数
    def get_encoder(self):
        return self.encoder

    # 获取解码器对象函数
    def get_decoder(self):
        return self.decoder

    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Seq2SeqModelOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    # 前向传播函数，接收多种输入和掩码，返回预测输出
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 初始化方法，接受一个 BartConfig 对象作为参数
    def __init__(self, config: BartConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 创建一个 BartModel 对象并赋值给 self.model
        self.model = BartModel(config)
        # 初始化一个形状为 (1, self.model.shared.num_embeddings) 的零张量，作为 final_logits_bias 属性
        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
        # 创建一个线性层，用于输出 logits，输入维度为 config.d_model，输出维度为 self.model.shared.num_embeddings
        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)

        # 调用自定义的后初始化方法
        self.post_init()

    # 获取编码器部分的方法
    def get_encoder(self):
        return self.model.get_encoder()

    # 获取解码器部分的方法
    def get_decoder(self):
        return self.model.get_decoder()

    # 调整 token embeddings 的方法，返回调整后的新 embeddings
    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
        # 调用父类的 resize_token_embeddings 方法，获取新的 embeddings
        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
        # 调用私有方法调整 final_logits_bias 属性
        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
        # 返回调整后的新 embeddings
        return new_embeddings

    # 调整 final_logits_bias 的私有方法，根据新的 token 数量调整 bias 的大小
    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
        # 获取当前 final_logits_bias 的 token 数量
        old_num_tokens = self.final_logits_bias.shape[-1]
        # 如果新的 token 数量小于等于当前的 token 数量，直接截取对应的部分作为新的 bias
        if new_num_tokens <= old_num_tokens:
            new_bias = self.final_logits_bias[:, :new_num_tokens]
        # 如果新的 token 数量大于当前的 token 数量，则扩展新的 bias，并将扩展部分填充为零
        else:
            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
        # 注册更新后的 final_logits_bias 属性
        self.register_buffer("final_logits_bias", new_bias)

    # 获取输出 embeddings 的方法
    def get_output_embeddings(self):
        return self.lm_head

    # 设置输出 embeddings 的方法
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 前向传播方法，接受多个输入参数，详细说明见装饰器内的文档字符串
    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    @add_end_docstrings(BART_GENERATION_EXAMPLE)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, Seq2SeqLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:
            Returns either a tuple or `Seq2SeqLMOutput` depending on `return_dict`.

        """
        # Determine whether to use the provided `return_dict` or default from `config`
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # If labels are provided, adjust `use_cache` and initialize `decoder_input_ids` if not provided
        if labels is not None:
            if use_cache:
                # Issue a warning about setting `use_cache` to `False` when `labels` are provided
                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
            # Always set `use_cache` to `False` when `labels` are provided
            use_cache = False
            # If `decoder_input_ids` is not provided, shift `labels` to the right for decoder inputs
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                decoder_input_ids = shift_tokens_right(
                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
                )

        # Forward the inputs to the model for computation
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Generate logits from the language model head and add bias
        lm_logits = self.lm_head(outputs[0])
        lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device)

        masked_lm_loss = None
        # Compute masked language modeling loss if labels are provided
        if labels is not None:
            labels = labels.to(lm_logits.device)
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))

        # Return either a tuple or `Seq2SeqLMOutput` based on `return_dict`
        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        return Seq2SeqLMOutput(
            loss=masked_lm_loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        decoder_attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果使用了过去的键值（past_key_values），则根据其长度调整 decoder_input_ids 的长度
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法已经只传递了最后一个输入 ID
            if decoder_input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认行为：只保留最后一个 ID
                remove_prefix_length = decoder_input_ids.shape[1] - 1

            decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]

        # 返回一个字典，包含准备好的生成器输入的各种组件
        return {
            "input_ids": None,  # encoder_outputs 已定义，input_ids 不需要
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # 更改此处以避免缓存（可能是为了调试目的）
        }

    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        # 将标签向右移动，以准备解码器的输入
        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            # 缓存的交叉注意力状态不需要重新排序 -> 它们总是相同的
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
                + layer_past[2:],
            )
        return reordered_past
@add_start_docstrings(
    """
    Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    """,
    BART_START_DOCSTRING,
)
class BartForSequenceClassification(BartPreTrainedModel):
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    def __init__(self, config: BartConfig, **kwargs):
        super().__init__(config, **kwargs)
        self.model = BartModel(config)  # 初始化一个Bart模型
        self.classification_head = BartClassificationHead(  # 初始化一个Bart分类头部
            config.d_model,
            config.d_model,
            config.num_labels,
            config.classifier_dropout,
        )

        # Initialize weights and apply final processing
        self.post_init()  # 执行后初始化步骤，包括权重初始化等

    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
        output_type=Seq2SeqSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
    )
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass of the BartForSequenceClassification model.

        Args:
            input_ids: Indices of input sequence tokens in the vocabulary.
            attention_mask: Mask to avoid performing attention on padding tokens.
            decoder_input_ids: Indices of decoder input sequence tokens in the vocabulary.
            decoder_attention_mask: Mask to avoid performing attention on padding tokens for decoder.
            head_mask: Mask to nullify selected heads of the self-attention modules.
            decoder_head_mask: Mask to nullify selected heads of the cross-attention modules.
            cross_attn_head_mask: Mask to nullify selected heads of the cross-attention modules.
            encoder_outputs: Hidden states of the encoder at each layer.
            inputs_embeds: Optional tensor of embeddings to be used instead of input_ids.
            decoder_inputs_embeds: Optional tensor of embeddings to be used instead of decoder_input_ids.
            labels: Labels for computing the sequence classification/regression loss.
            use_cache: Whether or not to use the pre-computed hidden states cache.
            output_attentions: Whether or not to return the attentions tensors.
            output_hidden_states: Whether or not to return the hidden states tensors.
            return_dict: Whether or not to return a dictionary as output.

        Returns:
            Depending on `return_dict`, either a dictionary (`Seq2SeqSequenceClassifierOutput`) or
            a tuple with sequence classifier output and optional hidden states and attentions.
        """
        # Implementation of forward pass, computing sequence classification output

@add_start_docstrings(
    """
    BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    BART_START_DOCSTRING,
)
class BartForQuestionAnswering(BartPreTrainedModel):
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    def __init__(self, config):
        super().__init__(config)

        config.num_labels = 2
        self.num_labels = config.num_labels

        self.model = BartModel(config)  # 初始化一个Bart模型
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)  # 初始化用于QA任务的线性分类器

        # Initialize weights and apply final processing
        self.post_init()  # 执行后初始化步骤，包括权重初始化等

    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_QA,  # 使用指定的检查点用于问答模型
        output_type=Seq2SeqQuestionAnsweringModelOutput,  # 指定输出类型为Seq2SeqQuestionAnsweringModelOutput
        config_class=_CONFIG_FOR_DOC,  # 使用指定的配置类来配置模型
        expected_loss=_QA_EXPECTED_LOSS,  # 预期的损失值用于模型评估
        expected_output=_QA_EXPECTED_OUTPUT,  # 预期的输出用于模型评估
    )
    def forward(
        self,
        input_ids: torch.Tensor = None,  # 输入的token IDs张量，可选
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码张量，可选
        decoder_input_ids: Optional[torch.LongTensor] = None,  # 解码器的token IDs张量，可选
        decoder_attention_mask: Optional[torch.LongTensor] = None,  # 解码器的注意力掩码张量，可选
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码张量，可选
        decoder_head_mask: Optional[torch.Tensor] = None,  # 解码器的头部掩码张量，可选
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 交叉注意力头部掩码张量，可选
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,  # 编码器输出列表，每个元素是张量，可选
        start_positions: Optional[torch.LongTensor] = None,  # 答案起始位置的张量，可选
        end_positions: Optional[torch.LongTensor] = None,  # 答案结束位置的张量，可选
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入张量，可选
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,  # 解码器输入的嵌入张量，可选
        use_cache: Optional[bool] = None,  # 是否使用缓存，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选
class BartDecoderWrapper(BartPreTrainedModel):
    """
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    """

    def __init__(self, config):
        # 调用父类的构造函数，初始化模型配置
        super().__init__(config)
        # 创建一个BartDecoder实例作为这个wrapper的decoder
        self.decoder = BartDecoder(config)

    def forward(self, *args, **kwargs):
        # 将输入参数传递给decoder模型，并返回其输出
        return self.decoder(*args, **kwargs)


@add_start_docstrings(
    """
    BART decoder with with a language modeling head on top (linear layer with weights tied to the input embeddings).
    """,
    BART_START_DOCSTRING,
)
class BartForCausalLM(BartPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        # 深拷贝配置，确保修改不影响原始配置
        config = copy.deepcopy(config)
        # 将配置设置为decoder模式
        config.is_decoder = True
        # 设置为非Encoder-Decoder模型
        config.is_encoder_decoder = False
        # 调用父类的构造函数，初始化模型配置
        super().__init__(config)
        # 使用BartDecoderWrapper创建一个decoder模型
        self.model = BartDecoderWrapper(config)

        # 创建一个线性层作为语言建模头部，权重与输入嵌入层的权重绑定
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回模型中decoder的嵌入层的嵌入tokens
        return self.model.decoder.embed_tokens

    def set_input_embeddings(self, value):
        # 设置模型中decoder的嵌入层的嵌入tokens
        self.model.decoder.embed_tokens = value

    def get_output_embeddings(self):
        # 返回语言建模头部的输出嵌入层
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        # 设置语言建模头部的输出嵌入层
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        # 设置模型中的decoder
        self.model.decoder = decoder

    def get_decoder(self):
        # 返回模型中的decoder
        return self.model.decoder

    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 正向传播方法，接收多个输入参数，返回语言模型输出
        ...

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
    ):
        # 为生成准备输入的方法，返回生成所需的输入
        ...
    ):
        # 如果模型作为编码器-解码器模型的解码器使用，则动态创建解码器注意力掩码
        if attention_mask is None:
            # 如果注意力掩码为空，则创建一个全为1的注意力掩码，形状与输入ID相同
            attention_mask = input_ids.new_ones(input_ids.shape)

        if past_key_values:
            # 获取过去键值对的长度
            past_length = past_key_values[0][0].shape[2]

            # 某些生成方法已经只传递最后一个输入ID
            if input_ids.shape[1] > past_length:
                # 如果输入ID的长度大于过去的长度，则移除前缀长度设为过去的长度
                remove_prefix_length = past_length
            else:
                # 否则，默认保留最后一个ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 截取输入ID的后部分以保留有效部分
            input_ids = input_ids[:, remove_prefix_length:]

        # 返回包含以下内容的字典
        return {
            "input_ids": input_ids,  # 编码器输出已定义，不再需要输入ID
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        # 重新排序缓存中的过去键值对
        reordered_past = ()
        for layer_past in past_key_values:
            # 对每一层的过去状态按beam_idx重新排序，并添加到重新排序过的过去状态中
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重新排序后的过去状态
        return reordered_past

Transformers-源码解析-十二-

Transformers 源码解析（十二）

.\models\auto\__init__.py

.\models\autoformer\configuration_autoformer.py

.\models\autoformer\modeling_autoformer.py

.\models\autoformer\__init__.py

.\models\bark\configuration_bark.py

.\models\bark\convert_suno_to_hf.py

.\models\bark\generation_configuration_bark.py

.\models\bark\modeling_bark.py

.\models\bark\processing_bark.py

.\models\bark\__init__.py

.\models\bart\configuration_bart.py

.\models\bart\convert_bart_original_pytorch_checkpoint_to_pytorch.py

.\models\bart\modeling_bart.py

`.\models\auto\init.py`

`.\models\autoformer\configuration_autoformer.py`

`.\models\autoformer\modeling_autoformer.py`

`.\models\autoformer\init.py`

`.\models\bark\configuration_bark.py`

`.\models\bark\convert_suno_to_hf.py`

`.\models\bark\generation_configuration_bark.py`

`.\models\bark\modeling_bark.py`

`.\models\bark\processing_bark.py`

`.\models\bark\init.py`

`.\models\bart\configuration_bart.py`

`.\models\bart\convert_bart_original_pytorch_checkpoint_to_pytorch.py`

`.\models\bart\modeling_bart.py`