Transformers 源码解析（七十三）

`.\models\mbart\modeling_tf_mbart.py`

# 设置文件编码为UTF-8

# 版权声明，声明此代码的版权归The Fairseq Authors和The HuggingFace Inc.团队所有
# 根据Apache许可证2.0版授权，除非符合许可证的规定，否则不得使用此文件
# 您可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于"按现状"提供的，不提供任何明示或暗示的保证或条件
# 有关特定语言的权限，请参阅许可证

""" TF 2.0 MBart model."""

from __future__ import annotations  # 使用未来的注释类型

import random  # 导入随机模块
from typing import Optional, Tuple, Union  # 导入类型提示相关模块

import tensorflow as tf  # 导入TensorFlow模块

from ...activations_tf import get_tf_activation  # 导入激活函数获取函数
from ...modeling_tf_outputs import (  # 导入TensorFlow模型输出相关模块
    TFBaseModelOutput,
    TFBaseModelOutputWithPastAndCrossAttentions,
    TFSeq2SeqLMOutput,
    TFSeq2SeqModelOutput,
)

# 公共API
from ...modeling_tf_utils import (  # 导入TensorFlow模型实用工具函数
    TFCausalLanguageModelingLoss,
    TFModelInputType,
    TFPreTrainedModel,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax  # 导入TensorFlow实用工具函数
from ...utils import (  # 导入通用实用函数
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_mbart import MBartConfig  # 导入MBart配置文件

logger = logging.get_logger(__name__)  # 获取模块专用的日志记录器

_CHECKPOINT_FOR_DOC = "facebook/mbart-large-cc25"  # 用于文档的预训练模型检查点
_CONFIG_FOR_DOC = "MBartConfig"  # 用于文档的MBart配置信息

LARGE_NEGATIVE = -1e8  # 设定一个大负数常量，值为-1e8

def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int):
    """
    将输入的token向右移动一个位置，并用最后一个非pad token（即<LID> token）进行包装。需要注意的是，与其他类似Bart的模型不同，MBart没有单一的`decoder_start_token_id`。
    """
    if pad_token_id is None:
        raise ValueError("self.model.config.pad_token_id has to be defined.")  # 如果pad_token_id为None，则抛出数值错误异常
    # 将标签中可能的-100值替换为`pad_token_id`
    input_ids = tf.where(
        input_ids == -100, tf.fill(shape_list(input_ids), tf.cast(pad_token_id, input_ids.dtype)), input_ids
    )
    language_id_index = (
        tf.reduce_sum(tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=input_ids.dtype), axis=-1) - 1
    )
    language_id_index = tf.stack(
        [tf.range(shape_list(input_ids)[0], dtype=input_ids.dtype), language_id_index], axis=-1
    )
    languages_ids = tf.gather_nd(input_ids, language_id_index)

    shifted_input_ids = tf.concat([tf.expand_dims(languages_ids, axis=-1), input_ids[:, :-1]], axis=-1)

    return shifted_input_ids

# 从transformers.models.bart.modeling_tf_bart._make_causal_mask复制过来的函数
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
    """
    创建用于双向自注意力的因果掩码。
    """
    # 获取输入张量的第一维大小，通常表示批量大小
    bsz = input_ids_shape[0]
    
    # 获取输入张量的第二维大小，通常表示序列长度
    tgt_len = input_ids_shape[1]
    
    # 创建一个形状为 (tgt_len, tgt_len) 的张量，并用大负数填充
    mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
    
    # 创建一个形状为 (tgt_len,) 的张量，包含从 0 到 tgt_len-1 的整数
    mask_cond = tf.range(shape_list(mask)[-1])
    
    # 根据条件重新设定 mask 张量的部分值为 0
    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
    
    # 如果过去的键值对长度大于 0，则在 mask 的左侧连接一个形状为 (tgt_len, past_key_values_length) 的全零张量
    if past_key_values_length > 0:
        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
    
    # 将 mask 张量在批量维度和其他维度上进行复制，以匹配输出形状
    return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    # 获取输入 mask 的序列长度
    src_len = shape_list(mask)[1]
    # 如果未指定 tgt_len，则使用 src_len
    tgt_len = tgt_len if tgt_len is not None else src_len
    # 创建一个常数张量，数值为 1.0
    one_cst = tf.constant(1.0)
    # 将 mask 转换为与 one_cst 相同的数据类型
    mask = tf.cast(mask, dtype=one_cst.dtype)
    # 在第二维和第三维上复制 mask，形成新的扩展 mask 张量
    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))

    # 返回经过扩展的 mask 与一个大负数相乘的结果
    return (one_cst - expanded_mask) * LARGE_NEGATIVE


# Copied from transformers.models.bart.modeling_tf_bart.TFBartLearnedPositionalEmbedding with Bart->MBart
class TFMBartLearnedPositionalEmbedding(keras.layers.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
        # MBart 设定如果指定了 padding_idx，则通过偏移 2 调整 embedding ids，并相应调整 num_embeddings
        self.offset = 2
        super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)

    def call(
        self,
        input_shape: Optional[tf.TensorShape] = None,
        past_key_values_length: int = 0,
        position_ids: tf.Tensor | None = None,
    ):
        """Input is expected to be of size [bsz x seqlen]."""
        # 如果 position_ids 未指定，则根据 input_shape 创建默认位置 ids
        if position_ids is None:
            seq_len = input_shape[1]
            position_ids = tf.range(seq_len, delta=1, name="range")
            position_ids += past_key_values_length

        # 根据 position_ids 的类型设置偏移的数据类型
        offset_dtype = position_ids.dtype if isinstance(position_ids, tf.Tensor) else tf.int32
        # 调用父类 Embedding 的 call 方法，加上偏移值 self.offset
        return super().call(position_ids + tf.constant(self.offset, dtype=offset_dtype))


# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->MBart
class TFMBartAttention(keras.layers.Layer):
    """Multi-headed attention from "Attention Is All You Need"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        # 初始化多头注意力层的参数
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.is_decoder = is_decoder
        self.bias = bias
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = keras.layers.Dropout(dropout)
        self.head_dim = embed_dim // num_heads
        # 检查 embed_dim 是否能被 num_heads 整除，否则抛出异常
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        # 计算缩放因子，用于缩放注意力分数
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        # 创建用于投影的 Dense 层，每个都带有偏置
        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")

    # 重塑张量的形状，用于多头注意力的计算
    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))

    # 定义模型的调用方法，实现注意力机制的计算
    def call(
        self,
        hidden_states: tf.Tensor,
        key_value_states: tf.Tensor | None = None,
        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
        attention_mask: tf.Tensor | None = None,
        layer_head_mask: tf.Tensor | None = None,
        training: Optional[bool] = False,
    ):
        # 这里会实现具体的注意力计算逻辑，但在当前代码段中并未展示完整的实现细节
        pass

    # 构建模型的方法，用于构建每个投影层的 Dense 层
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 检查并构建 k_proj 投影层
        if getattr(self, "k_proj", None) is not None:
            with tf.name_scope(self.k_proj.name):
                self.k_proj.build([None, None, self.embed_dim])
        # 检查并构建 q_proj 投影层
        if getattr(self, "q_proj", None) is not None:
            with tf.name_scope(self.q_proj.name):
                self.q_proj.build([None, None, self.embed_dim])
        # 检查并构建 v_proj 投影层
        if getattr(self, "v_proj", None) is not None:
            with tf.name_scope(self.v_proj.name):
                self.v_proj.build([None, None, self.embed_dim])
        # 检查并构建 out_proj 投影层
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.embed_dim])
class TFMBartEncoderLayer(keras.layers.Layer):
    def __init__(self, config: MBartConfig, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = config.d_model
        # 初始化自注意力层，使用配置中的参数
        self.self_attn = TFMBartAttention(
            self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
        )
        # 自注意力层的 LayerNormalization
        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
        self.dropout = keras.layers.Dropout(config.dropout)
        # 激活函数使用配置中的激活函数类型
        self.activation_fn = get_tf_activation(config.activation_function)
        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
        # 第一个全连接层
        self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
        # 第二个全连接层
        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
        # 最终的 LayerNormalization
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
        self.config = config

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        layer_head_mask: tf.Tensor,
        training: Optional[bool] = False,
    ):
        """
        Args:
            hidden_states (`tf.Tensor`): 输入到层的张量，形状为 *(batch, seq_len, embed_dim)*
            attention_mask (`tf.Tensor`): 注意力掩码张量，形状为 *(batch, 1, tgt_len, src_len)*，
                其中填充元素由非常大的负值表示。
            layer_head_mask (`tf.Tensor`): 给定层中注意力头的掩码张量，形状为 *(encoder_attention_heads,)*
        """
        # 保留残差连接
        residual = hidden_states
        # 使用 LayerNormalization 对输入进行归一化
        hidden_states = self.self_attn_layer_norm(hidden_states)
        # 调用自注意力层进行计算
        hidden_states, self_attn_weights, _ = self.self_attn(
            hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask
        )

        # 断言保证自注意力操作没有改变张量的形状
        tf.debugging.assert_equal(
            shape_list(hidden_states),
            shape_list(residual),
            message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
        )

        # 应用 dropout
        hidden_states = self.dropout(hidden_states, training=training)
        # 残差连接
        hidden_states = residual + hidden_states

        # 保留残差连接
        residual = hidden_states
        # 最终的 LayerNormalization
        hidden_states = self.final_layer_norm(hidden_states)
        # 使用激活函数和第一个全连接层进行前向传播
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 应用激活函数的 dropout
        hidden_states = self.activation_dropout(hidden_states, training=training)
        # 第二个全连接层
        hidden_states = self.fc2(hidden_states)
        # 应用 dropout
        hidden_states = self.dropout(hidden_states, training=training)
        # 残差连接
        hidden_states = residual + hidden_states

        return hidden_states, self_attn_weights
    # 构建模型结构，如果已经构建过，则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        
        # 将标志置为已构建
        self.built = True
        
        # 如果存在 self_attn 属性，则构建 self attention 层
        if getattr(self, "self_attn", None) is not None:
            with tf.name_scope(self.self_attn.name):
                self.self_attn.build(None)
        
        # 如果存在 self_attn_layer_norm 属性，则构建 self attention 层的 layer normalization 层
        if getattr(self, "self_attn_layer_norm", None) is not None:
            with tf.name_scope(self.self_attn_layer_norm.name):
                self.self_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 fc1 属性，则构建第一个全连接层
        if getattr(self, "fc1", None) is not None:
            with tf.name_scope(self.fc1.name):
                self.fc1.build([None, None, self.embed_dim])
        
        # 如果存在 fc2 属性，则构建第二个全连接层
        if getattr(self, "fc2", None) is not None:
            with tf.name_scope(self.fc2.name):
                self.fc2.build([None, None, self.config.encoder_ffn_dim])
        
        # 如果存在 final_layer_norm 属性，则构建最终的 layer normalization 层
        if getattr(self, "final_layer_norm", None) is not None:
            with tf.name_scope(self.final_layer_norm.name):
                self.final_layer_norm.build([None, None, self.embed_dim])
class TFMBartDecoderLayer(keras.layers.Layer):
    # TFMBartDecoderLayer 类，继承自 keras.layers.Layer
    def __init__(self, config: MBartConfig, **kwargs):
        # 初始化方法
        super().__init__(**kwargs)
        # 设置嵌入维度为 config.d_model
        self.embed_dim = config.d_model
        # 初始化 self_attn 层，使用 TFMBartAttention
        self.self_attn = TFMBartAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            name="self_attn",
            is_decoder=True,
        )
        # 设置 dropout 层
        self.dropout = keras.layers.Dropout(config.dropout)
        # 获取激活函数并设置 activation_fn
        self.activation_fn = get_tf_activation(config.activation_function)
        # 设置激活函数的 dropout 层
        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)

        # 初始化 self_attn_layer_norm，LayerNormalization 层
        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
        # 初始化 encoder_attn 层，使用 TFMBartAttention
        self.encoder_attn = TFMBartAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            name="encoder_attn",
            is_decoder=True,
        )
        # 初始化 encoder_attn_layer_norm，LayerNormalization 层
        self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
        # 初始化全连接层 fc1
        self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
        # 初始化全连接层 fc2
        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
        # 初始化 final_layer_norm，LayerNormalization 层
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
        # 设置配置信息
        self.config = config

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor | None = None,
        encoder_hidden_states: tf.Tensor | None = None,
        encoder_attention_mask: tf.Tensor | None = None,
        layer_head_mask: tf.Tensor | None = None,
        cross_attn_layer_head_mask: tf.Tensor | None = None,
        past_key_value: Tuple[tf.Tensor] | None = None,
        training: Optional[bool] = False,
        # call 方法，定义了层的正向传播逻辑和参数
    # 构建模型的方法，用于在第一次调用时构建模型结构
    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回，避免重复构建
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        
        # 如果存在 self_attn 属性，则构建 self attention 层
        if getattr(self, "self_attn", None) is not None:
            # 在命名空间下构建 self attention 层
            with tf.name_scope(self.self_attn.name):
                self.self_attn.build(None)
        
        # 如果存在 self_attn_layer_norm 属性，则构建 self attention 层的 layer normalization
        if getattr(self, "self_attn_layer_norm", None) is not None:
            # 在命名空间下构建 self attention 层的 layer normalization
            with tf.name_scope(self.self_attn_layer_norm.name):
                self.self_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 encoder_attn 属性，则构建 encoder-decoder attention 层
        if getattr(self, "encoder_attn", None) is not None:
            # 在命名空间下构建 encoder-decoder attention 层
            with tf.name_scope(self.encoder_attn.name):
                self.encoder_attn.build(None)
        
        # 如果存在 encoder_attn_layer_norm 属性，则构建 encoder-decoder attention 层的 layer normalization
        if getattr(self, "encoder_attn_layer_norm", None) is not None:
            # 在命名空间下构建 encoder-decoder attention 层的 layer normalization
            with tf.name_scope(self.encoder_attn_layer_norm.name):
                self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 fc1 属性，则构建第一个全连接层
        if getattr(self, "fc1", None) is not None:
            # 在命名空间下构建第一个全连接层
            with tf.name_scope(self.fc1.name):
                self.fc1.build([None, None, self.embed_dim])
        
        # 如果存在 fc2 属性，则构建第二个全连接层
        if getattr(self, "fc2", None) is not None:
            # 在命名空间下构建第二个全连接层
            with tf.name_scope(self.fc2.name):
                self.fc2.build([None, None, self.config.decoder_ffn_dim])
        
        # 如果存在 final_layer_norm 属性，则构建最终的 layer normalization 层
        if getattr(self, "final_layer_norm", None) is not None:
            # 在命名空间下构建最终的 layer normalization 层
            with tf.name_scope(self.final_layer_norm.name):
                self.final_layer_norm.build([None, None, self.embed_dim])
class TFMBartPreTrainedModel(TFPreTrainedModel):
    # 指定该模型所使用的配置类
    config_class = MBartConfig
    # 模型的基础名称前缀
    base_model_prefix = "model"



MBART_START_DOCSTRING = r"""
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`MBartConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""



MBART_INPUTS_DOCSTRING = r"""
"""



MBART_GENERATION_EXAMPLE = r"""
    Translation example:

    ```
    >>> from transformers import AutoTokenizer, TFMBartForConditionalGeneration

    >>> model = TFMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-en-ro")

    >>> example_english_phrase = "42 is the answer"
    ```
    >>> inputs = tokenizer(example_english_phrase, return_tensors="tf")
    # 使用预训练的tokenizer将输入的英文短语编码成模型可以处理的张量形式

    >>> # Translate
    >>> generated_ids = model.generate(**inputs, num_beams=4, max_length=5)
    # 使用预训练的翻译模型生成翻译结果，采用4束搜索，最大长度限制为5个token

    >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    # 解码生成的token IDs，跳过特殊token并保留tokenization空格，然后返回第一个翻译结果的文本形式
    '42 este răspuns'
    ```

    Mask filling example:

    ```
    >>> from transformers import AutoTokenizer, TFMBartForConditionalGeneration
    >>> import tensorflow as tf

    >>> model = TFMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
    # 从预训练的MBart模型加载条件生成器模型

    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
    # 从预训练的MBart模型加载tokenizer

    >>> # de_DE is the language symbol id <LID> for German
    >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
    # 定义一个包含掩码的文本字符串，用于在德语中填充掩码位置的词语

    >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="tf")["input_ids"]
    # 使用tokenizer将文本转换为模型输入的token IDs张量，不添加特殊token

    >>> logits = model(input_ids).logits
    # 通过模型获取预测的logits

    >>> masked_index = tf.where(input_ids[0] == tokenizer.mask_token_id)[0, 0]
    # 找到输入中掩码token的索引位置

    >>> probs = tf.nn.softmax(logits[0, masked_index], axis=0)
    # 对模型预测的掩码位置的logits进行softmax操作，得到概率分布

    >>> values, predictions = tf.math.top_k(probs, 5)
    # 获取最高的五个概率值及其对应的索引作为预测结果

    >>> tokenizer.decode(predictions).split()
    # 解码预测的token IDs，并以列表形式返回词语预测结果
    ['nett', 'sehr', 'ganz', 'nicht', 'so']
    ```
"""


@keras_serializable
class TFMBartEncoder(keras.layers.Layer):
    # MBart 配置类，用于配置编码器层
    config_class = MBartConfig
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`TFMBartEncoderLayer`].

    Args:
        config: MBartConfig
    """

    def __init__(self, config: MBartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
        super().__init__(**kwargs)
        # 从配置中初始化各种参数和层
        self.config = config
        self.dropout = keras.layers.Dropout(config.dropout)
        self.layerdrop = config.encoder_layerdrop
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_position_embeddings
        self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0

        self.embed_tokens = embed_tokens
        # 学习得到的位置嵌入
        self.embed_positions = TFMBartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
            name="embed_positions",
        )
        # 编码器层的列表
        self.layers = [TFMBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
        # 嵌入层的 LayerNormalization
        self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
        # 编码器层的 LayerNormalization
        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
        # 嵌入维度
        self.embed_dim = config.d_model

    def get_embed_tokens(self):
        return self.embed_tokens

    def set_embed_tokens(self, embed_tokens):
        self.embed_tokens = embed_tokens

    @unpack_inputs
    # 编码器的调用方法，包括输入的解包，注意力掩码等
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        inputs_embeds: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ):
        # 省略部分代码，用于处理编码器的输入和返回值

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 构建位置嵌入层
        if getattr(self, "embed_positions", None) is not None:
            with tf.name_scope(self.embed_positions.name):
                self.embed_positions.build(None)
        # 构建嵌入层的 LayerNormalization
        if getattr(self, "layernorm_embedding", None) is not None:
            with tf.name_scope(self.layernorm_embedding.name):
                self.layernorm_embedding.build([None, None, self.embed_dim])
        # 构建编码器层的 LayerNormalization
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.d_model])
        # 为每个编码器层构建
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    layer.build(None)


@keras_serializable
class TFMBartDecoder(keras.layers.Layer):
    config_class = MBartConfig
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFMBartDecoderLayer`]

    Args:
        config: MBartConfig
            MBart模型的配置对象，包含模型的各种设置和超参数
        embed_tokens: output embedding
            可选的嵌入层对象，用于将输入的token转换为向量表示
    """

    def __init__(self, config: MBartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        self.padding_idx = config.pad_token_id
        self.embed_tokens = embed_tokens
        self.layerdrop = config.decoder_layerdrop
        self.embed_positions = TFMBartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
            name="embed_positions",
        )
        self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
        self.layers = [TFMBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
        self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")

        self.dropout = keras.layers.Dropout(config.dropout)

    def get_embed_tokens(self):
        return self.embed_tokens

    def set_embed_tokens(self, embed_tokens):
        self.embed_tokens = embed_tokens

    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType = None,
        inputs_embeds: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        encoder_hidden_states: tf.Tensor | None = None,
        encoder_attention_mask: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        cross_attn_head_mask: tf.Tensor | None = None,
        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[
        TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]
            # 定义了调用该方法时的输入和输出类型及结构，返回模型输出或特定的元组
    # 定义一个方法用于构建模型结构，支持输入形状参数，默认为 None
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 标记模型已构建
        self.built = True
        
        # 如果存在 embed_positions 属性，则构建 embed_positions
        if getattr(self, "embed_positions", None) is not None:
            # 使用 embed_positions 的命名空间来构建，传入 None 作为输入形状
            with tf.name_scope(self.embed_positions.name):
                self.embed_positions.build(None)
        
        # 如果存在 layernorm_embedding 属性，则构建 layernorm_embedding
        if getattr(self, "layernorm_embedding", None) is not None:
            # 使用 layernorm_embedding 的命名空间来构建，输入形状为 [None, None, self.config.d_model]
            with tf.name_scope(self.layernorm_embedding.name):
                self.layernorm_embedding.build([None, None, self.config.d_model])
        
        # 如果存在 layer_norm 属性，则构建 layer_norm
        if getattr(self, "layer_norm", None) is not None:
            # 使用 layer_norm 的命名空间来构建，输入形状为 [None, None, self.config.d_model]
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.d_model])
        
        # 如果存在 layers 属性（通常是一个层的列表），逐层构建每一层
        if getattr(self, "layers", None) is not None:
            # 遍历每一层
            for layer in self.layers:
                # 使用每一层的命名空间来构建，传入 None 作为输入形状
                with tf.name_scope(layer.name):
                    layer.build(None)
@keras_serializable
class TFMBartMainLayer(keras.layers.Layer):
    # 指定配置类
    config_class = MBartConfig

    def __init__(self, config: MBartConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化配置
        self.config = config
        # 创建共享的嵌入层，用于共享模型中的词汇表和嵌入向量
        self.shared = keras.layers.Embedding(
            input_dim=config.vocab_size,
            output_dim=config.d_model,
            embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
            name="model.shared",
        )
        # 添加额外的属性以指定层的名称前缀（用于加载/存储权重）
        self.shared.load_weight_prefix = "model.shared"

        # 创建编码器和解码器对象
        self.encoder = TFMBartEncoder(config, self.shared, name="encoder")
        self.decoder = TFMBartDecoder(config, self.shared, name="decoder")

    def get_input_embeddings(self):
        # 返回输入嵌入层对象
        return self.shared

    def set_input_embeddings(self, new_embeddings):
        # 设置新的输入嵌入层
        self.shared = new_embeddings
        # 更新编码器和解码器的嵌入层
        self.encoder.embed_tokens = self.shared
        self.decoder.embed_tokens = self.shared

    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType = None,
        attention_mask: tf.Tensor | None = None,
        decoder_input_ids: tf.Tensor | None = None,
        decoder_attention_mask: tf.Tensor | None = None,
        decoder_position_ids: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        decoder_head_mask: tf.Tensor | None = None,
        cross_attn_head_mask: tf.Tensor | None = None,
        encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
        inputs_embeds: tf.Tensor | None = None,
        decoder_inputs_embeds: tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        **kwargs,
    ):
        # 模型调用方法，解包输入参数并进行处理
        # （具体处理逻辑在未显示的代码中实现）
        ) -> Union[TFSeq2SeqModelOutput, tf.Tensor]:
        # 如果没有提供解码器的输入 ID 和嵌入，禁用缓存
        if decoder_input_ids is None and decoder_inputs_embeds is None:
            use_cache = False

        # 如果未显式提供隐藏状态的输出，则使用模型配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        # 如果没有提供解码器输入 ID 但提供了输入 ID，则通过将输入 ID 右移来生成解码器输入 ID
        if decoder_input_ids is None and input_ids is not None:
            decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id)

        # 如果没有提供编码器输出，则调用编码器模型进行前向传播
        if encoder_outputs is None:
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                training=training,
            )
        # 如果用户传递了一个元组作为编码器输出，并且设置了 return_dict=True，则将其包装在 TFBaseModelOutput 中
        elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput):
            encoder_outputs = TFBaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )
        # 如果用户传递了 TFBaseModelOutput 作为编码器输出，并且设置了 return_dict=False，则将其包装在元组中
        elif not return_dict and not isinstance(encoder_outputs, tuple):
            encoder_outputs = encoder_outputs.to_tuple()

        # 调用解码器模型进行解码操作，生成解码器的输出
        decoder_outputs = self.decoder(
            decoder_input_ids,
            attention_mask=decoder_attention_mask,
            position_ids=decoder_position_ids,
            encoder_hidden_states=encoder_outputs[0],
            encoder_attention_mask=attention_mask,
            head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 如果 return_dict=False，则返回解码器和编码器的输出作为元组
        if not return_dict:
            return decoder_outputs + encoder_outputs

        # 如果 return_dict=True，则返回一个 TFSeq2SeqModelOutput 对象，包含解码器和编码器的相关输出
        return TFSeq2SeqModelOutput(
            last_hidden_state=decoder_outputs.last_hidden_state,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 设置标志为已构建
        self.built = True
        
        # 共享/共同权重期望位于模型基本命名空间中
        # 在 tf.name_scope 的末尾添加 "/"（但不是开头！）将其放入根命名空间，而不是当前命名空间。
        with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
            # 构建共享模型部分
            self.shared.build(None)
        
        # 如果存在编码器，则在其命名空间下构建
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        
        # 如果存在解码器，则在其命名空间下构建
        if getattr(self, "decoder", None) is not None:
            with tf.name_scope(self.decoder.name):
                self.decoder.build(None)
# 使用装饰器添加文档字符串，描述该类为一个输出原始隐藏状态的MBART模型，没有特定头部的处理方式
@add_start_docstrings(
    "The bare MBART Model outputting raw hidden-states without any specific head on top.",
    MBART_START_DOCSTRING,
)
# 继承TFMBartPreTrainedModel类，初始化时接受一个MBartConfig类型的配置对象和其他可选参数
class TFMBartModel(TFMBartPreTrainedModel):
    def __init__(self, config: MBartConfig, *inputs, **kwargs):
        # 调用父类的初始化方法，传入配置和其他参数
        super().__init__(config, *inputs, **kwargs)

        # 创建TFMBartMainLayer对象，用给定名称初始化为"model"
        self.model = TFMBartMainLayer(config, name="model")

    # 返回self.model的encoder部分
    def get_encoder(self):
        return self.model.encoder

    # 返回self.model的decoder部分
    def get_decoder(self):
        return self.model.decoder

    # 使用装饰器unpack_inputs和add_start_docstrings_to_model_forward，以及add_code_sample_docstrings
    # 定义call方法，接受多种输入参数，并返回Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]
    @unpack_inputs
    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSeq2SeqModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType = None,
        attention_mask: tf.Tensor | None = None,
        decoder_input_ids: tf.Tensor | None = None,
        decoder_attention_mask: tf.Tensor | None = None,
        decoder_position_ids: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        decoder_head_mask: tf.Tensor | None = None,
        cross_attn_head_mask: tf.Tensor | None = None,
        encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
        inputs_embeds: tf.Tensor | None = None,
        decoder_inputs_embeds: tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        **kwargs,
    ) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]:
        # 调用self.model的forward方法，传入所有参数，获取输出结果
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            decoder_position_ids=decoder_position_ids,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            encoder_outputs=encoder_outputs,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回self.model的输出结果
        return outputs

    # 从transformers.models.bart.modeling_tf_bart.TFBartModel.serving_output复制而来
    # 定义一个方法用于处理模型的输出
    def serving_output(self, output):
        # 如果配置中启用缓存，则获取过去的关键值，否则为 None
        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
        # 如果配置中启用输出隐藏状态，则将输出的解码器隐藏状态转换为张量，否则为 None
        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置中启用输出注意力权重，则将输出的解码器注意力权重转换为张量，否则为 None
        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
        # 如果配置中启用输出注意力权重，则将输出的交叉注意力权重转换为张量，否则为 None
        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
        # 如果配置中启用输出隐藏状态，则将输出的编码器隐藏状态转换为张量，否则为 None
        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置中启用输出注意力权重，则将输出的编码器注意力权重转换为张量，否则为 None
        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None

        # 返回一个 TFSeq2SeqModelOutput 对象，包含各种处理后的模型输出
        return TFSeq2SeqModelOutput(
            last_hidden_state=output.last_hidden_state,
            past_key_values=pkv,
            decoder_hidden_states=dec_hs,
            decoder_attentions=dec_attns,
            cross_attentions=cross_attns,
            encoder_last_hidden_state=output.encoder_last_hidden_state,
            encoder_hidden_states=enc_hs,
            encoder_attentions=enc_attns,
        )

    # 构建方法
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记该实例已经构建
        self.built = True
        # 如果存在模型对象，则在其名称域中进行构建
        if getattr(self, "model", None) is not None:
            with tf.name_scope(self.model.name):
                # 使用模型对象构建
                self.model.build(None)
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(keras.layers.Layer):
    """
    Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
    so all weights have to be registered in a layer.
    """

    def __init__(self, shape, initializer, trainable, name, **kwargs):
        super().__init__(name=name, **kwargs)
        # Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of
        # "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see:
        # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214
        # 添加一个偏置项作为层的权重，用于模型的序列化，保证所有权重在保存时都能被注册在一个层中
        self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable)

    def call(self, x):
        # 在调用时，返回输入张量 x 加上偏置项 self.bias
        return x + self.bias


@add_start_docstrings(
    "The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.",
    MBART_START_DOCSTRING,
)
class TFMBartForConditionalGeneration(TFMBartPreTrainedModel, TFCausalLanguageModelingLoss):
    _keys_to_ignore_on_load_unexpected = [
        r"model.encoder.embed_tokens.weight",
        r"model.decoder.embed_tokens.weight",
    ]

    def __init__(self, config, *inputs, **kwargs):
        # 使用给定的配置初始化模型
        super().__init__(config, *inputs, **kwargs)
        # 创建 MBART 主模型层，命名为 "model"
        self.model = TFMBartMainLayer(config, name="model")
        # 是否使用缓存的标志
        self.use_cache = config.use_cache
        # 创建一个偏置层用于最终的 logits，注册为缓冲区，不可训练以保持一致性
        self.bias_layer = BiasLayer(
            name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
        )

    def get_decoder(self):
        # 返回模型的解码器
        return self.model.decoder

    def get_encoder(self):
        # 返回模型的编码器
        return self.model.encoder

    def get_output_embeddings(self):
        # 返回输入嵌入
        return self.get_input_embeddings()

    def set_output_embeddings(self, value):
        # 设置输出嵌入
        self.set_input_embeddings(value)

    def get_bias(self):
        # 返回偏置层的偏置项
        return {"final_logits_bias": self.bias_layer.bias}

    def set_bias(self, value):
        # 替换现有的包含偏置项的层，以保证正确的（反）序列化
        vocab_size = value["final_logits_bias"].shape[-1]
        self.bias_layer = BiasLayer(
            name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False
        )
        self.bias_layer.bias.assign(value["final_logits_bias"])

    @unpack_inputs
    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    @add_end_docstrings(MBART_GENERATION_EXAMPLE)
    # 定义一个方法 `call`，用于调用模型进行推理或训练
    def call(
        self,
        input_ids: TFModelInputType = None,  # 输入模型的输入 ID，类型为 TFModelInputType
        attention_mask: tf.Tensor | None = None,  # 注意力掩码，用于指示模型应关注哪些部分
        decoder_input_ids: tf.Tensor | None = None,  # 解码器的输入 ID
        decoder_attention_mask: tf.Tensor | None = None,  # 解码器的注意力掩码
        decoder_position_ids: tf.Tensor | None = None,  # 解码器的位置 ID
        head_mask: tf.Tensor | None = None,  # 头部掩码，用于指示哪些头部应该被屏蔽
        decoder_head_mask: tf.Tensor | None = None,  # 解码器头部的掩码
        cross_attn_head_mask: tf.Tensor | None = None,  # 交叉注意力头部的掩码
        encoder_outputs: Optional[TFBaseModelOutput] = None,  # 编码器的输出
        past_key_values: Tuple[Tuple[tf.Tensor]] = None,  # 过去的键值对，用于存储解码器的历史状态
        inputs_embeds: tf.Tensor | None = None,  # 输入的嵌入表示
        decoder_inputs_embeds: tf.Tensor | None = None,  # 解码器输入的嵌入表示
        use_cache: Optional[bool] = None,  # 是否使用缓存
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出
        labels: tf.Tensor | None = None,  # 标签，用于训练时的监督
        training: Optional[bool] = False,  # 是否处于训练模式，默认为 False
    ) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]:
        """
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:
            Returns either a TFSeq2SeqLMOutput object or a tuple of tf.Tensor depending on `return_dict`.

        """

        # Adjust labels: replace pad_token_id with -100 and keep others unchanged
        if labels is not None:
            labels = tf.where(
                labels == self.config.pad_token_id,
                tf.cast(tf.fill(shape_list(labels), -100), labels.dtype),
                labels,
            )
            use_cache = False
            
            # If decoder_input_ids and decoder_inputs_embeds are not provided, shift labels to the right
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)

        # Pass inputs to the model and get outputs
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            decoder_position_ids=decoder_position_ids,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # Calculate language modeling logits
        lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True)
        lm_logits = self.bias_layer(lm_logits)
        
        # Calculate masked language modeling loss if labels are provided
        masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits)

        # Prepare output based on return_dict flag
        if not return_dict:
            # Return tuple of outputs
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
        else:
            # Return TFSeq2SeqLMOutput object with specific attributes assigned
            return TFSeq2SeqLMOutput(
                loss=masked_lm_loss,
                logits=lm_logits,
                past_key_values=outputs.past_key_values,  # index 1 of d outputs
                decoder_hidden_states=outputs.decoder_hidden_states,  # index 2 of d outputs
                decoder_attentions=outputs.decoder_attentions,  # index 3 of d outputs
                cross_attentions=outputs.cross_attentions,  # index 4 of d outputs
                encoder_last_hidden_state=outputs.encoder_last_hidden_state,  # index 0 of encoder outputs
                encoder_hidden_states=outputs.encoder_hidden_states,  # index 1 of encoder outputs
                encoder_attentions=outputs.encoder_attentions,  # index 2 of encoder outputs
            )
    # 定义一个方法用于处理模型输出，生成用于序列到序列任务的输出对象
    def serving_output(self, output):
        # 如果配置要求使用缓存，则获取过去的关键值，否则置为None
        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
        # 如果配置要求输出隐藏状态，则将输出的解码器隐藏状态转换为张量，否则置为None
        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置要求输出注意力分布，则将输出的解码器注意力分布转换为张量，否则置为None
        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
        # 如果配置要求输出交叉注意力分布，则将输出的交叉注意力分布转换为张量，否则置为None
        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
        # 如果配置要求输出隐藏状态，则将输出的编码器隐藏状态转换为张量，否则置为None
        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置要求输出注意力分布，则将输出的编码器注意力分布转换为张量，否则置为None
        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None

        # 返回一个 TFSeq2SeqLMOutput 对象，包含模型输出的日志概率、过去关键值、各种隐藏状态和注意力分布
        return TFSeq2SeqLMOutput(
            logits=output.logits,
            past_key_values=pkv,
            decoder_hidden_states=dec_hs,
            decoder_attentions=dec_attns,
            cross_attentions=cross_attns,
            encoder_last_hidden_state=output.encoder_last_hidden_state,
            encoder_hidden_states=enc_hs,
            encoder_attentions=enc_attns,
        )

    # 从 transformers 库中 TF 版本的 BART 模型中复制的方法，用于为生成准备输入
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        decoder_attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果有过去的关键值存在，则截取 decoder_input_ids 的最后一个标记
        if past_key_values is not None:
            decoder_input_ids = decoder_input_ids[:, -1:]

        # 如果存在 decoder_attention_mask，则计算累积位置 ID
        if decoder_attention_mask is not None:  # xla
            decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:]
        # 如果没有 XLA，但存在过去的关键值，则获取过去关键值的第一个批次的位置数
        elif past_key_values is not None:  # no xla + past_key_values
            decoder_position_ids = past_key_values[0][0].shape[2]
        # 否则，生成标准的位置 ID 序列
        else:  # no xla + no past_key_values
            decoder_position_ids = tf.range(decoder_input_ids.shape[1])

        # 返回一个字典，包含生成所需的输入信息，例如输入标识符、编码器输出、过去关键值等
        return {
            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "decoder_position_ids": decoder_position_ids,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
        }

    # 定义一个方法，从标签生成解码器输入标识符，用于模型训练
    def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
        # 将标签向右移动一个位置，返回作为解码器的输入
        return shift_tokens_right(labels, self.config.pad_token_id)
    # 定义模型构建方法，参数为输入形状 input_shape，默认为 None
    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回，不进行重复构建
        if self.built:
            return
        # 设置标志位表明模型已构建
        self.built = True
        # 如果存在模型属性，进行模型构建
        if getattr(self, "model", None) is not None:
            # 使用模型的名称创建命名空间，并调用模型的构建方法
            with tf.name_scope(self.model.name):
                self.model.build(None)
        # 如果存在偏置层属性，进行偏置层的构建
        if getattr(self, "bias_layer", None) is not None:
            # 使用偏置层的名称创建命名空间，并调用偏置层的构建方法
            with tf.name_scope(self.bias_layer.name):
                self.bias_layer.build(None)

`.\models\mbart\tokenization_mbart.py`

# coding=utf-8
# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os  # 导入操作系统功能模块
from shutil import copyfile  # 导入文件复制函数copyfile
from typing import Any, Dict, List, Optional, Tuple  # 导入类型注解相关模块

import sentencepiece as spm  # 导入sentencepiece模块，用于分词

from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer  # 导入分词工具相关模块
from ...utils import logging  # 导入日志记录模块

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义特殊标记字符
SPIECE_UNDERLINE = "▁"

# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/mbart-large-en-ro": (
            "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentencepiece.bpe.model"
        ),
        "facebook/mbart-large-cc25": (
            "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/sentencepiece.bpe.model"
        ),
    }
}

# 预训练模型的位置嵌入尺寸映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/mbart-large-en-ro": 1024,
    "facebook/mbart-large-cc25": 1024,
}

# Fairseq语言代码列表
FAIRSEQ_LANGUAGE_CODES = [
    "ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN",
    "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO",
    "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN"
]  # fmt: skip

class MBartTokenizer(PreTrainedTokenizer):
    """
    Construct an MBART tokenizer.

    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```
    >>> from transformers import MBartTokenizer

    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
    ```
    """

    vocab_files_names = VOCAB_FILES_NAMES  # 设置词汇文件的名称
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 设置最大模型输入尺寸
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 设置预训练模型的词汇文件映射
    model_input_names = ["input_ids", "attention_mask"]  # 定义模型输入名称列表

    prefix_tokens: List[int] = []  # 前缀标记列表初始化为空
    suffix_tokens: List[int] = []  # 后缀标记列表初始化为空
    # 初始化方法，用于创建一个新的对象实例
    def __init__(
        self,
        vocab_file,                      # 词汇表文件的路径，用于加载词汇表
        bos_token="<s>",                 # 开始序列的特殊符号，默认为"<s>"
        eos_token="</s>",                # 结束序列的特殊符号，默认为"</s>"
        sep_token="</s>",                # 分隔符的特殊符号，默认为"</s>"
        cls_token="<s>",                 # 类别序列的特殊符号，默认为"<s>"
        unk_token="<unk>",               # 未知符号的特殊符号，默认为"<unk>"
        pad_token="<pad>",               # 填充符号的特殊符号，默认为"<pad>"
        mask_token="<mask>",             # 掩码符号的特殊符号，默认为"<mask>"
        tokenizer_file=None,             # 分词器模型文件的路径，可选
        src_lang=None,                   # 源语言的语言代码，可选
        tgt_lang=None,                   # 目标语言的语言代码，可选
        sp_model_kwargs: Optional[Dict[str, Any]] = None,  # SentencePiece 模型的额外参数，可选
        additional_special_tokens=None,  # 额外的特殊符号列表，可选
        **kwargs,                        # 其他未明确指定的参数，以字典形式接收
        # Mask token behaves like a normal word, including the space before it
        mask_token = (
            AddedToken(mask_token, lstrip=True, normalized=False) if isinstance(mask_token, str) else mask_token
        )

        # Initialize SentencePiece model keyword arguments, defaulting to an empty dictionary if not provided
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # Create a SentencePieceProcessor object and load the vocabulary file
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(str(vocab_file))
        self.vocab_file = vocab_file

        # Ensure alignment between fairseq and SentencePiece vocabularies for specific tokens
        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}

        # Offset for fairseq vocabulary to align with SentencePiece model's tokens
        self.fairseq_offset = 1

        # Determine the size of the SentencePiece model's vocabulary
        self.sp_model_size = len(self.sp_model)

        # Map language codes to IDs based on fairseq language codes
        self.lang_code_to_id = {
            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES)
        }

        # Reverse mapping from ID to language code
        self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}

        # Define the ID for the <mask> token in the fairseq context
        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset

        # Update fairseq token mappings with language code mappings
        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)

        # Reverse mapping from fairseq token IDs to tokens
        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}

        # Extend additional special tokens with language codes if provided
        _additional_special_tokens = list(self.lang_code_to_id.keys())
        if additional_special_tokens is not None:
            # Only add those special tokens if they are not already present
            _additional_special_tokens.extend(
                [t for t in additional_special_tokens if t not in _additional_special_tokens]
            )

        # Initialize the superclass with tokenization parameters and additional settings
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            tokenizer_file=None,
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            additional_special_tokens=_additional_special_tokens,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

        # Set the current source language and its corresponding ID
        self._src_lang = src_lang if src_lang is not None else "en_XX"
        self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]

        # Set source language special tokens based on the chosen source language
        self.set_src_lang_special_tokens(self._src_lang)
    # 返回对象的状态字典，包括所有实例变量及其值
    def __getstate__(self):
        state = self.__dict__.copy()
        # 将 sp_model 设置为 None，用于序列化状态
        state["sp_model"] = None
        # 将 sp_model_proto 设置为序列化后的 sp_model 模型原型
        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
        return state

    # 设置对象的状态，使用传入的状态字典 d
    def __setstate__(self, d):
        self.__dict__ = d

        # 兼容旧版本代码
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 根据 sp_model_kwargs 创建 sp_model 对象
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        # 从序列化的 sp_model_proto 中加载 sp_model 的状态
        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)

    # 返回特定属性 vocab_size 的计算值
    @property
    def vocab_size(self):
        return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1  # 加 1 是为了掩码标记

    # 返回特定属性 src_lang 的值
    @property
    def src_lang(self) -> str:
        return self._src_lang

    # 设置特定属性 src_lang 的值，并更新 src_lang 的特殊标记
    @src_lang.setter
    def src_lang(self, new_src_lang: str) -> None:
        self._src_lang = new_src_lang
        self.set_src_lang_special_tokens(self._src_lang)

    # 根据 token_ids_0 和 token_ids_1 判断特殊标记的掩码
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        根据是否已经有特殊标记，获取未添加特殊标记的 token 列表的序列标识符。该方法在使用分词器的 `prepare_for_model` 方法添加特殊标记时调用。

        Args:
            token_ids_0 (`List[int]`):
                ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个 ID 列表，用于序列对。
            already_has_special_tokens (`bool`, *optional*, 默认为 `False`):
                token 列表是否已经格式化为模型的特殊标记。

        Returns:
            `List[int]`: 包含整数的列表，范围在 [0, 1]：1 表示特殊标记，0 表示序列标记。
        """

        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # 创建前缀和后缀的全 1 列表作为特殊标记的掩码
        prefix_ones = [1] * len(self.prefix_tokens)
        suffix_ones = [1] * len(self.suffix_tokens)
        if token_ids_1 is None:
            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones

    # 构建带有特殊标记的输入序列
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
    def build_inputs_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        从序列构建模型输入，用于序列分类任务，通过连接和添加特殊标记。MBART 序列具有以下格式，其中 `X` 表示序列：

        - `input_ids` (用于编码器)：`X [eos, src_lang_code]`
        - `decoder_input_ids` (用于解码器)：`X [eos, tgt_lang_code]`

        BOS 标记不被使用。序列对不是预期的使用情况，但会在没有分隔符的情况下处理。

        Args:
            token_ids_0 (`List[int]`):
                要添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *可选*):
                可选的第二个 ID 列表，用于序列对。

        Returns:
            `List[int]`: 带有适当特殊标记的输入 ID 列表。
        """
        if token_ids_1 is None:
            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
        # 对于 API 一致性，留下对序列对的逻辑处理
        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        从传入的两个序列创建用于序列对分类任务的掩码。MBART 不使用 token type ids，因此返回一个零列表。

        Args:
            token_ids_0 (`List[int]`):
                ID 列表。
            token_ids_1 (`List[int]`, *可选*):
                可选的第二个 ID 列表，用于序列对。

        Returns:
            `List[int]`: 零列表。
        """

        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    def _build_translation_inputs(
        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
    ):
        """用于翻译管道，准备用于 generate 函数的输入"""
        if src_lang is None or tgt_lang is None:
            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
        self.src_lang = src_lang
        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
        inputs["forced_bos_token_id"] = tgt_lang_id
        return inputs

    def get_vocab(self):
        """
        返回词汇表，将词汇映射到其对应的 ID。

        Returns:
            dict: 包含所有词汇及其 ID 的字典。
        """
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    def _tokenize(self, text: str) -> List[str]:
        """
        使用 subword 编码器对文本进行分词。

        Args:
            text (str): 要分词的文本。

        Returns:
            List[str]: 分词后的字符串列表。
        """
        return self.sp_model.encode(text, out_type=str)
    def _convert_token_to_id(self, token):
        """Converts a token (str) into an id using the vocabulary.

        Args:
            token (str): The token to convert.

        Returns:
            int: The corresponding id from the fairseq_tokens_to_ids dictionary
                 if present, otherwise uses the SentencePiece model to fetch
                 the id. Returns unk_token_id if the SentencePiece model returns 0.
        """
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        spm_id = self.sp_model.PieceToId(token)

        # Need to return unknown token if the SP model returned 0
        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) into a token (str) using the vocabulary.

        Args:
            index (int): The index to convert into a token.

        Returns:
            str: The corresponding token from the fairseq_ids_to_tokens dictionary
                 if present, otherwise uses the SentencePiece model to fetch
                 the token.
        """
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        return self.sp_model.IdToPiece(index - self.fairseq_offset)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) into a single string.

        Args:
            tokens (List[str]): List of tokens to concatenate.

        Returns:
            str: The concatenated string formed from tokens, with SPIECE_UNDERLINE
                 replaced by a space and leading/trailing whitespace removed.
        """
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """Saves the vocabulary to a directory.

        Args:
            save_directory (str): The directory path where the vocabulary will be saved.
            filename_prefix (Optional[str]): Optional prefix for the vocabulary file name.

        Returns:
            Tuple[str]: A tuple containing the path of the saved vocabulary file.
        """
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        return (out_vocab_file,)

    def prepare_seq2seq_batch(
        self,
        src_texts: List[str],
        src_lang: str = "en_XX",
        tgt_texts: Optional[List[str]] = None,
        tgt_lang: str = "ro_RO",
        **kwargs,
    ) -> BatchEncoding:
        """Prepares a batch for sequence-to-sequence model.

        Args:
            src_texts (List[str]): List of source texts.
            src_lang (str, optional): Source language code. Defaults to "en_XX".
            tgt_texts (Optional[List[str]], optional): List of target texts. Defaults to None.
            tgt_lang (str, optional): Target language code. Defaults to "ro_RO".
            **kwargs: Additional keyword arguments passed to the superclass method.

        Returns:
            BatchEncoding: The prepared batch containing encoded inputs for the model.
        """
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)

    def _switch_to_input_mode(self):
        """Switches the model to input mode by setting source language special tokens."""
        return self.set_src_lang_special_tokens(self.src_lang)

    def _switch_to_target_mode(self):
        """Switches the model to target mode by setting target language special tokens."""
        return self.set_tgt_lang_special_tokens(self.tgt_lang)

    def set_src_lang_special_tokens(self, src_lang) -> None:
        """Resets special tokens to match the source language settings.

        Args:
            src_lang (str): Source language code.

        Returns:
            None
        """
        self.cur_lang_code = self.lang_code_to_id[src_lang]
        self.prefix_tokens = []
        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
    def set_tgt_lang_special_tokens(self, lang: str) -> None:
        """设置目标语言的特殊标记。无前缀，后缀为[eos, tgt_lang_code]。"""
        # 将当前语言代码设置为给定语言对应的 ID
        self.cur_lang_code = self.lang_code_to_id[lang]
        # 清空前缀标记列表
        self.prefix_tokens = []
        # 设置后缀标记列表为 [eos, 当前语言代码]
        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]

`.\models\mbart\tokenization_mbart_fast.py`

# coding=utf-8
# 设置文件编码为 UTF-8

# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
# 版权声明，版权归Facebook AI研究团队和HuggingFace公司所有。

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证 2.0 版本授权许可

# you may not use this file except in compliance with the License.
# 除非符合许可证要求，否则不得使用此文件

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证的副本

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# distributed under the License is distributed on an "AS IS" BASIS, # 除非适用法律要求或书面同意，否则按“原样”分发软件
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 无论是明示的还是暗示的，软件都不附带任何保证或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查看许可证以了解具体的语言授权和限制

import os
# 导入操作系统模块

from shutil import copyfile
# 导入文件复制函数copyfile

from typing import List, Optional, Tuple
# 导入类型提示：List（列表），Optional（可选类型），Tuple（元组）

from tokenizers import processors
# 从tokenizers模块导入processors

from ...tokenization_utils import AddedToken, BatchEncoding
# 导入tokenization_utils模块中的AddedToken和BatchEncoding类

from ...tokenization_utils_fast import PreTrainedTokenizerFast
# 从tokenization_utils_fast模块导入PreTrainedTokenizerFast类

from ...utils import is_sentencepiece_available, logging
# 从utils模块导入is_sentencepiece_available函数和logging对象

if is_sentencepiece_available():
    from .tokenization_mbart import MBartTokenizer
else:
    MBartTokenizer = None
# 如果sentencepiece可用，则导入.tokenization_mbart模块中的MBartTokenizer类，否则将MBartTokenizer设为None

logger = logging.get_logger(__name__)
# 使用logging模块获取与当前模块名对应的logger对象

VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
# 设置字典，指定词汇文件和标记器文件的名称

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/mbart-large-en-ro": (
            "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentencepiece.bpe.model"
        ),
        "facebook/mbart-large-cc25": (
            "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/sentencepiece.bpe.model"
        ),
    },
    "tokenizer_file": {
        "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/tokenizer.json",
        "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/tokenizer.json",
    },
}
# 预训练模型的词汇文件和标记器文件的映射字典，指定了各个模型的下载链接

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/mbart-large-en-ro": 1024,
    "facebook/mbart-large-cc25": 1024,
}
# 预训练模型的位置嵌入大小字典，指定了各个模型的位置嵌入大小

FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN"]  # fmt: skip
# FAIRSEQ语言代码列表，指定了支持的语言代码

class MBartTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```
    >>> from transformers import MBartTokenizerFast

    >>> tokenizer = MBartTokenizerFast.from_pretrained(

    ```

    注释：
    创建一个“快速”MBART标记器（由HuggingFace的*tokenizers*库支持）。基于BPE模型。
    继承自PreTrainedTokenizerFast类，该类包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。
    用于源语言文档的标记化方法是`<tokens> <eos> <language code>`，用于目标语言文档的是`<language code> <tokens> <eos>`。
    ```
    # 定义一个类，用于处理 MBart 模型的特定 tokenizer
    class MBartTokenizer:
        # 类属性：定义一些常量
        vocab_files_names = VOCAB_FILES_NAMES  # MBartTokenizer 实例的词汇文件名
        max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 预训练位置嵌入大小的最大模型输入尺寸
        pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 预训练词汇文件映射表
        model_input_names = ["input_ids", "attention_mask"]  # 模型输入的名称列表
        slow_tokenizer_class = MBartTokenizer  # MBartTokenizer 类本身作为慢速 tokenizer 类
    
        prefix_tokens: List[int] = []  # 前缀 token 列表
        suffix_tokens: List[int] = []  # 后缀 token 列表
    
        def __init__(
            self,
            vocab_file=None,
            tokenizer_file=None,
            bos_token="<s>",
            eos_token="</s>",
            sep_token="</s>",
            cls_token="<s>",
            unk_token="<unk>",
            pad_token="<pad>",
            mask_token="<mask>",
            src_lang=None,
            tgt_lang=None,
            additional_special_tokens=None,
            **kwargs,
        ):
            # 如果 mask_token 是字符串，将其转换为 AddedToken 对象，处理前后空格
            mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
            
            # 复制 FAIRSEQ_LANGUAGE_CODES 到 _additional_special_tokens
            _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
    
            # 如果有额外的特殊 token，且它们不在 _additional_special_tokens 中，则添加到 _additional_special_tokens
            if additional_special_tokens is not None:
                _additional_special_tokens.extend(
                    [t for t in additional_special_tokens if t not in _additional_special_tokens]
                )
    
            # 调用父类的初始化方法，设置实例属性
            super().__init__(
                vocab_file=vocab_file,
                tokenizer_file=tokenizer_file,
                bos_token=bos_token,
                eos_token=eos_token,
                sep_token=sep_token,
                cls_token=cls_token,
                unk_token=unk_token,
                pad_token=pad_token,
                mask_token=mask_token,
                src_lang=src_lang,
                tgt_lang=tgt_lang,
                additional_special_tokens=_additional_special_tokens,
                **kwargs,
            )
    
            # 设置实例属性：词汇文件名、语言代码到 ID 的映射、当前源语言代码、目标语言和特殊 token
            self.vocab_file = vocab_file
            self.lang_code_to_id = {
                lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
            }
            self._src_lang = src_lang if src_lang is not None else "en_XX"
            self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
            self.tgt_lang = tgt_lang
            self.set_src_lang_special_tokens(self._src_lang)
    
        @property
        def can_save_slow_tokenizer(self) -> bool:
            # 检查词汇文件是否存在，用于判断是否可以保存慢速 tokenizer
            return os.path.isfile(self.vocab_file) if self.vocab_file else False
    
        @property
        def src_lang(self) -> str:
            # 返回当前源语言
            return self._src_lang
    
        @src_lang.setter
        def src_lang(self, new_src_lang: str) -> None:
            # 设置新的源语言，并更新特殊 token
            self._src_lang = new_src_lang
            self.set_src_lang_special_tokens(self._src_lang)
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        从一个或一对序列构建模型输入，用于序列分类任务，通过连接和添加特殊标记来完成。特殊标记取决于调用set_lang。

        一个MBART序列具有以下格式，其中 `X` 表示序列：

        - `input_ids`（用于编码器）：`X [eos, src_lang_code]`
        - `decoder_input_ids`：（用于解码器）：`X [eos, tgt_lang_code]`

        BOS 永远不会被使用。序列对不是预期的使用情况，但它们会在没有分隔符的情况下处理。

        Args:
            token_ids_0 (`List[int]`):
                要添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                第二个序列的可选 ID 列表，用于序列对。

        Returns:
            `List[int]`: 带有适当特殊标记的输入 ID 列表。
        """
        if token_ids_1 is None:
            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
        # 对于序列对，我们不希望处理它们，但为了 API 一致性保留了处理序列对的逻辑
        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        从传递的两个序列创建一个在序列对分类任务中使用的掩码。mBART 不使用标记类型 ID，因此返回一个零列表。

        Args:
            token_ids_0 (`List[int]`):
                ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                第二个序列的可选 ID 列表，用于序列对。

        Returns:
            `List[int]`: 零列表。
        """

        sep = [self.sep_token_id]  # 分隔符标记的 ID
        cls = [self.cls_token_id]  # 类别标记的 ID

        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]  # 返回一个全零列表，长度为特殊标记和输入长度的总和
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]  # 返回一个全零列表，长度包括特殊标记和两个序列的总和

    def _build_translation_inputs(
        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
    ):
        """由翻译管道使用，准备生成函数的输入"""
        if src_lang is None or tgt_lang is None:
            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
        self.src_lang = src_lang  # 设置源语言属性
        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)  # 使用模型处理原始输入，添加特殊标记
        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)  # 将目标语言转换为其对应的 ID
        inputs["forced_bos_token_id"] = tgt_lang_id  # 在输入中添加强制的 BOS 标记 ID
        return inputs  # 返回处理后的输入
    # 准备用于序列到序列模型的批处理数据，设置源语言和目标语言，默认源语言为英语，目标语言为罗马尼亚语
    def prepare_seq2seq_batch(
        self,
        src_texts: List[str],
        src_lang: str = "en_XX",
        tgt_texts: Optional[List[str]] = None,
        tgt_lang: str = "ro_RO",
        **kwargs,
    ) -> BatchEncoding:
        self.src_lang = src_lang  # 设置源语言
        self.tgt_lang = tgt_lang  # 设置目标语言
        # 调用父类方法准备序列到序列模型的批处理数据
        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)

    # 切换到输入模式，使用当前源语言设置特殊标记
    def _switch_to_input_mode(self):
        return self.set_src_lang_special_tokens(self.src_lang)

    # 切换到目标模式，使用当前目标语言设置特殊标记
    def _switch_to_target_mode(self):
        return self.set_tgt_lang_special_tokens(self.tgt_lang)

    # 设置特殊标记为当前源语言的设定，包括结束标记和源语言编码
    def set_src_lang_special_tokens(self, src_lang) -> None:
        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)  # 转换源语言为对应的语言编码
        self.prefix_tokens = []  # 重置前缀标记为空列表
        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]  # 设置后缀标记为结束标记和当前语言编码

        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)  # 将前缀标记转换为字符串形式
        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)  # 将后缀标记转换为字符串形式

        # 设置 Tokenizer 的后处理器，使用模板处理
        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
        )

    # 设置特殊标记为当前目标语言的设定，包括结束标记和目标语言编码
    def set_tgt_lang_special_tokens(self, lang: str) -> None:
        """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
        self.cur_lang_code = self.convert_tokens_to_ids(lang)  # 转换目标语言为对应的语言编码
        self.prefix_tokens = []  # 重置前缀标记为空列表
        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]  # 设置后缀标记为结束标记和当前语言编码

        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)  # 将前缀标记转换为字符串形式
        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)  # 将后缀标记转换为字符串形式

        # 设置 Tokenizer 的后处理器，使用模板处理
        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
        )
    # 定义一个保存词汇表的方法，接受一个保存目录路径和可选的文件名前缀作为参数，并返回一个包含文件路径字符串的元组
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果当前快速分词器不能保存慢速分词器所需的信息，则引发值错误异常
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # 如果保存目录不存在，则记录错误信息并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
            return
        
        # 构造输出词汇文件的路径，结合文件名前缀和常量中定义的词汇文件名
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇文件的绝对路径与输出词汇文件的绝对路径不同，则复制当前词汇文件到输出词汇文件
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # 返回包含输出词汇文件路径的元组
        return (out_vocab_file,)

`.\models\mbart\init.py`

# 版权声明和许可证信息
#
# 版权所有 2020 年 HuggingFace 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（"许可证"）获得许可；
# 除非符合许可证的条款，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按"原样"分发软件，
# 没有任何明示或暗示的担保或条件。
# 有关特定语言的权限，请参阅许可证。
from typing import TYPE_CHECKING

# 从 utils 模块中导入所需函数和异常
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_sentencepiece_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义导入结构字典，用于存储不同条件下的导入列表
_import_structure = {"configuration_mbart": ["MBART_PRETRAINED_CONFIG_ARCHIVE_MAP", "MBartConfig", "MBartOnnxConfig"]}

# 检查是否有 sentencepiece 库可用，若无则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则向导入结构字典添加 tokenization_mbart 模块的导入列表
    _import_structure["tokenization_mbart"] = ["MBartTokenizer"]

# 检查是否有 tokenizers 库可用，若无则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则向导入结构字典添加 tokenization_mbart_fast 模块的导入列表
    _import_structure["tokenization_mbart_fast"] = ["MBartTokenizerFast"]

# 检查是否有 torch 库可用，若无则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则向导入结构字典添加 modeling_mbart 模块的导入列表
    _import_structure["modeling_mbart"] = [
        "MBART_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MBartForCausalLM",
        "MBartForConditionalGeneration",
        "MBartForQuestionAnswering",
        "MBartForSequenceClassification",
        "MBartModel",
        "MBartPreTrainedModel",
    ]

# 检查是否有 tensorflow 库可用，若无则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则向导入结构字典添加 modeling_tf_mbart 模块的导入列表
    _import_structure["modeling_tf_mbart"] = [
        "TFMBartForConditionalGeneration",
        "TFMBartModel",
        "TFMBartPreTrainedModel",
    ]

# 检查是否有 flax 库可用，若无则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则向导入结构字典添加 modeling_flax_mbart 模块的导入列表
    _import_structure["modeling_flax_mbart"] = [
        "FlaxMBartForConditionalGeneration",
        "FlaxMBartForQuestionAnswering",
        "FlaxMBartForSequenceClassification",
        "FlaxMBartModel",
        "FlaxMBartPreTrainedModel",
    ]

# 如果在类型检查模式下，导入相关模块
if TYPE_CHECKING:
    from .configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig, MBartOnnxConfig

    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，则从 tokenization_mbart 模块导入 MBartTokenizer
        from .tokenization_mbart import MBartTokenizer

    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    # 尝试导入 MBartTokenizerFast 类，如果 OptionalDependencyNotAvailable 异常发生则跳过
    try:
        from .tokenization_mbart_fast import MBartTokenizerFast
    # 捕获 OptionalDependencyNotAvailable 异常，不做任何操作
    except OptionalDependencyNotAvailable:
        pass
    # 如果未发生异常，则导入以下模块
    else:
        from .modeling_mbart import (
            MBART_PRETRAINED_MODEL_ARCHIVE_LIST,
            MBartForCausalLM,
            MBartForConditionalGeneration,
            MBartForQuestionAnswering,
            MBartForSequenceClassification,
            MBartModel,
            MBartPreTrainedModel,
        )

    # 尝试检查 Torch 是否可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，不做任何操作
    except OptionalDependencyNotAvailable:
        pass
    # 如果未发生异常，则导入以下模块
    else:
        from .modeling_mbart import (
            MBART_PRETRAINED_MODEL_ARCHIVE_LIST,
            MBartForCausalLM,
            MBartForConditionalGeneration,
            MBartForQuestionAnswering,
            MBartForSequenceClassification,
            MBartModel,
            MBartPreTrainedModel,
        )

    # 尝试检查 TensorFlow 是否可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，不做任何操作
    except OptionalDependencyNotAvailable:
        pass
    # 如果未发生异常，则导入以下模块
    else:
        from .modeling_tf_mbart import (
            TFMBartForConditionalGeneration,
            TFMBartModel,
            TFMBartPreTrainedModel,
        )

    # 尝试检查 Flax 是否可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，不做任何操作
    except OptionalDependencyNotAvailable:
        pass
    # 如果未发生异常，则导入以下模块
    else:
        from .modeling_flax_mbart import (
            FlaxMBartForConditionalGeneration,
            FlaxMBartForQuestionAnswering,
            FlaxMBartForSequenceClassification,
            FlaxMBartModel,
            FlaxMBartPreTrainedModel,
        )
else:
    # 如果条件不成立，即导入模块失败时执行以下操作

    # 导入 sys 模块，用于管理 Python 解释器的运行时环境
    import sys

    # 将当前模块的字典 (__name__) 更新为一个 LazyModule 对象，
    # LazyModule 是一个自定义类，用于延迟加载模块的实现
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\mbart50\tokenization_mbart50.py`

# coding=utf-8
# 设置日志记录器以获取当前模块的日志记录器对象
import os
# 导入文件复制函数
from shutil import copyfile
# 导入类型提示相关库
from typing import Any, Dict, List, Optional, Tuple

# 导入 SentencePiece 库
import sentencepiece as spm

# 从 tokenization_utils 模块中导入必要的类和函数
from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer
# 导入 logging 函数
from ...utils import logging

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)

# SentencePiece 使用的特殊字符
SPIECE_UNDERLINE = "▁"

# 词汇文件的命名映射，这里指定了词汇文件的名称
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}

# 预训练模型的词汇文件映射，指定了预训练模型及其对应的词汇文件下载链接
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/mbart-large-50-one-to-many-mmt": (
            "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model"
        ),
    }
}

# 预训练模型的位置编码大小映射，指定了每个预训练模型的位置编码大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/mbart-large-50-one-to-many-mmt": 1024,
}

# Fairseq 使用的语言代码列表，包含多种语言代码
FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"]  # fmt: skip

# MBart50Tokenizer 类，继承自 PreTrainedTokenizer 类
class MBart50Tokenizer(PreTrainedTokenizer):
    """
    Construct a MBart50 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        src_lang (`str`, *optional*):
            A string representing the source language.
        tgt_lang (`str`, *optional*):
            A string representing the target language.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```
    >>> from transformers import MBart50Tokenizer

    >>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
    >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
    >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
    # 定义模型的词汇文件名列表，从全局常量中获取
    vocab_files_names = VOCAB_FILES_NAMES
    # 定义模型的最大输入大小列表，从全局常量中获取
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 定义预训练词汇文件映射字典，从全局常量中获取
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 定义模型的输入名称列表
    model_input_names = ["input_ids", "attention_mask"]

    # 前缀令牌列表的初始化
    prefix_tokens: List[int] = []
    # 后缀令牌列表的初始化
    suffix_tokens: List[int] = []

    # 初始化方法，用于创建模型对象
    def __init__(
        self,
        vocab_file,
        src_lang=None,
        tgt_lang=None,
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> None:
        # Mask token behave like a normal word, i.e. include the space before it
        # 如果 mask_token 是字符串，则设置为一个添加的标记，去除左侧空格，保留右侧空格；否则直接使用给定的 mask_token
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        # 初始化 sp_model_kwargs，如果未提供则为空字典
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 更新 kwargs 中的 additional_special_tokens，确保包含所有 FAIRSEQ_LANGUAGE_CODES 中的代码
        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
        kwargs["additional_special_tokens"] += [
            code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
        ]

        # 使用给定的 sp_model_kwargs 创建 SentencePieceProcessor 对象
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        
        # 加载指定路径的词汇文件到 sp_model 中
        self.sp_model.Load(str(vocab_file))
        self.vocab_file = vocab_file

        # 原始 fairseq 词汇和 spm 词汇必须是对齐的，初始化 fairseq_tokens_to_ids
        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}

        # fairseq 的偏移量，用于模仿 fairseq 与 spm 的对齐
        self.fairseq_offset = 1

        # 计算 sp_model 的大小
        self.sp_model_size = len(self.sp_model)

        # 创建语言代码到 ID 的映射，以及 ID 到语言代码的映射
        self.lang_code_to_id = {
            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES)
        }
        self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}

        # 添加 <mask> 到 fairseq_tokens_to_ids 中
        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset

        # 更新 fairseq_tokens_to_ids，将语言代码映射添加进去
        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)

        # 创建 fairseq_ids_to_tokens，将 fairseq_tokens_to_ids 的键值对颠倒
        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}

        # 调用父类的初始化方法，设置各种特殊的语言标记和参数
        super().__init__(
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

        # 设置当前源语言的语言代码，如果未指定则默认为 "en_XX"
        self._src_lang = src_lang if src_lang is not None else "en_XX"
        
        # 获取当前源语言代码的 ID
        self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]

        # 设置源语言的特殊标记
        self.set_src_lang_special_tokens(self._src_lang)

    @property
    def vocab_size(self) -> int:
        # 返回词汇大小，包括 sp_model、语言代码和 fairseq 偏移量，再加上一个用于 mask token
        return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1  # Plus 1 for the mask token

    @property
    def src_lang(self) -> str:
        # 返回当前源语言代码
        return self._src_lang

    @src_lang.setter
    # 设置新的源语言，并更新特殊标记
    def src_lang(self, new_src_lang: str) -> None:
        self._src_lang = new_src_lang
        self.set_src_lang_special_tokens(self._src_lang)

    # 返回对象的状态字典表示，排除 sp_model
    def __getstate__(self) -> Dict:
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    # 根据给定的状态字典恢复对象的状态
    def __setstate__(self, d: Dict) -> None:
        self.__dict__ = d

        # 兼容旧版本
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 根据 sp_model_kwargs 创建 SentencePieceProcessor 对象，并加载词汇表文件
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(self.vocab_file)

    # 返回词汇表，包括从 ID 到 token 的映射
    def get_vocab(self) -> Dict:
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    # 使用 SentencePieceProcessor 对文本进行分词处理，返回 token 列表
    def _tokenize(self, text: str) -> List[str]:
        return self.sp_model.encode(text, out_type=str)

    # 根据 token 获取对应的 ID，若 token 未知则返回未知 token 的 ID
    def _convert_token_to_id(self, token: str) -> int:
        """Converts a token (str) in an id using the vocab."""
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        spm_id = self.sp_model.PieceToId(token)

        # 如果 SP 模型返回 0，则返回未知 token 的 ID
        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id

    # 根据 ID 获取对应的 token，考虑 fairseq 偏移量
    def _convert_id_to_token(self, index: int) -> str:
        """Converts an index (integer) in a token (str) using the vocab."""
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        return self.sp_model.IdToPiece(index - self.fairseq_offset)

    # 将 tokens 序列转换为单个字符串
    # 保证特殊标记不通过 SentencePiece 模型解码
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        current_sub_tokens = []
        out_string = ""
        prev_is_special = False
        for token in tokens:
            if token in self.all_special_tokens:
                if not prev_is_special:
                    out_string += " "
                out_string += self.sp_model.decode(current_sub_tokens) + token
                prev_is_special = True
                current_sub_tokens = []
            else:
                current_sub_tokens.append(token)
                prev_is_special = False
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string.strip()
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 组合输出的词汇文件路径，考虑可选的文件名前缀
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇文件路径与输出路径不同且当前词汇文件存在，则复制文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # 如果当前词汇文件不存在，则将序列化后的词汇模型写入输出路径
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # 返回输出路径的元组形式
        return (out_vocab_file,)

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        # 如果已经有特殊标记，则调用父类方法获取特殊标记的掩码
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # 创建前缀和后缀的特殊标记掩码列表
        prefix_ones = [1] * len(self.prefix_tokens)
        suffix_ones = [1] * len(self.suffix_tokens)
        
        # 如果只有一个 token_ids 列表，则返回前缀 + 序列 token + 后缀 的掩码
        if token_ids_1 is None:
            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
        
        # 如果有两个 token_ids 列表，则返回前缀 + 第一个序列 token + 第二个序列 token + 后缀 的掩码
        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        # 该方法用于构建包含特殊标记的输入 token 列表
        
        # 如果只有一个 token_ids 列表，则返回前缀 + 第一个序列 token + 后缀 的列表
        if token_ids_1 is None:
            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
        
        # 如果有两个 token_ids 列表，则返回前缀 + 第一个序列 token + 第二个序列 token + 后缀 的列表
        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An MBART-50 sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `[src_lang_code] X [eos]`
        - `labels`: (for decoder) `[tgt_lang_code] X [eos]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # If only one sequence (`token_ids_1` is None), concatenate with prefix and suffix tokens
        if token_ids_1 is None:
            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
        # If processing a pair of sequences, concatenate both sequences with prefix and suffix tokens
        # Although pairs are not the expected use case, handle it for API consistency
        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens

    def _build_translation_inputs(
        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
    ):
        """Used by translation pipeline, to prepare inputs for the generate function"""
        # Check if source language and target language are provided
        if src_lang is None or tgt_lang is None:
            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
        self.src_lang = src_lang
        # Generate model inputs by adding special tokens to raw inputs
        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
        # Convert target language to its corresponding token ID
        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
        # Set the forced beginning-of-sequence token ID for decoding
        inputs["forced_bos_token_id"] = tgt_lang_id
        return inputs

    def prepare_seq2seq_batch(
        self,
        src_texts: List[str],
        src_lang: str = "en_XX",
        tgt_texts: Optional[List[str]] = None,
        tgt_lang: str = "ro_RO",
        **kwargs,
    ) -> BatchEncoding:
        # Set the source and target languages for the batch
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        # Call the superclass method to prepare the sequence-to-sequence batch
        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)

    def _switch_to_input_mode(self):
        # Set the special tokens for the current source language
        return self.set_src_lang_special_tokens(self.src_lang)

    def _switch_to_target_mode(self):
        # Set the special tokens for the current target language
        return self.set_tgt_lang_special_tokens(self.tgt_lang)

    def set_src_lang_special_tokens(self, src_lang: str) -> None:
        """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""
        # Set the current language code ID to the provided source language
        self.cur_lang_code_id = self.lang_code_to_id[src_lang]
        # Set the prefix tokens to start with the source language code ID
        self.prefix_tokens = [self.cur_lang_code_id]
        # Set the suffix tokens to end with the end-of-sequence token ID
        self.suffix_tokens = [self.eos_token_id]
    def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
        """重设特殊标记以适应目标语言设置。前缀=[tgt_lang_code] 和 后缀=[eos]。"""
        # 将当前语言代码ID设置为目标语言对应的ID
        self.cur_lang_code_id = self.lang_code_to_id[tgt_lang]
        # 将前缀标记设为包含当前语言代码ID的列表
        self.prefix_tokens = [self.cur_lang_code_id]
        # 将后缀标记设为包含结束符标记ID的列表
        self.suffix_tokens = [self.eos_token_id]

`.\models\mbart50\tokenization_mbart50_fast.py`

# coding=utf-8
# 设置文件编码为 UTF-8，确保正确处理各种字符集
# Copyright 2021 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
# 版权声明，指明代码版权归属
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证版本 2.0 进行许可
# you may not use this file except in compliance with the License.
# 除非符合许可证要求，否则不得使用此文件
# You may obtain a copy of the License at
# 详细许可证信息可在下述链接获取
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非法律要求或书面同意，本软件以“原样”分发，无论明示或暗示，均不包含任何担保或条件
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 根据许可证“原样”分发软件，不提供任何担保或条件
# See the License for the specific language governing permissions and
# limitations under the License.
# 请查看许可证以了解权限和限制详情

import os
# 导入操作系统模块，用于与操作系统交互
from shutil import copyfile
# 导入 shutil 模块的 copyfile 函数，用于文件复制操作
from typing import List, Optional, Tuple
# 导入 typing 模块，用于类型提示

from tokenizers import processors
# 从 tokenizers 库中导入 processors 模块

from ...tokenization_utils import AddedToken, BatchEncoding
# 从 tokenization_utils 模块中导入 AddedToken 和 BatchEncoding 类
from ...tokenization_utils_fast import PreTrainedTokenizerFast
# 从 tokenization_utils_fast 模块中导入 PreTrainedTokenizerFast 类
from ...utils import is_sentencepiece_available, logging
# 从 utils 模块中导入 is_sentencepiece_available 和 logging 函数

if is_sentencepiece_available():
    from .tokenization_mbart50 import MBart50Tokenizer
else:
    MBart50Tokenizer = None
# 如果 sentencepiece 库可用，则从 tokenization_mbart50 模块导入 MBart50Tokenizer 类，否则将 MBart50Tokenizer 设置为 None

logger = logging.get_logger(__name__)
# 获取当前模块的 logger 实例

VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
# 定义词汇表文件名字典，包含 "vocab_file" 和 "tokenizer_file" 键

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/mbart-large-50-one-to-many-mmt": (
            "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model"
        ),
    },
    "tokenizer_file": {
        "facebook/mbart-large-50-one-to-many-mmt": (
            "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/tokenizer.json"
        ),
    },
}
# 预训练词汇文件映射，包含 "vocab_file" 和 "tokenizer_file" 键，以及对应的 URL

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/mbart-large-50-one-to-many-mmt": 1024,
}
# 预训练位置嵌入大小，以模型名称为键，大小为值的字典

FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"]
# Fairseq 语言代码列表，用于支持多种语言和区域设置
    # 导入 MBart50TokenizerFast 类，该类用于处理 MBart 系列模型的快速分词和编码
    from transformers import MBart50TokenizerFast

    # 定义一个 MBart50TokenizerFast 类的实例，并从预训练模型 "facebook/mbart-large-50" 加载
    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")

    # 定义一个源文本字符串
    src_text = " UN Chief Says There Is No Military Solution in Syria"

    # 定义一个目标文本字符串
    tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"

    # 使用 tokenizer 处理源文本和目标文本，返回 PyTorch 张量格式的模型输入
    model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")

    # model(**model_inputs) 应该能够正常工作，此处提供了一个示例用法
    # 定义一个特殊的 Token，其行为类似于普通单词，即在其前包含空格
    mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
    
    # 设定额外的特殊 Token，如果不存在则创建空列表并添加 Fairseq 语言代码
    kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
    kwargs["additional_special_tokens"] += [
        code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
    ]
    
    # 调用父类的初始化方法，传入参数
    super().__init__(
        vocab_file,
        src_lang=src_lang,
        tgt_lang=tgt_lang,
        tokenizer_file=tokenizer_file,
        eos_token=eos_token,
        sep_token=sep_token,
        cls_token=cls_token,
        unk_token=unk_token,
        pad_token=pad_token,
        mask_token=mask_token,
        **kwargs,
    )
    
    # 设置属性 vocab_file 为传入的 vocab_file
    self.vocab_file = vocab_file
    
    # 设置属性 lang_code_to_id，循环 FAIRSEQ_LANGUAGE_CODES 并将其转换为对应的 id
    self.lang_code_to_id = {
        lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
    }
    
    # 设置属性 _src_lang 为传入的 src_lang，如果为 None 则设置为 "en_XX"
    self._src_lang = src_lang if src_lang is not None else "en_XX"
    # 设置属性 tgt_lang 为传入的 tgt_lang
    self.tgt_lang = tgt_lang
    # 设置属性 cur_lang_code_id 为 lang_code_to_id 的 _src_lang 对应的值
    self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
    # 调用方法 set_src_lang_special_tokens 并传入 _src_lang
    self.set_src_lang_special_tokens(self._src_lang)
    
    # 定义属性 can_save_slow_tokenizer，判断是否存在 vocab_file
    @property
    def can_save_slow_tokenizer(self) -> bool:
        return os.path.isfile(self.vocab_file) if self.vocab_file else False
    
    # 定义属性 src_lang，返回 _src_lang
    @property
    def src_lang(self) -> str:
        return self._src_lang
    
    # 定义属性 src_lang 的 setter 方法，将传入的值设置给 _src_lang，并调用方法 set_src_lang_special_tokens 并传入新值
    @src_lang.setter
    def src_lang(self, new_src_lang: str) -> None:
        self._src_lang = new_src_lang
        self.set_src_lang_special_tokens(self._src_lang)
    
    # 定义方法 build_inputs_with_special_tokens，用于构建带有特殊 Token 的输入序列
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. The special tokens depend on calling set_lang.
    
        An MBART-50 sequence has the following format, where `X` represents the sequence:
    
        - `input_ids` (for encoder) `[src_lang_code] X [eos]`
        - `labels`: (for decoder) `[tgt_lang_code] X [eos]`
    
        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.
    
        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
    
        Returns:
            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
        # 如果 token_ids_1 不为空，则将两个 token_ids 连接并加上特殊 Token
        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
    def prepare_seq2seq_batch(
        self,
        src_texts: List[str],
        src_lang: str = "en_XX",
        tgt_texts: Optional[List[str]] = None,
        tgt_lang: str = "ro_RO",
        **kwargs,
    ) -> BatchEncoding:
        # 设置源语言和目标语言属性
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        # 调用父类方法，准备序列到序列的批次编码
        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)

    def _switch_to_input_mode(self):
        # 切换到输入模式，设置特殊的源语言令牌
        return self.set_src_lang_special_tokens(self.src_lang)

    def _switch_to_target_mode(self):
        # 切换到目标模式，设置特殊的目标语言令牌
        return self.set_tgt_lang_special_tokens(self.tgt_lang)

    def set_src_lang_special_tokens(self, src_lang: str) -> None:
        """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""
        # 将当前语言代码转换为对应的 ID
        self.cur_lang_code_id = self.convert_tokens_to_ids(src_lang)
        # 设置前缀特殊令牌为源语言代码 ID，后缀特殊令牌为 EOS 令牌 ID
        self.prefix_tokens = [self.cur_lang_code_id]
        self.suffix_tokens = [self.eos_token_id]

        # 将 ID 转换为对应的字符串表示
        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)

        # 更新分词器的后处理器，使用模板处理器设置特殊令牌
        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
        )

    def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
        """Reset the special tokens to the target language setting. prefix=[src_lang_code] and suffix=[eos]."""
        # 将当前语言代码转换为对应的 ID
        self.cur_lang_code_id = self.convert_tokens_to_ids(tgt_lang)
        # 设置前缀特殊令牌为目标语言代码 ID，后缀特殊令牌为 EOS 令牌 ID
        self.prefix_tokens = [self.cur_lang_code_id]
        self.suffix_tokens = [self.eos_token_id]

        # 将 ID 转换为对应的字符串表示
        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)

        # 更新分词器的后处理器，使用模板处理器设置特殊令牌
        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
        )

    def _build_translation_inputs(
        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
    ):
        """Used by translation pipeline, to prepare inputs for the generate function"""
        # 检查是否提供了源语言和目标语言，若未提供则抛出异常
        if src_lang is None or tgt_lang is None:
            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
        # 设置当前的源语言属性
        self.src_lang = src_lang
        # 调用自身，使用指定参数调用模型，添加特殊令牌并返回结果
        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
        # 将目标语言转换为对应的 ID
        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
        # 将强制的 BOS 令牌 ID 添加到输入中
        inputs["forced_bos_token_id"] = tgt_lang_id
        return inputs
    # 定义一个方法用于保存词汇表到文件
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果当前快速分词器无法提供保存慢速分词器所需的信息，则引发数值错误异常
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # 如果保存目录不存在，则记录错误日志并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 构建输出词汇表文件路径，根据前缀确定文件名，使用全局常量 VOCAB_FILES_NAMES 中的键来生成文件名
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件路径与目标输出路径不一致，则复制当前词汇表文件到目标路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # 返回一个包含输出词汇表文件路径的元组
        return (out_vocab_file,)

`.\models\mbart50\init.py`

# 版权声明和许可信息，指明代码的版权和使用许可
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入类型检查模块中的 TYPE_CHECKING
from typing import TYPE_CHECKING

# 导入必要的依赖项：OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available

# 定义一个空的导入结构字典
_import_structure = {}

# 尝试导入 MBart50Tokenizer，如果 sentencepiece 不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果成功导入，则将 MBart50Tokenizer 加入到 _import_structure 中
    _import_structure["tokenization_mbart50"] = ["MBart50Tokenizer"]

# 尝试导入 MBart50TokenizerFast，如果 tokenizers 不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果成功导入，则将 MBart50TokenizerFast 加入到 _import_structure 中
    _import_structure["tokenization_mbart50_fast"] = ["MBart50TokenizerFast"]

# 如果是在类型检查环境中
if TYPE_CHECKING:
    try:
        # 再次尝试导入 MBart50Tokenizer，如果 sentencepiece 不可用则抛出 OptionalDependencyNotAvailable 异常
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果成功导入，则从 tokenization_mbart50 中导入 MBart50Tokenizer
        from .tokenization_mbart50 import MBart50Tokenizer

    try:
        # 再次尝试导入 MBart50TokenizerFast，如果 tokenizers 不可用则抛出 OptionalDependencyNotAvailable 异常
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果成功导入，则从 tokenization_mbart50_fast 中导入 MBart50TokenizerFast
        from .tokenization_mbart50_fast import MBart50TokenizerFast

# 如果不在类型检查环境中
else:
    # 导入 sys 模块
    import sys

    # 将当前模块指定为 _LazyModule 的实例，延迟加载模块的导入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\mega\configuration_mega.py`

# coding=utf-8
# 上面的行指定了文件的编码格式为 UTF-8

# Copyright 2023 The Mega Authors and The HuggingFace Inc. team.
# 以下是版权声明，说明了代码的版权归属

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 授权许可

# you may not use this file except in compliance with the License.
# 除非遵循许可证，否则不能使用该文件

# You may obtain a copy of the License at
# 可以在下面链接获取许可证副本

#     http://www.apache.org/licenses/LICENSE-2.0
# 许可证的获取链接

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 根据许可证分发的软件以“原样”分发，没有任何明示或暗示的保证或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 查看许可证以获取具体的语言控制权限和限制

""" MEGA configuration"""
# 注释说明这是 MEGA 模型的配置文件

from collections import OrderedDict
from typing import Mapping

from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging

# 引入必要的模块和类

logger = logging.get_logger(__name__)

# 获取logger对象，用于记录日志

MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "mnaylor/mega-base-wikitext": "https://huggingface.co/mnaylor/mega-base-wikitext/resolve/main/config.json",
}

# 定义一个映射表，将模型名称映射到预训练模型配置文件的下载链接

class MegaConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MegaModel`]. It is used to instantiate a Mega
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Mega
    [mnaylor/mega-base-wikitext](https://huggingface.co/mnaylor/mega-base-wikitext) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Examples:

    ```
    >>> from transformers import MegaConfig, MegaModel

    >>> # Initializing a Mega configuration
    >>> configuration = MegaConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = MegaModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """
    # MegaConfig 类的文档字符串，描述了配置一个 MegaModel 所需的参数和用法示例

    model_type = "mega"
    # 模型类型为 "mega"
    # 初始化函数，用于初始化一个类的实例
    def __init__(
        self,
        vocab_size=30522,  # 词汇表大小，默认为30522
        hidden_size=128,  # 隐藏层大小，默认为128
        num_hidden_layers=4,  # 隐藏层数量，默认为4
        intermediate_size=256,  # 中间层大小，默认为256
        ema_projection_size=16,  # EMA（指数移动平均）投影大小，默认为16
        bidirectional=True,  # 是否双向，默认为True
        shared_representation_size=64,  # 共享表示大小，默认为64
        use_chunking=False,  # 是否使用分块，默认为False
        chunk_size=-1,  # 分块大小，默认为-1
        truncation=None,  # 截断类型，默认为None
        normalize_before_mega=True,  # 在巨型模块之前是否进行归一化，默认为True
        normalization_type="scalenorm",  # 归一化类型，默认为"scalenorm"
        norm_affine=True,  # 归一化是否包含仿射变换，默认为True
        activation="silu",  # 激活函数类型，默认为"silu"
        attention_activation="softmax",  # 注意力机制的激活函数类型，默认为"softmax"
        dropout_prob=0.1,  # 一般的dropout概率，默认为0.1
        hidden_dropout_prob=0.1,  # 隐藏层dropout概率，默认为0.1
        attention_probs_dropout_prob=0.1,  # 注意力机制的dropout概率，默认为0.1
        use_feature_dropout=False,  # 是否使用特征dropout，默认为False
        use_normalized_ffn=True,  # 是否使用归一化的前馈网络，默认为True
        nffn_hidden_size=256,  # 归一化前馈网络的隐藏层大小，默认为256
        normalize_before_ffn=True,  # 在前馈网络之前是否进行归一化，默认为True
        nffn_activation_dropout_prob=0.1,  # 前馈网络的激活函数dropout概率，默认为0.1
        max_positions=2048,  # 最大位置编码数，默认为2048
        add_token_type_embeddings=False,  # 是否添加token类型的嵌入，默认为False
        type_vocab_size=2,  # token类型的词汇表大小，默认为2
        initializer_range=0.02,  # 初始化范围，默认为0.02
        ema_delta_alpha_range=0.2,  # EMA增量α范围，默认为0.2
        ema_beta_range=0.02,  # EMA β范围，默认为0.02
        ema_gamma_omega_range=1.0,  # EMA γ和ω范围，默认为1.0
        pad_token_id=1,  # 填充token的ID，默认为1
        bos_token_id=0,  # 开始token的ID，默认为0
        eos_token_id=2,  # 结束token的ID，默认为2
        relative_positional_bias="rotary",  # 相对位置偏置类型，默认为"rotary"
        classifier_dropout=None,  # 分类器的dropout概率，默认为None
        use_cache=True,  # 是否使用缓存，默认为True
        add_lm_hidden_dense_layer=True,  # 是否添加语言模型隐藏层密集层，默认为True
        **kwargs,  # 其他关键字参数
    ):
        ):
        # 调用父类的初始化方法，设置特定的参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 初始化模型的各种参数
        self.vocab_size = vocab_size                    # 词汇表大小
        self.hidden_size = hidden_size                  # 隐藏层大小
        self.num_hidden_layers = num_hidden_layers      # 隐藏层数量
        self.activation = activation                    # 激活函数类型
        self.attention_activation = attention_activation  # 注意力机制激活函数类型
        self.intermediate_size = intermediate_size      # 中间层大小
        self.ema_projection_size = ema_projection_size  # 指数移动平均投影大小
        self.bidirectional = bidirectional              # 是否使用双向模型
        self.shared_representation_size = shared_representation_size  # 共享表示大小
        self.use_chunking = use_chunking                # 是否使用分块处理
        self.chunk_size = chunk_size                    # 分块大小
        self.truncation = truncation                    # 截断长度
        self.normalize_before_mega = normalize_before_mega  # 在大模型之前进行归一化
        self.normalization_type = normalization_type    # 归一化类型
        self.norm_affine = norm_affine                  # 归一化的仿射变换
        self.dropout_prob = dropout_prob                # 通用丢弃概率
        self.hidden_dropout_prob = hidden_dropout_prob  # 隐藏层丢弃概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob  # 注意力概率丢弃概率
        self.use_feature_dropout = use_feature_dropout  # 是否使用特征丢弃
        self.use_normalized_ffn = use_normalized_ffn    # 是否使用归一化的前馈网络
        self.nffn_hidden_size = nffn_hidden_size        # 归一化前馈网络的隐藏层大小
        self.normalize_before_ffn = normalize_before_ffn  # 在前馈网络之前进行归一化
        self.nffn_activation_dropout_prob = nffn_activation_dropout_prob  # 前馈网络激活丢弃概率
        self.max_positions = max_positions              # 最大位置编码
        self.add_token_type_embeddings = add_token_type_embeddings  # 是否添加类型嵌入
        self.type_vocab_size = type_vocab_size          # 类型词汇表大小
        self.initializer_range = initializer_range      # 初始化范围
        self.ema_delta_alpha_range = ema_delta_alpha_range  # 指数移动平均增量 alpha 范围
        self.ema_beta_range = ema_beta_range            # 指数移动平均 beta 范围
        self.ema_gamma_omega_range = ema_gamma_omega_range  # 指数移动平均 gamma 和 omega 范围
        self.relative_positional_bias = relative_positional_bias  # 相对位置偏差
        self.use_cache = use_cache                      # 是否使用缓存
        self.classifier_dropout = classifier_dropout    # 分类器丢弃概率
        self.add_lm_hidden_dense_layer = add_lm_hidden_dense_layer  # 是否添加语言模型的隐藏密集层
        self.num_attention_heads = 1  # not used but required by Hugging Face  # 注意力头数量，Hugging Face 要求但未使用
class MegaOnnxConfig(OnnxConfig):
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务是多选，则定义动态轴包括批次、选项和序列
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        # 否则，定义动态轴包括批次和序列
        else:
            dynamic_axis = {0: "batch", 1: "sequence"}
        # 返回有序字典，定义输入名称和对应的动态轴
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),         # 定义输入名称 input_ids 和动态轴
                ("attention_mask", dynamic_axis),    # 定义输入名称 attention_mask 和动态轴
            ]
        )

`.\models\mega\convert_mega_original_pytorch_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Convert Mega pretrained checkpoint. Built to convert the Masked LM checkpoint located at
https://huggingface.co/mnaylor/mega-wikitext-103

Requirements:
  - clone the Mega repo and install fairseq from there
    1. git clone https://github.com/facebookresearch/mega.git
    2. cd mega && pip install -e
  - clone the pretrained weights for the original implementation from the hugging face repo
    * use this location as the path for pretrained weights
"""
import argparse  # 导入 argparse 模块，用于命令行参数解析

# utilities to import the model weights and config file
import os  # 导入 os 模块，用于操作文件和目录
import pickle as pkl  # 导入 pickle 模块，用于对象序列化和反序列化

# PyTorch + new model classes
import torch  # 导入 PyTorch 库
from torch import nn  # 从 torch 模块导入 nn 模块，用于构建神经网络

from transformers import AutoTokenizer, MegaConfig, MegaForMaskedLM  # 从 transformers 库导入 AutoTokenizer, MegaConfig 和 MegaForMaskedLM 类

# import the EncoderLayer class used to pretrain
# !! NOTE !! this requires the version of fairseq that is built when you install the Mega source
try:
    from fairseq.modules.mega_layer import MegaEncoderLayer  # 尝试从 fairseq 的模块中导入 MegaEncoderLayer 类
except ImportError:
    raise ImportError("You need to install the version of fairseq from the Mega repo!")  # 如果导入失败，抛出 ImportError 异常，提示用户需要安装来自 Mega 仓库的 fairseq 版本

# define the wrapper classes used to train the MLM  (see colab notebook below)
# https://colab.research.google.com/drive/1qfUO6o5HRdxBblWlw058HVyvaEPhPpH8?usp=sharing
# MegaLM outputs hidden states
class MegaLM(nn.Module):
    "The base class for our Mega encoder - given input IDs, embed text and return encoder output"

    def __init__(self, mega_args, depth, vocab_size):
        super().__init__()  # 调用父类 nn.Module 的初始化方法
        self.mega_args = mega_args  # 保存 mega_args 参数
        self.embedding_layer = nn.Embedding(vocab_size, self.mega_args.encoder_embed_dim)  # 创建词嵌入层，词汇表大小为 vocab_size，嵌入维度为 mega_args.encoder_embed_dim
        self.encoders = nn.ModuleList([MegaEncoderLayer(self.mega_args) for _ in range(depth)])  # 创建一个包含指定深度的 MegaEncoderLayer 的 ModuleList
        self.depth = depth  # 保存深度值
    def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0):
        """
        Code for a forward pass - expects input_ids and attention_mask to come from a Hugging Face tokenizer as PyTorch
        tensors, and returns a tensor of size (batch, n_classes) containing classification logits

        Other options:
          - batch_first: boolean indicating whether the batch dimension is first in input_ids (default: True, which
            aligns with the HF tokenizer behavior)
          - ignore_mask_value: the value in attention_mask that identifies tokens that should be ignored (default: 0,
            which aligns with HF tokenizer)
        """

        # Mega expects embeddings to be (time, batch, embedding size), but
        # Hugging Face returns tokens as (batch, time)
        # 如果 batch_first 为 True，交换 input_ids 的维度，使其变为 (time, batch)
        if batch_first:
            input_ids = input_ids.T

        # to make things more confusing, Mega expects the attention mask to
        # be (batch, time), but with values of 0 (normal token) and 1 (ignore token)
        # which is the opposite of what HF returns
        # 如果 ignore_mask_value 为 0，将 attention_mask 取反，使其与 Mega 要求的格式一致
        if ignore_mask_value == 0:
            attention_mask = 1 - attention_mask

        # get token embeddings from IDs
        # 通过 embedding_layer 获取 token 的嵌入表示
        embeds = self.embedding_layer(input_ids)

        # pass through the Mega layers
        # 通过 Mega layers 进行处理
        # 输入形状为 (time, batch, encoder dim)，输出形状相同
        for encoder in self.encoders:
            embeds = encoder(embeds, attention_mask)

        # return according to the shape specified
        # 根据 batch_first 的值返回不同形状的 embeds
        if batch_first:
            # (T, B, H) --> (B, T, H)
            return torch.transpose(embeds, 0, 1)
        else:
            return embeds
# 将类名从 MegaForMaskedLM 改名为 OriginalMegaForMaskedLM，以避免与新模块混淆
class OriginalMegaForMaskedLM(nn.Module):
    "A wrapper class for doing masked language modeling with Mega"

    def __init__(self, mega_args, depth, vocab_size):
        super().__init__()
        # 初始化 MegaLM 模型作为成员变量
        self.mega = MegaLM(mega_args, depth, vocab_size)
        # 初始化用于 MLM 的线性层，输入维度为 MegaLM 模型的编码器嵌入维度，输出维度为词汇表大小
        self.mlm_head = nn.Linear(mega_args.encoder_embed_dim, vocab_size)
        # 初始化用于 dropout 的层，丢弃概率为 0.1
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0):
        """
        执行 Mega 编码器和 MLM 头部的前向传播。返回每个词汇表条目的 logits。

        如果 `batch_first` 为 True（默认与 Hugging Face tokenizer 的行为一致），输出形状为 (Batch size, Sequence length, Vocab size)；
        否则为 (Sequence length, Batch size, Vocab size)。
        """
        # 调用 MegaLM 模型进行编码器的前向传播
        encoder_output = self.mega(input_ids, attention_mask, batch_first, ignore_mask_value)
        # 对编码器输出施加 dropout 后，通过 MLM 头部得到最终输出
        return self.mlm_head(self.dropout(encoder_output))


# 用于将用户指定位置的检查点转换为 Hugging Face 格式的代码
def convert_checkpoint_to_huggingface(pretrained_checkpoint_path, output_path, includes_tokenizer):
    with open(os.path.join(pretrained_checkpoint_path, "model_args.pkl"), "rb") as f:
        # 从文件中加载 MegaLM 模型的原始参数
        mega_original_args = pkl.load(f)

    # 加载原始的 MegaForMaskedLM 模型，用于检查点转换，设为评估模式
    original_mlm = OriginalMegaForMaskedLM(**mega_original_args).eval()

    # 加载原始模型的权重
    print(
        "Original Mega encoder:",
        # 加载编码器的权重
        original_mlm.mega.load_state_dict(
            torch.load(os.path.join(pretrained_checkpoint_path, "encoder_weights.pt"), map_location="cpu")
        ),
    )
    print(
        "Original Mega MLM layer:",
        # 加载 MLM 头部的权重
        original_mlm.mlm_head.load_state_dict(
            torch.load(os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu")
        ),
    )

    # 从旧配置文件创建一个新的配置
    # 创建 MegaConfig 对象，用于配置 Mega 模型的参数
    hf_config = MegaConfig(
        num_hidden_layers=mega_original_args["depth"],  # 设置隐藏层的数量
        vocab_size=mega_original_args["vocab_size"],  # 设置词汇表大小
        hidden_size=mega_original_args["mega_args"].encoder_embed_dim,  # 设置隐藏层的尺寸
        shared_representation_size=mega_original_args["mega_args"].encoder_z_dim,  # 设置共享表示的尺寸
        intermediate_size=mega_original_args["mega_args"].encoder_hidden_dim,  # 设置中间层尺寸
        ema_projection_size=mega_original_args["mega_args"].encoder_n_dim,  # 设置EMA投影的尺寸
        dropout_prob=mega_original_args["mega_args"].dropout,  # 设置dropout概率
        attention_probs_dropout_prob=mega_original_args["mega_args"].attention_dropout,  # 设置注意力机制的dropout概率
        hidden_dropout_prob=mega_original_args["mega_args"].hidden_dropout,  # 设置隐藏层的dropout概率
        activation=mega_original_args["mega_args"].activation_fn,  # 设置激活函数类型
        attention_activation=mega_original_args["mega_args"].attention_activation_fn,  # 设置注意力激活函数类型
        bidirectional=mega_original_args["mega_args"].bidirectional,  # 设置是否双向
        use_chunking=mega_original_args["mega_args"].encoder_chunk_size > 0,  # 是否使用分块
        chunk_size=mega_original_args["mega_args"].encoder_chunk_size,  # 设置分块的大小
        truncation=mega_original_args["mega_args"].truncation_length,  # 设置截断长度
        normalization_type=mega_original_args["mega_args"].normalization_type,  # 设置归一化类型
        normalize_before_mega=True,  # 在Mega之前是否进行归一化
        norm_affine=True,  # 归一化的affine参数是否开启
        use_feature_dropout=mega_original_args["mega_args"].feature_dropout,  # 是否使用特征dropout
        relative_positional_bias=mega_original_args["mega_args"].rel_pos_bias,  # 设置相对位置偏置
        max_positions=mega_original_args["mega_args"].max_source_positions,  # 设置最大位置
        nffn_hidden_size=mega_original_args["mega_args"].encoder_ffn_embed_dim,  # 设置NFFN隐藏层大小
        normalize_before_ffn=mega_original_args["mega_args"].normalize_before,  # 在FFN之前是否进行归一化
        # 新增的参数用于HF实现
        nffn_activation_dropout_prob=0.0,  # NFFN激活dropout的概率
        add_token_type_embeddings=False,  # 是否添加token类型的嵌入
        add_lm_hidden_dense_layer=False,  # 是否添加LM隐藏层密集层
    )

    # 创建并评估 MegaForMaskedLM 模型
    hf_mlm = MegaForMaskedLM(hf_config).eval()

    # 修改 hf_mlm 模型的嵌入层，使其与 original_mlm 模型的嵌入层权重相同
    hf_mlm.mega.embedding_layer.word_embeddings.weight = original_mlm.mega.embedding_layer.weight

    # 修改 original_mlm 模型的状态字典，以解决在 Hugging Face 生态系统中可能出现的命名问题，
    # 所有包含"beta"或"gamma"的名称在 _load_pretrained 时将被重命名，同时修正之前可能令人困惑的参数名称
    original_state_dict = original_mlm.mega.encoders.state_dict()
    updated_keys = {}
    # 遍历原始状态字典中的每个模块名称
    for module_name in original_state_dict.keys():
        new_module_name = None
        # 需要单独处理 gamma、beta 和 alpha，因为它们在原始代码库中被多个模块使用；
        # beta 在 EMA、MovingAverageGatedAttention 和 RotaryRelativePositionalBias 中使用，必须由于 flax/tf 权重而重命名
        # EMA 子层原始命名为 "move"，已经重命名为 "ema_gate"，这里进行了相应的处理
        if "beta" in module_name:
            # 如果模块名包含 "move.beta"，则进行替换为 "ema_gate.ema_expansion_matrix"
            if "move.beta" in module_name:
                new_module_name = module_name.replace("move.beta", "ema_gate.ema_expansion_matrix")
            # 如果模块名包含 "mega_layer.beta"，则进行替换为 "qk_bias"
            elif "mega_layer.beta" in module_name:
                new_module_name = module_name.replace("beta", "qk_bias")
            # 否则替换为 "b_param"
            else:
                new_module_name = module_name.replace("beta", "b_param")
        # gamma 在 EMA 和 MovingAverageGatedAttention 中使用，必须由于 flax/tf 权重而重命名
        elif "gamma" in module_name:
            # 如果模块名包含 "move.gamma"，则进行替换为 "ema_gate.kernel_projection_matrix"
            if "move.gamma" in module_name:
                new_module_name = module_name.replace("move.gamma", "ema_gate.kernel_projection_matrix")
            # 如果模块名包含 "mega_layer.gamma"，则进行替换为 "qk_weight"
            elif "mega_layer.gamma" in module_name:
                new_module_name = module_name.replace("gamma", "qk_weight")
            # 否则替换为 "g_param"
            else:
                new_module_name = module_name.replace("gamma", "g_param")
        # alpha 在 EMA 和 positional bias 中使用，重命名以提高可读性
        elif "move.alpha" in module_name:
            new_module_name = module_name.replace("move.alpha", "ema_gate.decay_factor")
        # delta 仅在 EMA 中使用，重命名以提高可读性
        elif "move.delta" in module_name:
            new_module_name = module_name.replace("move.delta", "ema_gate.damping_factor")
        # omega 仅在 EMA 中使用，重命名以提高可读性
        elif "omega" in module_name:
            new_module_name = module_name.replace("move.omega", "ema_gate.residual_weight")

        # 如果有新的模块名，则更新键值对
        if new_module_name:
            updated_keys[module_name] = new_module_name

    # 如果有需要重命名的键值对，则打印需要重命名的键的集合
    if len(updated_keys) != 0:
        print(f"Renaming these keys: {updated_keys.keys()}")
    else:
        print("No need to rename state dict entries")

    # 遍历更新后的键值对，将原始状态字典中的旧键替换为新键
    for old, new in updated_keys.items():
        original_state_dict[new] = original_state_dict.pop(old)

    # 尝试使用更新后的名称加载状态字典
    # 注意，现在称为 `mega.layers` 而不是 `mega.encoders`，因为采用了 hugging face 的风格
    print("HF Mega encoder:", hf_mlm.mega.layers.load_state_dict(original_state_dict))

    # 直接加载 MLM 头部权重
    print(
        "HF Mega MLM layer:",
        hf_mlm.mlm_head.load_state_dict(
            torch.load(os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu")
        ),
    )

    # 在随机生成的输入序列上进行测试
    # 使用 PyTorch 随机生成整数张量作为模型输入的标识符（token IDs），范围在 [0, hf_config.vocab_size) 内
    input_ids = torch.randint(0, hf_config.vocab_size, size=(4, 256))
    # 创建与 input_ids 相同形状的张量，用于指示哪些标记需要被掩码
    input_mask = torch.ones_like(input_ids)
    # 将最后 10 列的掩码值设置为 0，以确保掩码应用正确 :)
    input_mask[:, -10:] = 0

    # 进行前向传播计算
    original_output = original_mlm(input_ids, input_mask, batch_first=True, ignore_mask_value=0)
    # 使用 Hugging Face 模型进行前向传播计算，返回结果的第一个元素
    hf_output = hf_mlm(input_ids, input_mask)[0]

    # 打印输出张量的形状和它们之间的最大差异
    print(f"original output {original_output.shape}")
    print(f"hf output {hf_output.shape}")
    print(f"max diff: {(original_output - hf_output).max()}")  # 0.0
    # 检查两个输出张量是否在指定的绝对误差容限内近似相等
    success = torch.allclose(original_output, hf_output, atol=1e-3)

    if success:
        # 如果成功匹配，则输出 "Yay!"
        print("Yay!")
        # 将 Hugging Face 模型保存到指定的输出路径中
        hf_mlm.save_pretrained(output_path)
    else:
        # 如果匹配失败，则抛出运行时错误，并输出相关信息
        raise RuntimeError(f"Something's broken :(\nOriginal:\n{original_output}\n\nHF\n{hf_output}\n{hf_mlm}")

    # 如果需要包含 tokenizer
    if includes_tokenizer:
        # 打印信息，表示正在传输 tokenizer
        print("Transferring tokenizer")
        # 从预训练的检查点路径加载自动 tokenizer
        tokenizer = AutoTokenizer.from_pretrained(pretrained_checkpoint_path)
        # 将 tokenizer 保存到指定的输出路径中
        tokenizer.save_pretrained(output_path)
if __name__ == "__main__":
    # 如果作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建命令行参数解析器对象

    parser.add_argument(
        "--pretrained_checkpoint_path",
        default=None,
        type=str,
        required=True,
        help="Point to the directory containing your model weights using the official Mega repo",
    )
    # 添加命令行参数：预训练模型检查点路径，必须提供，用于指定官方 Mega 仓库中模型权重的目录路径

    parser.add_argument(
        "--output_path", default=None, type=str, required=True, help="Location to save the Hugging Face version"
    )
    # 添加命令行参数：输出路径，必须提供，用于指定保存 Hugging Face 版本的位置

    parser.add_argument(
        "--includes_tokenizer",
        action="store_true",
        help="Use this flag if there is a Hugging Face tokenizer in the original checkpoint repo",
    )
    # 添加命令行参数：包含 tokenizer 标志，如果原始检查点仓库中包含 Hugging Face 的 tokenizer，则设置此标志

    args = parser.parse_args()
    # 解析命令行参数，并将其存储在 args 变量中

    convert_checkpoint_to_huggingface(args.pretrained_checkpoint_path, args.output_path, args.includes_tokenizer)
    # 调用函数 convert_checkpoint_to_huggingface，将解析得到的参数传递给该函数进行转换操作


这段代码是一个命令行程序的入口，它使用 argparse 库来解析命令行参数，并调用 `convert_checkpoint_to_huggingface` 函数进行预训练模型检查点的转换工作。

`.\models\mega\modeling_mega.py`

# coding=utf-8
# Copyright 2023 The Mega Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
PyTorch MEGA model.
"""

import math
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_mega import MegaConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "mnaylor/mega-base-wikitext"
_CONFIG_FOR_DOC = "MegaConfig"

MEGA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "mnaylor/mega-base-wikitext",
    # See all Mega models at https://huggingface.co/models?filter=mega
]


class MegaEmbeddings(nn.Module):
    """
    Mega's basic implementation does not incorporate token type embeddings, so this is a stripped-down version of
    RoBERTa's embeddings which optionally includes token types
    """

    def __init__(self, config: MegaConfig):
        super().__init__()
        # Word embeddings layer using nn.Embedding, initialized with MegaConfig parameters
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # Boolean flag indicating whether token type embeddings are used
        self.use_token_types = config.add_token_type_embeddings
        if self.use_token_types:
            # Token type embeddings layer using nn.Embedding, initialized with MegaConfig parameters
            self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
            # Registering a buffer for token type IDs to enable model tracing when optional IDs are not passed
            # More information at transformers issue #5664
            self.register_buffer(
                "token_type_ids", torch.zeros(config.max_positions, dtype=torch.long).expand((1, -1)), persistent=False
            )

        # Padding token index from MegaConfig
        self.padding_idx = config.pad_token_id
    # 定义模型的前向传播函数，接受输入的标识符、token类型标识符或嵌入向量
    def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None):
        # 如果既未提供input_ids也未提供inputs_embeds，则抛出数值错误
        if (input_ids is None) and (inputs_embeds is None):
            raise ValueError("Must provide one of input_ids or inputs_embeds")
        # 如果提供了input_ids
        elif input_ids is not None:
            # 获取input_ids的形状
            input_shape = input_ids.size()
            # 获取input_ids所在设备
            device = input_ids.device

            # 如果仅提供了input_ids，则从word_embeddings中获取词嵌入
            inputs_embeds = self.word_embeddings(input_ids)
        else:
            # 获取inputs_embeds的形状，去掉最后一维
            input_shape = inputs_embeds.size()[:-1]
            # 获取inputs_embeds所在设备
            device = inputs_embeds.device

        # 原始的Mega实现不包含token类型嵌入，因此我们添加了一个选项来使用它们
        if self.use_token_types:
            # 如果未提供token_type_ids
            if token_type_ids is None:
                # 如果模型具有"token_type_ids"属性，则使用已注册的缓冲区
                if hasattr(self, "token_type_ids"):
                    # 获取缓冲区的token_type_ids，并根据输入的形状进行截取或扩展
                    buffered_token_type_ids = self.token_type_ids[:, : input_shape[1]]
                    buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], input_shape[1])
                    token_type_ids = buffered_token_type_ids_expanded
                else:
                    # 否则创建一个全零的token_type_ids张量
                    token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

            # 获取token类型嵌入
            token_type_embeddings = self.token_type_embeddings(token_type_ids)
            # 将token类型嵌入添加到词嵌入中
            embeddings = inputs_embeds + token_type_embeddings
        else:
            # 如果不使用token类型嵌入，则直接使用inputs_embeds作为输出的嵌入向量
            embeddings = inputs_embeds
        # 返回最终的嵌入向量
        return embeddings
# 定义一个名为 MegaSimpleRelativePositionalBias 的类，继承自 nn.Module
class MegaSimpleRelativePositionalBias(nn.Module):
    """
    Simple relative positional embeddings copied from the Mega repo; renamed variables for better readability
    """

    # 初始化方法，接收一个 MegaConfig 类的实例作为参数
    def __init__(self, config: MegaConfig):
        super().__init__()
        # 将传入的配置对象保存为类的属性
        self.config = config
        # 根据配置中的 chunk_size 设置最大位置信息，若 chunk_size < 0 则使用 max_positions
        self.max_positions = self.config.max_positions if self.config.chunk_size < 0 else self.config.chunk_size
        # 创建一个可学习参数，表示相对位置偏置，长度为 2 * max_positions - 1
        self.rel_pos_bias = nn.Parameter(torch.Tensor(2 * config.max_positions - 1))

    # 前向传播方法，接收一个整数 seq_len 作为输入
    def forward(self, seq_len):
        # 若输入的序列长度超过了最大位置信息，则抛出 ValueError
        if seq_len > self.max_positions:
            raise ValueError("Sequence length {} going beyond max length {}".format(seq_len, self.max_positions))

        # 从 rel_pos_bias 中选择合适的偏置，长度为 seq_len * 2 - 1
        bias = self.rel_pos_bias[(self.max_positions - seq_len) : (self.max_positions + seq_len - 1)]
        # 对 bias 进行填充，向右填充 seq_len 个 0，使其长度变为 seq_len * 3 - 1
        tile = F.pad(bias, (0, seq_len))
        # 将 tile 复制 seq_len 次，得到长度为 (seq_len * 3 - 1) * seq_len 的张量
        tile = torch.tile(tile, (seq_len,))
        # 去除末尾多余的部分，使得最终的维度为 seq_len x (3 * seq_len - 2)
        tile = tile[:-seq_len]
        # 返回处理后的相对位置偏置张量
        return tile


# 定义一个名为 MegaRotaryRelativePositionalBias 的类，继承自 nn.Module
class MegaRotaryRelativePositionalBias(nn.Module):
    """
    Rotary relative bias for positional information; similar in concept to RoPE (i.e. RoFormer) but taken from the Mega
    repo due to differences in implementation.

    When initialized, produces a positional bias which ranges from position 0 to config.max_positions, but can
    extrapolate to longer sequences. Can be indexed according to input position IDs
    """

    # 初始化方法，接收一个 MegaConfig 类的实例作为参数
    def __init__(self, config: MegaConfig):
        super().__init__()
        # 如果 hidden_size 不是 2 的倍数，则抛出 RuntimeError
        if config.hidden_size % 2 != 0:
            raise RuntimeError("Rotary positional bias requires `hidden_size` to be a multiple of 2")
        # 将传入的配置对象保存为类的属性
        self.config = config
        # 设置嵌入维度为 shared_representation_size
        self.embed_dim = config.shared_representation_size
        # 根据 chunk_size 设置最大位置信息，若 chunk_size < 0 则使用 max_positions
        self.max_positions = self.config.max_positions if self.config.chunk_size < 0 else self.config.chunk_size
        # 调用静态方法 get_sinusoid_embeddings 生成正弦和余弦的嵌入
        self.sine, self.cosine = MegaRotaryRelativePositionalBias.get_sinusoid_embeddings(
            config.max_positions, self.embed_dim
        )
        # 创建两个可学习参数，分别表示 alpha 和 b_param（避免与 tf/flax 的权重处理冲突，将 b_param 重命名为 b_param）
        self.alpha = nn.Parameter(torch.Tensor(1, self.embed_dim))
        self.b_param = nn.Parameter(torch.Tensor(1, self.embed_dim))
        # 注册一个缓冲张量，值为 0.0
        self.register_buffer("_float_tensor", torch.FloatTensor([0.0]))

    # 静态方法，生成正弦和余弦的嵌入
    @staticmethod
    def get_sinusoid_embeddings(max_positions: int, embedding_dim: int):
        # 计算 embedding_dim 的一半
        half_dim = embedding_dim // 2
        # 计算指数衰减率
        emb = math.log(10000) / half_dim
        # 计算正弦和余弦的嵌入张量
        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
        emb = torch.arange(max_positions, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
        return torch.sin(emb), torch.cos(emb)
    # 定义一个函数 `rotary`，用于处理输入数据
    def rotary(self, input):
        # 获取输入张量的长度和嵌入维度
        seq_len, embed_dim = input.size()
        # 将输入张量按照最后一个维度分成两个块
        chunk_1, chunk_2 = torch.chunk(input, 2, dim=-1)
        
        # 如果 sine 或 cosine 是空的，或者序列长度超过当前的最大位置数
        if self.sine is None or seq_len > self.sine.size(0):
            # 生成新的 sine 和 cosine 位置嵌入
            self.sine, self.cosine = MegaRotaryRelativePositionalBias.get_sinusoid_embeddings(seq_len, embed_dim)
            # 更新最大的位置数
            self.max_positions = seq_len
        
        # 将 sine 和 cosine 转换为指定的浮点张量类型
        self.sine = self.sine.to(self._float_tensor)
        self.cosine = self.cosine.to(self._float_tensor)

        # 取出当前序列长度内的 sine 和 cosine
        sin = self.sine[:seq_len]
        cos = self.cosine[:seq_len]
        
        # 返回旋转后的张量结果，按照旋转矩阵的定义进行计算
        return torch.cat([chunk_1 * cos - chunk_2 * sin, chunk_2 * cos + chunk_1 * sin], dim=1)

    # 定义前向传播函数 `forward`
    def forward(self, seq_len):
        # 计算 alpha 的旋转结果
        rotary_alpha = self.rotary(self.alpha.expand(seq_len, self.embed_dim))
        # 计算 beta 的旋转结果
        rotary_beta = self.rotary(self.b_param.expand(seq_len, self.embed_dim))
        
        # 计算旋转后的偏置张量，使用 Einstein Summation Notation (einsum) 定义
        bias = torch.einsum("mk,nk->mn", rotary_alpha, rotary_beta)
        
        # 返回偏置张量作为前向传播的结果
        return bias
class MegaDropout(nn.Module):
    """
    A unified class for standard dropout functionality and featurewise dropout.

    The original fairseq Mega repo used 2 classes for these, which included some unnecessary handling of training logic
    and an unused `inplace` option. The original implementation used torch.nn.functional instead of submodules, which
    is retained here as well.
    """

    def __init__(self, dropout_probability, is_featurewise=False):
        super().__init__()
        self.dropout_probability = dropout_probability  # 设置 dropout 的概率
        self.is_featurewise = is_featurewise  # 是否使用特征级别的 dropout

    def forward(self, input, batch_first: bool = False):
        if self.is_featurewise:
            if batch_first:
                # 如果 batch_first 为 True，则进行维度转换：
                # (batch_size X sequence_length X feature_dimension)
                # -> (batch_size X feature_dimension X sequence_length)
                # -> (batch_size X sequence_length X feature_dimension)
                return F.dropout2d(
                    input.transpose(-1, -2), p=self.dropout_probability, training=self.training
                ).transpose(-1, -2)
            else:
                if input.dim() != 3:
                    raise ValueError(
                        "Feature dropout inputs must be exactly 3-dimensional if inputs are ordered [sequence length, batch size, hidden dimension]"
                    )
                # 如果 batch_first 为 False，并且输入不是 3 维的，抛出 ValueError
                # (sequence_length X batch_size X feature_dimension)
                # -> (batch_size X feature_dimension X sequence_length)
                # -> (sequence_length X batch_size X feature_dimension)
                return F.dropout2d(input.permute(1, 2, 0), p=self.dropout_probability, training=self.training).permute(
                    2, 0, 1
                )
        else:
            # 如果不是 featurewise dropout，直接应用标准的 dropout
            return F.dropout(input, p=self.dropout_probability, training=self.training)


class MegaRMSNorm(nn.Module):
    """
    RMSNorm used in Mega implementation. Differs from T5's RMSNorm by applying the weight prior to taking the square
    root (as opposed to after in T5)
    """

    def __init__(self, number_features, eps=1e-6, affine=True):
        super().__init__()
        self.num_features = number_features  # 特征的数量
        self.eps = eps  # epsilon 值，用于数值稳定性
        self.affine = affine  # 是否应用仿射变换
        if affine:
            self.weight = nn.Parameter(torch.Tensor(self.num_features))  # 如果 affine 为 True，则初始化权重参数
        else:
            self.register_parameter("weight", None)  # 如果 affine 为 False，则不使用权重参数

    def forward(self, input):
        mean_square = torch.mean(torch.square(input), dim=-1, keepdim=True)  # 计算输入的平方的均值
        if self.weight is not None:
            input = input * self.weight  # 如果有权重参数，将输入乘以权重

        input * torch.rsqrt(mean_square + self.eps)  # 应用 RMS 标准化
        return input


class MegaScaleNorm(nn.Module):
    """
    Scale normalization introduced in MEGA which is similar to RMSNorm, but uses a single parameter for scalar
    multiplication instead of a vector, and applies over a specified dimension
    """

    # 此处留空，未提供具体实现，仅有描述
    # 初始化函数，用于初始化 BatchNorm 自定义层
    def __init__(self, dim, eps=1e-6, affine=True):
        # 调用父类的初始化方法
        super().__init__()
        # 设置 BatchNorm 的维度
        self.dim = dim
        # 设置 BatchNorm 的 epsilon 值，用于数值稳定性
        self.eps = eps
        # 是否使用仿射变换
        self.affine = affine
        # 如果启用仿射变换
        if affine:
            # 创建一个可学习的标量参数 scalar
            self.scalar = nn.Parameter(torch.Tensor(1))
        else:
            # 如果不启用仿射变换，则注册一个空的参数 scalar
            self.register_parameter("scalar", None)

    # 前向传播函数，用于计算 BatchNorm 的输出
    def forward(self, input):
        # 计算输入张量的各维度上的平方后求平均值
        mean_square = torch.mean(torch.square(input), dim=self.dim, keepdim=True)
        # 如果存在仿射变换参数 scalar
        if self.scalar is not None:
            # 对输入张量进行仿射变换
            input = self.scalar * input

        # 根据 BatchNorm 公式，计算 BatchNorm 的输出
        output = input * torch.rsqrt(mean_square + self.eps)
        return output
# MegaSequenceNorm 类定义，用于包装 Mega 中使用的各种层归一化选项，处理不同归一化方法对输入轴位置的期望差异。

    """
    A wrapper class for various layer normalization options used in Mega. Used to handle differences in expectations on
    input axis locations for different normalization methods.
    """

    def __init__(self, norm_type, embedding_dim, eps=1e-5, affine=True, export=False):
        # 初始化函数，根据给定的归一化类型选择相应的归一化层
        super().__init__()
        if norm_type == "layernorm":
            # 如果是 layernorm，使用 PyTorch 的 LayerNorm 归一化层
            self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine=affine)
        elif norm_type == "scalenorm":
            # 如果是 scalenorm，使用 MegaScaleNorm 归一化层
            self.norm = MegaScaleNorm(dim=-1, eps=eps, affine=affine)
        elif norm_type == "rmsnorm":
            # 如果是 rmsnorm，使用 MegaRMSNorm 归一化层
            self.norm = MegaRMSNorm(embedding_dim, eps=eps, affine=affine)
        elif norm_type == "batchnorm":
            # 如果是 batchnorm，使用 PyTorch 的 BatchNorm1d 归一化层
            self.norm = nn.BatchNorm1d(embedding_dim, eps=eps, affine=affine)
        elif norm_type == "syncbatchnorm":
            # 如果是 syncbatchnorm，使用 PyTorch 的 SyncBatchNorm 归一化层
            self.norm = nn.SyncBatchNorm(embedding_dim, eps=eps, affine=affine)
        else:
            # 如果类型未知，则抛出 ValueError 异常
            raise ValueError("Unknown norm type: {}".format(norm_type))

    def forward(self, input):
        # 前向传播函数，根据归一化层类型执行相应的归一化操作
        if isinstance(self.norm, nn.modules.batchnorm._BatchNorm):
            # 如果当前归一化层是 BatchNorm 类型，则要求输入必须是三维的张量
            if input.dim() != 3:
                raise ValueError("BatchNorm inputs must be exactly 3-dimensional")
            # 将输入的维度顺序转换为 (batch_size, seq_len, embedding_dim)
            input = input.permute(1, 2, 0)
            # 应用归一化层
            input = self.norm(input)
            # 将输出维度顺序转换回来 (seq_len, batch_size, embedding_dim)
            return input.permute(2, 0, 1)
        else:
            # 对于其他类型的归一化层，直接应用归一化
            return self.norm(input)


# 将 MegaSequenceNorm 类添加到 ALL_LAYERNORM_LAYERS 列表中
ALL_LAYERNORM_LAYERS.append(MegaSequenceNorm)


class MegaMultiDimensionDampedEma(nn.Module):
    """
    Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of
    variable names and moving away from the stateful representation of incremental decoding state. See
    "https://arxiv.org/abs/2209.10655" for more details.
    """
    def __init__(self, config: MegaConfig):
        super().__init__()
        
        # 初始化函数，接收一个 MegaConfig 类型的配置参数对象
        self.config = config

        # 设置嵌入维度为配置中的隐藏大小
        self.embed_dim = config.hidden_size
        # 设置维度为配置中的EMA投影大小
        self.ndim = config.ema_projection_size
        # 设置是否双向
        self.bidirectional = config.bidirectional
        # 设置截断大小
        self.truncation = config.truncation
        # 设置比例为EMA投影大小的倒数平方根
        self.scale = math.sqrt(1.0 / self.ndim)

        # 计算卷积核维度，如果是双向则为隐藏大小的两倍
        kernel_dim = 2 * config.hidden_size if self.bidirectional else config.hidden_size
        
        # 重命名阻尼因子和衰减因子以更清晰地描述参数功能
        self.damping_factor = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
        self.decay_factor = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
        
        # 重命名EMA扩展矩阵和核投影矩阵以避免与HF重命名冲突，并与论文描述保持一致
        self.ema_expansion_matrix = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
        self.kernel_projection_matrix = nn.Parameter(torch.Tensor(kernel_dim, self.ndim))
        
        # 将omega重命名为残差权重以描述其作用
        self.residual_weight = nn.Parameter(torch.Tensor(config.hidden_size))
        
        # 初始化私有变量
        self._kernel = None
        self._coeffs = None

    def _compute_ema_coefficients(self):
        # 计算EMA系数
        self._coeffs = None
        
        # 将阻尼因子和衰减因子（kernel_dim x EMA投影大小 x 1）转换为[0, 1]区间，使用sigmoid函数
        damping_factor = torch.sigmoid(self.damping_factor)
        decay_factor = torch.sigmoid(self.decay_factor)
        
        # 计算上一个时间步的权重
        previous_timestep_weight = 1.0 - damping_factor * decay_factor
        
        return damping_factor, previous_timestep_weight

    def _compute_efficient_ema_kernel(self, length: int):
        # 计算用于高效阻尼EMA的卷积核
        
        self._kernel = None
        
        # 计算EMA系数
        damping_factor, previous_timestep_weight = self._compute_ema_coefficients()
        
        # 创建Vandermonde矩阵，形状为(1, 1, length)，乘以对数化的上一个时间步权重
        vander = torch.arange(length).to(damping_factor).view(1, 1, length) * torch.log(previous_timestep_weight)
        
        # 计算卷积核，形状为(kernel_dim x EMA投影大小 x sequence_length)
        kernel = (damping_factor * self.ema_expansion_matrix) * torch.exp(vander)
        
        # 将卷积核从三维形状(kernel_dim x EMA投影大小 x sequence_length)压缩为二维形状(kernel_dim, sequence_length)
        return torch.einsum("dnl,dn->dl", kernel, self.kernel_projection_matrix * self.scale)

    def get_ema_coefficients(self):
        # 获取EMA系数
        
        if self.training:
            # 在训练模式下，重新计算EMA系数并返回
            return self._compute_ema_coefficients()
        else:
            # 在非训练模式下，如果系数尚未计算，则计算并存储，并返回
            if self._coeffs is None:
                self._coeffs = self._compute_ema_coefficients()
            return self._coeffs
    # 定义一个方法，用于获取指数移动平均（EMA）核函数
    def get_ema_kernel(self, length: int):
        # 确定核函数的大小，取决于给定的长度和截断值（如果有的话）
        kernel_size = length if self.truncation is None else min(self.truncation, length)
        
        # 如果处于训练模式下
        if self.training:
            # 调用计算高效EMA核函数的私有方法
            return self._compute_efficient_ema_kernel(kernel_size)
        else:
            # 如果核函数为空或者大小小于指定的长度
            if self._kernel is None or self._kernel.size(-1) < kernel_size:
                # 计算并缓存高效EMA核函数
                self._kernel = self._compute_efficient_ema_kernel(kernel_size)
            
            # 返回核函数的部分，截取到指定长度
            return self._kernel[..., :kernel_size]

    # 定义一个方法，实现使用FFT卷积进行重复计算EMA的包装器
    def fft_convolution(self, inputs, kernel, length):
        # 对输入进行FFT（快速傅里叶变换），扩展为两倍的长度
        inputs_fft = torch.fft.rfft(inputs.float(), n=2 * length)
        
        # 对核函数进行FFT，扩展为两倍的长度
        kernel_fft = torch.fft.rfft(kernel.float(), n=2 * length)
        
        # 执行FFT卷积，得到卷积序列
        convolved_sequence = torch.fft.irfft(inputs_fft * kernel_fft, n=2 * length)
        
        # 返回卷积序列作为结果
        return convolved_sequence
    # 计算指数移动平均 (EMA) 的一步更新
    def ema_step(self, inputs, length, past_state=None):
        # 如果长度为1，直接调用单步 EMA 更新函数
        if length == 1:
            return self.one_ema_step(inputs, past_state=past_state)

        # 获取当前时间步的阻尼系数和上一时间步权重
        damping_factor, previous_timestep_weight = self.get_ema_coefficients()

        # 构建范德蒙德矩阵
        vander = torch.arange(length + 1).to(damping_factor).view(1, 1, length + 1) * torch.log(
            previous_timestep_weight
        )
        vander = torch.exp(vander)

        # 如果有过去状态，计算过去的 EMA 投影和范德蒙德矩阵
        if past_state is not None:
            # 计算过去 EMA 投影
            past_ema_proj = vander[:, :, 1:] * (self.kernel_projection_matrix * self.scale).unsqueeze(-1)
            # 计算过去 EMA 状态
            past_ema_state = torch.einsum("bdn,dnl->bdl", past_state, past_ema_proj)
            # 计算过去范德蒙德矩阵
            past_vandermonde = vander[:, :, -1] * past_state
        else:
            past_ema_state = None
            past_vandermonde = None

        # 调整范德蒙德矩阵的维度
        vander = vander[:, :, :-1]

        # 计算卷积核
        kernel = (damping_factor * self.ema_expansion_matrix) * vander
        kernel_proj = torch.einsum("dnl,dn->dl", kernel, self.kernel_projection_matrix * self.scale)

        # 执行 FFT 卷积操作
        ema_output = self.fft_convolution(inputs, kernel_proj, length=length)[..., 0:length]
        ema_output = ema_output.type_as(inputs)

        # 如果有过去 EMA 状态，加上过去状态
        if past_ema_state is not None:
            ema_output = ema_output + past_ema_state

        # 更新隐藏状态
        updated_hidden_state = torch.einsum("bdl,dnl->bdn", inputs, torch.flip(kernel, dims=[2]))

        # 如果有过去范德蒙德矩阵，加上过去范德蒙德矩阵
        if past_vandermonde is not None:
            updated_hidden_state = updated_hidden_state + past_vandermonde

        # 返回结果，包括 EMA 输出和更新后的隐藏状态
        # 返回一个元组:
        # (sequence_length, batch_size, kernel_dim)
        # (batch_size, kernel_dim, ema_projection_size)
        return ema_output.permute(2, 0, 1), updated_hidden_state

    # 计算指数移动平均 (EMA) 的单步更新
    def one_ema_step(self, inputs, past_state=None):
        # 获取当前时间步的阻尼系数和上一时间步权重
        damping_factor, previous_timestep_weight = self.get_ema_coefficients()

        # 计算更新后的状态
        updated_state = (damping_factor * self.ema_expansion_matrix).squeeze(-1) * inputs

        # 如果有过去状态，加上过去状态的权重
        if past_state is not None:
            updated_state = updated_state + previous_timestep_weight.squeeze(-1) * past_state

        # 计算输出
        out = torch.einsum("bdn,dn->bd", updated_state, self.kernel_projection_matrix * self.scale)

        # 返回结果，包括输出和更新后的状态
        # 返回一个元组:
        # (1, batch_size, kernel_dim), (batch_size, kernel_dim, ema_projection_size)
        return out.unsqueeze(0), updated_state
    # 定义一个方法 `forward`，用于模型的前向传播
    # 参数 `self` 表示类的实例本身，`inputs` 是输入数据
    # `attention_mask` 是一个可选的张量，用于注意力掩码
    # `prev_state` 也是一个可选的张量，表示前一个状态的输出
    # `use_cache` 是一个布尔值，默认为 False，指示是否使用缓存
class MegaGatedCrossAttention(nn.Module):
    """
    Gated Structured State Attention for use in encoder-decoder model. See Mega paper for more details. Only
    modifications from original implementation are variable names, removing the unnecessary `before_attn_fn` and
    `static_kv` arguments, and the stateful representation of incremental decoder state.
    """

    def __init__(self, config: MegaConfig):
        super().__init__()

        self.config = config  # 存储传入的配置对象
        self.activation = ACT2FN[self.config.activation]  # 根据配置中的激活函数名称选择对应的激活函数
        self.attention_activation = self.config.attention_activation  # 存储注意力激活函数类型
        self.scaling = self.config.shared_representation_size**-0.5 if self.attention_activation == "softmax" else None  # 如果注意力激活函数是softmax，则设置缩放因子，否则为None

        self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)  # 使用MegaDropout初始化普通的Dropout
        self.hidden_dropout = MegaDropout(
            self.config.hidden_dropout_prob, is_featurewise=self.config.use_feature_dropout
        )  # 使用MegaDropout初始化隐藏层Dropout
        # Attention dropout is standard dropout
        self.attention_dropout = MegaDropout(self.config.attention_probs_dropout_prob, is_featurewise=False)  # 使用MegaDropout初始化注意力Dropout

        self.prenorm = self.config.normalize_before_mega  # 是否在应用Mega之前进行归一化的标志
        self.norm = MegaSequenceNorm(
            self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
        )  # 使用MegaSequenceNorm初始化归一化层

        self.k_proj = nn.Linear(self.config.hidden_size, self.config.shared_representation_size)  # 创建线性层k_proj，用于映射隐藏状态到共享表示大小
        self.v_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size)  # 创建线性层v_proj，用于映射隐藏状态到隐藏大小
        self.q_proj = nn.Linear(
            self.config.hidden_size, 2 * self.config.hidden_size + self.config.shared_representation_size
        )  # 创建线性层q_proj，用于映射隐藏状态到查询大小
        self.h_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size)  # 创建线性层h_proj，用于映射隐藏状态到隐藏大小

        if self.config.relative_positional_bias == "simple":
            self.rel_pos_bias = MegaSimpleRelativePositionalBias(config)  # 如果相对位置偏置为简单类型，则创建简单相对位置偏置对象
        elif self.config.relative_positional_bias == "rotary":
            self.rel_pos_bias = MegaRotaryRelativePositionalBias(config)  # 如果相对位置偏置为旋转类型，则创建旋转相对位置偏置对象
        else:
            raise ValueError("unknown relative position bias: {}".format(self.config.relative_positional_bias))  # 如果相对位置偏置类型未知，则抛出异常

        self.softmax = nn.Softmax(dim=-1)  # 创建softmax层，沿着最后一个维度进行softmax操作
    def element_attention(self, query, key, key_padding_mask, pidx):
        # 获取 key 的尺寸信息
        bsz, src_len, _ = key.size()
        # 获取查询序列的长度，如果有位置索引 pidx，则使用 pidx+1，否则使用查询序列的长度
        tgt_len = query.size(1) if pidx is None else pidx + 1
        if key_padding_mask is not None:
            # 计算每个样本在源序列上的有效长度，并扩展为 (batch_size X 1 X 1) 的形状
            lengths = key_padding_mask.sum(dim=-1).view(bsz, 1, 1)
        else:
            # 如果没有提供 key_padding_mask，则使用源序列的长度作为有效长度
            lengths = src_len

        # 生成相对位置偏置矩阵，形状为 (target_sequence_length X source_sequence_length)
        bias = self.rel_pos_bias(max(tgt_len, src_len))[:, :src_len]
        if pidx is not None:
            if query.size(1) != 1:
                raise ValueError("Position offset provided with queries longer than 1 token")
            # 如果提供了位置索引 pidx，并且查询序列的长度不为 1，则引发异常
            bias = bias[pidx]
        else:
            # 如果没有提供位置索引 pidx，则截取相对位置偏置矩阵到目标序列长度 tgt_len
            bias = bias[:tgt_len]

        # 计算查询-键之间的点积注意力，除以有效长度并加上偏置
        qk = torch.bmm(query, key.transpose(1, 2)) / lengths + bias

        # 使用激活函数 ACT2FN[self.attention_activation] 处理注意力权重
        attn_weights = ACT2FN[self.attention_activation](qk).type_as(qk)

        if key_padding_mask is not None:
            # 如果存在 key_padding_mask，则将注意力权重乘以 key_padding_mask 的扩展形状
            attn_weights = attn_weights * key_padding_mask.unsqueeze(1)

        return attn_weights

    def softmax_attention(self, query, key, key_padding_mask, pidx):
        # 获取 key 的尺寸信息
        bsz, src_len, _ = key.size()
        # 获取查询序列的长度，如果有位置索引 pidx，则使用 pidx+1，否则使用查询序列的长度
        tgt_len = query.size(1) if pidx is None else pidx + 1

        # 生成相对位置偏置矩阵，形状为 (target_sequence_length X source_sequence_length)
        bias = self.rel_pos_bias(max(tgt_len, src_len))[:, :src_len]
        if pidx is not None:
            if query.size(1) != 1:
                raise ValueError("Position offset provided with queries longer than 1 token")
            # 如果提供了位置索引 pidx，并且查询序列的长度不为 1，则引发异常
            bias = bias[pidx]
        else:
            # 如果没有提供位置索引 pidx，则截取相对位置偏置矩阵到目标序列长度 tgt_len
            bias = bias[:tgt_len]

        # 对查询进行缩放
        query = query * self.scaling
        # 计算查询-键之间的点积注意力，并加上偏置
        qk = torch.bmm(query, key.transpose(1, 2)) + bias

        if key_padding_mask is not None:
            # 使用 key_padding_mask 屏蔽无效位置
            qk = qk.masked_fill((1 - key_padding_mask).unsqueeze(1).to(torch.bool), float("-inf"))

        # 对注意力权重进行 softmax 归一化
        attn_weights = self.softmax(qk).type_as(qk)
        return attn_weights

    def forward(
        self,
        query,
        key: Optional[torch.Tensor],
        value: Optional[torch.Tensor],
        key_padding_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
    """
    Pure PyTorch implementation of Mega block; see https://arxiv.org/abs/2209.10655 and original fairseq implementation
    at https://github.com/facebookresearch/mega (copyright Meta Research, licensed under MIT License)

    Differences from original implementation include hidden state refactor and fixed inconsistency with additive /
    multiplicative attention masks
    """

    def __init__(self, config: MegaConfig):
        super().__init__()
        self.config = config
        self.activation = ACT2FN[self.config.activation]  # 设置激活函数，根据配置选择相应的激活函数
        self.scaling = (
            self.config.shared_representation_size**-0.5 if self.config.attention_activation == "softmax" else None
        )  # 如果注意力激活函数为 softmax，则设置缩放因子为共享表示大小的倒数，否则为 None
        self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)  # 初始化特征级别的 dropout
        self.hidden_dropout = MegaDropout(
            self.config.hidden_dropout_prob, is_featurewise=self.config.use_feature_dropout
        )  # 初始化隐藏层级别的 dropout
        # attention dropout is standard dropout
        self.attention_dropout = MegaDropout(self.config.attention_probs_dropout_prob, is_featurewise=False)  # 初始化注意力矩阵的 dropout

        self.norm = MegaSequenceNorm(
            self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
        )  # 初始化序列规范化层，根据配置设置归一化类型和是否仿射变换
        self.ema_gate = MegaMultiDimensionDampedEma(config)  # 初始化多维度阻尼 EMA（指数移动平均）

        self.v_proj = nn.Linear(self.config.hidden_size, self.config.intermediate_size)  # 初始化线性变换 v_proj
        self.mx_proj = nn.Linear(
            self.config.hidden_size,
            self.config.shared_representation_size + self.config.intermediate_size + 2 * self.config.hidden_size,
        )  # 初始化线性变换 mx_proj
        self.h_proj = nn.Linear(self.config.intermediate_size, self.config.hidden_size)  # 初始化线性变换 h_proj

        self.qk_weight = nn.Parameter(torch.Tensor(2, self.config.shared_representation_size))  # 初始化查询和键的权重参数
        self.qk_bias = nn.Parameter(torch.Tensor(2, self.config.shared_representation_size))  # 初始化查询和键的偏置参数

        if self.config.relative_positional_bias == "simple":
            self.rel_pos_bias = MegaSimpleRelativePositionalBias(config)  # 如果相对位置偏置为简单模式，则初始化简单相对位置偏置
        elif self.config.relative_positional_bias == "rotary":
            self.rel_pos_bias = MegaRotaryRelativePositionalBias(config)  # 如果相对位置偏置为旋转模式，则初始化旋转相对位置偏置
        else:
            raise ValueError(f"Unknown relative positional bias: {self.config.relative_positional_bias}")  # 抛出异常，未知的相对位置偏置类型

        self.softmax = nn.Softmax(dim=-1)  # 初始化 Softmax 函数，用于计算 softmax 注意力分布
        self.attention_function = (
            self.softmax_attention if self.config.attention_activation == "softmax" else self.element_attention
        )  # 根据配置选择注意力激活函数，softmax 或者元素级别的注意力
    def element_attention(self, query, key, padding_mask, causal_mask):
        """
        Apply element-wise attention via relu^2 or laplace. Same as original implementation but with standardized
        causal attention mask. Expects the Hugging Face standard attention mask paradigm: 1 for not masked, and 0 for
        masked.
        """
        # 获取序列长度
        seq_len = key.size(2)

        # 如果存在填充掩码
        if padding_mask is not None:
            # 计算每个样本的有效长度并扩展维度
            # (batch_size X number of chunks X 1)
            lengths = padding_mask.sum(-1, keepdim=True)
            # (batch_size X number of chunks X 1 X 1)
            lengths = lengths.clamp(min=1.0).unsqueeze(-1)
        else:
            # 如果没有填充掩码，则使用序列长度
            lengths = seq_len

        # 如果存在因果掩码
        if causal_mask is not None:
            # 计算因果掩码的和并扩展维度
            lengths = causal_mask.sum(dim=-1, keepdim=True)

        # 获取相对位置偏置
        # (sequence_length X sequence_length)
        bias = self.rel_pos_bias(seq_len)

        # 如果查询向量的长度不等于键向量的长度，则抛出异常
        if seq_len != query.size(2):
            if query.size(2) != 1:
                raise ValueError("Size mismatch between Q and K in element attention")
            # 只选择最后一个位置的偏置
            # (1 X sequence_length)
            bias = bias[-1:]

        # 计算查询-键之间的注意力分数
        # (batch_size X number of chunks X sequence_length X sequence_length)
        qk = torch.matmul(query, key.transpose(2, 3)) / lengths + bias

        # 应用激活函数到注意力分数
        attn_weights = ACT2FN[self.config.attention_activation](qk).type_as(qk)

        # 如果存在填充掩码，则应用填充掩码
        if padding_mask is not None:
            attn_weights = attn_weights * padding_mask.unsqueeze(2)

        # 如果存在因果掩码，则应用因果掩码
        if causal_mask is not None:
            attn_weights = attn_weights * causal_mask

        # 返回注意力权重
        return attn_weights
    def softmax_attention(self, query, key, padding_mask, causal_mask):
        "Standard softmax self-attention, as in the original Transformer paper"
        # 获取序列长度
        seq_len = key.size(2)
        
        # 生成相对位置偏置矩阵
        bias = self.rel_pos_bias(seq_len)
        
        # 如果 Q 和 K 的长度不匹配，进行异常处理
        if seq_len != query.size(2):
            if query.size(2) != 1:
                raise ValueError("Size mismatch between Q and K in softmax attention")
            # 如果长度不匹配，只取最后一行偏置矩阵
            bias = bias[-1:]

        # 缩放注意力权重
        query = query * self.scaling

        # 计算注意力矩阵 QK
        qk = torch.matmul(query, key.transpose(2, 3)) + bias
        
        # 应用因果遮蔽（假设为1/0表示未遮蔽/遮蔽）
        if causal_mask is not None:
            additive_causal_mask = torch.zeros_like(causal_mask, dtype=qk.dtype)
            additive_causal_mask = additive_causal_mask.masked_fill((1 - causal_mask).bool(), float("-inf"))
            qk = qk + additive_causal_mask

        # 应用填充遮蔽
        if padding_mask is not None:
            # 将填充遮蔽反转，以符合 Mega 源码的处理方式
            padding_mask = 1 - padding_mask
            padding_mask_all = padding_mask.all(dim=-1, keepdim=True)
            padding_mask = torch.logical_and(padding_mask, ~padding_mask_all)
            qk = qk.masked_fill(padding_mask.unsqueeze(2).to(torch.bool), float("-inf"))

        # 计算 softmax 权重并转换为与 QK 相同的数据类型
        attn_weights = self.softmax(qk).type_as(qk)
        return attn_weights

    def forward(
        self,
        input,
        padding_mask: Optional[torch.Tensor] = None,
        causal_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[torch.Tensor]] = None,
        output_attentions=False,
        use_cache=False,
# 定义一个名为 MegaNormalizedFeedForwardNetwork 的类，继承自 nn.Module 类
class MegaNormalizedFeedForwardNetwork(nn.Module):
    """
    Normalized feed-forward network used in Mega blocks. Left as-is from original Mega repo aside from retrieving args
    from Hugging Face config
    """

    # 初始化方法，接受一个 MegaConfig 类型的参数 config
    def __init__(self, config: MegaConfig):
        super().__init__()

        # 将参数 config 存储在对象的属性中
        self.config = config
        # 初始化隐藏层维度为配置中的 nffn_hidden_size
        self.hidden_dim = config.nffn_hidden_size
        # 激活函数名称从配置中获取，并且将其映射为对应的激活函数
        self.act_fn = config.activation
        self.activation = ACT2FN[config.activation]

        # 初始化两个 MegaDropout 对象，分别用于不同的配置参数
        self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
        self.hidden_dropout = MegaDropout(
            self.config.nffn_activation_dropout_prob, is_featurewise=self.config.use_feature_dropout
        )

        # 根据配置参数决定是否在前馈网络之前进行归一化
        self.prenorm = self.config.normalize_before_ffn
        # 初始化 MegaSequenceNorm 对象，用于序列归一化
        self.norm = MegaSequenceNorm(
            self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
        )

        # 初始化两个线性层，分别为输入到隐藏层和隐藏层到输出层的线性变换
        self.fc1 = nn.Linear(self.config.hidden_size, self.config.nffn_hidden_size)
        self.fc2 = nn.Linear(self.config.nffn_hidden_size, self.config.hidden_size)

    # 前向传播方法，接受输入 inputs，并返回输出 output
    def forward(self, inputs):
        # 将输入保存为残差连接的一部分
        residual = inputs

        # 如果配置要求在前馈网络之前进行归一化，则对输入进行归一化处理
        if self.prenorm:
            inputs = self.norm(inputs)

        # 第一层前馈网络，使用激活函数后，应用 dropout
        hidden = self.activation(self.fc1(inputs))
        hidden = self.hidden_dropout(hidden)
        # 第二层前馈网络，无激活函数，但应用 dropout
        output = self.fc2(hidden)
        output = self.dropout(output)
        # 将输出与残差相加，实现残差连接
        output = output + residual

        # 如果配置要求在前馈网络之后进行归一化，则对输出进行归一化处理
        if not self.prenorm:
            output = self.norm(output)

        # 返回最终的输出
        return output


# 定义一个名为 MegaBlock 的类，继承自 nn.Module 类
class MegaBlock(nn.Module):
    # 初始化方法，接受一个 MegaConfig 类型的参数 config
    def __init__(self, config: MegaConfig):
        super().__init__()
        # 设置序列长度的维度为 1
        self.seq_len_dim = 1
        # 初始化 MegaMovingAverageGatedAttention 对象
        self.mega_layer = MegaMovingAverageGatedAttention(config)
        # 根据配置决定是否初始化 MegaNormalizedFeedForwardNetwork 对象
        self.nffn = MegaNormalizedFeedForwardNetwork(config) if config.use_normalized_ffn else None
        # 设置是否为解码器的标志
        self.is_decoder = config.is_decoder
        # 根据配置决定是否添加交叉注意力机制
        self.add_cross_attention = config.add_cross_attention
        # 如果添加了交叉注意力机制
        if self.add_cross_attention:
            # 如果不是解码器模型，则抛出 ValueError 异常
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 初始化 MegaGatedCrossAttention 对象
            self.cross_attn = MegaGatedCrossAttention(config)
        else:
            self.cross_attn = None

    # 前向传播方法，接受多个输入参数，返回输出 hidden_states
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.LongTensor] = None,
        causal_mask: Optional[torch.LongTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[torch.FloatTensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: bool = False,
# 从 transformers.models.roberta.modeling_roberta.RobertaPooler 复制，将 Roberta 替换为 Mega
class MegaPooler(nn.Module):
    # 初始化方法，接受一个 config 参数
    def __init__(self, config):
        super().__init__()
        # 初始化线性层，输入和输出维度都为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 激活函数为双曲正切函数
        self.activation = nn.Tanh()
    # 定义前向传播方法，接受隐藏状态张量作为输入，并返回张量作为输出
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 通过获取第一个 token 对应的隐藏状态来“池化”模型
        first_token_tensor = hidden_states[:, 0]
        # 将第一个 token 的隐藏状态传递给全连接层进行线性变换
        pooled_output = self.dense(first_token_tensor)
        # 应用激活函数到线性变换的结果上
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出张量
        return pooled_output
class MegaPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 设置配置类，用于当前类的配置
    config_class = MegaConfig
    # 设置基础模型前缀，用于标识当前类的基础模型
    base_model_prefix = "mega"
    # 指示当前类不支持梯度检查点
    supports_gradient_checkpointing = False
    # 定义不需要拆分的模块列表，这些模块不会在模型参数分组中进行拆分
    _no_split_modules = ["MegaMovingAverageGatedAttention"]
    # 初始化模块的权重
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果模块是 MegaMultiDimensionDampedEma 类型
        if isinstance(module, MegaMultiDimensionDampedEma):
            # 使用 torch.no_grad() 上下文管理器，确保不计算梯度
            with torch.no_grad():
                # 初始化模块的阻尼因子和衰减因子
                nn.init.normal_(module.damping_factor, mean=0.0, std=self.config.ema_delta_alpha_range)
                nn.init.normal_(module.decay_factor, mean=0.0, std=self.config.ema_delta_alpha_range)
                # 初始化模块的扩展矩阵，其中特定索引位置的值为 -1，其余为 1
                val = torch.ones(self.config.ema_projection_size, 1)
                if self.config.ema_projection_size > 1:
                    idx = torch.tensor(list(range(1, self.config.ema_projection_size, 2)))
                    val.index_fill_(0, idx, -1.0)
                module.ema_expansion_matrix.normal_(mean=0.0, std=self.config.ema_beta_range).add_(val)
                # 初始化模块的核心投影矩阵和残余权重
                nn.init.normal_(module.kernel_projection_matrix, mean=0.0, std=self.config.ema_gamma_omega_range)
                nn.init.normal_(module.residual_weight, mean=0.0, std=self.config.ema_gamma_omega_range)
        # 如果模块是 MegaSimpleRelativePositionalBias 类型
        elif isinstance(module, MegaSimpleRelativePositionalBias):
            # 初始化相对位置偏置
            nn.init.normal_(module.rel_pos_bias, mean=0.0, std=self.config.initializer_range)
        # 如果模块是 MegaRotaryRelativePositionalBias 类型
        elif isinstance(module, MegaRotaryRelativePositionalBias):
            # 初始化旋转相对位置偏置的 alpha 和 b_param
            nn.init.normal_(module.alpha, mean=0.0, std=self.config.initializer_range)
            nn.init.normal_(module.b_param, mean=0.0, std=self.config.initializer_range)
        # 如果模块是 MegaScaleNorm 类型
        elif isinstance(module, MegaScaleNorm):
            # 如果配置中开启了归一化参数，初始化 scalar 参数为 1.0
            if self.config.norm_affine:
                nn.init.constant_(module.scalar, 1.0)
        # 如果模块是 MegaRMSNorm 类型
        elif isinstance(module, MegaRMSNorm):
            # 如果配置中开启了归一化参数，初始化 weight 参数为 1.0
            if self.config.norm_affine:
                nn.init.constant_(module.weight, 1.0)
        # 如果模块是 MegaMovingAverageGatedAttention 类型
        elif isinstance(module, MegaMovingAverageGatedAttention):
            # 初始化模块的 qk_weight 和 qk_bias
            # 线性层在下面的通用 nn.Linear 初始化中单独处理
            nn.init.normal_(module.qk_weight, mean=0.0, std=self.config.initializer_range)
            nn.init.constant_(module.qk_bias, 0.0)
        # 如果模块是 nn.Linear 类型
        elif isinstance(module, nn.Linear):
            # 初始化整个网络中所有线性层的权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置，将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是 nn.Embedding 类型
        elif isinstance(module, nn.Embedding):
            # 初始化嵌入层的权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果指定了填充索引，将其对应的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果模块是 nn.LayerNorm 类型
        elif isinstance(module, nn.LayerNorm):
            # 初始化层归一化的偏置为零，权重为 1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
# MEGA_START_DOCSTRING 是一个长字符串，用来描述此模型的文档字符串。
# 文档字符串提供了从 PreTrainedModel 继承的信息，包括通用方法（如下载、保存、调整输入嵌入、修剪头等）。
# 此模型还是一个 PyTorch 的 torch.nn.Module 子类，可以像常规 PyTorch 模块一样使用，并且可以参考 PyTorch 文档以获取一般使用和行为相关的所有信息。
# 参数：
#   config（MegaConfig）：模型配置类，包含模型的所有参数。使用配置文件初始化时不会加载与模型关联的权重，只加载配置。
#   使用 ~PreTrainedModel.from_pretrained 方法加载模型权重。
MEGA_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MegaConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# MEGA_INPUTS_DOCSTRING 是另一个字符串变量，暂时为空字符串，可能用于描述此模型的输入。
MEGA_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列标记的索引，对应词汇表中的标记

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 遮罩用于避免在填充的标记索引上执行注意力操作。遮罩值在 `[0, 1]` 范围内：

            - 1 表示 **未被遮罩** 的标记，
            - 0 表示 **被遮罩** 的标记。

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 段标记索引，用于指示输入的第一部分和第二部分。索引选择在 `[0,1]` 范围内：

            - 0 对应于 *句子A* 的标记，
            - 1 对应于 *句子B* 的标记。
            此参数仅在模型初始化时设置了 `add_token_type_embeddings` 参数为 `True` 时可用。此张量中的所有值应始终 < config.type_vocab_size。

            [What are token type IDs?](../glossary#token-type-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选地，您可以直接传递嵌入表示，而不是传递 `input_ids`。如果您想对如何将 `input_ids` 索引转换为关联向量有更多控制，这将非常有用，而不是使用模型内部的嵌入查找矩阵。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。有关更多详细信息，请参见返回的张量中的 `attentions`。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。有关更多详细信息，请参见返回的张量中的 `hidden_states`。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""
@add_start_docstrings(
    "The bare MEGA Model transformer outputting raw hidden-states without any specific head on top.",
    MEGA_START_DOCSTRING,
)
class MegaModel(MegaPreTrainedModel):
    """

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added after self-attention, following the architecture described in *Mega: Moving Average
    Equipped Gated Attention*_ by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig,
    Jonathan May, and Luke Zettlemoyer

    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
    `True` and `bidirectional` set to `False`. To be used in a Seq2Seq model, the model needs to initialized with both
    `is_decoder=True` and `bidirectional=False` argument as well as `add_cross_attention` set to `True`; an
    `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Mega: Moving Average Equipped Gated Attention*: https://arxiv.org/abs/2209.10655

    """

    def __init__(self, config: MegaConfig, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        # Initialize embedding layer specific to MegaModel
        self.embedding_layer = MegaEmbeddings(config)
        
        # Create multiple MegaBlocks (transformer blocks)
        self.layers = nn.ModuleList([MegaBlock(config) for _ in range(config.num_hidden_layers)])

        # Optionally add a pooling layer
        self.pooler = MegaPooler(config) if add_pooling_layer else None

        # Initialize weights and apply final processing (retained from RoBERTa code)
        self.post_init()

    def get_input_embeddings(self):
        # Retrieve the word embedding layer
        return self.embedding_layer.word_embeddings

    def set_input_embeddings(self, value):
        # Set the word embedding layer with a new value
        self.embedding_layer.word_embeddings = value

    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    """MEGA Model with a `language modeling` head on top for CLM fine-tuning.""", MEGA_START_DOCSTRING
)
class MegaForCausalLM(MegaPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]
    def __init__(self, config: MegaConfig):
        # 调用父类构造函数初始化对象
        super().__init__(config)

        # 如果配置不是解码器，则发出警告信息
        if not config.is_decoder:
            logger.warning("If you want to use `MegaForCausalLM` as a standalone, add `is_decoder=True.`")

        # 创建 MegaModel 对象，不添加池化层
        self.mega = MegaModel(config, add_pooling_layer=False)

        # 根据配置决定是否添加 LM 隐藏层稠密层和激活函数
        if config.add_lm_hidden_dense_layer:
            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
            self.hidden_activation = nn.Tanh()
        else:
            self.dense = None
            self.hidden_activation = None

        # 创建 LM 头部线性层，输出大小为词汇表大小
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输出嵌入层（LM 头部线性层）
    def get_output_embeddings(self):
        return self.lm_head

    # 设置输出嵌入层（LM 头部线性层）
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 调用装饰器添加模型前向方法的文档字符串
    # 调用装饰器替换返回文档字符串，输出类型为 CausalLMOutputWithCrossAttentions，配置类为 _CONFIG_FOR_DOC
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 方法用于为生成准备输入
        input_shape = input_ids.shape
        
        # 如果注意力掩码为空，则创建与输入形状相同的全 1 矩阵作为注意力掩码
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)

        # 如果使用过去的键值对，则截取输入的最后一个标记
        if past_key_values is not None:
            input_ids = input_ids[:, -1:]

        # 返回包含输入标记、注意力掩码和过去键值对的字典
        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}

    # 重新排序缓存中的过去键值对
    def _reorder_cache(self, past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past
# 使用装饰器为 MegaForMaskedLM 类添加文档字符串，描述其作为带有语言建模头部的 MEGA 模型
@add_start_docstrings("""MEGA Model with a `language modeling` head on top.""", MEGA_START_DOCSTRING)
class MegaForMaskedLM(MegaPreTrainedModel):
    # 指定共享权重的键名，这里是语言建模头部的权重
    _tied_weights_keys = ["mlm_head.weight"]

    def __init__(self, config: MegaConfig):
        super().__init__(config)

        # 如果配置为解码器模式，发出警告，因为 MegaForMaskedLM 适合使用单向自注意力
        if config.is_decoder:
            logger.warning(
                "If you want to use `MegaForMaskedLM`, set `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # 初始化 MegaModel，根据配置决定是否添加语言建模隐藏层的稠密层
        self.mega = MegaModel(config, add_pooling_layer=False)
        if config.add_lm_hidden_dense_layer:
            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
            self.hidden_activation = nn.Tanh()
        else:
            self.dense = None
            self.hidden_activation = None
        
        # 语言建模头部，线性层将隐藏状态映射到词汇表大小
        self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size)
        self.dropout = nn.Dropout(config.dropout_prob)

        # 初始化权重并进行最终处理
        self.post_init()

    # 返回语言建模头部的输出嵌入
    def get_output_embeddings(self):
        return self.mlm_head

    # 设置语言建模头部的新嵌入
    def set_output_embeddings(self, new_embeddings):
        self.mlm_head = new_embeddings

    # 使用装饰器为 forward 方法添加文档字符串，描述输入参数及其作用
    # 还包括代码示例的文档字符串，展示输入、输出、期望输出和损失
    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="<mask>",
        expected_output="' Paris'",
        expected_loss=0.1,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        """
        # Decide whether to use return_dict based on provided value or default from configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass input arguments to the mega model for processing
        outputs = self.mega(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # Extract the sequence output from the mega model's outputs
        sequence_output = outputs[0]

        # Apply additional dense layer transformation if defined
        if self.dense is not None:
            sequence_output = self.dense(sequence_output)
            sequence_output = self.hidden_activation(sequence_output)

        # Generate prediction scores using the MLM head on the processed sequence output
        prediction_scores = self.mlm_head(sequence_output)

        # Initialize masked language modeling loss as None
        masked_lm_loss = None

        # Compute masked LM loss if labels are provided
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # Prepare output based on whether return_dict is False
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]  # Include prediction scores and other outputs
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # Return MaskedLMOutput with relevant components
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 基于 MEGA 模型的序列分类/回归头部的模型定义，使用线性层作为池化输出之上的顶层，例如用于GLUE任务。
@add_start_docstrings(
    """
    MEGA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    MEGA_START_DOCSTRING,
)
class MegaForSequenceClassification(MegaPreTrainedModel):
    def __init__(self, config):
        # 调用父类构造函数初始化模型配置
        super().__init__(config)
        # 记录标签数和配置
        self.num_labels = config.num_labels
        self.config = config

        # 初始化 MEGA 模型，不包含池化层
        self.mega = MegaModel(config, add_pooling_layer=False)
        # 初始化 MEGA 分类头部
        self.classifier = MegaClassificationHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用它；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给模型 `self.mega`
        outputs = self.mega(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取模型输出的序列输出
        sequence_output = outputs[0]
        # 将序列输出传递给分类器 `self.classifier` 得到 logits
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        # 如果 labels 不为 None，则计算损失
        if labels is not None:
            # 如果问题类型未定义，则根据情况设置问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回不同的输出形式
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回包含损失、logits、隐藏状态和注意力权重的 SequenceClassifierOutput 对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 使用指定的文档字符串注释装饰器，描述了此类的作用和结构
@add_start_docstrings(
    """
    MEGA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    MEGA_START_DOCSTRING,  # 引用了全局变量 MEGA_START_DOCSTRING，补充了更多文档内容
)
# 定义 MegaForMultipleChoice 类，继承自 MegaPreTrainedModel
class MegaForMultipleChoice(MegaPreTrainedModel):
    def __init__(self, config):
        # 调用父类 MegaPreTrainedModel 的初始化方法
        super().__init__(config)

        # 创建一个 MegaModel 实例，接收给定的配置信息
        self.mega = MegaModel(config)
        # 创建一个 dropout 层，使用给定的隐藏层 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 创建一个线性层，将隐藏状态大小映射到 1（用于多选题的分类）
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 执行初始化权重和应用最终处理
        self.post_init()

    # 使用指定的文档字符串注释装饰器，描述了 forward 方法的输入参数
    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    # 使用指定的代码示例文档字符串注释装饰器，提供了 forward 方法的示例用法和输出类型
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,  # 引用了全局变量 _CHECKPOINT_FOR_DOC，指示可以使用的检查点
        output_type=MultipleChoiceModelOutput,  # 指定了输出类型为 MultipleChoiceModelOutput
        config_class=_CONFIG_FOR_DOC,  # 引用了全局变量 _CONFIG_FOR_DOC，指示可以使用的配置类
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据输入的 return_dict 参数确定是否返回字典格式的结果
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 获取输入中选择项的数量
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 将输入张量展平，以便适应模型输入要求
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        flat_inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 将展平后的输入传递给模型进行处理，并获取输出
        outputs = self.mega(
            flat_input_ids,
            token_type_ids=flat_token_type_ids,
            attention_mask=flat_attention_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取模型输出中的汇总向量
        pooled_output = outputs[1]

        # 对汇总向量应用 dropout
        pooled_output = self.dropout(pooled_output)
        # 将汇总后的向量输入到分类器中得到 logits
        logits = self.classifier(pooled_output)
        # 将 logits 重塑为原始的多选项形状
        reshaped_logits = logits.view(-1, num_choices)

        # 如果提供了 labels，则计算交叉熵损失
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 根据 return_dict 参数确定返回的结果格式
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回多选模型的输出，包括损失、logits、隐藏状态和注意力权重
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
MEGA Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""

@add_start_docstrings(
    """
    MEGA Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    MEGA_START_DOCSTRING,
)
class MegaForTokenClassification(MegaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # Initialize the MEGA model with pooling layer excluded
        self.mega = MegaModel(config, add_pooling_layer=False)
        
        # Determine dropout rate for the classifier, fallback to config's hidden dropout prob if not specified
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        
        # Linear layer for token classification, output size equals config's hidden size and number of labels
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and perform any post initialization steps
        self.post_init()

    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # Determine if return_dict is explicitly provided, otherwise use model's default setting
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Forward pass through the MEGA model to get outputs
        outputs = self.mega(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the sequence output from the MEGA model's outputs
        sequence_output = outputs[0]

        # Apply dropout to the sequence output
        sequence_output = self.dropout(sequence_output)
        
        # Generate logits for token classification using a linear layer
        logits = self.classifier(sequence_output)

        # Compute the loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # Return different output formats based on return_dict flag
        if not return_dict:
            output = (logits,) + outputs[2:]  # Include hidden states and attentions if not using return_dict
            return ((loss,) + output) if loss is not None else output
        else:
            # Return TokenClassifierOutput with relevant outputs
            return TokenClassifierOutput(
                loss=loss,
                logits=logits,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            )
# 从transformers.models.roberta.modeling_roberta.RobertaClassificationHead复制，将Roberta改为Mega
class MegaClassificationHead(nn.Module):
    """用于句子级分类任务的头部模块。"""

    def __init__(self, config):
        super().__init__()
        # 全连接层，输入和输出维度都是config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 分类器的Dropout率，如果config.classifier_dropout不为None，则使用该值，否则使用config.hidden_dropout_prob
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # Dropout层
        self.dropout = nn.Dropout(classifier_dropout)
        # 输出投影层，将config.hidden_size映射到config.num_labels
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        # 取features的第一个token的隐藏状态（相当于[CLS] token）
        x = features[:, 0, :]
        # 应用Dropout
        x = self.dropout(x)
        # 全连接层
        x = self.dense(x)
        # 使用tanh激活函数
        x = torch.tanh(x)
        # 再次应用Dropout
        x = self.dropout(x)
        # 输出投影层
        x = self.out_proj(x)
        return x


@add_start_docstrings(
    """
    MEGA模型，顶部具有一个用于抽取式问答任务（例如SQuAD）的跨度分类头部模块（在隐藏状态输出的线性层之上计算`跨度起始logits`和`跨度终止logits`）。
    """,
    MEGA_START_DOCSTRING,
)
class MegaForQuestionAnswering(MegaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 类别数目
        self.num_labels = config.num_labels

        # 使用MegaModel构建mega模型，不添加池化层
        self.mega = MegaModel(config, add_pooling_layer=False)
        # QA输出层，全连接层将config.hidden_size映射到config.num_labels
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 如果 `return_dict` 不为 None，则使用其值；否则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的前向传播方法 `mega`，传递各种输入和参数
        outputs = self.mega(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取序列输出 `sequence_output`
        sequence_output = outputs[0]

        # 将序列输出传递给模型的 QA 输出层，获得开始和结束位置的 logits
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果 `start_positions` 或 `end_positions` 是多维的，在第一维上进行压缩
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 忽略超出模型输入长度的位置
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，忽略索引为 `ignored_index` 的位置
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        # 如果 `return_dict` 为 False，则按照非字典返回格式构造输出
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果 `return_dict` 为 True，则构造 `QuestionAnsweringModelOutput` 对象返回
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Transformers-源码解析-七十三-

Transformers 源码解析（七十三）

.\models\mbart\modeling_tf_mbart.py

.\models\mbart\tokenization_mbart.py

.\models\mbart\tokenization_mbart_fast.py

.\models\mbart\__init__.py

.\models\mbart50\tokenization_mbart50.py

.\models\mbart50\tokenization_mbart50_fast.py

.\models\mbart50\__init__.py

.\models\mega\configuration_mega.py

.\models\mega\convert_mega_original_pytorch_checkpoint_to_pytorch.py

.\models\mega\modeling_mega.py

`.\models\mbart\modeling_tf_mbart.py`

`.\models\mbart\tokenization_mbart.py`

`.\models\mbart\tokenization_mbart_fast.py`

`.\models\mbart\init.py`

`.\models\mbart50\tokenization_mbart50.py`

`.\models\mbart50\tokenization_mbart50_fast.py`

`.\models\mbart50\init.py`

`.\models\mega\configuration_mega.py`

`.\models\mega\convert_mega_original_pytorch_checkpoint_to_pytorch.py`

`.\models\mega\modeling_mega.py`