Transformers 源码解析（七十九）

`.\models\mpnet\tokenization_mpnet_fast.py`

# coding=utf-8
# 文件编码声明，指定文件采用UTF-8编码格式

# Copyright 2018 The HuggingFace Inc. team, Microsoft Corporation.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
# 版权声明，指明版权归属及保留的所有权声明

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Apache License 2.0许可声明，指定了软件使用的许可条件

"""Fast Tokenization classes for MPNet."""
# 为MPNet提供快速分词的类声明

import json
from typing import List, Optional, Tuple

from tokenizers import normalizers

from ...tokenization_utils import AddedToken
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_mpnet import MPNetTokenizer

# 导入所需的模块和类

logger = logging.get_logger(__name__)

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
# 定义了词汇表和分词器文件的名称及其默认文件名

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/mpnet-base": "https://huggingface.co/microsoft/mpnet-base/resolve/main/vocab.txt",
    },
    "tokenizer_file": {
        "microsoft/mpnet-base": "https://huggingface.co/microsoft/mpnet-base/resolve/main/tokenizer.json",
    },
}
# 预训练模型与其对应的词汇表和分词器文件的映射关系

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/mpnet-base": 512,
}
# 预训练模型的位置嵌入大小配置

PRETRAINED_INIT_CONFIGURATION = {
    "microsoft/mpnet-base": {"do_lower_case": True},
}
# 预训练模型的初始化配置

class MPNetTokenizerFast(PreTrainedTokenizerFast):
    r"""
    Construct a "fast" MPNet tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    """
    # MPNetTokenizerFast类的定义，构建基于HuggingFace的tokenizers库支持的“快速”MPNet分词器，基于WordPiece算法
    # 词汇表文件名列表，定义了与不同模型相关联的预训练词汇表文件名
    vocab_files_names = VOCAB_FILES_NAMES
    
    # 预训练模型的词汇表文件映射，将预训练模型的名称映射到其对应的词汇表文件
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    
    # 预训练模型初始化的配置，包括词汇表、特殊标记等信息的配置
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    
    # 预训练位置嵌入大小，用于确定模型输入的最大长度限制
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    
    # 使用的慢速分词器的类，这里指定了 MPNetTokenizer 作为慢速分词器的实现
    slow_tokenizer_class = MPNetTokenizer
    model_input_names = ["input_ids", "attention_mask"]

    # 初始化函数，用于创建一个新的 Tokenizer 对象
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=True,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="[UNK]",
        pad_token="<pad>",
        mask_token="<mask>",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        # 将字符串类型的特殊 token 转换为 AddedToken 对象，保持左右空白不去除
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token

        # 设置 mask_token 作为特殊 token，包含之前的空格
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        # 调用父类的初始化函数，初始化 Tokenizer
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

        # 获取当前标准化器的状态并根据初始化参数调整
        pre_tok_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
        if (
            pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
            or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
        ):
            pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
            pre_tok_state["lowercase"] = do_lower_case
            pre_tok_state["strip_accents"] = strip_accents
            self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)

        # 设置当前对象的小写参数
        self.do_lower_case = do_lower_case

    # 获取 mask_token 的属性方法
    @property
    def mask_token(self) -> str:
        """
        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
        having been set.

        MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
        comprise the space before the *<mask>*.
        """
        # 如果 mask_token 尚未设置，记录错误并返回 None
        if self._mask_token is None:
            if self.verbose:
                logger.error("Using mask_token, but it is not set yet.")
            return None
        return str(self._mask_token)
    @mask_token.setter
    def mask_token(self, value):
        """
        Overriding the default behavior of the mask token to have it eat the space before it.

        This is needed to preserve backward compatibility with all the previously used models based on MPNet.
        """
        # 将 mask token 设置为像普通单词一样，包含其前面的空格
        # 因此我们将 lstrip 设置为 True
        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
        self._mask_token = value

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Constructs input with special tokens by prepending bos_token_id, appending eos_token_id,
        and optionally adding token_ids_1 with an additional eos_token_id.

        Args:
            token_ids_0 (list of int):
                List of input token IDs.
            token_ids_1 (list of int, optional):
                Optional second list of token IDs for sequence pairs.

        Returns:
            list of int: Combined list of token IDs with special tokens.
        """
        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        if token_ids_1 is None:
            return output

        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. MPNet does not
        make use of token type IDs, therefore a list of zeros is returned.

        Args:
            token_ids_0 (List[int]):
                List of token IDs.
            token_ids_1 (List[int], optional):
                Optional second list of token IDs for sequence pairs.

        Returns:
            List[int]: List of zeros indicating token type IDs.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Saves the vocabulary and related model files to the specified directory.

        Args:
            save_directory (str):
                Directory where the vocabulary will be saved.
            filename_prefix (str, optional):
                Optional prefix for the saved vocabulary files.

        Returns:
            Tuple[str]: Tuple containing the paths of the saved files.
        """
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

`.\models\mpnet\init.py`

# 导入类型检查模块
from typing import TYPE_CHECKING

# 导入相关依赖和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig"],
    "tokenization_mpnet": ["MPNetTokenizer"],
}

# 检查是否可用 tokenizers，如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则添加 tokenization_mpnet_fast 到导入结构
    _import_structure["tokenization_mpnet_fast"] = ["MPNetTokenizerFast"]

# 检查是否可用 torch，如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则添加 modeling_mpnet 到导入结构
    _import_structure["modeling_mpnet"] = [
        "MPNET_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MPNetForMaskedLM",
        "MPNetForMultipleChoice",
        "MPNetForQuestionAnswering",
        "MPNetForSequenceClassification",
        "MPNetForTokenClassification",
        "MPNetLayer",
        "MPNetModel",
        "MPNetPreTrainedModel",
    ]

# 检查是否可用 tensorflow，如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则添加 modeling_tf_mpnet 到导入结构
    _import_structure["modeling_tf_mpnet"] = [
        "TF_MPNET_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFMPNetEmbeddings",
        "TFMPNetForMaskedLM",
        "TFMPNetForMultipleChoice",
        "TFMPNetForQuestionAnswering",
        "TFMPNetForSequenceClassification",
        "TFMPNetForTokenClassification",
        "TFMPNetMainLayer",
        "TFMPNetModel",
        "TFMPNetPreTrainedModel",
    ]

# 如果是类型检查阶段，导入配置和 tokenizer 模块
if TYPE_CHECKING:
    from .configuration_mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig
    from .tokenization_mpnet import MPNetTokenizer

    # 检查是否可用 tokenizers，如果不可用则忽略
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，则导入 tokenization_mpnet_fast 模块
        from .tokenization_mpnet_fast import MPNetTokenizerFast

    # 检查是否可用 torch，如果不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    # 如果在当前环境中找不到 TensorFlow 库，则抛出 OptionalDependencyNotAvailable 异常
    else:
        # 导入 MPNet 模型相关模块，这些模块通常用于处理自然语言处理任务
        from .modeling_mpnet import (
            MPNET_PRETRAINED_MODEL_ARCHIVE_LIST,
            MPNetForMaskedLM,
            MPNetForMultipleChoice,
            MPNetForQuestionAnswering,
            MPNetForSequenceClassification,
            MPNetForTokenClassification,
            MPNetLayer,
            MPNetModel,
            MPNetPreTrainedModel,
        )

    # 尝试检查是否 TensorFlow 可用，如果不可用则捕获 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 如果 TensorFlow 不可用，处理捕获到的 OptionalDependencyNotAvailable 异常
    except OptionalDependencyNotAvailable:
        # 什么都不做，继续执行后续逻辑
        pass
    # 如果 TensorFlow 可用
    else:
        # 导入 TensorFlow 版本的 MPNet 模型相关模块
        from .modeling_tf_mpnet import (
            TF_MPNET_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFMPNetEmbeddings,
            TFMPNetForMaskedLM,
            TFMPNetForMultipleChoice,
            TFMPNetForQuestionAnswering,
            TFMPNetForSequenceClassification,
            TFMPNetForTokenClassification,
            TFMPNetMainLayer,
            TFMPNetModel,
            TFMPNetPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于操作 Python 解释器的系统功能
    import sys

    # 将当前模块注册到 sys.modules 中，使其可以通过当前模块的名称访问
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\mpt\configuration_mpt.py`

# coding=utf-8
# Copyright 2023 HuggingFace Inc. team and MosaicML NLP team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Mpt configuration
"""
from typing import TYPE_CHECKING, Optional, Union

# 检查是否在类型检查环境下
if TYPE_CHECKING:
    pass

# 导入预训练配置类
from ...configuration_utils import PretrainedConfig
# 导入日志工具
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# MPT预训练模型配置文件映射字典
MPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "mosaicml/mpt-7b": "https://huggingface.co/mosaicml/mpt-7b/resolve/main/config.json",
}


class MptAttentionConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`MptAttention`] class. It is used to instantiate
    attention layers according to the specified arguments, defining the layers architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the MPT
    [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b) architecture. Most of the arguments are kept for backward
    compatibility with previous MPT models that are hosted on the Hub (previously with `trust_remote_code=True`).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    Args:
        attn_type (`str`, *optional*, defaults to `"multihead_attention"`):
            type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`.
        attn_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability for the attention layers.
        attn_impl (`str`, *optional*, defaults to `"torch"`):
            The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`.
        clip_qkv (`float`, *optional*):
            If not `None`, clip the queries, keys, and values in the attention layer to this value.
        softmax_scale (`float`, *optional*, defaults to `None`):
            If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to
            `1/sqrt(hidden_size)`.
        prefix_lm (`bool`, *optional*, defaults to `False`)):
            Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument
            which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another
            bi-directionally. Tokens outside the prefix use causal attention.
        qk_ln (`bool`, *optional*, defaults to `False`):
            Whether to apply layer normalization to the queries and keys in the attention layer.
        attn_uses_sequence_id (`bool`, *optional*, defaults to `False`)):
            Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train`
            mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each
            token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored.
        alibi (`bool`, *optional*, defaults to `True`):
            Whether or not to use the alibi bias instead of positional embedding.
        alibi_bias_max (`int`, *optional*, defaults to 8):
            The maximum value of the alibi bias.
    """

    def __init__(
        self,
        attn_type="multihead_attention",
        attn_pdrop=0,
        attn_impl="torch",
        clip_qkv=None,
        softmax_scale=None,
        prefix_lm=False,
        qk_ln=False,
        attn_uses_sequence_id=False,
        alibi=True,
        alibi_bias_max=8,
        **kwargs,
    ):
        # 调用父类的初始化方法
        super().__init__()
        # 初始化注意力机制类型
        self.attn_type = attn_type
        # 初始化注意力机制中的 dropout 概率
        self.attn_pdrop = attn_pdrop
        # 初始化注意力机制的实现方式
        self.attn_impl = attn_impl
        # 如果设置了 clip_qkv 参数，则用其值来剪裁注意力层中的 queries、keys 和 values
        self.clip_qkv = clip_qkv
        # 如果设置了 softmax_scale 参数，则用其值来缩放注意力层中的 softmax 操作
        self.softmax_scale = softmax_scale
        # 是否将模型设置为 Prefix LM，这要求传入额外的 prefix_mask 参数
        self.prefix_lm = prefix_lm
        # 是否对注意力层中的 queries 和 keys 应用层归一化
        self.qk_ln = qk_ln
        # 是否限制注意力仅应用于具有相同 token_type_ids 的 tokens
        self.attn_uses_sequence_id = attn_uses_sequence_id
        # 是否使用 alibi 偏置而不是位置嵌入
        self.alibi = alibi
        # 初始化 alibi 偏置的最大值
        self.alibi_bias_max = alibi_bias_max

        # 检查 attn_type 是否为支持的类型，否则抛出 ValueError 异常
        if attn_type not in ["multihead_attention", "multiquery_attention"]:
            raise ValueError(
                f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}"
            )

    @classmethod
    # 从预训练模型名称或路径加载配置，并返回预训练配置对象
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
        # 在 kwargs 中设置 token
        cls._set_token_in_kwargs(kwargs)

        # 调用 get_config_dict 方法获取配置字典和更新后的 kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中的模型类型是 "mpt"，则使用其对应的注意力配置
        if config_dict.get("model_type") == "mpt":
            config_dict = config_dict["attn_config"]

        # 如果配置字典中包含 "model_type" 并且类有 "model_type" 属性，并且两者不相等，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 使用 from_dict 方法根据配置字典创建预训练配置对象
        return cls.from_dict(config_dict, **kwargs)
class MptConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`MptModel`]. It is used to instantiate a Mpt model
    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to the Mpt-7b architecture
    [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义 Transformer 编码器的配置参数

    Args:
        d_model (`int`, *optional*, defaults to 2048):
            # 嵌入和隐藏状态的维度。
            Dimensionality of the embeddings and hidden states.
        n_heads (`int`, *optional*, defaults to 16):
            # 每个注意力层中的注意力头数量。
            Number of attention heads for each attention layer in the Transformer encoder.
        n_layers (`int`, *optional*, defaults to 24):
            # Transformer 编码器中隐藏层的数量。
            Number of hidden layers in the Transformer encoder.
        expansion_ratio (`int`, *optional*, defaults to 4):
            # MLP 中上/下扩展比率。
            The ratio of the up/down scale in the MLP.
        max_seq_len (`int`, *optional*, defaults to 2048):
            # 模型的最大序列长度。
            The maximum sequence length of the model.
        vocab_size (`int`, *optional*, defaults to 50368):
            # Mpt 模型的词汇量大小。定义了在调用 `MptModel` 时可以表示的不同标记的最大数量。
            Vocabulary size of the Mpt model. Defines the maximum number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`MptModel`]. Check [this
            discussion](https://huggingface.co/bigscience/mpt/discussions/120#633d28389addb8530b406c2a) on how the
            `vocab_size` has been defined.
        resid_pdrop (`float`, *optional*, defaults to 0.0):
            # 在与残差结合之前应用于注意力输出的 dropout 概率。
            The dropout probability applied to the attention output before combining with residual.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
            # 层归一化层中使用的 epsilon。
            The epsilon to use in the layer normalization layers.
        emb_pdrop (`float`, *optional*, defaults to 0.0):
            # 嵌入层的 dropout 概率。
            The dropout probability for the embedding layer.
        learned_pos_emb (`bool`, *optional*, defaults to `True`):
            # 是否使用学习的位置编码。
            Whether to use learned positional embeddings.
        attn_config (`dict`, *optional*):
            # 用于配置模型注意力模块的字典。
            A dictionary used to configure the model's attention module.
        init_device (`str`, *optional*, defaults to `"cpu"`):
            # 用于参数初始化的设备。为了向后兼容而定义。
            The device to use for parameter initialization. Defined for backward compatibility
        logit_scale (`float`, *optional*):
            # 如果不为 None，则缩放 logits 的值。
            If not None, scale the logits by this value.
        no_bias (`bool`, *optional*, defaults to `True`):
            # 是否在所有线性层中使用偏置。
            Whether to use bias in all linear layers.
        verbose (`int`, *optional*, defaults to 0):
            # 用于日志记录的详细级别。在先前版本的 MPT 模型中用于日志记录。此参数已弃用。
            The verbosity level to use for logging. Used in the previous versions of MPT models for logging. This
            argument is deprecated.
        embedding_fraction (`float`, *optional*, defaults to 1.0):
            # 缩放嵌入层梯度的比例。
            The fraction to scale the gradients of the embedding layer by.
        norm_type (`str`, *optional*, defaults to `"low_precision_layernorm"`):
            # 要使用的层归一化类型。所有 MPT 模型使用相同的层归一化实现。为了向后兼容而定义。
            Type of layer norm to use. All MPT models uses the same layer norm implementation. Defined for backward
            compatibility.
        use_cache (`bool`, *optional*, defaults to `False`):
            # 模型是否应返回最后的 key/values 注意力（并非所有模型都使用）。
            Whether or not the model should return the last key/values attentions (not used by all models).
        initializer_range (`float`, *optional*, defaults to 0.02):
            # 用于初始化所有权重矩阵的截断正态初始化器的标准差。
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    Example:

    ```
    # 导入 transformers 库中的 MptConfig 和 MptModel 类
    >>> from transformers import MptConfig, MptModel

    # 初始化一个 MptConfig 实例
    >>> # Initializing a Mpt configuration
    >>> configuration = MptConfig()

    # 使用配置初始化一个模型实例（权重随机生成）
    >>> # Initializing a model (with random weights) from the configuration
    >>> model = MptModel(configuration)

    # 获取模型的配置信息
    >>> # Accessing the model configuration
    >>> configuration = model.config

    # 设定模型类型为 "mpt"
    model_type = "mpt"

    # 定义一个属性映射字典，将 MptConfig 中的部分属性名映射到另一种命名方式
    attribute_map = {
        "num_attention_heads": "n_heads",  # 注意力头数量映射为 n_heads
        "hidden_size": "d_model",          # 隐藏层大小映射为 d_model
        "num_hidden_layers": "n_layers",   # 隐藏层层数映射为 n_layers
    }

    # 定义 MptConfig 类
    def __init__(
        self,
        d_model: int = 2048,
        n_heads: int = 16,
        n_layers: int = 24,
        expansion_ratio: int = 4,
        max_seq_len: int = 2048,
        vocab_size: int = 50368,
        resid_pdrop: float = 0.0,
        layer_norm_epsilon: float = 1e-5,
        emb_pdrop: float = 0.0,
        learned_pos_emb: bool = True,
        attn_config: MptAttentionConfig = None,
        init_device: str = "cpu",
        logit_scale: Optional[Union[float, str]] = None,
        no_bias: bool = True,
        verbose: int = 0,
        embedding_fraction: float = 1.0,
        norm_type: str = "low_precision_layernorm",
        use_cache: bool = False,
        initializer_range=0.02,
        **kwargs,
    ):
        # 如果没有给定 attn_config，则初始化一个空的 MptAttentionConfig 对象
        if attn_config is None:
            self.attn_config = MptAttentionConfig()
        # 如果 attn_config 是字典类型，则使用这些参数初始化一个 MptAttentionConfig 对象
        elif isinstance(attn_config, dict):
            self.attn_config = MptAttentionConfig(**attn_config)
        else:
            self.attn_config = attn_config

        # 初始化各个属性值
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.expansion_ratio = expansion_ratio
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.resid_pdrop = resid_pdrop
        self.emb_pdrop = emb_pdrop
        self.learned_pos_emb = learned_pos_emb
        self.init_device = init_device
        self.logit_scale = logit_scale
        self.no_bias = no_bias
        self.verbose = verbose
        self.embedding_fraction = embedding_fraction
        self.norm_type = norm_type
        self.layer_norm_epsilon = layer_norm_epsilon
        self.use_cache = use_cache
        self.initializer_range = initializer_range

        # 调用父类的构造函数，传递其他未命名的关键字参数
        super().__init__(**kwargs)

`.\models\mpt\modeling_mpt.py`

# coding=utf-8
# Copyright 2023 HuggingFace Inc. team and MosaicML NLP team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch MPT model."""

import math
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
from torch.nn import functional as F

from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import logging
from .configuration_mpt import MptConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "mosaicml/mpt-7b"
_CONFIG_FOR_DOC = "MptConfig"

MPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "mosaicml/mpt-7b",
    "mosaicml/mpt-7b-storywriter",
    "mosaicml/mpt-7b-instruct",
    "mosaicml/mpt-7b-8k",
    "mosaicml/mpt-7b-8k-instruct",
    "mosaicml/mpt-7b-8k-chat",
    "mosaicml/mpt-30b",
    "mosaicml/mpt-30b-instruct",
    "mosaicml/mpt-30b-chat",
    # See all MPT models at https://huggingface.co/models?filter=mpt
]


def build_mpt_alibi_tensor(num_heads, sequence_length, alibi_bias_max=8, device=None):
    r"""
    Link to paper: https://arxiv.org/abs/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
    """
    # 创建一个序列长度为 sequence_length 的张量，元素从 -(sequence_length - 1) 到 -1
    alibi = torch.arange(1 - sequence_length, 1, dtype=torch.int32, device=device).view(1, 1, 1, sequence_length)
    
    # 将 num_heads 扩展到最接近的 2 的幂
    num_heads_power_of_2 = 2 ** math.ceil(math.log2(num_heads))
    
    # 创建一个长度为 num_heads_power_of_2 的序列，值从 1 到 num_heads_power_of_2
    base = torch.arange(1, num_heads_power_of_2 + 1, dtype=torch.int64, device=device).float()
    
    # 根据 num_heads_power_of_2 调整斜率，使其范围在 [0, alibi_bias_max / 2]
    base = base * (alibi_bias_max / num_heads_power_of_2)
    
    # 计算每个位置的斜率，根据 base 计算 2 的倒数
    slopes = 1.0 / torch.pow(2, base)
    slopes = slopes.view(1, num_heads_power_of_2, 1, 1)
    
    # 如果 num_heads_power_of_2 不等于 num_heads，则调整斜率的顺序以匹配 num_heads
    if num_heads_power_of_2 != num_heads:
        slopes = torch.cat([slopes[:, 1::2, ...], slopes[:, ::2, ...]], dim=1)[:, :num_heads, ...]
    # 将 alibi 乘以 slopes，假设它们是相同形状的张量，执行逐元素乘法
    alibi = alibi * slopes
    # 压缩张量 alibi 的第一个维度，如果该维度为 1，则去掉该维度
    return alibi.squeeze(0)
# 定义一个多头自注意力模块的类，继承自 nn.Module
class MptAttention(nn.Module):
    """Multi-head self attention.
    Using torch or triton attention implementation enables user to also use additive bias.
    多头自注意力模块，使用 torch 或 triton 实现的注意力机制，允许用户使用附加偏置。
    """

    def __init__(self, config: MptConfig):
        super().__init__()
        # 初始化模块参数
        self.hidden_size = config.hidden_size  # 隐藏层大小
        self.n_heads = config.n_heads  # 注意力头的数量
        self.max_seq_length = config.max_seq_len  # 最大序列长度
        self.head_dim = self.hidden_size // self.n_heads  # 每个注意力头的维度
        self.softmax_scale = config.attn_config.softmax_scale  # softmax 缩放因子
        if self.softmax_scale is None:
            self.softmax_scale = 1 / math.sqrt(self.hidden_size / self.n_heads)  # 若未指定，按默认计算

        self.attn_dropout_p = config.attn_config.attn_pdrop  # 注意力 dropout 概率
        # 线性层，用于计算查询、键、值
        self.Wqkv = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
        # 输出投影层，将多头注意力的结果映射回隐藏层大小
        self.out_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)

    def forward(
        self,
        hidden_states: torch.Tensor,  # 输入的隐藏状态张量
        position_bias: torch.Tensor,  # 位置偏置张量
        past_key_value: Optional[Tuple[torch.Tensor]] = None,  # 过去的键值对（可选）
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码（可选）
        ):
        ):
            # 获取隐藏状态的批量大小和序列长度
            batch_size, seq_length = hidden_states.shape[:2]

            # 通过权重矩阵Wqkv对隐藏状态进行线性变换，得到混合的查询、键、值状态
            mixed_qkv = self.Wqkv(hidden_states)
            query_states, key_states, value_states = mixed_qkv.chunk(3, dim=2)
            # 将查询、键、值状态重塑为(batch_size, seq_length, n_heads, head_dim)的形状，并转置以便后续操作
            query_states = query_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
            key_states = key_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
            value_states = value_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)

            # 处理过去的键值对，如果存在，则将当前的键值与过去的连接起来；否则直接使用当前的键值
            if past_key_value is not None:
                if len(past_key_value) != 0:
                    key_states = torch.cat([past_key_value[0], key_states], dim=2)
                    value_states = torch.cat([past_key_value[1], value_states], dim=2)
                past_key_value = (key_states, value_states)
            else:
                past_key_value = (key_states, value_states)

            # 计算注意力分数，通过查询状态与键状态的转置矩阵相乘，并乘以softmax缩放因子
            attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * self.softmax_scale

            # 如果存在位置偏置，则将注意力分数调整加上位置偏置
            query_length = seq_length if past_key_value is None else seq_length + past_key_value[0].shape[2]
            if position_bias is not None:
                if len(position_bias.shape) != 3:
                    raise ValueError(f"Expecting position_bias shape to be 3 dimensions, got {len(position_bias.shape)}")
                key_length = key_states.shape[-2]

                # 根据位置偏置的尺寸调整位置偏置矩阵，并加到注意力分数上
                position_bias_query_index = max(0, position_bias.size(1) - query_length)
                position_bias_key_index = max(0, position_bias.size(2) - key_length)
                position_bias = position_bias[:, position_bias_query_index:, position_bias_key_index:]
                attention_scores = attention_scores + position_bias

            # 如果存在注意力遮罩，则用一个很小的数填充注意力分数中的遮罩位置
            if attention_mask is not None:
                attention_scores = attention_scores.masked_fill(attention_mask, torch.finfo(query_states.dtype).min)

            # 对注意力分数应用softmax操作，进行dropout，得到最终的注意力权重
            attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).to(value_states.dtype)
            attn_weights = nn.functional.dropout(attn_weights, p=self.attn_dropout_p, training=self.training)

            # 将注意力权重与值状态相乘，得到上下文状态，并调整维度顺序以便进行最终投影
            context_states = torch.matmul(attn_weights, value_states)
            context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)

            # 通过最终投影得到最终的注意力输出结果，并返回注意力输出、注意力权重和更新后的键值对
            attn_output = self.out_proj(context_states)
            return attn_output, attn_weights, past_key_value
class MptMLP(nn.Module):
    # 定义 MptMLP 类，继承自 nn.Module
    def __init__(self, config: MptConfig):
        # 初始化函数，接收 MptConfig 类型的参数 config
        super().__init__()
        # 调用父类的初始化方法

        # 从配置中获取隐藏层大小
        hidden_size = config.hidden_size

        # 定义上投影层，将隐藏状态映射到四倍隐藏大小，不使用偏置
        self.up_proj = nn.Linear(hidden_size, 4 * hidden_size, bias=False)

        # 定义激活函数为 GELU，具体参数为"none"
        self.act = nn.GELU(approximate="none")

        # 定义下投影层，将四倍隐藏大小映射回隐藏大小，不使用偏置
        self.down_proj = nn.Linear(4 * hidden_size, hidden_size, bias=False)

        # 从注意力配置中获取隐藏层dropout概率
        self.hidden_dropout = config.attn_config.attn_pdrop

    def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
        # 前向传播方法
        # 对隐藏状态应用上投影层并使用激活函数
        hidden_states = self.act(self.up_proj(hidden_states))

        # 计算中间输出，通过下投影层
        intermediate_output = self.down_proj(hidden_states)

        # 对中间输出应用dropout，使用预定义的隐藏层dropout概率
        output = F.dropout(intermediate_output, p=self.hidden_dropout, training=self.training)

        # 将dropout后的输出与残差连接
        output = output + residual

        return output


class MptBlock(nn.Module):
    # 定义 MptBlock 类，继承自 nn.Module
    def __init__(self, config: MptConfig):
        # 初始化函数，接收 MptConfig 类型的参数 config
        super().__init__()
        # 调用父类的初始化方法

        # 从配置中获取隐藏层大小
        hidden_size = config.hidden_size

        # 定义第一个层归一化层，使用配置中的层归一化epsilon值
        self.norm_1 = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        # 兼容 Hub 上的权重，将偏置设为 None

        # 设置注意力头数为配置中的值
        self.num_heads = config.n_heads

        # 定义注意力机制层
        self.attn = MptAttention(config)

        # 定义第二个层归一化层，使用配置中的层归一化epsilon值
        self.norm_2 = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        # 兼容 Hub 上的权重，将偏置设为 None

        # 定义多层感知机
        self.ffn = MptMLP(config)

        # 设置dropout率为配置中的注意力dropout概率
        self.dropout_rate = config.attn_config.attn_pdrop

        # 定义残差注意力dropout层
        self.resid_attn_dropout = nn.Dropout(self.dropout_rate)

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_bias: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        use_cache: bool = False,
        output_attentions: bool = False,
    ):
        # 前向传播方法
        # hidden_states: [batch_size, seq_length, hidden_size]

        # 在变换器层的开始进行层归一化
        layernorm_output = self.norm_1(hidden_states)

        # 保存残差连接的初始隐藏状态
        residual = hidden_states

        # 自注意力机制
        attn_outputs, attn_weights, past_key_value = self.attn(
            layernorm_output,
            position_bias=position_bias,
            attention_mask=attention_mask,
            past_key_value=layer_past,
        )

        # 对注意力输出应用残差注意力dropout，并与初始残差连接
        hidden_states = self.resid_attn_dropout(attn_outputs) + residual

        # 再次进行层归一化
        layernorm_output = self.norm_2(hidden_states)

        # 保存残差连接的中间隐藏状态
        residual = hidden_states

        # 应用多层感知机层
        output = self.ffn(layernorm_output, residual)
        outputs = (output,)

        # 如果需要缓存，则返回额外的 past_key_value
        if use_cache:
            outputs += (past_key_value,)

        # 如果需要输出注意力权重，则返回额外的 attn_weights
        if output_attentions:
            outputs += (attn_weights,)

        return outputs  # 返回输出元组，包含隐藏状态、present、注意力权重


class MptPreTrainedModel(PreTrainedModel):
    # 定义 MptPreTrainedModel 类，继承自 PreTrainedModel
    config_class = MptConfig
    base_model_prefix = "transformer"
    supports_gradient_checkpointing = True
    _no_split_modules = ["MptBlock"]
    _keys_to_ignore_on_load_missing = [r"lm_head.*."]
    def __init__(self, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)


# 初始化函数，调用父类的初始化方法
def __init__(self, *inputs, **kwargs):
    super().__init__(*inputs, **kwargs)



    def _init_weights(self, module: nn.Module):
        """Initialize the weights."""
        if isinstance(module, nn.Linear):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, LayerNorm):
            if module.bias is not None:
                module.bias.data.zero_()
            module.weight.data.fill_(1.0)


# 初始化模型权重的函数
def _init_weights(self, module: nn.Module):
    """Initialize the weights."""
    # 如果是线性层，则使用正态分布初始化权重
    if isinstance(module, nn.Linear):
        # 使用正态分布初始化权重，均值为0，标准差为配置中的初始化范围
        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        # 如果存在偏置项，则将其初始化为零
        if module.bias is not None:
            module.bias.data.zero_()
    # 如果是嵌入层，则同样使用正态分布初始化权重
    elif isinstance(module, nn.Embedding):
        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        # 如果存在填充索引，则将对应的权重置零
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    # 如果是 LayerNorm 层，则初始化偏置项为零，权重为1
    elif isinstance(module, LayerNorm):
        if module.bias is not None:
            module.bias.data.zero_()
        module.weight.data.fill_(1.0)



    @staticmethod
    def _convert_to_mpt_cache(
        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
        """
        Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
        """
        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
        batch_size_times_num_heads = batch_size * num_heads
        # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
        return tuple(
            (
                layer_past[0].reshape(batch_size_times_num_heads, head_dim, seq_length),
                layer_past[1].reshape(batch_size_times_num_heads, seq_length, head_dim),
            )
            for layer_past in past_key_value
        )


# 将过去的键值对转换为 Mpt 期望的格式的静态方法
@staticmethod
def _convert_to_mpt_cache(
    past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
    """
    Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
    """
    # 获取过去键值对的形状信息
    batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
    batch_size_times_num_heads = batch_size * num_heads
    # 将每一层的过去键值对重塑为 Mpt 期望的形状
    return tuple(
        (
            layer_past[0].reshape(batch_size_times_num_heads, head_dim, seq_length),
            layer_past[1].reshape(batch_size_times_num_heads, seq_length, head_dim),
        )
        for layer_past in past_key_value
    )
# MPT_START_DOCSTRING 是一个长字符串，用来描述这个模型的基本信息和使用说明
MPT_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MptConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# MPT_INPUTS_DOCSTRING 是另一个字符串常量，用来描述模型的输入参数及其用法
MPT_INPUTS_DOCSTRING = r"""
    # 接收输入参数的函数定义，用于处理Transformer模型的输入
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
            
            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.
            
            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
            
            [What are input IDs?](../glossary#input-ids)
        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
            their past given to this model should not be passed as `input_ids` as they have already been computed.
            
            Each element of `past_key_values` is a tuple (past_key, past_value):
            - past_key: [batch_size * num_heads, head_dim, kv_length]
            - past_value: [batch_size * num_heads, kv_length, head_dim]
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            
            [What are attention masks?](../glossary#attention-mask)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
            
            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
            `past_key_values`).
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
    "The bare Mpt Model transformer outputting raw hidden-states without any specific head on top.",
    MPT_START_DOCSTRING,
)
"""
class MptModel(MptPreTrainedModel):
    """
    MPT Model class inheriting from MptPreTrainedModel, initializing the model with given configuration.

    Args:
        config (MptConfig): The configuration class defining model parameters.

    Attributes:
        hidden_size (int): Size of the hidden layers.
        num_heads (int): Number of attention heads.
        wte (nn.Embedding): Word token embeddings.
        blocks (nn.ModuleList): List of transformer blocks.
        norm_f (LayerNorm): Final layer normalization.
        gradient_checkpointing (bool): Flag for gradient checkpointing.

    Methods:
        get_input_embeddings(): Returns the input embeddings.
        build_mpt_alibi_tensor(): Builds alibi tensor for MPT.
        set_input_embeddings(new_embeddings): Sets new input embeddings.
        forward(): Performs forward pass through the model.
    """

    def __init__(self, config: MptConfig):
        super().__init__(config)

        self.hidden_size = config.hidden_size
        self.num_heads = config.n_heads

        # Embedding + LN Embedding
        self.wte = nn.Embedding(config.vocab_size, self.hidden_size)

        # Transformer blocks
        self.blocks = nn.ModuleList([MptBlock(config) for _ in range(config.n_layers)])

        # Final Layer Norm
        self.norm_f = LayerNorm(self.hidden_size, eps=config.layer_norm_epsilon)
        # backward compatibility with weights on the Hub
        self.norm_f.bias = None

        self.gradient_checkpointing = False

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        Returns:
            nn.Embedding: The input word token embeddings.
        """
        return self.wte

    def build_mpt_alibi_tensor(self, num_heads, sequence_length, alibi_bias_max=8, device=None):
        """
        Builds an alibi tensor for MPT.

        Args:
            num_heads (int): Number of attention heads.
            sequence_length (int): Length of the input sequence.
            alibi_bias_max (int, optional): Maximum bias value for alibi tensor. Defaults to 8.
            device (torch.device, optional): Device to place alibi tensor on. Defaults to None.

        Returns:
            torch.Tensor: Alibi tensor for MPT.
        """
        return build_mpt_alibi_tensor(num_heads, sequence_length, alibi_bias_max, device)

    def set_input_embeddings(self, new_embeddings: torch.Tensor):
        """
        Sets new input embeddings.

        Args:
            new_embeddings (torch.Tensor): New input embeddings to be set.
        """
        self.wte = new_embeddings

    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPastAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Performs forward pass through the MPT model.

        Args:
            input_ids (torch.LongTensor, optional): Input token IDs.
            past_key_values (Tuple[Tuple[torch.Tensor, torch.Tensor], ...], optional): Past key-value states for fast decoding.
            attention_mask (torch.Tensor, optional): Mask to avoid attention on padding tokens.
            inputs_embeds (torch.LongTensor, optional): Optional input embeddings.
            use_cache (bool, optional): Whether to use cached key-value states.
            output_attentions (bool, optional): Whether to output attention weights.
            output_hidden_states (bool, optional): Whether to output hidden states.
            return_dict (bool, optional): Whether to return a dictionary as output.

        Returns:
            BaseModelOutputWithPastAndCrossAttentions: Model output including past and cross attentions.
        """
        # Implementation of forward pass is omitted here for brevity
        pass



"""
@add_start_docstrings(
    """
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    MPT_START_DOCSTRING,
)
"""
class MptForCausalLM(MptPreTrainedModel):
    """
    MPT Model for Causal Language Modeling, inheriting from MptPreTrainedModel.

    Args:
        config (MptConfig): The configuration class defining model parameters.

    Attributes:
        transformer (MptModel): The MPT base model transformer.
        lm_head (nn.Linear): Language modeling head.

    Methods:
        get_output_embeddings(): Returns the output embeddings.
        set_output_embeddings(new_embeddings): Sets new output embeddings.
    """

    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config: MptConfig):
        super().__init__(config)
        self.transformer = MptModel(config)
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        """
        Returns:
            nn.Linear: The language modeling head.
        """
        return self.lm_head

    def set_output_embeddings(self, new_embeddings: torch.Tensor):
        """
        Sets new output embeddings.

        Args:
            new_embeddings (torch.Tensor): New output embeddings to be set.
        """
        self.lm_head = new_embeddings
    def prepare_inputs_for_generation(
        self,
        input_ids: torch.LongTensor,
        past_key_values: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        **kwargs,
    ) -> dict:
        # 如果 past_key_values 不为 None，则仅保留 input_ids 的最后一部分
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法已经只传递最后一个输入 ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认旧行为：仅保留最后一个 ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]

        # 如果传入了 `inputs_embeds`，并且 past_key_values 是 None，则只在第一个生成步骤中使用它们
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        # 更新 model_inputs 字典，包括 past_key_values、use_cache 和 attention_mask
        model_inputs.update(
            {
                "past_key_values": past_key_values,  # NITS 这里应该是 layer_past 吗？
                "use_cache": use_cache,
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Model 的前向传播方法，接受各种输入和参数进行推理和生成。

        Parameters:
        - input_ids (Optional[torch.LongTensor]): 输入的 token IDs.
        - past_key_values (Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]): 用于存储过去的 key 和 value 的元组。
        - attention_mask (Optional[torch.Tensor]): 注意力遮罩，掩盖不需要计算的位置。
        - inputs_embeds (Optional[torch.Tensor]): 如果传入，代表已经嵌入的输入。
        - labels (Optional[torch.Tensor]): 模型的标签，用于计算损失。
        - use_cache (Optional[bool]): 是否使用缓存以加速生成。
        - output_attentions (Optional[bool]): 是否输出注意力权重。
        - output_hidden_states (Optional[bool]): 是否输出隐藏状态。
        - return_dict (Optional[bool]): 是否返回字典格式的输出。

        Returns:
        - 输出字典，包含模型生成的各种输出。
        """
    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        # Determine whether to return a dictionary of outputs
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass input through the transformer model
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # Extract hidden states from transformer outputs
        hidden_states = transformer_outputs[0]

        # Generate logits from the language model head
        lm_logits = self.lm_head(hidden_states)

        # Initialize loss as None
        loss = None
        # Calculate loss if labels are provided
        if labels is not None:
            # Move labels to the same device as logits for model parallelism
            labels = labels.to(lm_logits.device)
            # Shift logits and labels to align predictions and targets
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            batch_size, seq_length, vocab_size = shift_logits.shape
            # Flatten the logits and labels to compute loss
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(
                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
            )

        # Prepare the output depending on return_dict flag
        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # Return structured output using CausalLMOutputWithCrossAttentions class
        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def _reorder_cache(
        self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
        """
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.

        Output shares the same memory storage as `past`.
        """
        # 创建一个字典，将每个 `layer_past` 的 `device` 映射到对应的 `beam_idx`，确保在每个生成步骤中 `past_key_values` 与正确的 `beam_idx` 匹配
        device_to_beam_idx = {
            past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
        }
        # 重新排序 `past`，使得每个 `layer_past` 的数据按照 `device_to_beam_idx` 中的索引顺序重新排列
        reordered_past = tuple(
            (
                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
            )
            for layer_past in past
        )
        # 返回重新排序后的 `past`，保持与输入 `past` 相同的内存存储结构
        return reordered_past
"""
The MPT Model transformer with a sequence classification head on top (linear layer).

[`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
(e.g. GPT-1) do.

Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
"""
@add_start_docstrings(
    """
    MPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    MPT_START_DOCSTRING,
)
class MptForTokenClassification(MptPreTrainedModel):
    def __init__(self, config: MptConfig):
        super().__init__(config)
        self.num_labels = config.num_labels

        # Initialize the MPT transformer model with the provided configuration
        self.transformer = MptModel(config)
        
        # Determine the dropout rate for the classifier layer based on the provided configuration
        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
            classifier_dropout = config.classifier_dropout
        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        
        # Apply dropout regularization to the classifier layer
        self.dropout = nn.Dropout(classifier_dropout)
        
        # Create a linear layer for the classification task with output size as specified in the configuration
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
"""
    # 使用装饰器为 forward 方法添加文档字符串，用于生成代码示例文档
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,  # 指定文档生成的检查点
        output_type=TokenClassifierOutput,  # 指定输出类型为 TokenClassifierOutput
        config_class=_CONFIG_FOR_DOC,  # 指定配置类用于文档
    )
    # 定义模型的前向传播方法
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的 token IDs，可选的长整型张量
        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,  # 过去的键值对，可选的张量元组
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，可选的张量
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入嵌入向量，可选的张量
        labels: Optional[torch.Tensor] = None,  # 标签，用于计算序列分类/回归损失的张量
        use_cache: Optional[bool] = None,  # 是否使用缓存，可选的布尔值
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，可选的布尔值
        **deprecated_arguments,  # 其他过时参数
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:  # 返回值可以是损失和输出元组或 TokenClassifierOutput 类型
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 Transformer 模型进行前向传播
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = transformer_outputs[0]  # 获取 Transformer 输出的隐藏状态
        hidden_states = self.dropout(hidden_states)  # 对隐藏状态应用 dropout
        logits = self.classifier(hidden_states)  # 将隐藏状态输入分类器得到 logits

        loss = None
        if labels is not None:
            # 将标签移动到正确的设备以启用模型并行计算
            labels = labels.to(logits.device)
            batch_size, seq_length = labels.shape
            loss_fct = CrossEntropyLoss()  # 使用交叉熵损失函数
            # 计算损失，将 logits 和 labels 展平为二维张量
            loss = loss_fct(
                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
            )

        if not return_dict:
            # 如果不返回字典，则返回包含 logits 和其他 Transformer 输出的元组
            output = (logits,) + transformer_outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果返回字典，则创建 TokenClassifierOutput 对象并返回
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
@add_start_docstrings(
    """
    The MPT Model transformer with a span classification head on top for extractive question-answering tasks like SQuAD
    (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    MPT_START_DOCSTRING,
)
class MptForQuestionAnswering(MptPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.transformer = MptModel(config)  # 使用给定配置初始化 MPT 模型
        self.qa_outputs = nn.Linear(config.hidden_size, 2)  # 初始化线性层用于答案起始位置和结束位置的预测

        # Initialize weights and apply final processing
        self.post_init()  # 执行额外的初始化和最终处理步骤

    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 如果 return_dict 为 None，则使用模型配置中的 use_return_dict 设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 Transformer 模型进行处理
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 Transformer 输出中获取序列输出
        sequence_output = outputs[0]

        # 将序列输出传递给 QA 输出层获取 logits
        logits = self.qa_outputs(sequence_output)
        # 将 logits 拆分为起始位置和结束位置的 logits
        start_logits, end_logits = logits.split(1, dim=-1)
        # 去除多余的维度，并使得张量连续
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果 start_positions 或 end_positions 的维度大于 1，则去除多余的维度
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 忽略超出模型输入的 start/end 位置
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，并计算起始位置和结束位置的损失
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            # 计算总损失为起始位置损失和结束位置损失的平均值
            total_loss = (start_loss + end_loss) / 2

        # 如果 return_dict 为 False，则返回包含损失和 logits 的元组
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果 return_dict 为 True，则返回 QuestionAnsweringModelOutput 对象
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\mpt\init.py`

# 导入类型检查模块
from typing import TYPE_CHECKING

# 导入自定义异常和懒加载模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构
_import_structure = {
    "configuration_mpt": ["MPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MptConfig", "MptOnnxConfig"],
}

# 检查是否导入了torch，若未导入则引发OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果torch可用，则添加模型相关的导入结构
    _import_structure["modeling_mpt"] = [
        "MPT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MptForCausalLM",
        "MptModel",
        "MptPreTrainedModel",
        "MptForSequenceClassification",
        "MptForTokenClassification",
        "MptForQuestionAnswering",
    ]

# 如果是类型检查模式，导入特定的配置和模型类
if TYPE_CHECKING:
    from .configuration_mpt import MPT_PRETRAINED_CONFIG_ARCHIVE_MAP, MptConfig, MptOnnxConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_mpt import (
            MPT_PRETRAINED_MODEL_ARCHIVE_LIST,
            MptForCausalLM,
            MptForQuestionAnswering,
            MptForSequenceClassification,
            MptForTokenClassification,
            MptModel,
            MptPreTrainedModel,
        )

# 在非类型检查模式下，将当前模块设置为一个懒加载模块
else:
    import sys

    # 将当前模块替换为一个懒加载模块对象
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\mra\configuration_mra.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
# 上面是版权声明和编码声明

# 导入必要的模块和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# 定义一个映射，将预训练模型名称映射到其配置文件的URL
MRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "uw-madison/mra-base-512-4": "https://huggingface.co/uw-madison/mra-base-512-4/resolve/main/config.json",
}

# MraConfig类继承自PretrainedConfig类，用于存储MRA模型的配置信息
class MraConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MraModel`]. It is used to instantiate an MRA
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Mra
    [uw-madison/mra-base-512-4](https://huggingface.co/uw-madison/mra-base-512-4) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义 Mra 模型的配置类，包含 Transformer 编码器的各种参数设置
    
    Args:
        vocab_size (`int`, *optional*, defaults to 50265):
            Mra 模型的词汇表大小，定义了在调用 [`MraModel`] 时输入 `inputs_ids` 可以表示的不同标记数量。
        hidden_size (`int`, *optional*, defaults to 768):
            编码器层和池化层的维度大小。
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Transformer 编码器中隐藏层的数量。
        num_attention_heads (`int`, *optional*, defaults to 12):
            Transformer 编码器中每个注意力层的注意头数量。
        intermediate_size (`int`, *optional*, defaults to 3072):
            Transformer 编码器中“中间”（即前馈）层的维度大小。
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            编码器和池化器中的非线性激活函数（函数或字符串）。如果是字符串，支持 `"gelu"`, `"relu"`, `"selu"` 和 `"gelu_new"`。
        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            注意力概率的 dropout 比率。
        max_position_embeddings (`int`, *optional*, defaults to 512):
            该模型可能使用的最大序列长度。通常设置一个大值（例如 512、1024 或 2048）以防万一。
        type_vocab_size (`int`, *optional*, defaults to 1):
            在调用 [`MraModel`] 时传递的 `token_type_ids` 的词汇表大小。
        initializer_range (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态初始化器的标准差。
        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
            层归一化层使用的 epsilon。
        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
            位置嵌入的类型。选择 `"absolute"`, `"relative_key"`, `"relative_key_query"` 之一。
        block_per_row (`int`, *optional*, defaults to 4):
            用于设置高分辨率比例的预算。
        approx_mode (`str`, *optional*, defaults to `"full"`):
            控制是否使用低分辨率和高分辨率的逼近。设置为 `"full"` 表示同时使用低分辨率和高分辨率，设置为 `"sparse"` 表示仅使用低分辨率。
        initial_prior_first_n_blocks (`int`, *optional*, defaults to 0):
            最初使用高分辨率的块数。
        initial_prior_diagonal_n_blocks (`int`, *optional*, defaults to 0):
            使用高分辨率的对角块数。
    
    Example:
    >>> from transformers import MraConfig, MraModel
    
    # 初始化一个 MRA 模型的配置，使用默认参数
    >>> configuration = MraConfig()
    
    # 根据给定的配置初始化一个 MRA 模型（权重随机初始化）
    >>> model = MraModel(configuration)
    
    # 获取模型的配置信息
    >>> configuration = model.config

`.\models\mra\convert_mra_pytorch_to_pytorch.py`

# 导入必要的库和模块
import argparse  # 用于解析命令行参数
import torch  # PyTorch 深度学习框架

# 从transformers库中导入MraConfig和MraForMaskedLM类
from transformers import MraConfig, MraForMaskedLM

# 定义函数：重命名原始键名
def rename_key(orig_key):
    # 替换包含 "model" 的键名为去除 "model." 后的内容
    if "model" in orig_key:
        orig_key = orig_key.replace("model.", "")
    # 替换包含 "norm1" 的键名为 "attention.output.LayerNorm"
    if "norm1" in orig_key:
        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
    # 替换包含 "norm2" 的键名为 "output.LayerNorm"
    if "norm2" in orig_key:
        orig_key = orig_key.replace("norm2", "output.LayerNorm")
    # 替换包含 "norm" 的键名为 "LayerNorm"
    if "norm" in orig_key:
        orig_key = orig_key.replace("norm", "LayerNorm")
    # 替换包含 "transformer" 的键名为 "encoder.layer."
    if "transformer" in orig_key:
        layer_num = orig_key.split(".")[0].split("_")[-1]
        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
    # 替换包含 "mha.attn" 的键名为 "attention.self"
    if "mha.attn" in orig_key:
        orig_key = orig_key.replace("mha.attn", "attention.self")
    # 替换包含 "mha" 的键名为 "attention"
    if "mha" in orig_key:
        orig_key = orig_key.replace("mha", "attention")
    # 替换包含 "W_q" 的键名为 "self.query"
    if "W_q" in orig_key:
        orig_key = orig_key.replace("W_q", "self.query")
    # 替换包含 "W_k" 的键名为 "self.key"
    if "W_k" in orig_key:
        orig_key = orig_key.replace("W_k", "self.key")
    # 替换包含 "W_v" 的键名为 "self.value"
    if "W_v" in orig_key:
        orig_key = orig_key.replace("W_v", "self.value")
    # 替换包含 "ff.0" 的键名为 "intermediate.dense"
    if "ff.0" in orig_key:
        orig_key = orig_key.replace("ff.0", "intermediate.dense")
    # 替换包含 "ff.2" 的键名为 "output.dense"
    if "ff.2" in orig_key:
        orig_key = orig_key.replace("ff.2", "output.dense")
    # 替换包含 "ff" 的键名为 "output.dense"
    if "ff" in orig_key:
        orig_key = orig_key.replace("ff", "output.dense")
    # 替换包含 "mlm_class" 的键名为 "cls.predictions.decoder"
    if "mlm_class" in orig_key:
        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
    # 替换包含 "mlm" 的键名为 "cls.predictions.transform"
    if "mlm" in orig_key:
        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
    # 替换包含 "backbone.backbone.encoders" 的键名为 "encoder.layer"
    if "backbone.backbone.encoders" in orig_key:
        orig_key = orig_key.replace("backbone.backbone.encoders", "encoder.layer")
    # 如果键名中不包含 "cls"，则添加前缀 "mra."
    if "cls" not in orig_key:
        orig_key = "mra." + orig_key

    return orig_key

# 定义函数：帮助转换检查点
def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
    # 遍历原始状态字典的所有键
    for key in orig_state_dict.copy().keys():
        # 弹出当前键对应的值
        val = orig_state_dict.pop(key)

        # 如果键名中包含 "pooler" 或 "sen_class"，则跳过当前键的处理
        if ("pooler" in key) or ("sen_class" in key):
            continue
        else:
            # 否则，使用重命名函数处理键名，并将值放回原始状态字典
            orig_state_dict[rename_key(key)] = val

    # 将 "cls.predictions.decoder.bias" 键名设置为 "cls.predictions.bias"
    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
    # 设置 "mra.embeddings.position_ids" 键名为一个张量，用于位置编码
    orig_state_dict["mra.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2
    # 返回函数当前保存的原始状态字典
    return orig_state_dict
# 定义函数，用于将 MRA 模型的检查点文件转换为 PyTorch 格式
def convert_mra_checkpoint(checkpoint_path, mra_config_file, pytorch_dump_path):
    # 使用 torch.load 加载检查点文件，并指定在 CPU 上加载模型状态字典
    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
    # 从 JSON 文件中加载 MRA 模型配置
    config = MraConfig.from_json_file(mra_config_file)
    # 根据配置创建 MraForMaskedLM 模型对象
    model = MraForMaskedLM(config)

    # 调用辅助函数转换原始状态字典到新状态字典
    new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)

    # 使用新状态字典加载模型参数
    print(model.load_state_dict(new_state_dict))
    # 将模型设置为评估模式
    model.eval()
    # 将转换后的模型保存到指定路径
    model.save_pretrained(pytorch_dump_path)

    # 打印转换成功消息，并显示保存的模型路径
    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")


if __name__ == "__main__":
    # 创建参数解析器
    parser = argparse.ArgumentParser()
    # 添加必需参数
    parser.add_argument(
        "--pytorch_model_path", default=None, type=str, required=True, help="Path to Mra pytorch checkpoint."
    )
    parser.add_argument(
        "--config_file",
        default=None,
        type=str,
        required=True,
        help="The json file for Mra model config.",
    )
    parser.add_argument(
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 解析命令行参数
    args = parser.parse_args()
    # 调用转换函数，传入解析后的参数
    convert_mra_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)

`.\models\mra\modeling_mra.py`

# coding=utf-8
# Copyright 2023 University of Wisconsin-Madison and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch MRA model."""


import math  # 导入数学模块
from pathlib import Path  # 导入路径操作模块
from typing import Optional, Tuple, Union  # 导入类型提示模块

import torch  # 导入PyTorch库
import torch.utils.checkpoint  # 导入PyTorch的checkpoint工具
from torch import nn  # 导入神经网络模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 导入损失函数
from torch.utils.cpp_extension import load  # 导入C++扩展加载模块

from ...activations import ACT2FN  # 导入激活函数映射
from ...modeling_outputs import (  # 导入模型输出
    BaseModelOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型工具类
from ...pytorch_utils import (  # 导入PyTorch工具类
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from ...utils import (  # 导入通用工具函数
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_ninja_available,
    is_torch_cuda_available,
    logging,
)
from .configuration_mra import MraConfig  # 导入MRA模型配置


logger = logging.get_logger(__name__)  # 获取日志记录器

_CHECKPOINT_FOR_DOC = "uw-madison/mra-base-512-4"  # 文档中使用的检查点
_CONFIG_FOR_DOC = "MraConfig"  # 文档中使用的配置
_TOKENIZER_FOR_DOC = "AutoTokenizer"  # 文档中使用的分词器

MRA_PRETRAINED_MODEL_ARCHIVE_LIST = [  # MRA预训练模型存档列表
    "uw-madison/mra-base-512-4",
    # 查看所有MRA模型：https://huggingface.co/models?filter=mra
]

mra_cuda_kernel = None  # 初始化MRA CUDA内核为None


def load_cuda_kernels():
    global mra_cuda_kernel  # 使用全局变量
    src_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "mra"  # 设置CUDA内核源文件夹路径

    def append_root(files):  # 定义一个函数，在文件列表中加入根路径
        return [src_folder / file for file in files]

    src_files = append_root(["cuda_kernel.cu", "cuda_launch.cu", "torch_extension.cpp"])  # CUDA内核源文件列表

    mra_cuda_kernel = load("cuda_kernel", src_files, verbose=True)  # 加载CUDA内核


def sparse_max(sparse_qk_prod, indices, query_num_block, key_num_block):
    """
    Computes maximum values for softmax stability.
    计算softmax稳定性的最大值。
    """
    if len(sparse_qk_prod.size()) != 4:  # 检查输入张量维度是否为4
        raise ValueError("sparse_qk_prod must be a 4-dimensional tensor.")

    if len(indices.size()) != 2:  # 检查索引张量维度是否为2
        raise ValueError("indices must be a 2-dimensional tensor.")

    if sparse_qk_prod.size(2) != 32:  # 检查sparse_qk_prod的第二个维度大小是否为32
        raise ValueError("The size of the second dimension of sparse_qk_prod must be 32.")

    if sparse_qk_prod.size(3) != 32:  # 检查sparse_qk_prod的第三个维度大小是否为32
        raise ValueError("The size of the third dimension of sparse_qk_prod must be 32.")

    index_vals = sparse_qk_prod.max(dim=-2).values.transpose(-1, -2)  # 计算最大值并转置
    # 调用PyTorch的contiguous()方法，确保index_vals是一个连续的Tensor
    index_vals = index_vals.contiguous()
    
    # 将indices转换为整型Tensor，并确保它是连续的
    indices = indices.int()
    indices = indices.contiguous()
    
    # 调用mra_cuda_kernel的index_max方法，使用index_vals和indices进行计算，
    # query_num_block和key_num_block是用于计算的块数量参数
    max_vals, max_vals_scatter = mra_cuda_kernel.index_max(index_vals, indices, query_num_block, key_num_block)
    
    # 将max_vals_scatter的最后两个维度进行转置操作，并在倒数第二个位置插入一个新的维度
    max_vals_scatter = max_vals_scatter.transpose(-1, -2)[:, :, None, :]
    
    # 返回计算结果max_vals和max_vals_scatter
    return max_vals, max_vals_scatter
# 稀疏掩码转换函数，用于生成高分辨率逻辑的稀疏掩码
def sparse_mask(mask, indices, block_size=32):
    # 检查掩码是否为二维张量
    if len(mask.size()) != 2:
        raise ValueError("mask must be a 2-dimensional tensor.")
    
    # 检查索引是否为二维张量
    if len(indices.size()) != 2:
        raise ValueError("indices must be a 2-dimensional tensor.")
    
    # 检查掩码和索引的第一维度是否相等
    if mask.shape[0] != indices.shape[0]:
        raise ValueError("mask and indices must have the same size in the zero-th dimension.")
    
    batch_size, seq_len = mask.shape
    num_block = seq_len // block_size
    
    # 创建一个批次索引张量
    batch_idx = torch.arange(indices.size(0), dtype=torch.long, device=indices.device)
    # 将掩码重塑为批次大小、块数量和块大小的张量
    mask = mask.reshape(batch_size, num_block, block_size)
    # 根据索引和块数量选择相应的掩码块
    mask = mask[batch_idx[:, None], (indices % num_block).long(), :]
    
    return mask


# 执行稀疏密集矩阵乘法的函数
def mm_to_sparse(dense_query, dense_key, indices, block_size=32):
    batch_size, query_size, dim = dense_query.size()
    _, key_size, dim = dense_key.size()
    
    # 检查查询大小是否可以被块大小整除
    if query_size % block_size != 0:
        raise ValueError("query_size (size of first dimension of dense_query) must be divisible by block_size.")
    
    # 检查键大小是否可以被块大小整除
    if key_size % block_size != 0:
        raise ValueError("key_size (size of first dimension of dense_key) must be divisible by block_size.")
    
    # 将密集查询和密集键重塑为批次大小、块数量、块大小和维度的张量，并交换最后两个维度
    dense_query = dense_query.reshape(batch_size, query_size // block_size, block_size, dim).transpose(-1, -2)
    dense_key = dense_key.reshape(batch_size, key_size // block_size, block_size, dim).transpose(-1, -2)
    
    # 检查密集查询和密集键是否为四维张量
    if len(dense_query.size()) != 4:
        raise ValueError("dense_query must be a 4-dimensional tensor.")
    
    if len(dense_key.size()) != 4:
        raise ValueError("dense_key must be a 4-dimensional tensor.")
    
    # 检查索引是否为二维张量，并确保密集查询和密集键的第三维度为32
    if len(indices.size()) != 2:
        raise ValueError("indices must be a 2-dimensional tensor.")
    
    if dense_query.size(3) != 32:
        raise ValueError("The third dimension of dense_query must be 32.")
    
    if dense_key.size(3) != 32:
        raise ValueError("The third dimension of dense_key must be 32.")
    
    # 使得密集查询、密集键和索引的内存连续性
    dense_query = dense_query.contiguous()
    dense_key = dense_key.contiguous()
    indices = indices.int().contiguous()
    
    # 调用底层 CUDA 内核函数执行稀疏化密集矩阵乘法
    return mra_cuda_kernel.mm_to_sparse(dense_query, dense_key, indices.int())


# 执行稀疏密集矩阵乘法的逆操作函数
def sparse_dense_mm(sparse_query, indices, dense_key, query_num_block, block_size=32):
    batch_size, key_size, dim = dense_key.size()
    
    # 检查密集键的大小是否可以被块大小整除
    if key_size % block_size != 0:
        raise ValueError("key_size (size of first dimension of dense_key) must be divisible by block_size.")
    
    # 检查稀疏查询的第二维和第三维大小是否等于块大小
    if sparse_query.size(2) != block_size:
        raise ValueError("The size of the second dimension of sparse_query must be equal to the block_size.")
    
    if sparse_query.size(3) != block_size:
        raise ValueError("The size of the third dimension of sparse_query must be equal to the block_size.")
    # 将密集键 reshape 成指定形状，以便进行后续操作
    dense_key = dense_key.reshape(batch_size, key_size // block_size, block_size, dim).transpose(-1, -2)
    
    # 检查稀疏查询的维度是否为四维，否则引发数值错误异常
    if len(sparse_query.size()) != 4:
        raise ValueError("sparse_query must be a 4-dimensional tensor.")
    
    # 检查密集键的维度是否为四维，否则引发数值错误异常
    if len(dense_key.size()) != 4:
        raise ValueError("dense_key must be a 4-dimensional tensor.")
    
    # 检查索引的维度是否为二维，否则引发数值错误异常
    if len(indices.size()) != 2:
        raise ValueError("indices must be a 2-dimensional tensor.")
    
    # 检查密集键的第三维是否为32，否则引发数值错误异常
    if dense_key.size(3) != 32:
        raise ValueError("The size of the third dimension of dense_key must be 32.")
    
    # 确保稀疏查询在内存中是连续的
    sparse_query = sparse_query.contiguous()
    
    # 将索引转换为整型类型，并确保在内存中是连续的
    indices = indices.int()
    indices = indices.contiguous()
    
    # 确保密集键在内存中是连续的
    dense_key = dense_key.contiguous()
    
    # 使用自定义 CUDA 核函数进行稀疏-密集矩阵乘法，生成密集查询-键乘积
    dense_qk_prod = mra_cuda_kernel.sparse_dense_mm(sparse_query, indices, dense_key, query_num_block)
    
    # 转置乘积张量的后两个维度，并将其 reshape 成指定形状
    dense_qk_prod = dense_qk_prod.transpose(-1, -2).reshape(batch_size, query_num_block * block_size, dim)
    
    # 返回最终的密集查询-键乘积张量
    return dense_qk_prod
def transpose_indices(indices, dim_1_block, dim_2_block):
    # 计算索引的转置，将二维块索引转换为一维块索引
    return ((indices % dim_2_block) * dim_1_block + torch.div(indices, dim_2_block, rounding_mode="floor")).long()


class MraSampledDenseMatMul(torch.autograd.Function):
    @staticmethod
    def forward(ctx, dense_query, dense_key, indices, block_size):
        # 计算稠密查询和键的乘积，并将结果转换为稀疏格式
        sparse_qk_prod = mm_to_sparse(dense_query, dense_key, indices, block_size)
        ctx.save_for_backward(dense_query, dense_key, indices)
        ctx.block_size = block_size
        return sparse_qk_prod

    @staticmethod
    def backward(ctx, grad):
        dense_query, dense_key, indices = ctx.saved_tensors
        block_size = ctx.block_size
        query_num_block = dense_query.size(1) // block_size
        key_num_block = dense_key.size(1) // block_size
        # 计算转置后的索引，用于反向传播梯度
        indices_T = transpose_indices(indices, query_num_block, key_num_block)
        grad_key = sparse_dense_mm(grad.transpose(-1, -2), indices_T, dense_query, key_num_block)
        grad_query = sparse_dense_mm(grad, indices, dense_key, query_num_block)
        return grad_query, grad_key, None, None

    @staticmethod
    def operator_call(dense_query, dense_key, indices, block_size=32):
        # 调用前向传播函数
        return MraSampledDenseMatMul.apply(dense_query, dense_key, indices, block_size)


class MraSparseDenseMatMul(torch.autograd.Function):
    @staticmethod
    def forward(ctx, sparse_query, indices, dense_key, query_num_block):
        # 计算稀疏查询和键的乘积，并将结果返回
        sparse_qk_prod = sparse_dense_mm(sparse_query, indices, dense_key, query_num_block)
        ctx.save_for_backward(sparse_query, indices, dense_key)
        ctx.query_num_block = query_num_block
        return sparse_qk_prod

    @staticmethod
    def backward(ctx, grad):
        sparse_query, indices, dense_key = ctx.saved_tensors
        query_num_block = ctx.query_num_block
        key_num_block = dense_key.size(1) // sparse_query.size(-1)
        # 计算转置后的索引，用于反向传播梯度
        indices_T = transpose_indices(indices, query_num_block, key_num_block)
        grad_key = sparse_dense_mm(sparse_query.transpose(-1, -2), indices_T, grad, key_num_block)
        grad_query = mm_to_sparse(grad, dense_key, indices)
        return grad_query, None, grad_key, None

    @staticmethod
    def operator_call(sparse_query, indices, dense_key, query_num_block):
        # 调用前向传播函数
        return MraSparseDenseMatMul.apply(sparse_query, indices, dense_key, query_num_block)


class MraReduceSum:
    @staticmethod
    # 定义一个函数operator_call，接受稀疏查询sparse_query、索引indices、查询块数query_num_block和键块数key_num_block作为参数
    def operator_call(sparse_query, indices, query_num_block, key_num_block):
        # 获取稀疏查询sparse_query的尺寸信息，包括批次大小batch_size、块数num_block、块大小block_size和未使用的维度（_）
        batch_size, num_block, block_size, _ = sparse_query.size()

        # 检查稀疏查询sparse_query是否为4维张量，如果不是则抛出ValueError异常
        if len(sparse_query.size()) != 4:
            raise ValueError("sparse_query must be a 4-dimensional tensor.")

        # 检查索引indices是否为2维张量，如果不是则抛出ValueError异常
        if len(indices.size()) != 2:
            raise ValueError("indices must be a 2-dimensional tensor.")

        # 重新获取稀疏查询sparse_query的尺寸信息，只关注批次大小batch_size和块数num_block，以及块大小block_size
        _, _, block_size, _ = sparse_query.size()
        
        # 获取索引indices的尺寸信息，包括批次大小batch_size和块数num_block
        batch_size, num_block = indices.size()

        # 对稀疏查询sparse_query按第2维求和，然后重新形状为(batch_size * num_block, block_size)
        sparse_query = sparse_query.sum(dim=2).reshape(batch_size * num_block, block_size)

        # 创建一个长为indices.size(0)的长整型张量batch_idx，设备与indices.device相同
        batch_idx = torch.arange(indices.size(0), dtype=torch.long, device=indices.device)
        
        # 计算全局索引global_idxes，通过除以key_num_block向下取整并转换为长整型，加上batch_idx的扩展乘以query_num_block
        global_idxes = (
            torch.div(indices, key_num_block, rounding_mode="floor").long() + batch_idx[:, None] * query_num_block
        ).reshape(batch_size * num_block)
        
        # 创建一个全零张量temp，形状为(batch_size * query_num_block, block_size)，数据类型与sparse_query一致，设备与sparse_query.device相同
        temp = torch.zeros(
            (batch_size * query_num_block, block_size), dtype=sparse_query.dtype, device=sparse_query.device
        )
        
        # 在temp的0维度上按照global_idxes进行索引添加sparse_query的数据，然后重新形状为(batch_size, query_num_block, block_size)
        output = temp.index_add(0, global_idxes, sparse_query).reshape(batch_size, query_num_block, block_size)

        # 将output重新形状为(batch_size, query_num_block * block_size)，并返回结果
        output = output.reshape(batch_size, query_num_block * block_size)
        return output
# 计算查询张量的批次大小、序列长度和注意力头维度
batch_size, seq_len, head_dim = query.size()

# 计算每行中的块数
num_block_per_row = seq_len // block_size

# 如果存在掩码，则计算每个块的令牌数量，并分别计算查询、键和可选值的低分辨率估计
value_hat = None
if mask is not None:
    token_count = mask.reshape(batch_size, num_block_per_row, block_size).sum(dim=-1)
    query_hat = query.reshape(batch_size, num_block_per_row, block_size, head_dim).sum(dim=-2) / (
        token_count[:, :, None] + 1e-6
    )
    key_hat = key.reshape(batch_size, num_block_per_row, block_size, head_dim).sum(dim=-2) / (
        token_count[:, :, None] + 1e-6
    )
    if value is not None:
        value_hat = value.reshape(batch_size, num_block_per_row, block_size, head_dim).sum(dim=-2) / (
            token_count[:, :, None] + 1e-6
        )
# 如果没有掩码，则假设所有块具有相同数量的令牌，并计算查询、键和可选值的均值
else:
    token_count = block_size * torch.ones(batch_size, num_block_per_row, dtype=torch.float, device=query.device)
    query_hat = query.reshape(batch_size, num_block_per_row, block_size, head_dim).mean(dim=-2)
    key_hat = key.reshape(batch_size, num_block_per_row, block_size, head_dim).mean(dim=-2)
    if value is not None:
        value_hat = value.reshape(batch_size, num_block_per_row, block_size, head_dim).mean(dim=-2)

# 计算低分辨率对数线性模型，使用查询的估计和键的估计之间的乘积，并除以头维度的平方根
low_resolution_logit = torch.matmul(query_hat, key_hat.transpose(-1, -2)) / math.sqrt(head_dim)

# 计算低分辨率对数线性模型每行的最大值
low_resolution_logit_row_max = low_resolution_logit.max(dim=-1, keepdims=True).values

# 如果存在掩码，则将低分辨率对数线性模型中小于阈值的元素置为负无穷大
if mask is not None:
    low_resolution_logit = (
        low_resolution_logit - 1e4 * ((token_count[:, None, :] * token_count[:, :, None]) < 0.5).float()
    )

# 返回低分辨率对数线性模型、令牌计数、每行最大值和可选值的估计
return low_resolution_logit, token_count, low_resolution_logit_row_max, value_hat
    # 获取 top_k_vals 中的索引信息
    indices = top_k_vals.indices

    # 根据 approx_mode 的取值进行不同的处理
    if approx_mode == "full":
        # 计算 top_k_vals 中每行最小值作为阈值
        threshold = top_k_vals.values.min(dim=-1).values
        # 生成一个高分辨率掩码，使得低分辨率的 logits 大于等于对应的阈值
        high_resolution_mask = (low_resolution_logit >= threshold[:, None, None]).float()
    elif approx_mode == "sparse":
        # 如果是稀疏模式，则高分辨率掩码设为 None
        high_resolution_mask = None
    else:
        # 抛出异常，提示 approx_mode 不是有效的值
        raise ValueError(f"{approx_mode} is not a valid approx_model value.")

    # 返回计算得到的索引和高分辨率掩码
    return indices, high_resolution_mask
    """
    使用 Mra 来近似自注意力机制。
    """
    # 如果未加载 CUDA 核心，返回一个和 query 形状相同的全零张量并标记需要梯度
    if mra_cuda_kernel is None:
        return torch.zeros_like(query).requires_grad_()

    # 获取 query 的形状信息
    batch_size, num_head, seq_len, head_dim = query.size()
    # 计算元批次大小
    meta_batch = batch_size * num_head

    # 检查序列长度是否能整除 block_size
    if seq_len % block_size != 0:
        raise ValueError("sequence length must be divisible by the block_size.")

    # 计算每行的块数
    num_block_per_row = seq_len // block_size

    # 重塑 query, key, value 张量的形状
    query = query.reshape(meta_batch, seq_len, head_dim)
    key = key.reshape(meta_batch, seq_len, head_dim)
    value = value.reshape(meta_batch, seq_len, head_dim)

    # 如果存在掩码，将其应用到 query, key, value 上
    if mask is not None:
        query = query * mask[:, :, None]
        key = key * mask[:, :, None]
        value = value * mask[:, :, None]

    # 根据 approx_mode 调用不同的低分辨率逻辑
    if approx_mode == "full":
        # 获取低分辨率逻辑相关的值
        low_resolution_logit, token_count, low_resolution_logit_row_max, value_hat = get_low_resolution_logit(
            query, key, block_size, mask, value
        )
    elif approx_mode == "sparse":
        # 在无梯度计算环境下获取低分辨率逻辑相关的值
        with torch.no_grad():
            low_resolution_logit, token_count, low_resolution_logit_row_max, _ = get_low_resolution_logit(
                query, key, block_size, mask
            )
    else:
        # 如果 approx_mode 不是 "full" 或 "sparse"，抛出异常
        raise Exception('approx_mode must be "full" or "sparse"')

    # 计算低分辨率逻辑的归一化值
    with torch.no_grad():
        low_resolution_logit_normalized = low_resolution_logit - low_resolution_logit_row_max
        # 获取块索引和高分辨率掩码
        indices, high_resolution_mask = get_block_idxes(
            low_resolution_logit_normalized,
            num_blocks,
            approx_mode,
            initial_prior_first_n_blocks,
            initial_prior_diagonal_n_blocks,
        )

    # 计算高分辨率逻辑
    high_resolution_logit = MraSampledDenseMatMul.operator_call(
        query, key, indices, block_size=block_size
    ) / math.sqrt(head_dim)
    # 计算最大值及其散列版本
    max_vals, max_vals_scatter = sparse_max(high_resolution_logit, indices, num_block_per_row, num_block_per_row)
    # 对高分辨率逻辑进行归一化处理
    high_resolution_logit = high_resolution_logit - max_vals_scatter
    # 如果存在掩码，对高分辨率逻辑进行额外处理
    if mask is not None:
        high_resolution_logit = high_resolution_logit - 1e4 * (1 - sparse_mask(mask, indices)[:, :, :, None])
    # 计算高分辨率注意力分布
    high_resolution_attn = torch.exp(high_resolution_logit)
    # 计算高分辨率注意力输出
    high_resolution_attn_out = MraSparseDenseMatMul.operator_call(
        high_resolution_attn, indices, value, num_block_per_row
    )
    # 计算高分辨率正则化因子
    high_resolution_normalizer = MraReduceSum.operator_call(
        high_resolution_attn, indices, num_block_per_row, num_block_per_row
    )
    # 如果近似模式为 "full"，则进行全模式的注意力计算
    if approx_mode == "full":
        # 计算低分辨率注意力权重
        low_resolution_attn = (
            torch.exp(low_resolution_logit - low_resolution_logit_row_max - 1e4 * high_resolution_mask)
            * token_count[:, None, :]
        )

        # 计算低分辨率注意力输出
        low_resolution_attn_out = (
            torch.matmul(low_resolution_attn, value_hat)[:, :, None, :]
            .repeat(1, 1, block_size, 1)
            .reshape(meta_batch, seq_len, head_dim)
        )

        # 计算低分辨率注意力的归一化因子
        low_resolution_normalizer = (
            low_resolution_attn.sum(dim=-1)[:, :, None].repeat(1, 1, block_size).reshape(meta_batch, seq_len)
        )

        # 计算对数修正项，用于调整低分辨率注意力输出
        log_correction = low_resolution_logit_row_max.repeat(1, 1, block_size).reshape(meta_batch, seq_len) - max_vals
        if mask is not None:
            log_correction = log_correction * mask

        # 计算低分辨率注意力的修正系数
        low_resolution_corr = torch.exp(log_correction * (log_correction <= 0).float())
        low_resolution_attn_out = low_resolution_attn_out * low_resolution_corr[:, :, None]
        low_resolution_normalizer = low_resolution_normalizer * low_resolution_corr

        # 计算高分辨率注意力的修正系数
        high_resolution_corr = torch.exp(-log_correction * (log_correction > 0).float())
        high_resolution_attn_out = high_resolution_attn_out * high_resolution_corr[:, :, None]
        high_resolution_normalizer = high_resolution_normalizer * high_resolution_corr

        # 计算最终的上下文层，结合了高低分辨率的注意力
        context_layer = (high_resolution_attn_out + low_resolution_attn_out) / (
            high_resolution_normalizer[:, :, None] + low_resolution_normalizer[:, :, None] + 1e-6
        )

    # 如果近似模式为 "sparse"，则进行稀疏模式的注意力计算
    elif approx_mode == "sparse":
        # 计算高分辨率注意力输出
        context_layer = high_resolution_attn_out / (high_resolution_normalizer[:, :, None] + 1e-6)
    else:
        # 如果近似模式既不是 "full" 也不是 "sparse"，则抛出异常
        raise Exception('config.approx_mode must be "full" or "sparse"')

    # 如果存在掩码，则应用掩码到上下文层
    if mask is not None:
        context_layer = context_layer * mask[:, :, None]

    # 将上下文层重塑成(batch_size, num_head, seq_len, head_dim)的形状
    context_layer = context_layer.reshape(batch_size, num_head, seq_len, head_dim)

    # 返回最终的上下文层
    return context_layer
class MraEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
        # 初始化词嵌入层，根据词汇大小、隐藏大小和填充标识符创建嵌入层
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 初始化位置嵌入层，根据最大位置嵌入大小创建嵌入层
        self.position_embeddings = nn.Embedding(config.max_position_embeddings + 2, config.hidden_size)
        # 初始化标记类型嵌入层，根据类型词汇大小创建嵌入层
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 使用 TensorFlow 模型变量名，并且能够加载任何 TensorFlow 检查点文件，因此未改为蛇形命名的 self.LayerNorm
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 注册缓冲区 "position_ids"，用于存储序列位置 ID
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2)
        # 位置嵌入类型，默认为绝对位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册缓冲区 "token_type_ids"，存储标记类型 ID，默认为全零
        self.register_buffer(
            "token_type_ids",
            torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
            persistent=False,
        )

    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        if position_ids is None:
            # 如果未提供位置 IDs，则使用注册的 position_ids
            position_ids = self.position_ids[:, :seq_length]

        # 如果未提供 token_type_ids，则使用注册的缓冲区中的全零，扩展以匹配输入形状
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            # 如果未提供 inputs_embeds，则使用 word_embeddings 层对 input_ids 进行嵌入
            inputs_embeds = self.word_embeddings(input_ids)
        # 获取 token_type_embeddings
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将词嵌入和标记类型嵌入相加
        embeddings = inputs_embeds + token_type_embeddings
        if self.position_embedding_type == "absolute":
            # 如果使用绝对位置编码，获取位置嵌入并加到 embeddings 中
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        # LayerNorm 标准化 embeddings
        embeddings = self.LayerNorm(embeddings)
        # 使用 dropout 进行 embeddings 的随机失活
        embeddings = self.dropout(embeddings)
        return embeddings
    # 初始化函数，接受配置参数和位置嵌入类型作为可选参数
    def __init__(self, config, position_embedding_type=None):
        # 调用父类的初始化方法
        super().__init__()
        
        # 检查隐藏层大小是否是注意力头数目的整数倍，如果不是且配置中没有嵌入大小属性，则引发值错误异常
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 检查是否支持 Torch CUDA，Ninja 构建系统可用，并且自定义 CUDA 内核未加载，则尝试加载 CUDA 内核
        kernel_loaded = mra_cuda_kernel is not None
        if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
            try:
                load_cuda_kernels()
            except Exception as e:
                # 如果加载失败，记录警告信息
                logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")

        # 初始化注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 分别初始化查询、键、值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 初始化注意力概率的丢弃层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # 设置位置嵌入类型，如果未提供则使用配置中的位置嵌入类型
        self.position_embedding_type = (
            position_embedding_type if position_embedding_type is not None else config.position_embedding_type
        )

        # 计算块的数量，最多不超过配置中允许的位置嵌入最大数量的平方
        self.num_block = (config.max_position_embeddings // 32) * config.block_per_row
        self.num_block = min(self.num_block, int((config.max_position_embeddings // 32) ** 2))

        # 设置近似模式和初始优先级的前几个块数以及对角线上的块数
        self.approx_mode = config.approx_mode
        self.initial_prior_first_n_blocks = config.initial_prior_first_n_blocks
        self.initial_prior_diagonal_n_blocks = config.initial_prior_diagonal_n_blocks

    # 重塑张量形状以便进行注意力计算，将最后一维分割为注意力头和每个头的大小
    def transpose_for_scores(self, layer):
        new_layer_shape = layer.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        layer = layer.view(*new_layer_shape)
        return layer.permute(0, 2, 1, 3)
    # 定义前向传播函数，用于处理输入的隐藏状态和注意力掩码
    def forward(self, hidden_states, attention_mask=None):
        # 生成查询向量，通过 self.query 函数对隐藏状态进行处理
        mixed_query_layer = self.query(hidden_states)

        # 生成键向量，通过 self.key 函数对隐藏状态进行处理，并转置以便进行注意力计算
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        
        # 生成值向量，通过 self.value 函数对隐藏状态进行处理，并转置以便进行注意力计算
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        
        # 转换查询向量的维度，以便进行注意力计算
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 获取 batch_size, num_heads, seq_len, head_dim 四个维度的大小
        batch_size, num_heads, seq_len, head_dim = query_layer.size()

        # 根据注意力掩码进行调整，将其归一化处理并乘以一个比例因子
        attention_mask = 1.0 + attention_mask / 10000.0
        attention_mask = (
            attention_mask.squeeze().repeat(1, num_heads, 1).reshape(batch_size * num_heads, seq_len).int()
        )

        # 对于 head_dim 小于 GPU 的 warp 大小（32）的情况，进行维度调整和填充操作
        gpu_warp_size = 32
        if head_dim < gpu_warp_size:
            pad_size = batch_size, num_heads, seq_len, gpu_warp_size - head_dim

            # 在查询、键和值向量的最后一个维度上拼接零张量，以满足 GPU warp 大小的要求
            query_layer = torch.cat([query_layer, torch.zeros(pad_size, device=query_layer.device)], dim=-1)
            key_layer = torch.cat([key_layer, torch.zeros(pad_size, device=key_layer.device)], dim=-1)
            value_layer = torch.cat([value_layer, torch.zeros(pad_size, device=value_layer.device)], dim=-1)

        # 调用自定义的多头相对注意力函数 mra2_attention 进行注意力计算
        context_layer = mra2_attention(
            query_layer.float(),
            key_layer.float(),
            value_layer.float(),
            attention_mask.float(),
            self.num_block,
            approx_mode=self.approx_mode,
            initial_prior_first_n_blocks=self.initial_prior_first_n_blocks,
            initial_prior_diagonal_n_blocks=self.initial_prior_diagonal_n_blocks,
        )

        # 如果 head_dim 小于 GPU warp 大小，截取计算后的 context_layer 的最后一个维度
        if head_dim < gpu_warp_size:
            context_layer = context_layer[:, :, :, :head_dim]

        # 调整 context_layer 的维度顺序，以便与输出维度一致
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()

        # 重新整形 context_layer，以适应输出的全部头部大小
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        # 输出结果，包含调整后的 context_layer
        outputs = (context_layer,)

        # 返回最终的输出结果
        return outputs
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput

class MraSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个线性层，用于变换隐藏状态的维度
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # LayerNorm 层，用于对输入进行归一化处理
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层，用于随机失活一部分神经元，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 线性变换
        hidden_states = self.dense(hidden_states)
        # Dropout 处理
        hidden_states = self.dropout(hidden_states)
        # LayerNorm 处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class MraAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # MraSelfAttention 类的实例化，用于自注意力计算
        self.self = MraSelfAttention(config, position_embedding_type=position_embedding_type)
        # MraSelfOutput 类的实例化，用于处理自注意力输出
        self.output = MraSelfOutput(config)
        # 用于存储被修剪的注意力头信息的集合
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 根据头信息进行注意力头的修剪
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 修剪线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储被修剪的头信息
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(self, hidden_states, attention_mask=None):
        # 调用 self 层的 forward 方法，计算自注意力
        self_outputs = self.self(hidden_states, attention_mask)
        # 调用 output 层的 forward 方法，处理自注意力的输出
        attention_output = self.output(self_outputs[0], hidden_states)
        outputs = (attention_output,) + self_outputs[1:]  # 如果输出了注意力信息，将其加入输出元组
        return outputs


# Copied from transformers.models.bert.modeling_bert.BertIntermediate

class MraIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性层，用于将隐藏状态映射到中间状态
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 中间激活函数，根据配置选择对应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 线性变换
        hidden_states = self.dense(hidden_states)
        # 中间激活函数处理
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOutput

class MraOutput(nn.Module):
    # 这部分截断了，无法为其添加注释
    # 初始化函数，用于初始化对象
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个全连接层，输入维度是config.intermediate_size，输出维度是config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个 Layer Normalization 层，输入维度是config.hidden_size，设置epsilon为config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层，概率为config.hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，接收两个张量作为输入并返回一个张量
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用全连接层处理hidden_states张量
        hidden_states = self.dense(hidden_states)
        # 对处理后的张量进行 Dropout 操作
        hidden_states = self.dropout(hidden_states)
        # 对结果张量进行 Layer Normalization，并与输入张量相加
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的张量作为输出
        return hidden_states
# 定义一个名为 MraLayer 的类，继承自 nn.Module
class MraLayer(nn.Module):
    # 初始化方法，接收一个 config 对象作为参数
    def __init__(self, config):
        super().__init__()
        # 设置自身属性 chunk_size_feed_forward 为 config 中的 chunk_size_feed_forward 属性
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 设置自身属性 seq_len_dim 为 1
        self.seq_len_dim = 1
        # 初始化 self.attention 为 MraAttention 类的一个实例，使用传入的 config 参数
        self.attention = MraAttention(config)
        # 设置 self.add_cross_attention 属性为 config 中的 add_cross_attention 属性
        self.add_cross_attention = config.add_cross_attention
        # 初始化 self.intermediate 为 MraIntermediate 类的一个实例，使用传入的 config 参数
        self.intermediate = MraIntermediate(config)
        # 初始化 self.output 为 MraOutput 类的一个实例，使用传入的 config 参数
        self.output = MraOutput(config)

    # 前向传播方法，接收 hidden_states 和 attention_mask 两个参数
    def forward(self, hidden_states, attention_mask=None):
        # 使用 self.attention 对象处理 hidden_states 和 attention_mask，得到 self_attention_outputs
        self_attention_outputs = self.attention(hidden_states, attention_mask)
        # 从 self_attention_outputs 中获取注意力输出 attention_output
        attention_output = self_attention_outputs[0]

        # 如果需要输出注意力权重，则将额外的自注意力输出添加到 outputs 中
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        # 将 attention_output 应用分块处理函数，得到 layer_output
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # 将 layer_output 添加到 outputs 中
        outputs = (layer_output,) + outputs

        # 返回 outputs
        return outputs

    # 分块处理函数，接收 attention_output 作为参数
    def feed_forward_chunk(self, attention_output):
        # 使用 self.intermediate 处理 attention_output，得到 intermediate_output
        intermediate_output = self.intermediate(attention_output)
        # 使用 self.output 处理 intermediate_output 和 attention_output，得到 layer_output
        layer_output = self.output(intermediate_output, attention_output)
        # 返回 layer_output
        return layer_output


# 定义一个名为 MraEncoder 的类，继承自 nn.Module
class MraEncoder(nn.Module):
    # 初始化方法，接收一个 config 对象作为参数
    def __init__(self, config):
        super().__init__()
        # 将 config 属性设置为 self 的属性
        self.config = config
        # 创建一个包含多个 MraLayer 层的 ModuleList，层数由 config.num_hidden_layers 决定
        self.layer = nn.ModuleList([MraLayer(config) for _ in range(config.num_hidden_layers)])
        # 设置 gradient_checkpointing 属性为 False
        self.gradient_checkpointing = False

    # 前向传播方法，接收多个参数，其中 hidden_states 是必须的
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_hidden_states=False,
        return_dict=True,
    ):
        # 如果需要输出所有隐藏状态，则初始化 all_hidden_states 为一个空元组，否则为 None
        all_hidden_states = () if output_hidden_states else None

        # 遍历每个 MraLayer 层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出所有隐藏状态，并且当前处于训练阶段，则将 hidden_states 加入 all_hidden_states
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果启用了梯度检查点且当前处于训练阶段，则调用 _gradient_checkpointing_func
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                )
            else:
                # 否则，调用 layer_module 的前向传播方法，处理 hidden_states 和 attention_mask
                layer_outputs = layer_module(hidden_states, attention_mask)

            # 更新 hidden_states 为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

        # 如果需要输出所有隐藏状态，则将最终的 hidden_states 加入 all_hidden_states
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果 return_dict 为 False，则返回隐藏状态和所有隐藏状态中的非空元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
        
        # 否则，返回一个 BaseModelOutputWithCrossAttentions 对象，包含最终的隐藏状态和所有隐藏状态
        return BaseModelOutputWithCrossAttentions(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
        )
    # 初始化方法，用于创建一个新的对象实例
    def __init__(self, config):
        # 调用父类（nn.Module）的初始化方法
        super().__init__()
        # 创建一个全连接层，输入和输出大小均为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 如果 config.hidden_act 是字符串，则从预定义的映射 ACT2FN 中获取对应的激活函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            # 否则，使用配置中的激活函数
            self.transform_act_fn = config.hidden_act
        # 创建一个 LayerNorm 层，用于规范化隐藏状态向量
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # 前向传播方法，定义了模型的数据流向
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 全连接层计算，将输入 hidden_states 映射到同一维度空间
        hidden_states = self.dense(hidden_states)
        # 应用激活函数，根据 self.transform_act_fn 定义的方式
        hidden_states = self.transform_act_fn(hidden_states)
        # LayerNorm 层对隐藏状态进行规范化处理
        hidden_states = self.LayerNorm(hidden_states)
        # 返回处理后的隐藏状态作为输出
        return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->Mra
# 定义了一个用于Masked Language Model预测头部的PyTorch模块。
class MraLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用MraPredictionHeadTransform对隐藏状态进行转换
        self.transform = MraPredictionHeadTransform(config)

        # 输出权重与输入的嵌入向量相同，但每个token有一个仅输出的偏置项。
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 初始化一个全零的偏置项参数
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # 需要在两个变量之间建立链接，以便在调用resize_token_embeddings时正确调整偏置大小
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        # 对输入的隐藏状态进行转换
        hidden_states = self.transform(hidden_states)
        # 使用线性层进行预测
        hidden_states = self.decoder(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->Mra
# 定义了一个仅包含MLM预测头部的PyTorch模块。
class MraOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用MraLMPredictionHead来生成预测分数
        self.predictions = MraLMPredictionHead(config)

    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
        # 对序列输出进行预测
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


# Copied from transformers.models.yoso.modeling_yoso.YosoPreTrainedModel with Yoso->Mra,yoso->mra
# MraPreTrainedModel是一个抽象类，用于处理权重初始化以及下载和加载预训练模型的简单接口。
class MraPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    
    # 指定配置类为MraConfig
    config_class = MraConfig
    # 指定基础模型前缀为"mra"
    base_model_prefix = "mra"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """初始化权重"""
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化线性层的权重，均值为0，标准差为self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果有偏置项，则将其初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化嵌入层的权重，均值为0，标准差为self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果有padding_idx，则将其对应的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 将LayerNorm层的偏置项初始化为零，权重初始化为1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


# MRA_START_DOCSTRING
# MRA_START_DOCSTRING用于定义一个多行字符串，介绍了MraPreTrainedModel的基本信息和用法。
"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
"""
    Parameters:
        config ([`MraConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
定义一个多层次的文档字符串，描述了模型输入的各种参数和返回的内容。

"""

@add_start_docstrings(
    "The bare MRA Model transformer outputting raw hidden-states without any specific head on top.",
    MRA_START_DOCSTRING,
)
class MraModel(MraPreTrainedModel):
    """
    MRA模型类，继承自MraPreTrainedModel，用于输出未经任何特定头部处理的原始隐藏状态。

    Args:
        config (MraConfig): 包含模型配置信息的配置对象。

    Attributes:
        config (MraConfig): 模型的配置信息对象。
        embeddings (MraEmbeddings): MRA模型的嵌入层。
        encoder (MraEncoder): MRA模型的编码器层。
    """

    def __init__(self, config):
        """
        初始化方法，设置模型的各个组件。

        Args:
            config (MraConfig): 包含模型配置信息的配置对象。
        """
        super().__init__(config)
        self.config = config

        # 初始化嵌入层和编码器层
        self.embeddings = MraEmbeddings(config)
        self.encoder = MraEncoder(config)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        """
        返回模型的嵌入层的词嵌入。

        Returns:
            torch.nn.Embedding: 返回模型的嵌入层的词嵌入。
        """
        return self.embeddings.word_embeddings
    # 设置模型输入的嵌入向量，用给定的值替换当前的词嵌入
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    # 剪枝模型中的注意力头部
    # heads_to_prune: 要剪枝的注意力头部的字典 {层号: 要在此层剪枝的头部列表}，参见基类 PreTrainedModel
    def _prune_heads(self, heads_to_prune):
        for layer, heads in heads_to_prune.items():
            # 获取指定层的注意力头部并执行剪枝操作
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 将添加的文档字符串（start_docstrings_to_model_forward 中的格式字符串）应用于模型前向方法
    # MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length") 中的格式字符串是输入说明
    # 同时添加代码示例的文档字符串，包括 checkpoint、输出类型、配置类
    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 使用装饰器为模型类添加文档字符串，描述其作为具有“语言建模”头部的MRA模型
@add_start_docstrings("""MRA Model with a `language modeling` head on top.""", MRA_START_DOCSTRING)
# 定义一个继承自MraPreTrainedModel的MraForMaskedLM类
class MraForMaskedLM(MraPreTrainedModel):
    # 定义一个类变量，包含绑定权重的键名列表
    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]

    # 初始化方法，接受一个配置参数config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 创建一个MRA模型实例，根据给定的配置参数
        self.mra = MraModel(config)
        # 创建一个仅包含MLM头部的实例，根据给定的配置参数
        self.cls = MraOnlyMLMHead(config)

        # 执行后续初始化权重和应用最终处理
        self.post_init()

    # 返回MLM头部的预测解码器的权重
    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    # 设置MLM头部的预测解码器的新嵌入
    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    # 使用装饰器为forward方法添加文档字符串，描述其输入格式
    # 并添加代码示例的文档字符串，指定了用于文档的检查点、输出类型和配置类
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 输入参数说明结束
    ) -> Union[Tuple, MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        """
        # 设置返回字典的选择，如果未提供则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用模型的 masked language modeling heads 处理输入
        outputs = self.mra(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中提取序列输出
        sequence_output = outputs[0]
        # 通过分类层获取预测分数
        prediction_scores = self.cls(sequence_output)

        # 初始化 masked language modeling 的损失为 None
        masked_lm_loss = None
        # 如果提供了 labels，则计算 masked language modeling 的损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # 交叉熵损失函数，用于计算损失
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # 如果 return_dict 为 False，则返回一个元组
        if not return_dict:
            output = (prediction_scores,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # 如果 return_dict 为 True，则返回一个 MaskedLMOutput 对象
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 从 transformers.models.yoso.modeling_yoso.YosoClassificationHead 复制并改名为 MraClassificationHead
class MraClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层，输入维度为 config.hidden_size，输出维度为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义一个 dropout 层，根据 config.hidden_dropout_prob 进行随机置零
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 定义一个全连接层，输入维度为 config.hidden_size，输出维度为 config.num_labels
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

        self.config = config

    def forward(self, features, **kwargs):
        # 从 features 中取出第一个 token 的输出作为 x，相当于取出 [CLS] 的输出
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        # 对 x 应用 dropout
        x = self.dropout(x)
        # 将 x 输入到全连接层 dense 中
        x = self.dense(x)
        # 根据 config 中指定的激活函数 ACT2FN 进行激活
        x = ACT2FN[self.config.hidden_act](x)
        # 再次对 x 应用 dropout
        x = self.dropout(x)
        # 将 x 输入到全连接层 out_proj 中，得到最终的分类输出
        x = self.out_proj(x)
        return x


@add_start_docstrings(
    """MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.""",
    MRA_START_DOCSTRING,
)
# 定义 MraForSequenceClassification 类，继承自 MraPreTrainedModel
class MraForSequenceClassification(MraPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 初始化 num_labels 属性
        self.num_labels = config.num_labels
        # 创建 MraModel 实例，并赋值给 self.mra
        self.mra = MraModel(config)
        # 创建 MraClassificationHead 实例，并赋值给 self.classifier
        self.classifier = MraClassificationHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义 forward 方法，接受一系列输入参数并返回分类模型的输出
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 输入参数的注释说明
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 根据需要确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用模型的前向传播方法进行计算
        outputs = self.mra(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取序列输出
        sequence_output = outputs[0]
        # 将序列输出传递给分类器获取 logits
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        # 如果给定了标签
        if labels is not None:
            # 如果问题类型尚未确定，则根据情况自动设定
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算相应的损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单一标签的回归问题，计算均方误差损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签的回归问题，计算均方误差损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类问题，使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类问题，使用带 logits 的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        
        # 如果不需要返回字典，则组织输出格式
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典，则构建 SequenceClassifierOutput 对象并返回
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 使用装饰器添加多项选择分类模型的文档字符串，描述其用途和结构
@add_start_docstrings(
    """MRA Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
    MRA_START_DOCSTRING,
)
# 定义多项选择分类的 MRA 模型类，继承自 MraPreTrainedModel
class MraForMultipleChoice(MraPreTrainedModel):
    
    # 初始化方法，接收一个配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        
        # 创建 MRA 模型对象
        self.mra = MraModel(config)
        # 创建预分类器，将隐藏状态映射到同样大小的隐藏状态空间
        self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
        # 创建分类器，将隐藏状态映射到一个标量值（用于多项选择任务）
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    # 使用装饰器为 forward 方法添加输入文档字符串，描述输入参数的作用和形状
    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    # 使用装饰器为 forward 方法添加代码示例的文档字符串，展示其用法和输出类型
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 前向传播方法定义，接收多个输入参数和可选的标签，返回模型输出或损失
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 函数参数列表未完，继续下一行
        ) -> Union[Tuple, MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据 return_dict 参数确定是否返回字典类型的结果
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 计算 num_choices，即选择题选项的数量
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 将 input_ids 重新视图化为二维张量，方便后续处理
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        # 将 attention_mask 重新视图化为二维张量，方便后续处理
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        # 将 token_type_ids 重新视图化为二维张量，方便后续处理
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        # 将 position_ids 重新视图化为二维张量，方便后续处理
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        # 将 inputs_embeds 重新视图化为三维张量，方便后续处理
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用模型的前向传播函数 mra，获取输出结果
        outputs = self.mra(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取隐藏状态输出
        hidden_state = outputs[0]  # (bs * num_choices, seq_len, dim)
        # 取每个样本序列的第一个位置的隐藏状态作为池化输出
        pooled_output = hidden_state[:, 0]  # (bs * num_choices, dim)
        # 将池化输出传递给预分类器进行处理
        pooled_output = self.pre_classifier(pooled_output)  # (bs * num_choices, dim)
        # 使用 ReLU 激活函数处理池化输出
        pooled_output = nn.ReLU()(pooled_output)  # (bs * num_choices, dim)
        # 使用分类器获取最终的分类 logits
        logits = self.classifier(pooled_output)

        # 将 logits 重塑为二维张量，以便计算损失
        reshaped_logits = logits.view(-1, num_choices)

        # 初始化损失为 None
        loss = None
        # 如果提供了 labels，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果 return_dict 参数为 False，则按元组形式返回结果
        if not return_dict:
            output = (reshaped_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 参数为 True，则按 MultipleChoiceModelOutput 类返回结果
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 使用装饰器为模型类添加文档字符串，描述该模型在标记分类任务（如命名实体识别）上的作用
@add_start_docstrings(
    """MRA Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
    MRA_START_DOCSTRING,
)
# 定义 MraForTokenClassification 类，继承自 MraPreTrainedModel
class MraForTokenClassification(MraPreTrainedModel):
    def __init__(self, config):
        # 调用父类的构造函数初始化模型
        super().__init__(config)
        # 设置类别数目
        self.num_labels = config.num_labels

        # 初始化 MRA 模型
        self.mra = MraModel(config)
        # 初始化 Dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 初始化分类器线性层
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    # 使用装饰器为 forward 方法添加文档字符串，描述其输入与输出
    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 使用装饰器为 forward 方法添加代码示例文档字符串
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 根据 `return_dict` 是否为 None，确定是否使用配置中的返回字典设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的前向传播方法，获取输出
        outputs = self.mra(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出中的序列输出（通常是隐藏状态的最后一层）
        sequence_output = outputs[0]

        # 对序列输出应用 dropout
        sequence_output = self.dropout(sequence_output)
        
        # 将dropout后的序列输出传入分类器，得到分类器的 logits
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        # 如果提供了标签，则计算损失
        if labels is not None:
            # 使用交叉熵损失函数
            loss_fct = CrossEntropyLoss()
            
            # 只保留损失的活跃部分（根据 attention_mask）
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不要求返回字典，则返回元组形式的输出
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典，则返回 TokenClassifierOutput 对象
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 为 MRA 问题回答模型添加文档字符串，描述其包含在顶部的跨度分类头部，用于像 SQuAD 这样的抽取式问答任务
# (在隐藏状态输出之上的线性层，用于计算 `span start logits` 和 `span end logits`)。
@add_start_docstrings(
    """MRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
    MRA_START_DOCSTRING,
)
class MraForQuestionAnswering(MraPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 设定模型的标签数量为2（即开始和结束）
        config.num_labels = 2
        self.num_labels = config.num_labels

        # 初始化 MRA 模型和线性输出层
        self.mra = MraModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    # 添加模型前向传播函数的文档字符串，描述输入参数和返回类型
    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 声明模型前向传播函数可能接受的参数及其类型
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 初始化返回字典，如果未提供则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的前向传播方法，获取模型输出
        outputs = self.mra(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取序列输出
        sequence_output = outputs[0]

        # 将序列输出传入答案提取的输出层，获取起始位置和结束位置的预测概率
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果提供了起始和结束位置标签，则计算损失函数
            # 处理多GPU情况，添加维度以匹配模型输出
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 忽略超出模型输入范围的起始和结束位置标签
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 使用交叉熵损失函数，计算起始和结束位置的损失
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        # 如果不需要返回字典，则返回包含损失和输出的元组
        if not return_dict:
            output = (start_logits, end_logits) + outputs[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 返回带有损失、起始和结束位置预测、隐藏状态和注意力权重的输出对象
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\mra\init.py`

# flake8: noqa
# 无法在此模块中忽略 "F401 '...' imported but unused" 警告，以保留其他警告。因此，完全不检查此模块。

# 版权 2023 年 HuggingFace 团队保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（"许可证"）许可；
# 除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，软件是基于"按现状"提供的，
# 没有任何形式的明示或暗示担保或条件。
# 有关特定语言的权限，请参阅许可证。

from typing import TYPE_CHECKING

# 导入自定义的异常类和模块懒加载工具函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义导入结构，用于延迟加载模块
_import_structure = {"configuration_mra": ["MRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MraConfig"]}

# 检查是否可用 torch，若不可用则抛出自定义异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，添加模型相关的导入结构
    _import_structure["modeling_mra"] = [
        "MRA_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MraForMaskedLM",
        "MraForMultipleChoice",
        "MraForQuestionAnswering",
        "MraForSequenceClassification",
        "MraForTokenClassification",
        "MraLayer",
        "MraModel",
        "MraPreTrainedModel",
    ]

# 如果是类型检查模式，导入必要的类型声明
if TYPE_CHECKING:
    from .configuration_mra import MRA_PRETRAINED_CONFIG_ARCHIVE_MAP, MraConfig

    # 再次检查 torch 是否可用，并导入相关的模型类
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_mra import (
            MRA_PRETRAINED_MODEL_ARCHIVE_LIST,
            MraForMaskedLM,
            MraForMultipleChoice,
            MraForQuestionAnswering,
            MraForSequenceClassification,
            MraForTokenClassification,
            MraLayer,
            MraModel,
            MraPreTrainedModel,
        )
# 如果不是类型检查模式，设置当前模块为懒加载模式
else:
    import sys

    # 使用 _LazyModule 将当前模块设置为懒加载模式
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\mt5\configuration_mt5.py`

# coding=utf-8
# 设置文件编码为 UTF-8

# 版权声明，指明代码版权归 The T5 Authors 和 HuggingFace Inc. 所有

# 引入 Mapping 类型用于类型提示
from typing import Mapping

# 从相关的库中导入必要的配置类和函数
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxSeq2SeqConfigWithPast
from ...utils import logging

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 定义 mT5 模型配置类，继承自 PretrainedConfig 类
class MT5Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MT5Model`] or a [`TFMT5Model`]. It is used to
    instantiate a mT5 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the mT5
    [google/mt5-small](https://huggingface.co/google/mt5-small) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义模型类型为 "mt5"
    model_type = "mt5"
    # 在推断时忽略的关键字列表，这些关键字不参与推断过程
    keys_to_ignore_at_inference = ["past_key_values"]
    # 将属性映射为模型参数，例如将 "hidden_size" 映射为 "d_model"，"num_attention_heads" 映射为 "num_heads"，"num_hidden_layers" 映射为 "num_layers"
    attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
    # 初始化函数，设置模型的各项参数
    def __init__(
        self,
        vocab_size=250112,  # 词汇表大小，默认为250112
        d_model=512,  # 模型维度，默认为512
        d_kv=64,  # KV维度，默认为64
        d_ff=1024,  # 前馈网络的维度，默认为1024
        num_layers=8,  # 层数，默认为8
        num_decoder_layers=None,  # 解码器层数，默认为None
        num_heads=6,  # 头数，默认为6
        relative_attention_num_buckets=32,  # 相对注意力的桶数，默认为32
        relative_attention_max_distance=128,  # 相对注意力的最大距离，默认为128
        dropout_rate=0.1,  # 丢弃率，默认为0.1
        layer_norm_epsilon=1e-6,  # 层归一化的epsilon，默认为1e-6
        initializer_factor=1.0,  # 初始化因子，默认为1.0
        feed_forward_proj="gated-gelu",  # 前馈网络激活函数，默认为"gated-gelu"
        is_encoder_decoder=True,  # 是否是编码-解码模型，默认为True
        use_cache=True,  # 是否使用缓存，默认为True
        tokenizer_class="T5Tokenizer",  # 分词器的类名，默认为"T5Tokenizer"
        tie_word_embeddings=False,  # 是否绑定词嵌入，默认为False
        pad_token_id=0,  # 填充标记的ID，默认为0
        eos_token_id=1,  # 结束标记的ID，默认为1
        decoder_start_token_id=0,  # 解码器开始标记的ID，默认为0
        classifier_dropout=0.0,  # 分类器的丢弃率，默认为0.0
        **kwargs,  # 其他参数
    ):
        # 将参数赋值给对象的属性
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.d_kv = d_kv
        self.d_ff = d_ff
        self.num_layers = num_layers
        self.num_decoder_layers = (
            num_decoder_layers if num_decoder_layers is not None else self.num_layers
        )  # 如果解码器层数不为空，则赋值为解码器层数，否则赋值为层数
        self.num_heads = num_heads
        self.relative_attention_num_buckets = relative_attention_num_buckets
        self.relative_attention_max_distance = relative_attention_max_distance
        self.dropout_rate = dropout_rate
        self.classifier_dropout = classifier_dropout
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_factor = initializer_factor
        self.feed_forward_proj = feed_forward_proj
        self.use_cache = use_cache

        # 根据feed_forward_proj设置前馈网络激活函数相关属性
        act_info = self.feed_forward_proj.split("-")
        self.dense_act_fn = act_info[-1]
        self.is_gated_act = act_info[0] == "gated"

        # 检查前馈网络激活函数是否合法
        if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
            raise ValueError(
                f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer. "
                "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
                "'gated-gelu' or 'relu'"
            )

        # 为了向后兼容性，如果前馈网络激活函数为"gated-gelu"，将dense_act_fn设置为"gelu_new"
        if feed_forward_proj == "gated-gelu":
            self.dense_act_fn = "gelu_new"

        # 调用父类的初始化函数
        super().__init__(
            is_encoder_decoder=is_encoder_decoder,
            tokenizer_class=tokenizer_class,
            tie_word_embeddings=tie_word_embeddings,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            decoder_start_token_id=decoder_start_token_id,
            **kwargs,
        )
# 定义一个继承自OnnxSeq2SeqConfigWithPast的MT5OnnxConfig类
class MT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
    
    @property
    # 从transformers.models.t5.configuration_t5.T5OnnxConfig.inputs中复制而来
    # 返回一个映射，将输入名称映射到索引及其含义的字典
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 定义通用的输入映射，包括input_ids和attention_mask
        common_inputs = {
            "input_ids": {0: "batch", 1: "encoder_sequence"},
            "attention_mask": {0: "batch", 1: "encoder_sequence"},
        }
        # 如果使用过去的状态（self.use_past为True）
        if self.use_past:
            # 调整attention_mask的描述以包括过去的编码器序列
            common_inputs["attention_mask"][1] = "past_encoder_sequence + sequence"
            # 添加decoder_input_ids映射
            common_inputs["decoder_input_ids"] = {0: "batch"}
            # 添加decoder_attention_mask映射，包括过去的解码器序列
            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
        else:
            # 如果不使用过去的状态，设置decoder_input_ids映射
            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
            # 设置decoder_attention_mask映射，仅包括当前的解码器序列
            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}

        # 如果使用过去的状态，调用self.fill_with_past_key_values_方法填充common_inputs
        if self.use_past:
            self.fill_with_past_key_values_(common_inputs, direction="inputs")

        # 返回最终的输入映射字典
        return common_inputs

    @property
    # 从transformers.models.t5.configuration_t5.T5OnnxConfig.default_onnx_opset中复制而来
    # 返回默认的ONNX操作集版本号
    def default_onnx_opset(self) -> int:
        return 13

    @property
    # 返回用于验证的绝对误差限制
    def atol_for_validation(self) -> float:
        return 5e-4

`.\models\mt5\modeling_flax_mt5.py`

# coding=utf-8
# Copyright 2021 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Flax mT5 model."""

import jax.numpy as jnp  # 导入 JAX 库的 NumPy 接口

from ...utils import logging  # 导入相对路径下的 logging 模块
from ..t5.modeling_flax_t5 import FlaxT5EncoderModel, FlaxT5ForConditionalGeneration, FlaxT5Model  # 导入 FlaxT5 相关模块
from .configuration_mt5 import MT5Config  # 导入 MT5 模型配置

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CONFIG_FOR_DOC = "T5Config"  # 用于文档的配置信息

# Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
    """
    Shift input ids one token to the right.
    """
    shifted_input_ids = jnp.zeros_like(input_ids)  # 创建一个和 input_ids 形状相同的全零数组
    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])  # 将 input_ids 向右移动一个位置
    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)  # 设置起始位置的 token id 为 decoder_start_token_id

    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)  # 将 -100 的位置替换为 pad_token_id
    return shifted_input_ids


class FlaxMT5Model(FlaxT5Model):
    r"""
    This class overrides [`FlaxT5Model`]. Please check the superclass for the appropriate documentation alongside usage
    examples.

    Examples:

    ```
    >>> from transformers import FlaxMT5Model, AutoTokenizer

    >>> model = FlaxMT5Model.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, return_tensors="np")

    >>> decoder_input_ids = tokenizer(text_target=summary, return_tensors="np").input_ids

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=decoder_input_ids)
    >>> hidden_states = outputs.last_hidden_state
    ```"""

    model_type = "mt5"  # 模型类型为 mt5
    config_class = MT5Config  # 使用 MT5Config 类配置


class FlaxMT5EncoderModel(FlaxT5EncoderModel):
    r"""
    This class overrides [`FlaxT5EncoderModel`]. Please check the superclass for the appropriate documentation
    alongside usage examples.

    Examples:

    ```
    >>> from transformers import FlaxT5EncoderModel, AutoTokenizer

    >>> model = FlaxT5EncoderModel.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    ```
    # 定义一个字符串变量，表示模型类型为 "mt5"
    model_type = "mt5"
    # 定义一个变量，表示配置类为 MT5Config，但未使用该变量
    config_class = MT5Config
# 定义一个用于条件生成的FlaxMT5ForConditionalGeneration类，它继承自FlaxT5ForConditionalGeneration类。
# 请查看超类以获取适当的文档和用法示例。

class FlaxMT5ForConditionalGeneration(FlaxT5ForConditionalGeneration):
    # 类型标识为"mt5"
    model_type = "mt5"
    # 配置类为MT5Config
    config_class = MT5Config

`.\models\mt5\modeling_mt5.py`

# coding=utf-8
# Copyright 2020 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch mT5 model."""

# 导入所需的模块和类
import copy
import math
import os
import warnings
from typing import List, Optional, Tuple, Union

# 导入PyTorch库
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入各种输出类和模型基类
from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
    Seq2SeqQuestionAnsweringModelOutput,
    Seq2SeqSequenceClassifierOutput,
    TokenClassifierOutput,
)
# 导入模型工具函数和常用函数
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    DUMMY_INPUTS,
    DUMMY_MASK,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_torch_fx_proxy,
    logging,
    replace_return_docstrings,
)
# 导入模型并行处理相关的函数
from ...utils.model_parallel_utils import assert_device_map, get_device_map
# 导入mT5模型配置
from .configuration_mt5 import MT5Config

# 获取日志记录器
logger = logging.get_logger(__name__)

# mT5预训练模型存档列表，包含预训练模型的标识和URL
MT5_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/mt5-small",
    "google/mt5-base",
    "google/mt5-large",
    "google/mt5-xl",
    "google/mt5-xxl",
    # 查看所有mT5模型：https://huggingface.co/models?filter=mt5
]

# 并行化文档字符串，描述了一个实验性功能，随时可能变更
PARALLELIZE_DOCSTRING = r"""
    This is an experimental feature and is a subject to change at a moment's notice.

    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
    it will evenly distribute blocks across all devices.
"""
    Args:
        device_map (`Dict[int, list]`, optional, defaults to None):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the mt5 models have the
            following number of attention modules:

                - mt5-small: 6
                - mt5-base: 12
                - mt5-large: 24
                - mt5-xl: 24
                - mt5-xxl: 24

    Example:

    ```
    # Here is an example of a device map on a machine with 4 GPUs using mt5-xl, which has a total of 24 attention modules:
    model = MT5ForConditionalGeneration.from_pretrained("mt5-xl")
    创建一个 MT5 模型实例，使用预训练的 "mt5-xl" 模型
    device_map = {
        0: [0, 1, 2],
        将 attention 模块映射到四个 GPU 设备上的示例映射表
        1: [3, 4, 5, 6, 7, 8, 9],
        2: [10, 11, 12, 13, 14, 15, 16],
        3: [17, 18, 19, 20, 21, 22, 23],
    }
    使用给定的设备映射表将模型并行化处理
    model.parallelize(device_map)
    ```
"""
DEPARALLELIZE_DOCSTRING = r"""
    Moves the model to cpu from a model parallel state.

    Example:

    ```
    # On a 4 GPU machine with mt5-xl:
    model = MT5ForConditionalGeneration.from_pretrained("Mt5-xl")
    device_map = {
        0: [0, 1, 2],
        1: [3, 4, 5, 6, 7, 8, 9],
        2: [10, 11, 12, 13, 14, 15, 16],
        3: [17, 18, 19, 20, 21, 22, 23],
    }
    model.parallelize(device_map)  # Splits the model across several devices
    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
    ```
"""


# Copied from transformers.models.t5.modeling_t5.T5LayerNorm with T5->MT5
class MT5LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        Construct a layernorm module in the MT5 style. No bias and no subtraction of mean.
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))  # 初始化权重参数为全一张量
        self.variance_epsilon = eps  # 设置方差的 epsilon 值

    def forward(self, hidden_states):
        # MT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
        # half-precision inputs is done in fp32

        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)  # 计算输入张量的方差
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)  # 根据方差进行 layer normalization

        # convert into half-precision if necessary
        if self.weight.dtype in [torch.float16, torch.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)  # 如果权重数据类型为半精度，则将隐藏状态转换为相同精度

        return self.weight * hidden_states  # 返回经过权重调整后的隐藏状态


# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->MT5
class MT5DenseActDense(nn.Module):
    def __init__(self, config: MT5Config):
        super().__init__()
        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)  # 带有线性变换的全连接层，无偏置
        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)  # 带有线性变换的全连接层，无偏置
        self.dropout = nn.Dropout(config.dropout_rate)  # 随机丢弃层，使用指定的 dropout 率
        self.act = ACT2FN[config.dense_act_fn]  # 激活函数从配置中获取

    def forward(self, hidden_states):
        hidden_states = self.wi(hidden_states)  # 输入经过第一个线性层变换
        hidden_states = self.act(hidden_states)  # 应用激活函数
        hidden_states = self.dropout(hidden_states)  # 应用 dropout
        if (
            isinstance(self.wo.weight, torch.Tensor)
            and hidden_states.dtype != self.wo.weight.dtype
            and self.wo.weight.dtype != torch.int8
        ):
            hidden_states = hidden_states.to(self.wo.weight.dtype)  # 根据权重数据类型调整隐藏状态的数据类型
        hidden_states = self.wo(hidden_states)  # 输入经过第二个线性层变换
        return hidden_states


# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5->MT5
class MT5DenseGatedActDense(nn.Module):
    # 初始化方法，接受一个 MT5Config 对象作为参数
    def __init__(self, config: MT5Config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个线性层，输入维度为 config.d_model，输出维度为 config.d_ff，无偏置
        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
        # 创建一个线性层，输入维度为 config.d_model，输出维度为 config.d_ff，无偏置
        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
        # 创建一个线性层，输入维度为 config.d_ff，输出维度为 config.d_model，无偏置
        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
        # 创建一个 Dropout 层，使用给定的 dropout 率
        self.dropout = nn.Dropout(config.dropout_rate)
        # 根据配置中指定的激活函数名称，选择对应的激活函数
        self.act = ACT2FN[config.dense_act_fn]

    # 前向传播方法，接受输入 hidden_states
    def forward(self, hidden_states):
        # 将输入 hidden_states 经过激活函数 act 和线性层 wi_0 得到 hidden_gelu
        hidden_gelu = self.act(self.wi_0(hidden_states))
        # 将输入 hidden_states 经过线性层 wi_1 得到 hidden_linear
        hidden_linear = self.wi_1(hidden_states)
        # 将 hidden_gelu 和 hidden_linear 逐元素相乘得到 hidden_states
        hidden_states = hidden_gelu * hidden_linear
        # 对 hidden_states 应用 dropout 操作
        hidden_states = self.dropout(hidden_states)

        # 为了让 8 位量化适用于 google/flan-t5-xxl，self.wo 保持为 float32 类型。
        # 参考 https://github.com/huggingface/transformers/issues/20287
        # 同时确保权重不是 `int8` 类型，以防止用户强制设置 `_keep_in_fp32_modules` 为 `None`
        if (
            isinstance(self.wo.weight, torch.Tensor)
            and hidden_states.dtype != self.wo.weight.dtype
            and self.wo.weight.dtype != torch.int8
        ):
            # 将 hidden_states 转换为 self.wo.weight 的数据类型
            hidden_states = hidden_states.to(self.wo.weight.dtype)

        # 将 hidden_states 经过线性层 self.wo 得到输出 hidden_states
        hidden_states = self.wo(hidden_states)
        # 返回最终的 hidden_states 结果
        return hidden_states
# 从 transformers.models.t5.modeling_t5.T5LayerFF 复制并改为 T5->MT5
class MT5LayerFF(nn.Module):
    # 初始化函数，接受一个 MT5Config 对象作为参数
    def __init__(self, config: MT5Config):
        super().__init__()
        # 根据配置选择不同的 DenseReluDense 模块
        if config.is_gated_act:
            self.DenseReluDense = MT5DenseGatedActDense(config)
        else:
            self.DenseReluDense = MT5DenseActDense(config)

        # 初始化 LayerNorm 模块，设定 epsilon 值
        self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 初始化 Dropout 模块，设定 dropout 率
        self.dropout = nn.Dropout(config.dropout_rate)

    # 前向传播函数，接受隐藏状态作为输入，返回更新后的隐藏状态
    def forward(self, hidden_states):
        # 对隐藏状态进行 LayerNorm 处理
        forwarded_states = self.layer_norm(hidden_states)
        # 通过 DenseReluDense 模块处理规范化后的隐藏状态
        forwarded_states = self.DenseReluDense(forwarded_states)
        # 使用 Dropout 处理得到的前向传播状态，并与原始隐藏状态相加
        hidden_states = hidden_states + self.dropout(forwarded_states)
        # 返回更新后的隐藏状态
        return hidden_states


# 从 transformers.models.t5.modeling_t5.T5Attention 复制并改为 T5->MT5
class MT5Attention(nn.Module):
    # 初始化函数，接受一个 MT5Config 对象和是否包含相对注意力偏置的标志作为参数
    def __init__(self, config: MT5Config, has_relative_attention_bias=False):
        super().__init__()
        # 是否为解码器
        self.is_decoder = config.is_decoder
        # 是否包含相对注意力偏置
        self.has_relative_attention_bias = has_relative_attention_bias
        # 相对注意力偏置的桶数
        self.relative_attention_num_buckets = config.relative_attention_num_buckets
        # 相对注意力的最大距离
        self.relative_attention_max_distance = config.relative_attention_max_distance
        # 模型的隐藏状态维度
        self.d_model = config.d_model
        # 键值投影维度
        self.key_value_proj_dim = config.d_kv
        # 注意力头的数量
        self.n_heads = config.num_heads
        # Dropout 率
        self.dropout = config.dropout_rate
        # 内部维度，即头数乘以键值投影维度
        self.inner_dim = self.n_heads * self.key_value_proj_dim

        # 初始化查询、键、值和输出的线性变换层，无偏置
        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)

        # 如果有相对注意力偏置，初始化相对注意力偏置的嵌入层
        if self.has_relative_attention_bias:
            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
        # 初始化被剪枝的注意力头集合和梯度检查点标志
        self.pruned_heads = set()
        self.gradient_checkpointing = False

    # 静态方法：剪枝注意力头
    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 找到可剪枝的注意力头和对应索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
        )
        # 剪枝线性层
        self.q = prune_linear_layer(self.q, index)
        self.k = prune_linear_layer(self.k, index)
        self.v = prune_linear_layer(self.v, index)
        self.o = prune_linear_layer(self.o, index, dim=1)
        # 更新超参数
        self.n_heads = self.n_heads - len(heads)
        self.inner_dim = self.key_value_proj_dim * self.n_heads
        self.pruned_heads = self.pruned_heads.union(heads)
    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor - 相对位置，表示从当前位置到关注位置的距离
            bidirectional: a boolean - 是否为双向注意力
            num_buckets: an integer - 桶的数量，用于将相对位置映射到桶编号
            max_distance: an integer - 最大距离，超过此距离的相对位置映射到同一个桶

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
            返回一个与 relative_position 形状相同的张量，包含范围在 [0, num_buckets) 内的整数值
        """
        relative_buckets = 0  # 初始化相对位置桶号为0

        # 如果是双向注意力，则将桶数减半，并根据 relative_position 的正负分别计算桶号偏移
        if bidirectional:
            num_buckets //= 2
            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
            relative_position = torch.abs(relative_position)
        else:
            # 如果是单向注意力，将 relative_position 转换为非正的数值
            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))

        # 现在 relative_position 范围在 [0, inf)

        # 小于 max_exact 的相对位置使用线性增量的桶
        max_exact = num_buckets // 2
        is_small = relative_position < max_exact

        # 大于 max_exact 的相对位置使用对数增量的桶，映射到 [max_exact, num_buckets-1] 范围内
        relative_position_if_large = max_exact + (
            torch.log(relative_position.float() / max_exact)
            / math.log(max_distance / max_exact)
            * (num_buckets - max_exact)
        ).to(torch.long)
        relative_position_if_large = torch.min(
            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
        )

        # 根据相对位置大小选择合适的桶号
        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)

        return relative_buckets  # 返回计算得到的相对位置桶号张量
    def compute_bias(self, query_length, key_length, device=None):
        """Compute binned relative position bias"""
        # 如果未指定设备，则使用相对注意力偏置权重张量的设备
        if device is None:
            device = self.relative_attention_bias.weight.device
        # 创建表示上下文位置的张量，范围为[0, query_length-1]
        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
        # 创建表示记忆位置的张量，范围为[0, key_length-1]
        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
        # 计算相对位置偏差，形状为(query_length, key_length)
        relative_position = memory_position - context_position
        # 将相对位置映射到桶中，返回形状为(query_length, key_length)的桶索引张量
        relative_position_bucket = self._relative_position_bucket(
            relative_position,
            bidirectional=(not self.is_decoder),
            num_buckets=self.relative_attention_num_buckets,
            max_distance=self.relative_attention_max_distance,
        )
        # 使用相对位置桶索引获取相对注意力偏置值，形状为(query_length, key_length, num_heads)
        values = self.relative_attention_bias(relative_position_bucket)
        # 调整张量维度顺序以匹配Transformer的注意力头结构，形状为(1, num_heads, query_length, key_length)
        values = values.permute([2, 0, 1]).unsqueeze(0)
        # 返回相对位置注意力偏置张量
        return values

    def forward(
        self,
        hidden_states,
        mask=None,
        key_value_states=None,
        position_bias=None,
        past_key_value=None,
        layer_head_mask=None,
        query_length=None,
        use_cache=False,
        output_attentions=False,
# Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5
class MT5LayerSelfAttention(nn.Module):
    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__()
        # 初始化自注意力层对象，使用MT5Attention进行自注意力计算
        self.SelfAttention = MT5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
        # 初始化层归一化对象，用于规范化隐藏状态
        self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 初始化Dropout层，用于随机失活以防止过拟合
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
    ):
        # 对输入的隐藏状态进行层归一化处理
        normed_hidden_states = self.layer_norm(hidden_states)
        # 使用SelfAttention对象计算自注意力，得到注意力输出
        attention_output = self.SelfAttention(
            normed_hidden_states,
            mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        # 将原始隐藏状态与注意力输出相加，并且应用Dropout
        hidden_states = hidden_states + self.dropout(attention_output[0])
        # 准备输出，如果需要返回注意力权重，则包含在输出中
        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
        return outputs


# Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->MT5
class MT5LayerCrossAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化跨注意力层对象，使用MT5Attention进行编码-解码注意力计算
        self.EncDecAttention = MT5Attention(config, has_relative_attention_bias=False)
        # 初始化层归一化对象，用于规范化隐藏状态
        self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 初始化Dropout层，用于随机失活以防止过拟合
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        key_value_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        query_length=None,
        output_attentions=False,
    ):
        # 对输入的隐藏状态进行层归一化处理
        normed_hidden_states = self.layer_norm(hidden_states)
        # 使用EncDecAttention对象计算编码-解码注意力，得到注意力输出
        attention_output = self.EncDecAttention(
            normed_hidden_states,
            mask=attention_mask,
            key_value_states=key_value_states,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            query_length=query_length,
            output_attentions=output_attentions,
        )
        # 将原始隐藏状态与注意力输出相加，并且应用Dropout
        layer_output = hidden_states + self.dropout(attention_output[0])
        # 准备输出，如果需要返回注意力权重，则包含在输出中
        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
        return outputs


# Copied from transformers.models.t5.modeling_t5.T5Block with T5->MT5
class MT5Block(nn.Module):
    # 初始化方法，用于创建一个 MT5Model 的实例
    def __init__(self, config, has_relative_attention_bias=False):
        # 调用父类的初始化方法
        super().__init__()
        # 根据配置设置是否为解码器
        self.is_decoder = config.is_decoder
        # 创建一个空的模块列表用于存储层的组件
        self.layer = nn.ModuleList()
        # 向模块列表中添加自注意力层，并传入配置和是否有相对注意力偏置的参数
        self.layer.append(MT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
        # 如果是解码器，再向模块列表中添加跨注意力层
        if self.is_decoder:
            self.layer.append(MT5LayerCrossAttention(config))

        # 向模块列表中添加前馈神经网络层
        self.layer.append(MT5LayerFF(config))

    # 前向传播方法，用于计算模型的输出
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        encoder_decoder_position_bias=None,
        layer_head_mask=None,
        cross_attn_layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
        return_dict=True,
# 导入必要的模块和库
try:
    import re
    import numpy as np
    import tensorflow as tf
except ImportError:
    # 如果导入失败，记录错误信息并抛出异常
    logger.error(
        "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
        "https://www.tensorflow.org/install/ for installation instructions."
    )
    raise

# 获取 TensorFlow checkpoint 文件的绝对路径
tf_path = os.path.abspath(tf_checkpoint_path)

# 打印日志，显示正在转换的 TensorFlow checkpoint 的路径
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")

# 从 TensorFlow 模型中加载权重
init_vars = tf.train.list_variables(tf_path)
names = []
tf_weights = {}

# 遍历初始化变量列表，加载每个权重并存储到字典中
for name, shape in init_vars:
    logger.info(f"Loading TF weight {name} with shape {shape}")
    array = tf.train.load_variable(tf_path, name)
    names.append(name)
    tf_weights[name] = array

# 打印日志，显示未复制到 PyTorch 模型的权重名称
logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.")

# 返回加载权重后的 PyTorch 模型
return model
    # 定义一个方法 `_shift_right`，接受一个输入的张量 `input_ids`
    def _shift_right(self, input_ids):
        # 从配置中获取解码器起始标记的 ID
        decoder_start_token_id = self.config.decoder_start_token_id
        # 从配置中获取填充标记的 ID
        pad_token_id = self.config.pad_token_id

        # 如果解码器起始标记的 ID 未定义，则抛出数值错误
        if decoder_start_token_id is None:
            raise ValueError(
                "self.model.config.decoder_start_token_id has to be defined. In MT5 it is usually set to the pad_token_id. "
                "See MT5 docs for more information."
            )

        # 将输入向右移动一位
        if is_torch_fx_proxy(input_ids):
            # 对于 Torch FX 代理，不支持原生的项目赋值
            # 创建一个全是解码器起始标记 ID 的张量，并连接到输入张量的末尾
            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
        else:
            # 使用 `new_zeros` 创建与输入张量相同形状的零张量
            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
            # 将输入张量向右移动一位，并将解码器起始标记 ID 放在开头
            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
            shifted_input_ids[..., 0] = decoder_start_token_id

        # 如果填充标记 ID 未定义，则抛出数值错误
        if pad_token_id is None:
            raise ValueError("self.model.config.pad_token_id has to be defined.")
        
        # 将标签中可能存在的 -100 值替换为 `pad_token_id`
        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

        # 返回向右移动后的输入张量
        return shifted_input_ids
# Copied from transformers.models.t5.modeling_t5.T5Stack with T5->MT5
class MT5Stack(MT5PreTrainedModel):
    def __init__(self, config, embed_tokens=None):
        super().__init__(config)

        # 初始化 MT5Stack 类的实例
        self.embed_tokens = embed_tokens  # 嵌入令牌，用于输入的嵌入表示
        self.is_decoder = config.is_decoder  # 是否为解码器模式

        # 创建由多个 MT5Block 组成的模块列表，每个块具有相对注意力偏置（仅第一个块）
        self.block = nn.ModuleList(
            [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
        )
        self.final_layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)  # 最终的层归一化
        self.dropout = nn.Dropout(config.dropout_rate)  # 随机失活率

        # 初始化权重并应用最终处理
        self.post_init()

        # 模型并行化相关设置
        self.model_parallel = False  # 模型是否并行化
        self.device_map = None  # 设备映射表
        self.gradient_checkpointing = False  # 梯度检查点

    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def parallelize(self, device_map=None):
        warnings.warn(
            "`MT5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
            " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
            " 'block.1': 1, ...}",
            FutureWarning,
        )
        # 检查设备映射的有效性
        self.device_map = (
            get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
        )
        assert_device_map(self.device_map, len(self.block))  # 断言设备映射合法性
        self.model_parallel = True  # 开启模型并行化
        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
        self.last_device = "cuda:" + str(max(self.device_map.keys()))

        # 将每个块加载到指定设备
        for k, v in self.device_map.items():
            for layer in v:
                cuda_device = "cuda:" + str(k)
                self.block[layer] = self.block[layer].to(cuda_device)

        # 将嵌入令牌加载到第一个设备
        self.embed_tokens = self.embed_tokens.to(self.first_device)
        # 将最终层归一化加载到最后一个设备
        self.final_layer_norm = self.final_layer_norm.to(self.last_device)

    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    def deparallelize(self):
        warnings.warn(
            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        self.model_parallel = False  # 关闭模型并行化
        self.device_map = None  # 清空设备映射表
        self.first_device = "cpu"  # 第一个设备设置为 CPU
        self.last_device = "cpu"  # 最后一个设备设置为 CPU

        # 将每个块加载到 CPU
        for i in range(len(self.block)):
            self.block[i] = self.block[i].to("cpu")
        self.embed_tokens = self.embed_tokens.to("cpu")  # 将嵌入令牌加载到 CPU
        self.final_layer_norm = self.final_layer_norm.to("cpu")  # 将最终层归一化加载到 CPU
        torch.cuda.empty_cache()  # 清空 CUDA 缓存

    def get_input_embeddings(self):
        return self.embed_tokens  # 返回嵌入令牌
    # 设置模型输入的嵌入向量
    def set_input_embeddings(self, new_embeddings):
        self.embed_tokens = new_embeddings

    # 定义模型的前向传播函数，接收多个参数用于推理或训练
    def forward(
        self,
        input_ids=None,  # 输入的token IDs
        attention_mask=None,  # 注意力掩码，指示模型在计算注意力时忽略某些token
        encoder_hidden_states=None,  # 编码器的隐藏状态，用于注意力机制
        encoder_attention_mask=None,  # 编码器的注意力掩码，指示编码器在计算注意力时忽略某些token
        inputs_embeds=None,  # 替代input_ids的嵌入向量输入
        head_mask=None,  # 头部掩码，用于遮蔽某些注意力头部的输出
        cross_attn_head_mask=None,  # 用于跨注意力的头部掩码
        past_key_values=None,  # 用于存储过去的键值对，以便支持自回归生成
        use_cache=None,  # 控制是否使用缓存
        output_attentions=None,  # 是否输出注意力权重
        output_hidden_states=None,  # 是否输出所有隐藏状态
        return_dict=None,  # 是否以字典形式返回输出
# MT5_START_DOCSTRING 是一个长字符串，用来描述 MT5 模型的相关信息和特性，包括其论文引用、模型结构等详细信息。
MT5_START_DOCSTRING = r"""

    The MT5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
    Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
    Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
    text-to-text denoising generative setting.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MT5Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# MT5_INPUTS_DOCSTRING 是一个空字符串，可能是为了后续补充描述输入的相关文档信息。
MT5_INPUTS_DOCSTRING = r"""
"""

# MT5_ENCODER_INPUTS_DOCSTRING 是另一个字符串，可能用来描述 MT5 模型编码器相关的输入信息。
MT5_ENCODER_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            # 输入序列标记在词汇表中的索引。MT5 模型具有相对位置嵌入，因此可以在右侧和左侧都进行填充。

            # 可以使用 [`AutoTokenizer`] 获取索引。详见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`]。

            # 想要了解如何为预训练准备 `input_ids`，请参考 [MT5 Training](./mt5#training)。

        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 遮盖掩码，避免在填充标记索引上执行注意力操作。遮盖值在 `[0, 1]` 中选择：

            # - 1 表示**未遮盖**的标记，
            # - 0 表示**遮盖**的标记。

            # [什么是注意力遮盖？](../glossary#attention-mask)

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 遮头掩码，用于使自注意力模块的特定头部失效。遮盖值在 `[0, 1]` 中选择：

            # - 1 表示头部**未遮盖**，
            # - 0 表示头部**遮盖**。

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            # 可选地，您可以直接传递嵌入表示，而不是传递 `input_ids`。如果您希望更多控制如何将 `input_ids` 索引转换为关联向量，
            # 则这很有用，而不是使用模型的内部嵌入查找矩阵。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。有关详细信息，请参见返回张量下的 `attentions`。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。有关详细信息，请参见返回张量下的 `hidden_states`。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
# 用于将来的警告消息：head_mask 参数已分成两个参数 - head_mask 和 decoder_head_mask
__HEAD_MASK_WARNING_MSG = """
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
"""

# 定义 MT5Model 类，用于输出没有特定输出头的原始隐藏状态
@add_start_docstrings(
    "The bare MT5 Model transformer outputting raw hidden-states without any specific head on top.",
    MT5_START_DOCSTRING,
)
class MT5Model(MT5PreTrainedModel):
    r"""
    Examples:

    ```
    >>> from transformers import MT5Model, AutoTokenizer

    >>> model = MT5Model.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, return_tensors="pt")
    >>> labels = tokenizer(text_target=summary, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```
    """

    # 模型类型为 "mt5"
    model_type = "mt5"
    # 配置类为 MT5Config
    config_class = MT5Config
    # 在加载时忽略的意外键列表
    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
    # 共享权重键的列表
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    # 从 transformers.models.t5.modeling_t5.T5Model.__init__ 复制并修改为 MT5Model
    def __init__(self, config: MT5Config):
        super().__init__(config)
        # 创建一个共享的嵌入层
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        # 复制并修改编码器配置
        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        # 创建编码器实例
        self.encoder = MT5Stack(encoder_config, self.shared)

        # 复制并修改解码器配置
        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        # 创建解码器实例
        self.decoder = MT5Stack(decoder_config, self.shared)

        # 初始化权重并应用最终处理
        self.post_init()

        # 模型并行设置
        self.model_parallel = False
        self.device_map = None

    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    # 从 transformers.models.t5.modeling_t5.T5Model.parallelize 复制
    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    # 使用给定的 DEPARALLELIZE_DOCSTRING 添加文档字符串，这是从 transformers.models.t5.modeling_t5.T5Model.deparallelize 复制过来的
    def deparallelize(self):
        # 发出警告，说明此方法即将在 Transformers 的 v5 版本中删除
        warnings.warn(
            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        # 调用编码器的 deparallelize 方法
        self.encoder.deparallelize()
        # 调用解码器的 deparallelize 方法
        self.decoder.deparallelize()
        # 将编码器移动到 CPU
        self.encoder = self.encoder.to("cpu")
        # 将解码器移动到 CPU
        self.decoder = self.decoder.to("cpu")
        # 将 model_parallel 标志设置为 False
        self.model_parallel = False
        # 将 device_map 设置为 None
        self.device_map = None
        # 清空 CUDA 缓存
        torch.cuda.empty_cache()

    # Copied from transformers.models.t5.modeling_t5.T5Model.get_input_embeddings
    # 从 transformers.models.t5.modeling_t5.T5Model.get_input_embeddings 复制而来
    def get_input_embeddings(self):
        # 返回共享的输入嵌入层
        return self.shared

    # Copied from transformers.models.t5.modeling_t5.T5Model.set_input_embeddings
    # 从 transformers.models.t5.modeling_t5.T5Model.set_input_embeddings 复制而来
    def set_input_embeddings(self, new_embeddings):
        # 设置共享的输入嵌入层为新的嵌入
        self.shared = new_embeddings
        # 调用编码器的 set_input_embeddings 方法设置新的嵌入
        self.encoder.set_input_embeddings(new_embeddings)
        # 调用解码器的 set_input_embeddings 方法设置新的嵌入

    # Copied from transformers.models.t5.modeling_t5.T5Model.get_encoder
    # 从 transformers.models.t5.modeling_t5.T5Model.get_encoder 复制而来
    def get_encoder(self):
        # 返回编码器
        return self.encoder

    # Copied from transformers.models.t5.modeling_t5.T5Model.get_decoder
    # 从 transformers.models.t5.modeling_t5.T5Model.get_decoder 复制而来
    def get_decoder(self):
        # 返回解码器
        return self.decoder

    # Copied from transformers.models.t5.modeling_t5.T5Model._prune_heads
    # 从 transformers.models.t5.modeling_t5.T5Model._prune_heads 复制而来
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历需要修剪的层和头部的字典
        for layer, heads in heads_to_prune.items():
            # 在编码器的特定层的注意力头部上执行修剪操作
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
    # 从 transformers.models.t5.modeling_t5.T5Model.forward 复制过来，但将 T5->MT5, t5->mt5
    # 添加开始的文档字符串和替换返回文档字符串的注解
    # 定义一个方法 `forward`，用于模型的前向传播
    def forward(
        # 输入序列的标识符，可以是一个长整型张量，可选参数
        input_ids: Optional[torch.LongTensor] = None,
        # 注意力掩码，可以是一个浮点数张量，可选参数
        attention_mask: Optional[torch.FloatTensor] = None,
        # 解码器的输入序列的标识符，可以是一个长整型张量，可选参数
        decoder_input_ids: Optional[torch.LongTensor] = None,
        # 解码器的注意力掩码，可以是一个布尔张量，可选参数
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        # 头部掩码，可以是一个浮点数张量，可选参数
        head_mask: Optional[torch.FloatTensor] = None,
        # 解码器的头部掩码，可以是一个浮点数张量，可选参数
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        # 跨注意力头部掩码，可以是一个张量，可选参数
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        # 编码器的输出，可以是一系列浮点数张量的元组，可选参数
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        # 过去键值对，可以是一系列浮点数张量的元组，可选参数
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        # 输入嵌入，可以是一个张量，可选参数
        inputs_embeds: Optional[torch.Tensor] = None,
        # 解码器的输入嵌入，可以是一个张量，可选参数
        decoder_inputs_embeds: Optional[torch.Tensor] = None,
        # 是否使用缓存，布尔值，可选参数
        use_cache: Optional[bool] = None,
        # 是否输出注意力，布尔值，可选参数
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态，布尔值，可选参数
        output_hidden_states: Optional[bool] = None,
        # 是否返回字典，布尔值，可选参数
        return_dict: Optional[bool] = None,
# 使用装饰器为类添加文档字符串，描述其作为基于 MT5 模型的带有语言建模头部的条件生成模型的特性
@add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING)
class MT5ForConditionalGeneration(MT5PreTrainedModel):
    r"""
    Examples:

    ```
    >>> from transformers import MT5ForConditionalGeneration, AutoTokenizer

    >>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```"""

    # 模型类型设定为 "mt5"
    model_type = "mt5"
    # 配置类设定为 MT5Config
    config_class = MT5Config
    # 加载时忽略的键列表，用于处理未预期的键
    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
    # 共享权重的键列表
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ 复制并替换 T5 为 MT5
    def __init__(self, config: MT5Config):
        super().__init__(config)
        # 设置模型维度为 config.d_model
        self.model_dim = config.d_model

        # 创建共享的嵌入层，用于词汇表大小和模型维度
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        # 复制编码器配置，将其设定为非解码器
        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        # 创建 MT5 编码器堆栈
        self.encoder = MT5Stack(encoder_config, self.shared)

        # 复制解码器配置，将其设定为解码器
        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        # 创建 MT5 解码器堆栈
        self.decoder = MT5Stack(decoder_config, self.shared)

        # 创建线性层用于语言建模头部，输入维度为 config.d_model，输出维度为 config.vocab_size，无偏置
        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

        # 模型并行设定为 False
        self.model_parallel = False
        # 设备映射设定为 None
        self.device_map = None

    # 使用装饰器添加并行化文档字符串
    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize 复制
    def parallelize(self, device_map=None):
        # 发出警告，提醒 `T5ForConditionalGeneration.parallelize` 方法将在 Transformers v5 中移除
        warnings.warn(
            "`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you"
            " should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also"
            " provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance"
            " {'encoder.block.0': 0, 'encoder.block.1': 1, ...}",
            FutureWarning,
        )
        # 根据 encoder.block 的数量和当前 CUDA 设备数量生成设备映射，如果未提供 device_map 则使用生成的映射
        self.device_map = (
            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
            if device_map is None
            else device_map
        )
        # 检查设备映射的有效性
        assert_device_map(self.device_map, len(self.encoder.block))
        # 并行化编码器
        self.encoder.parallelize(self.device_map)
        # 并行化解码器
        self.decoder.parallelize(self.device_map)
        # 将语言模型头部移动到解码器的第一个设备上
        self.lm_head = self.lm_head.to(self.decoder.first_device)
        # 设置模型并行化标志为 True
        self.model_parallel = True

    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.deparallelize 复制而来
    def deparallelize(self):
        # 发出警告，提醒 `deparallelize` 方法将在 Transformers v5 中移除
        warnings.warn(
            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        # 反并行化编码器
        self.encoder.deparallelize()
        # 反并行化解码器
        self.decoder.deparallelize()
        # 将编码器移动到 CPU
        self.encoder = self.encoder.to("cpu")
        # 将解码器移动到 CPU
        self.decoder = self.decoder.to("cpu")
        # 将语言模型头部移动到 CPU
        self.lm_head = self.lm_head.to("cpu")
        # 设置模型并行化标志为 False
        self.model_parallel = False
        # 清空 CUDA 缓存
        torch.cuda.empty_cache()

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_input_embeddings 复制而来
    def get_input_embeddings(self):
        # 返回共享的输入嵌入
        return self.shared

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_input_embeddings 复制而来
    def set_input_embeddings(self, new_embeddings):
        # 设置共享的输入嵌入
        self.shared = new_embeddings
        # 设置编码器的输入嵌入
        self.encoder.set_input_embeddings(new_embeddings)
        # 设置解码器的输入嵌入
        self.decoder.set_input_embeddings(new_embeddings)

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_output_embeddings 复制而来
    def set_output_embeddings(self, new_embeddings):
        # 设置语言模型头部的输出嵌入
        self.lm_head = new_embeddings

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_output_embeddings 复制而来
    def get_output_embeddings(self):
        # 返回语言模型头部的输出嵌入
        return self.lm_head

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_encoder 复制而来
    def get_encoder(self):
        # 返回编码器
        return self.encoder

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_decoder 复制而来
    def get_decoder(self):
        # 返回解码器
        return self.decoder

    @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward 复制而来，定义了 MT5 模型的前向传播方法
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的 token IDs，类型为可选的长整型张量
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力掩码，类型为可选的浮点数张量
        decoder_input_ids: Optional[torch.LongTensor] = None,  # 解码器的输入 token IDs，类型为可选的长整型张量
        decoder_attention_mask: Optional[torch.BoolTensor] = None,  # 解码器的注意力掩码，类型为可选的布尔张量
        head_mask: Optional[torch.FloatTensor] = None,  # 头部掩码，类型为可选的浮点数张量
        decoder_head_mask: Optional[torch.FloatTensor] = None,  # 解码器头部掩码，类型为可选的浮点数张量
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 跨注意力头部掩码，类型为可选的张量
        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,  # 编码器的输出，类型为可选的张量元组
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,  # 过去的键值对，类型为可选的张量元组
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入，类型为可选的浮点数张量
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,  # 解码器输入的嵌入，类型为可选的浮点数张量
        labels: Optional[torch.LongTensor] = None,  # 标签，类型为可选的长整型张量
        use_cache: Optional[bool] = None,  # 是否使用缓存，类型为可选的布尔值
        output_attentions: Optional[bool] = None,  # 是否输出注意力，类型为可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，类型为可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，类型为可选的布尔值
    ):
    
    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation 复制而来，准备生成过程中的输入
    def prepare_inputs_for_generation(
        self,
        input_ids,  # 输入的 token IDs
        past_key_values=None,  # 过去的键值对，默认为 None
        attention_mask=None,  # 注意力掩码，默认为 None
        head_mask=None,  # 头部掩码，默认为 None
        decoder_head_mask=None,  # 解码器头部掩码，默认为 None
        decoder_attention_mask=None,  # 解码器的注意力掩码，默认为 None
        cross_attn_head_mask=None,  # 跨注意力头部掩码，默认为 None
        use_cache=None,  # 是否使用缓存，默认为 None
        encoder_outputs=None,  # 编码器的输出，默认为 None
        **kwargs,  # 其他关键字参数
    ):
        # 如果使用了过去的键值对
        if past_key_values is not None:
            # 获取过去键值对的长度
            past_length = past_key_values[0][0].shape[2]
    
            # 如果输入的 token IDs 的长度大于过去键值对的长度
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length  # 移除前缀的长度设为过去键值对的长度
            else:
                # 否则，默认采用旧的行为：只保留最后一个输入 ID
                remove_prefix_length = input_ids.shape[1] - 1
    
            # 将输入的 token IDs 裁剪为移除前缀长度后的部分
            input_ids = input_ids[:, remove_prefix_length:]
    
        # 返回准备好的输入字典
        return {
            "decoder_input_ids": input_ids,
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,
        }
    
    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels 复制而来，准备从标签生成解码器输入 token IDs
    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        return self._shift_right(labels)
    
    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration._reorder_cache 复制而来，重新排序缓存
    # 重新排列缓存中的过去键值，以便与beam索引对应
    def _reorder_cache(self, past_key_values, beam_idx):
        # 如果过去的键值未包含在输出中
        # 禁用快速解码，无需重新排序
        if past_key_values is None:
            # 提示用户可能需要设置`use_cache=True`来加快解码速度
            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
            return past_key_values

        # 重新排序后的解码器过去状态
        reordered_decoder_past = ()
        for layer_past_states in past_key_values:
            # 从层过去状态中获取正确的批次索引，批次维度在第二个位置
            reordered_layer_past_states = ()
            for layer_past_state in layer_past_states:
                # 需要为每个四个键/值状态设置正确的`past`
                reordered_layer_past_states = reordered_layer_past_states + (
                    layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
                )

            # 检查重新排序后的第一个层过去状态的形状与原始的是否匹配
            if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
                raise ValueError(
                    f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
                )
            # 检查重新排序后的过去状态列表长度与原始列表是否匹配
            if len(reordered_layer_past_states) != len(layer_past_states):
                raise ValueError(
                    f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
                )

            # 将重新排序后的层过去状态添加到重新排序后的解码器过去状态中
            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
        
        # 返回重新排序后的解码器过去状态
        return reordered_decoder_past
# 使用装饰器为类添加文档字符串，描述了该类的基本信息和使用示例
@add_start_docstrings(
    "The bare MT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
    MT5_START_DOCSTRING,
)
class MT5EncoderModel(MT5PreTrainedModel):
    r"""
    Examples:

    ```
    >>> from transformers import MT5EncoderModel, AutoTokenizer

    >>> model = MT5EncoderModel.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```"""

    # 设置模型类型为 "mt5"
    model_type = "mt5"
    # 指定配置类为 MT5Config
    config_class = MT5Config
    # 定义了需要绑定权重的键列表
    _tied_weights_keys = ["encoder.embed_tokens.weight"]

    # 从 transformers.models.t5.modeling_t5.T5EncoderModel.__init__ 复制并修改为 MT5EncoderModel
    def __init__(self, config: MT5Config):
        super().__init__(config)
        # 创建共享的嵌入层，使用配置中的词汇表大小和模型维度
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        # 复制配置以便修改而不影响原始配置，设置不使用缓存和不是编码器-解码器模型
        encoder_config = copy.deepcopy(config)
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        # 创建 MT5 堆栈编码器
        self.encoder = MT5Stack(encoder_config, self.shared)

        # 初始化权重并应用最终处理
        self.post_init()

        # 模型并行设置
        self.model_parallel = False
        self.device_map = None

    # 从 transformers.models.t5.modeling_t5.T5EncoderModel.parallelize 复制而来
    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def parallelize(self, device_map=None):
        # 发出警告，说明方法已弃用，将在 Transformers v5 版本中删除
        warnings.warn(
            "`T5EncoderModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
            " 'block.1': 1, ...}",
            FutureWarning,
        )
        # 根据传入的 device_map 参数设置设备映射
        self.device_map = (
            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
            if device_map is None
            else device_map
        )
        # 断言设备映射的有效性
        assert_device_map(self.device_map, len(self.encoder.block))
        # 将编码器对象分布到多个设备上
        self.encoder.parallelize(self.device_map)
        self.model_parallel = True

    # 从 transformers.models.t5.modeling_t5.T5EncoderModel.deparallelize 复制而来
    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    def deparallelize(self):
        # 发出警告，说明方法已弃用，将在 Transformers v5 版本中删除
        warnings.warn(
            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        # 取消编码器对象的并行化
        self.encoder.deparallelize()
        # 将编码器对象移回 CPU
        self.encoder = self.encoder.to("cpu")
        self.model_parallel = False
        self.device_map = None
        # 清空 CUDA 缓存
        torch.cuda.empty_cache()

    # 从 transformers.models.t5.modeling_t5.T5EncoderModel.get_input_embeddings 复制而来
    # 返回当前模型共享的输入嵌入向量
    def get_input_embeddings(self):
        return self.shared

    # 从给定的新嵌入向量设置模型共享的输入嵌入向量，并更新编码器的输入嵌入
    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)

    # 返回当前模型的编码器
    def get_encoder(self):
        return self.encoder

    # 剪枝模型中编码器的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(MT5_ENCODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
    # 重写的前向传播函数，用于MT5模型，接受多种输入并返回编码器的输出
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
        r"""
        Returns:

        Example:

        ```
        >>> from transformers import AutoTokenizer, MT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google-mt5/mt5-small")
        >>> model = MT5EncoderModel.from_pretrained("google-mt5/mt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```"""
        # 如果return_dict未指定，则根据配置确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用编码器的前向传播，传递输入参数并返回编码器的输出
        encoder_outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        return encoder_outputs
"""
MT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
tasks.
"""
@add_start_docstrings(
    """
    MT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    """,
    MT5_START_DOCSTRING,
)
class MT5ForSequenceClassification(MT5PreTrainedModel):
    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    # Copied from transformers.models.t5.modeling_t5.T5ForSequenceClassification.__init__ with T5->MT5
    def __init__(self, config: MT5Config):
        super().__init__(config)
        self.transformer = MT5Model(config)  # 初始化MT5模型
        self.classification_head = MT5ClassificationHead(config)  # 初始化分类头部

        # Initialize weights and apply final processing
        self.post_init()  # 初始化后处理步骤

        self.model_parallel = False  # 设置模型并行为False

    @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
    # Copied from transformers.models.t5.modeling_t5.T5ForSequenceClassification.forward
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Perform forward pass of the MT5 model for sequence classification.
        """
        # Forward pass through MT5 model and classification head
        # 正向传播通过MT5模型和分类头部
        # 详细参数说明参见MT5_INPUTS_DOCSTRING
        pass


"""
MT5 Encoder Model with a token classification head on top (a linear layer on top of the hidden-states output)
e.g. for Named-Entity-Recognition (NER) tasks.
"""
@add_start_docstrings(
    """
    MT5 Encoder Model with a token classification head on top (a linear layer on top of the hidden-states output)
    e.g. for Named-Entity-Recognition (NER) tasks.
    """,
    MT5_START_DOCSTRING,
)
class MT5ForTokenClassification(MT5PreTrainedModel):
    _tied_weights_keys = ["transformer.encoder.embed_tokens.weight"]

    # Copied from transformers.models.t5.modeling_t5.T5ForTokenClassification.__init__ with T5->MT5
    def __init__(self, config: MT5Config):
        super().__init__(config)
        self.num_labels = config.num_labels  # 设置标签数量

        self.transformer = MT5EncoderModel(config)  # 初始化MT5编码器模型
        self.dropout = nn.Dropout(config.classifier_dropout)  # 初始化Dropout层
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)  # 初始化线性分类器

        # Initialize weights and apply final processing
        self.post_init()  # 初始化后处理步骤

    @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Perform forward pass of the MT5 model for token classification.
        """
        # Forward pass through MT5 model and token classification head
        # 正向传播通过MT5模型和标记分类头部
        # 详细参数说明参见MT5_INPUTS_DOCSTRING
        pass
    # 从transformers.models.mt5.modeling_mt5.MT5ForTokenClassification.forward中复制而来
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            计算标记分类损失的标签。索引应在 `[0, ..., config.num_labels - 1]` 范围内。
        Returns:
            返回一个元组或者TokenClassifierOutput对象。
        """
        # 确定是否返回字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用transformer模型处理输入
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出中的隐藏状态
        hidden_states = outputs[0]
        # 对隐藏状态应用dropout层
        hidden_states = self.dropout(hidden_states)
        # 将处理后的隐藏状态传入分类器得到logits
        logits = self.classifier(hidden_states)

        # 初始化损失值为None
        loss = None
        # 如果有标签，则计算损失值
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # 计算交叉熵损失
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不要求返回字典格式的输出
        if not return_dict:
            # 构建输出元组
            output = (logits, outputs[2:-1])
            # 如果损失不为None，则将损失值加入输出元组中
            return ((loss,) + output) if loss is not None else output

        # 返回TokenClassifierOutput对象，包含损失、logits、隐藏状态和注意力值
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    MT5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
    on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    MT5_START_DOCSTRING,
)
class MT5ForQuestionAnswering(MT5PreTrainedModel):
    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.__init__ with T5->MT5
    def __init__(self, config: MT5Config):
        super().__init__(config)
        self.model_dim = config.d_model

        # Embedding layer shared between encoder and decoder
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        # Initialize encoder with MT5Stack
        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        self.encoder = MT5Stack(encoder_config, self.shared)

        # Initialize decoder with MT5Stack
        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        self.decoder = MT5Stack(decoder_config, self.shared)

        # Output layer for question answering logits
        self.num_labels = config.num_labels
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

        self.model_parallel = False

    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_input_embeddings
    def get_input_embeddings(self):
        return self.shared

    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.set_input_embeddings
    def set_input_embeddings(self, new_embeddings):
        # Set new embeddings for shared layer and update encoder and decoder embeddings
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)
        self.decoder.set_input_embeddings(new_embeddings)

    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_encoder
    def get_encoder(self):
        return self.encoder

    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_decoder
    def get_decoder(self):
        return self.decoder

    @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.forward
    # 定义模型的前向传播方法，接受多个可选的输入参数
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入序列的token IDs，可选的长整型张量
        attention_mask: Optional[torch.FloatTensor] = None,  # 输入序列的注意力掩码，可选的浮点数张量
        decoder_input_ids: Optional[torch.LongTensor] = None,  # 解码器输入序列的token IDs，可选的长整型张量
        decoder_attention_mask: Optional[torch.BoolTensor] = None,  # 解码器输入序列的注意力掩码，可选的布尔张量
        head_mask: Optional[torch.FloatTensor] = None,  # 多头注意力机制的头掩码，可选的浮点数张量
        decoder_head_mask: Optional[torch.FloatTensor] = None,  # 解码器的多头注意力机制的头掩码，可选的浮点数张量
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 交叉注意力机制的头掩码，可选的张量
        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,  # 编码器输出的元组，可选的张量元组
        start_positions: Optional[torch.LongTensor] = None,  # 开始位置的token IDs，可选的长整型张量
        end_positions: Optional[torch.LongTensor] = None,  # 结束位置的token IDs，可选的长整型张量
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入嵌入的张量，可选的浮点数张量
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,  # 解码器输入嵌入的张量，可选的浮点数张量
        use_cache: Optional[bool] = None,  # 是否使用缓存，可选的布尔值
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典格式的结果，可选的布尔值

Transformers-源码解析-七十九-

Transformers 源码解析（七十九）

.\models\mpnet\tokenization_mpnet_fast.py

.\models\mpnet\__init__.py

.\models\mpt\configuration_mpt.py

.\models\mpt\modeling_mpt.py

.\models\mpt\__init__.py

.\models\mra\configuration_mra.py

.\models\mra\convert_mra_pytorch_to_pytorch.py

.\models\mra\modeling_mra.py

.\models\mra\__init__.py

.\models\mt5\configuration_mt5.py

.\models\mt5\modeling_flax_mt5.py

.\models\mt5\modeling_mt5.py

`.\models\mpnet\tokenization_mpnet_fast.py`

`.\models\mpnet\init.py`

`.\models\mpt\configuration_mpt.py`

`.\models\mpt\modeling_mpt.py`

`.\models\mpt\init.py`

`.\models\mra\configuration_mra.py`

`.\models\mra\convert_mra_pytorch_to_pytorch.py`

`.\models\mra\modeling_mra.py`

`.\models\mra\init.py`

`.\models\mt5\configuration_mt5.py`

`.\models\mt5\modeling_flax_mt5.py`

`.\models\mt5\modeling_mt5.py`