Transformers 源码解析（一百二十八）

`.\models\xlm_roberta_xl\init.py`

# 引入类型检查模块
from typing import TYPE_CHECKING

# 引入自定义的异常和模块懒加载工具函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构，包含了配置和模型的映射和类名
_import_structure = {
    "configuration_xlm_roberta_xl": [
        "XLM_ROBERTA_XL_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "XLMRobertaXLConfig",
        "XLMRobertaXLOnnxConfig",
    ],
}

# 尝试检测是否存在 torch 库，如果不存在则抛出自定义的异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 存在，则添加相关模型的导入结构
    _import_structure["modeling_xlm_roberta_xl"] = [
        "XLM_ROBERTA_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
        "XLMRobertaXLForCausalLM",
        "XLMRobertaXLForMaskedLM",
        "XLMRobertaXLForMultipleChoice",
        "XLMRobertaXLForQuestionAnswering",
        "XLMRobertaXLForSequenceClassification",
        "XLMRobertaXLForTokenClassification",
        "XLMRobertaXLModel",
        "XLMRobertaXLPreTrainedModel",
    ]

# 如果处于类型检查模式
if TYPE_CHECKING:
    # 从配置模块中导入特定的符号（符号已在上面定义）
    from .configuration_xlm_roberta_xl import (
        XLM_ROBERTA_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
        XLMRobertaXLConfig,
        XLMRobertaXLOnnxConfig,
    )

    # 再次尝试检查 torch 库的存在，如果不存在则抛出自定义的异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从模型定义模块中导入特定的符号（符号已在上面定义）
        from .modeling_xlm_roberta_xl import (
            XLM_ROBERTA_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
            XLMRobertaXLForCausalLM,
            XLMRobertaXLForMaskedLM,
            XLMRobertaXLForMultipleChoice,
            XLMRobertaXLForQuestionAnswering,
            XLMRobertaXLForSequenceClassification,
            XLMRobertaXLForTokenClassification,
            XLMRobertaXLModel,
            XLMRobertaXLPreTrainedModel,
        )

# 如果不处于类型检查模式
else:
    # 引入系统模块
    import sys

    # 动态地将当前模块设置为一个懒加载模块，延迟加载导入的模块和符号
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\xlnet\configuration_xlnet.py`

# coding=utf-8
# 文件编码声明为 UTF-8

# XLNet 配置模块的版权声明和许可证信息

# 引入警告模块，用于在特定情况下生成警告信息
import warnings

# 从 transformers 库中引入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig

# 从 transformers 库中引入日志记录工具 logging
from ...utils import logging

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)

# 定义预训练模型配置文件的映射字典，将模型名称映射到对应的配置文件 URL
XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "xlnet/xlnet-base-cased": "https://huggingface.co/xlnet/xlnet-base-cased/resolve/main/config.json",
    "xlnet/xlnet-large-cased": "https://huggingface.co/xlnet/xlnet-large-cased/resolve/main/config.json",
}

# 定义 XLNetConfig 类，用于存储 XLNet 模型的配置信息
class XLNetConfig(PretrainedConfig):
    """
    这是一个配置类，用于存储 [`XLNetModel`] 或 [`TFXLNetModel`] 的配置信息。根据指定的参数实例化一个 XLNet 模型，
    定义模型的架构。使用默认参数实例化配置对象将得到与 [xlnet/xlnet-large-cased](https://huggingface.co/xlnet/xlnet-large-cased)
    架构类似的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。

    示例：

    ```
    >>> from transformers import XLNetConfig, XLNetModel

    >>> # 初始化一个 XLNet 配置对象
    >>> configuration = XLNetConfig()

    >>> # 使用配置对象初始化一个模型（随机权重）
    >>> model = XLNetModel(configuration)

    >>> # 访问模型的配置信息
    >>> configuration = model.config
    ```
    """

    # 模型类型为 "xlnet"
    model_type = "xlnet"

    # 推理阶段忽略的键列表，这些键不参与推理阶段的处理
    keys_to_ignore_at_inference = ["mems"]

    # 属性映射字典，将旧属性名映射到新属性名，用于向后兼容性
    attribute_map = {
        "n_token": "vocab_size",       # 词汇表大小，向后兼容
        "hidden_size": "d_model",      # 隐藏层大小
        "num_attention_heads": "n_head",  # 注意力头的数量
        "num_hidden_layers": "n_layer",   # 隐藏层的数量
    }
    def __init__(
        self,
        vocab_size=32000,
        d_model=1024,
        n_layer=24,
        n_head=16,
        d_inner=4096,
        ff_activation="gelu",
        untie_r=True,
        attn_type="bi",
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        dropout=0.1,
        mem_len=512,
        reuse_len=None,
        use_mems_eval=True,
        use_mems_train=False,
        bi_data=False,
        clamp_len=-1,
        same_length=False,
        summary_type="last",
        summary_use_proj=True,
        summary_activation="tanh",
        summary_last_dropout=0.1,
        start_n_top=5,
        end_n_top=5,
        pad_token_id=5,
        bos_token_id=1,
        eos_token_id=2,
        **kwargs,
    ):
        """Constructs XLNetConfig."""

        # 初始化 XLNetConfig 对象，设置各个参数
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_layer = n_layer
        self.n_head = n_head

        # 检查 d_model 是否能整除 n_head
        if d_model % n_head != 0:
            raise ValueError(f"'d_model % n_head' ({d_model % n_head}) should be equal to 0")

        # 检查 kwargs 中是否存在 d_head 参数，并且其值是否等于 d_model // n_head
        if "d_head" in kwargs:
            if kwargs["d_head"] != d_model // n_head:
                raise ValueError(
                    f"`d_head` ({kwargs['d_head']}) should be equal to `d_model // n_head` ({d_model // n_head})"
                )

        # 计算并设置 d_head 的值
        self.d_head = d_model // n_head

        # 设置激活函数类型
        self.ff_activation = ff_activation
        # 设置内部维度大小
        self.d_inner = d_inner
        # 是否解绑 r
        self.untie_r = untie_r
        # 设置注意力类型
        self.attn_type = attn_type

        # 设置初始化范围
        self.initializer_range = initializer_range
        # 设置层归一化的 epsilon 值
        self.layer_norm_eps = layer_norm_eps

        # 设置 dropout 概率
        self.dropout = dropout
        # 设置记忆长度
        self.mem_len = mem_len
        # 设置重复长度
        self.reuse_len = reuse_len
        # 是否使用双向数据
        self.bi_data = bi_data
        # 设置 clamp 长度
        self.clamp_len = clamp_len
        # 是否具有相同长度
        self.same_length = same_length

        # 设置摘要类型
        self.summary_type = summary_type
        # 是否使用投影层
        self.summary_use_proj = summary_use_proj
        # 摘要激活函数类型
        self.summary_activation = summary_activation
        # 最后一层摘要的 dropout 概率
        self.summary_last_dropout = summary_last_dropout
        # 开始 top-n
        self.start_n_top = start_n_top
        # 结束 top-n
        self.end_n_top = end_n_top

        # 设置开始、填充、结束 token 的 ID
        self.bos_token_id = bos_token_id
        self.pad_token_id = pad_token_id
        self.eos_token_id = eos_token_id

        # 检查 kwargs 中是否存在 use_cache 参数，如果存在则警告将被弃用
        if "use_cache" in kwargs:
            warnings.warn(
                "The `use_cache` argument is deprecated and will be removed in a future version, use `use_mems_eval`"
                " instead.",
                FutureWarning,
            )
            # 将 use_cache 的值赋给 use_mems_eval
            use_mems_eval = kwargs["use_cache"]

        # 设置是否在评估中使用记忆
        self.use_mems_eval = use_mems_eval
        # 设置是否在训练中使用记忆
        self.use_mems_train = use_mems_train

        # 调用父类初始化方法，传递 pad_token_id、bos_token_id、eos_token_id 以及其它 kwargs
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

    @property
    def max_position_embeddings(self):
        # 输出日志信息，指出该模型没有序列长度限制
        logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
        # 返回 -1 表示没有序列长度限制
        return -1

    @max_position_embeddings.setter
    # 定义一个方法 max_position_embeddings，接受参数 value
    def max_position_embeddings(self, value):
        # 抛出 NotImplementedError 异常，说明方法尚未实现
        raise NotImplementedError(
            f"The model {self.model_type} is one of the few models that has no sequence length limit."
        )

`.\models\xlnet\convert_xlnet_original_tf_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert BERT checkpoint."""

import argparse  # 导入命令行参数解析库
import os  # 导入操作系统相关功能的库

import torch  # 导入PyTorch

from transformers import (  # 导入transformers库中的相关类和函数
    XLNetConfig,
    XLNetForQuestionAnswering,
    XLNetForSequenceClassification,
    XLNetLMHeadModel,
    load_tf_weights_in_xlnet,
)
from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging  # 从transformers.utils中导入配置名、权重名和日志功能


GLUE_TASKS_NUM_LABELS = {  # GLUE任务到标签数量的映射字典
    "cola": 2,
    "mnli": 3,
    "mrpc": 2,
    "sst-2": 2,
    "sts-b": 1,
    "qqp": 2,
    "qnli": 2,
    "rte": 2,
    "wnli": 2,
}

logging.set_verbosity_info()  # 设置日志的详细程度为info


def convert_xlnet_checkpoint_to_pytorch(
    tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None
):
    # Initialise PyTorch model 初始化PyTorch模型
    config = XLNetConfig.from_json_file(bert_config_file)

    finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""  # 将微调任务名称转换为小写（如果指定了的话）
    if finetuning_task in GLUE_TASKS_NUM_LABELS:  # 如果微调任务在GLUE任务标签数量字典中
        print(f"Building PyTorch XLNetForSequenceClassification model from configuration: {config}")
        config.finetuning_task = finetuning_task  # 设置配置对象的微调任务属性
        config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]  # 设置配置对象的标签数量属性
        model = XLNetForSequenceClassification(config)  # 创建XLNet序列分类模型
    elif "squad" in finetuning_task:  # 如果微调任务名称中包含"squad"
        config.finetuning_task = finetuning_task  # 设置配置对象的微调任务属性
        model = XLNetForQuestionAnswering(config)  # 创建XLNet问答模型
    else:
        model = XLNetLMHeadModel(config)  # 创建XLNet语言建模头模型

    # Load weights from tf checkpoint 从TensorFlow的checkpoint中加载权重
    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)

    # Save pytorch-model 保存PyTorch模型
    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)  # 拼接PyTorch权重保存路径
    pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)  # 拼接PyTorch配置保存路径
    print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}")  # 打印保存PyTorch权重模型的路径
    torch.save(model.state_dict(), pytorch_weights_dump_path)  # 保存PyTorch模型的权重
    print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}")  # 打印保存配置文件的路径
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
        f.write(config.to_json_string())  # 将配置对象转换成JSON字符串并写入文件


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建命令行参数解析器
    # Required parameters 必要的参数
    parser.add_argument(
        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
    )
    parser.add_argument(
        "--xlnet_config_file",
        default=None,
        type=str,
        required=True,
        help=(
            "The config json file corresponding to the pre-trained XLNet model. \n"
            "This specifies the model architecture."
        ),
    )
    # 添加命令行参数 --xlnet_config_file，用于指定预训练XLNet模型的配置文件路径
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        required=True,
        help="Path to the folder to store the PyTorch model or dataset/vocab.",
    )
    # 添加命令行参数 --pytorch_dump_folder_path，指定存储PyTorch模型或数据集/词汇表的文件夹路径
    parser.add_argument(
        "--finetuning_task",
        default=None,
        type=str,
        help="Name of a task on which the XLNet TensorFlow model was fine-tuned",
    )
    # 添加命令行参数 --finetuning_task，用于指定XLNet TensorFlow模型进行微调的任务名称
    args = parser.parse_args()
    # 解析命令行参数，并将其存储在args变量中
    print(args)

    convert_xlnet_checkpoint_to_pytorch(
        args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task
    )
    # 调用函数convert_xlnet_checkpoint_to_pytorch，传递解析后的参数，执行XLNet模型的转换操作

`.\models\xlnet\modeling_tf_xlnet.py`

# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
 TF 2.0 XLNet model.
"""


from __future__ import annotations

import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...activations_tf import get_tf_activation
from ...modeling_tf_utils import (
    TFCausalLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFSequenceSummary,
    TFSharedEmbeddings,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_xlnet import XLNetConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "xlnet/xlnet-base-cased"
_CONFIG_FOR_DOC = "XLNetConfig"

TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "xlnet/xlnet-base-cased",
    "xlnet/xlnet-large-cased",
    # See all XLNet models at https://huggingface.co/models?filter=xlnet
]


class TFXLNetRelativeAttention(keras.layers.Layer):
    """
    相对注意力层的 TensorFlow 2.0 实现。

    Args:
        config (XLNetConfig): XLNet 模型的配置对象。

    Raises:
        ValueError: 如果配置中的隐藏大小不是注意力头数的倍数。

    Attributes:
        n_head (int): 注意力头的数量。
        d_head (int): 每个注意力头的隐藏大小。
        d_model (int): 模型的隐藏大小。
        scale (float): 缩放因子，用于注意力计算。
        initializer_range (float): 初始化范围。
        output_attentions (bool): 是否输出注意力权重。
        layer_norm (keras.layers.LayerNormalization): 应用在每个子层输出上的层归一化层。
        dropout (keras.layers.Dropout): 用于应用 dropout 的层。
        config (XLNetConfig): XLNet 模型的配置对象。
    """

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        if config.d_model % config.n_head != 0:
            raise ValueError(
                f"The hidden size ({config.d_model}) is not a multiple of the number of attention "
                f"heads ({config.n_head}"
            )

        self.n_head = config.n_head
        self.d_head = config.d_head
        self.d_model = config.d_model
        self.scale = 1 / (config.d_head**0.5)
        self.initializer_range = config.initializer_range
        self.output_attentions = config.output_attentions

        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        self.dropout = keras.layers.Dropout(config.dropout)
        self.config = config
    # 在神经网络模型中建立权重参数，用于自注意力机制的构建
    def build(self, input_shape=None):
        # 根据指定的初始化范围获取初始化器
        initializer = get_initializer(self.initializer_range)
        # 添加查询向量权重矩阵 q
        self.q = self.add_weight(
            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q"
        )
        # 添加键向量权重矩阵 k
        self.k = self.add_weight(
            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="k"
        )
        # 添加值向量权重矩阵 v
        self.v = self.add_weight(
            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="v"
        )
        # 添加输出向量权重矩阵 o
        self.o = self.add_weight(
            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="o"
        )
        # 添加相对位置编码向量权重矩阵 r
        self.r = self.add_weight(
            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="r"
        )
        # 添加相对位置编码的尾部-尾部偏置矩阵 r_r_bias
        self.r_r_bias = self.add_weight(
            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
        )
        # 添加相对位置编码的尾部-序列偏置矩阵 r_s_bias
        self.r_s_bias = self.add_weight(
            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_s_bias"
        )
        # 添加相对位置编码的尾部-权重偏置矩阵 r_w_bias
        self.r_w_bias = self.add_weight(
            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
        )
        # 添加分段嵌入向量权重矩阵 seg_embed
        self.seg_embed = self.add_weight(
            shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed"
        )

        # 如果已经构建过网络，直接返回
        if self.built:
            return
        # 标记网络已构建
        self.built = True
        # 如果存在层归一化，则对其进行构建
        if getattr(self, "layer_norm", None) is not None:
            # 在指定的命名域下构建层归一化，设置输入形状为 [None, None, self.config.d_model]
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.d_model])

    # 剪枝指定的注意力头，但未实现具体功能
    def prune_heads(self, heads):
        raise NotImplementedError

    # 执行相对偏移以形成相对注意力分数
    def rel_shift(self, x, klen=-1):
        """perform relative shift to form the relative attention score."""
        # 获取张量 x 的形状列表
        x_size = shape_list(x)

        # 将张量 x 重塑为新的形状
        x = tf.reshape(x, (x_size[1], x_size[0], x_size[2], x_size[3]))
        # 从第二个元素开始切片，实现相对偏移
        x = x[1:, ...]
        # 再次重塑张量 x
        x = tf.reshape(x, (x_size[0], x_size[1] - 1, x_size[2], x_size[3]))
        # 切片以控制长度为 klen
        x = x[:, 0:klen, :, :]
        # 返回处理后的张量 x
        return x

    # 执行相对注意力的核心计算
    def rel_attn_core(
        self, q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask, output_attentions, training=False
    ):
        # 此方法的具体实现逻辑未提供，用于执行相对注意力的核心计算
    ):
        """Core relative positional attention operations."""
        # 计算基于内容的注意力分数
        ac = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_w_bias, k_head_h)

        # 计算基于位置的注意力分数
        bd = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_r_bias, k_head_r)
        bd = self.rel_shift(bd, klen=shape_list(ac)[1])

        # 计算基于段落的注意力分数
        if seg_mat is None:
            ef = 0
        else:
            ef = tf.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
            ef = tf.einsum("ijbs,ibns->ijbn", seg_mat, ef)

        # 合并注意力分数并执行掩码处理
        attn_score = (ac + bd + ef) * self.scale
        if attn_mask is not None:
            # 根据掩码类型进行不同的处理
            if attn_mask.dtype == tf.float16 or attn_mask.dtype == tf.bfloat16:
                attn_score = attn_score - 65500 * attn_mask
            else:
                attn_score = attn_score - 1e30 * attn_mask

        # 计算注意力概率
        attn_prob = stable_softmax(attn_score, axis=1)

        attn_prob = self.dropout(attn_prob, training=training)

        # 如果需要，对注意力头进行掩码处理
        if head_mask is not None:
            attn_prob = attn_prob * head_mask

        # 计算注意力输出向量
        attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, v_head_h)

        if output_attentions:
            return attn_vec, attn_prob

        # 返回注意力输出向量
        return attn_vec

    def post_attention(self, h, attn_vec, residual=True, training=False):
        """Post-attention processing."""
        # 后处理注意力向量，投影回 `d_model` 空间
        attn_out = tf.einsum("ibnd,hnd->ibh", attn_vec, self.o)

        attn_out = self.dropout(attn_out, training=training)

        if residual:
            attn_out = attn_out + h
        output = self.layer_norm(attn_out)

        return output

    def call(
        self,
        h,
        g,
        attn_mask_h,
        attn_mask_g,
        r,
        seg_mat,
        mems: np.ndarray | tf.Tensor | None = None,
        target_mapping: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = False,
        training: bool = False,
# 定义一个自定义的XLNet层，继承自Keras的Layer类
class TFXLNetFeedForward(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # LayerNormalization层，用于归一化输入数据
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        # 第一个全连接层，用于特征提取
        self.layer_1 = keras.layers.Dense(
            config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1"
        )
        # 第二个全连接层，用于映射到输出维度
        self.layer_2 = keras.layers.Dense(
            config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2"
        )
        # Dropout层，用于防止过拟合
        self.dropout = keras.layers.Dropout(config.dropout)
        # 根据配置选择激活函数
        if isinstance(config.ff_activation, str):
            self.activation_function = get_tf_activation(config.ff_activation)
        else:
            self.activation_function = config.ff_activation
        # 保存配置信息
        self.config = config

    # 定义层的前向传播过程
    def call(self, inp, training=False):
        output = inp
        output = self.layer_1(output)  # 第一个全连接层
        output = self.activation_function(output)  # 激活函数
        output = self.dropout(output, training=training)  # Dropout层
        output = self.layer_2(output)  # 第二个全连接层
        output = self.dropout(output, training=training)  # 再次应用Dropout层
        output = self.layer_norm(output + inp)  # 残差连接和LayerNormalization
        return output

    # 构建层，初始化各子层
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 构建LayerNormalization层
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.d_model])
        # 构建第一个全连接层
        if getattr(self, "layer_1", None) is not None:
            with tf.name_scope(self.layer_1.name):
                self.layer_1.build([None, None, self.config.d_model])
        # 构建第二个全连接层
        if getattr(self, "layer_2", None) is not None:
            with tf.name_scope(self.layer_2.name):
                self.layer_2.build([None, None, self.config.d_inner])


# 定义一个XLNet层，继承自Keras的Layer类
class TFXLNetLayer(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 相对注意力机制层
        self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn")
        # 前馈神经网络层
        self.ff = TFXLNetFeedForward(config, name="ff")
        # Dropout层
        self.dropout = keras.layers.Dropout(config.dropout)

    # 定义层的前向传播过程
    def call(
        self,
        output_h,
        output_g,
        non_tgt_mask,
        attn_mask,
        pos_emb,
        seg_mat,
        mems: np.ndarray | tf.Tensor | None = None,
        target_mapping: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = False,
        training: bool = False,
    ):
        # 调用相对注意力模块计算输出
        outputs = self.rel_attn(
            output_h,
            output_g,
            non_tgt_mask,
            attn_mask,
            pos_emb,
            seg_mat,
            mems,
            target_mapping,
            head_mask,
            output_attentions,
            training=training,
        )
        # 分离输出中的 h 和 g
        output_h, output_g = outputs[:2]

        # 如果存在 output_g，则通过前馈网络处理
        if output_g is not None:
            output_g = self.ff(output_g, training=training)
        
        # 通过前馈网络处理 output_h
        output_h = self.ff(output_h, training=training)

        # 如果 outputs 还包含额外的注意力信息，则重新加入输出中
        outputs = (output_h, output_g) + outputs[2:]  # 如果有额外的注意力信息，再次添加到输出中
        return outputs

    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        
        # 设置为已构建状态
        self.built = True
        
        # 如果存在相对注意力模块，构建其子模块
        if getattr(self, "rel_attn", None) is not None:
            with tf.name_scope(self.rel_attn.name):
                self.rel_attn.build(None)
        
        # 如果存在前馈网络模块，构建其子模块
        if getattr(self, "ff", None) is not None:
            with tf.name_scope(self.ff.name):
                self.ff.build(None)
class TFXLNetLMHead(keras.layers.Layer):
    # TFXLNetLMHead 类定义，继承自 keras.layers.Layer

    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)
        # 初始化方法，接受 config 和 input_embeddings 参数
        self.config = config
        # 将 config 参数赋值给实例变量 self.config
        # 输出权重与输入嵌入相同，但每个标记都有一个仅输出的偏置
        self.input_embeddings = input_embeddings
        # 将 input_embeddings 参数赋值给实例变量 self.input_embeddings

    def build(self, input_shape):
        # build 方法，用于构建层，在此处添加权重
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
        # 添加一个名为 bias 的权重，形状为 (config.vocab_size,)
        super().build(input_shape)
        # 调用父类的 build 方法，传入 input_shape 参数

    def get_output_embeddings(self):
        # 返回输入嵌入 self.input_embeddings
        return self.input_embeddings

    def set_output_embeddings(self, value):
        # 设置输出嵌入的值，并更新 vocab_size
        self.input_embeddings.weight = value
        self.input_embeddings.vocab_size = shape_list(value)[0]

    def get_bias(self):
        # 返回偏置 self.bias
        return {"bias": self.bias}

    def set_bias(self, value):
        # 设置偏置 self.bias 的值，并更新 config.vocab_size
        self.bias = value["bias"]
        self.config.vocab_size = shape_list(value["bias"])[0]

    def call(self, hidden_states):
        # 定义层的前向传播逻辑
        hidden_states = self.input_embeddings(hidden_states, mode="linear")
        # 使用输入嵌入层处理隐藏状态，模式为 "linear"
        hidden_states = hidden_states + self.bias
        # 添加偏置到隐藏状态中
        return hidden_states
        # 返回处理后的隐藏状态


@keras_serializable
class TFXLNetMainLayer(keras.layers.Layer):
    # TFXLNetMainLayer 类定义，继承自 keras.layers.Layer

    config_class = XLNetConfig
    # 类变量 config_class，指定为 XLNetConfig 类

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 初始化方法，接受 config 参数
        self.config = config
        # 将 config 参数赋值给实例变量 self.config
        self.output_hidden_states = config.output_hidden_states
        # 设置输出隐藏状态的配置
        self.output_attentions = config.output_attentions
        # 设置输出注意力权重的配置
        self.return_dict = config.return_dict
        # 设置返回字典的配置

        self.mem_len = config.mem_len
        # 设置记忆长度的配置
        self.reuse_len = config.reuse_len
        # 设置重用长度的配置
        self.d_model = config.d_model
        # 设置模型维度的配置
        self.same_length = config.same_length
        # 设置是否长度相同的配置
        self.attn_type = config.attn_type
        # 设置注意力类型的配置
        self.bi_data = config.bi_data
        # 设置是否双向数据的配置
        self.clamp_len = config.clamp_len
        # 设置长度截断的配置
        self.n_layer = config.n_layer
        # 设置层数的配置
        self.use_bfloat16 = config.use_bfloat16
        # 设置是否使用 bfloat16 的配置
        self.initializer_range = config.initializer_range
        # 设置初始化范围的配置

        self.word_embedding = TFSharedEmbeddings(
            config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding"
        )
        # 创建 TFSharedEmbeddings 实例 word_embedding，共享嵌入
        self.layer = [TFXLNetLayer(config, name=f"layer_._{i}") for i in range(config.n_layer)]
        # 创建 TFXLNetLayer 实例的列表 layer，根据配置的层数
        self.dropout = keras.layers.Dropout(config.dropout)
        # 创建 Dropout 层，使用配置的 dropout 概率

        self.use_mems_eval = config.use_mems_eval
        # 设置评估时是否使用记忆的配置
        self.use_mems_train = config.use_mems_train
        # 设置训练时是否使用记忆的配置

    def get_input_embeddings(self):
        # 返回输入嵌入层 self.word_embedding
        return self.word_embedding

    def set_input_embeddings(self, value):
        # 设置输入嵌入层的值，并更新 vocab_size
        self.word_embedding.weight = value
        self.word_embedding.vocab_size = shape_list(value)[0]
    # 构建函数，用于初始化模型的权重和层结构
    def build(self, input_shape=None):
        # 获取指定初始化范围的初始化器
        initializer = get_initializer(self.initializer_range)
        # 添加名为 mask_emb 的可训练权重，形状为 (1, 1, self.d_model)
        self.mask_emb = self.add_weight(
            shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb"
        )

        # 如果模型已经建立，则直接返回，避免重复构建
        if self.built:
            return
        self.built = True  # 设置模型为已建立状态

        # 如果存在 word_embedding 属性，则构建它
        if getattr(self, "word_embedding", None) is not None:
            with tf.name_scope(self.word_embedding.name):
                self.word_embedding.build(None)

        # 如果存在 layer 属性，则逐层构建每一层
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                with tf.name_scope(layer.name):
                    layer.build(None)

    # 剪枝注意力头的方法，抛出未实现错误
    def _prune_heads(self, heads_to_prune):
        raise NotImplementedError

    # 创建自注意力掩码的方法，返回一个浮点数掩码，用于指示哪些位置需要被屏蔽
    def create_mask(self, qlen, mlen):
        """
        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.

        Args:
            qlen: Query 的长度
            mlen: Memory 的长度
        """
        attn_mask = tf.ones([qlen, qlen])  # 创建全为 1 的注意力掩码矩阵
        mask_u = tf.linalg.band_part(attn_mask, 0, -1)  # 上三角矩阵
        mask_dia = tf.linalg.band_part(attn_mask, 0, 0)  # 对角线矩阵
        attn_mask_pad = tf.zeros([qlen, mlen])  # 创建全为 0 的填充掩码矩阵
        ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)  # 拼接得到最终的掩码矩阵

        # 如果设置了 same_length 标志，则生成长度相同的掩码矩阵
        if self.same_length:
            mask_l = tf.linalg.band_part(attn_mask, -1, 0)  # 下三角矩阵
            ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)  # 拼接得到长度相同的掩码矩阵

        return ret  # 返回生成的注意力掩码矩阵

    # 缓存当前输出到内存中的方法，用于在模型推理或训练时存储隐藏状态
    def cache_mem(self, curr_out, prev_mem):
        # 如果设置了 reuse_len 并且大于 0，则截取当前输出的前部分作为有效输出
        if self.reuse_len is not None and self.reuse_len > 0:
            curr_out = curr_out[: self.reuse_len]

        # 如果未定义 mem_len 或 mem_len 为 0，则设定截断点为 0
        if self.mem_len is None or self.mem_len == 0:
            cutoff = 0
        else:
            # 否则，根据 mem_len 设定截断点
            cutoff = -self.mem_len

        # 如果之前的记忆 prev_mem 为空，则直接使用当前输出的截断部分
        if prev_mem is None:
            new_mem = curr_out[cutoff:]
        else:
            # 否则，将当前输出与之前的记忆连接，并根据截断点进行截取
            new_mem = tf.concat([prev_mem, curr_out], 0)[cutoff:]

        return tf.stop_gradient(new_mem)  # 返回新的内存状态，并停止梯度传播
    def positional_embedding(pos_seq, inv_freq, bsz=None):
        # 使用 tf.einsum 计算正弦和余弦函数输入的乘积
        sinusoid_inp = tf.einsum("i,d->id", pos_seq, inv_freq)
        # 将正弦和余弦函数结果连接起来，形成位置编码
        pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1)
        # 在第二维度增加一个维度，用于后续的扩展操作
        pos_emb = pos_emb[:, None, :]

        if bsz is not None:
            # 如果指定了 batch size，使用 tf.tile 扩展 pos_emb 的第二维度
            pos_emb = tf.tile(pos_emb, [1, bsz, 1])

        return pos_emb

    def relative_positional_encoding(self, qlen, klen, bsz=None):
        """create relative positional encoding."""
        # 创建频率序列，用于计算位置编码
        freq_seq = tf.range(0, self.d_model, 2.0)
        inv_freq = 1 / (10000 ** (freq_seq / self.d_model))

        if self.attn_type == "bi":
            # 如果是双向注意力，设置起始和结束位置
            beg, end = klen, -qlen
        elif self.attn_type == "uni":
            # 如果是单向注意力，设置起始和结束位置
            beg, end = klen, -1
        else:
            # 抛出异常，表示未知的注意力类型
            raise ValueError(f"Unknown `attn_type` {self.attn_type}.")

        if self.bi_data:
            # 如果使用双向数据，生成正向和反向的位置序列
            fwd_pos_seq = tf.range(beg, end, -1.0)
            bwd_pos_seq = tf.range(-beg, -end, 1.0)

            if self.clamp_len > 0:
                # 如果设置了 clamp_len，则对序列进行截断
                fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
                bwd_pos_seq = tf.clip_by_value(bwd_pos_seq, -self.clamp_len, self.clamp_len)

            if bsz is not None:
                # 如果指定了 batch size，按照 batch size 的一半创建正向和反向的位置编码
                if bsz % 2 != 0:
                    raise ValueError(f"With bi_data, the batch size {bsz} should be divisible by 2")
                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
            else:
                # 否则创建不带 batch size 的正向和反向位置编码
                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)

            # 拼接正向和反向的位置编码
            pos_emb = tf.concat([fwd_pos_emb, bwd_pos_emb], axis=1)
        else:
            # 如果不使用双向数据，只生成正向的位置序列
            fwd_pos_seq = tf.range(beg, end, -1.0)
            if self.clamp_len > 0:
                # 如果设置了 clamp_len，则对序列进行截断
                fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
            # 创建正向的位置编码
            pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)

        return pos_emb

    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        mems: np.ndarray | tf.Tensor | None = None,
        perm_mask: np.ndarray | tf.Tensor | None = None,
        target_mapping: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        input_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        use_mems: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
# TFXLNetPreTrainedModel 类，继承自 TFPreTrainedModel 类，用于处理权重初始化和预训练模型下载及加载的抽象类。
class TFXLNetPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 配置类变量，指定为 XLNetConfig 类型，用于配置模型的参数和结构
    config_class = XLNetConfig
    # 基础模型前缀，指定为 "transformer"，用于模型命名空间管理
    base_model_prefix = "transformer"


# dataclass 装饰器标记 TFXLNetModelOutput 类，定义了 TFXLNetModel 的输出类型
@dataclass
class TFXLNetModelOutput(ModelOutput):
    """
    Output type of [`TFXLNetModel`].

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, num_predict, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.

            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict`
            corresponds to `sequence_length`.
        mems (`List[tf.Tensor]` of length `config.n_layers`):
            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
            token ids which have their past given to this model should not be passed as `input_ids` as they have
            already been computed.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # 最后一层模型的隐藏状态，形状为 `(batch_size, num_predict, hidden_size)` 的张量
    last_hidden_state: tf.Tensor = None
    # 预先计算的隐藏状态列表，长度为 `config.n_layers` 的张量列表
    mems: List[tf.Tensor] | None = None
    # 可选项，当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回，包含每层模型输出的元组
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    # 可选项，当 `output_attentions=True` 或 `config.output_attentions=True` 时返回，包含每层注意力权重的元组
    attentions: Tuple[tf.Tensor, ...] | None = None


@dataclass
class TFXLNetLMHeadModelOutput(ModelOutput):
    """
    Output type of [`TFXLNetLMHeadModel`].
    """

    # 此处未定义具体的输出结构或参数，但作为 TFXLNetLMHeadModel 的输出类型声明
    pass
    # 定义函数的参数列表，包含多个可选参数，用于语言建模任务
    Args:
        loss (`tf.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
            如果提供了 `labels`，则返回的语言建模损失（用于下一个标记预测）。
        logits (`tf.Tensor` of shape `(batch_size, num_predict, config.vocab_size)`):
            语言建模头部的预测分数（在应用 SoftMax 之前的每个词汇标记的分数）。
    
            `num_predict` 对应于 `target_mapping.shape[1]`。如果 `target_mapping` 是 `None`，则 `num_predict`
            对应于 `sequence_length`。
        mems (`List[tf.Tensor]` of length `config.n_layers`):
            包含预计算的隐藏状态。可以用于加速顺序解码。已经计算过其过去的令牌 id 不应该作为 `input_ids` 传递，
            因为它们已经被计算过。
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            一个元组，包含 `tf.Tensor`（一个用于嵌入输出 + 每个层的输出）的形状为 `(batch_size, sequence_length, hidden_size)`。
    
            模型每个层的输出以及初始嵌入输出的隐藏状态。
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            一个元组，包含 `tf.Tensor`（每个层的一个）的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
    
            注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
# 使用 `dataclass` 装饰器定义一个数据类，表示XLNet用于序列分类任务的输出。
@dataclass
class TFXLNetForSequenceClassificationOutput(ModelOutput):
    """
    [`TFXLNetForSequenceClassification`] 的输出类型。

    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, 当 `label` 被提供时返回):
            分类（如果 `config.num_labels==1` 则为回归）损失。
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            分类（或回归，如果 `config.num_labels==1`）得分（SoftMax 之前）。
        mems (`List[tf.Tensor]` of length `config.n_layers`):
            包含预计算隐藏状态。可以用于加速序列解码。将已经计算过其过去的令牌 id 传递给该模型不应作为 `input_ids` 传递。
        hidden_states (`tuple(tf.Tensor)`, *optional*, 当 `output_hidden_states=True` 传递或者 `config.output_hidden_states=True` 时返回):
            形状为 `(batch_size, sequence_length, hidden_size)` 的 `tf.Tensor` 元组（一个用于嵌入输出，一个用于每一层的输出）。

            模型每一层输出的隐藏状态加上初始嵌入输出。
        attentions (`tuple(tf.Tensor)`, *optional*, 当 `output_attentions=True` 传递或者 `config.output_attentions=True` 时返回):
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的 `tf.Tensor` 元组（每层一个）。

            在注意力 softmax 后的注意力权重，用于在自注意力头中计算加权平均值。
    """

    # 损失值，类型为 `tf.Tensor` 或 `None`
    loss: tf.Tensor | None = None
    # 预测的 logits，类型为 `tf.Tensor` 或 `None`
    logits: tf.Tensor = None
    # 隐藏状态的记忆列表，类型为 `List[tf.Tensor]` 或 `None`
    mems: List[tf.Tensor] | None = None
    # 每一层的隐藏状态，类型为 `Tuple[tf.Tensor, ...]` 或 `None`
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    # 注意力权重，类型为 `Tuple[tf.Tensor, ...]` 或 `None`
    attentions: Tuple[tf.Tensor, ...] | None = None


@dataclass
class TFXLNetForTokenClassificationOutput(ModelOutput):
    """
    [`TFXLNetForTokenClassificationOutput`] 的输出类型。
    """
    # loss: `tf.Tensor`类型，形状为`(1,)`，可选参数，当提供`labels`时返回。
    #       分类损失。
    loss: tf.Tensor | None = None
    
    # logits: `tf.Tensor`类型，形状为`(batch_size, sequence_length, config.num_labels)`。
    #         分类分数（SoftMax之前的）。
    logits: tf.Tensor = None
    
    # mems: 长度为`config.n_layers`的`List[tf.Tensor]`类型，可选参数。
    #       包含预先计算的隐藏状态。可以用于加速顺序解码。
    #       此模型已经计算了过去的令牌id，不应作为`input_ids`传递。
    mems: List[tf.Tensor] | None = None
    
    # hidden_states: 可选参数，当传递`output_hidden_states=True`或`config.output_hidden_states=True`时返回。
    #                是一个包含两个`tf.Tensor`的元组。
    #                第一个`tf.Tensor`为嵌入层的输出，第二个为每一层的输出。
    #                形状为`(batch_size, sequence_length, hidden_size)`。
    #                模型每一层的隐藏状态加上初始嵌入层的输出。
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    
    # attentions: 可选参数，当传递`output_attentions=True`或`config.output_attentions=True`时返回。
    #             是一个包含多个`tf.Tensor`的元组，每个`tf.Tensor`对应一个层的注意力权重。
    #             形状为`(batch_size, num_heads, sequence_length, sequence_length)`。
    #             注意力softmax之后的注意力权重，用于在自注意力头中计算加权平均值。
    attentions: Tuple[tf.Tensor, ...] | None = None
# 定义一个数据类，用于存储 `TFXLNetForMultipleChoice` 模型的输出结果
@dataclass
class TFXLNetForMultipleChoiceOutput(ModelOutput):
    """
    Output type of [`TFXLNetForMultipleChoice`].

    Args:
        loss (`tf.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
            分类损失。
        logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
            `num_choices` 是输入张量的第二维度。参见上文中的 `input_ids`。

            分类得分（SoftMax 之前）。
        mems (`List[tf.Tensor]` of length `config.n_layers`):
            包含预先计算的隐藏状态。可用于加速顺序解码。这个模型接收到的 token id 不应作为 `input_ids` 传递，因为它们已经被计算过。
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            元组，包含 `tf.Tensor`（一个用于嵌入输出，每一层一个用于层的输出）的形状为 `(batch_size, sequence_length, hidden_size)`。

            每层模型的隐藏状态，以及初始嵌入的输出。
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            元组，包含每一层的 `tf.Tensor` 的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。

            注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
    """

    loss: tf.Tensor | None = None
    logits: tf.Tensor = None
    mems: List[tf.Tensor] | None = None
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    attentions: Tuple[tf.Tensor, ...] | None = None


# 定义一个数据类，用于存储 `TFXLNetForQuestionAnsweringSimple` 模型的输出结果
@dataclass
class TFXLNetForQuestionAnsweringSimpleOutput(ModelOutput):
    """
    Output type of [`TFXLNetForQuestionAnsweringSimple`].
    """
    # 定义函数参数和返回值的注释文档字符串，描述了函数可能接收的参数和返回的值的类型和含义
    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            总的跨度抽取损失，由开始和结束位置的交叉熵之和组成。
        start_logits (`tf.Tensor` of shape `(batch_size, sequence_length,)`):
            跨度开始位置的分数（SoftMax 之前）。
        end_logits (`tf.Tensor` of shape `(batch_size, sequence_length,)`):
            跨度结束位置的分数（SoftMax 之前）。
        mems (`List[tf.Tensor]` of length `config.n_layers`):
            包含预先计算的隐藏状态的列表。可以用于加速序列解码。
            给定到该模型的过去标记 id 不应作为 `input_ids` 传递，因为它们已经计算过。
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            包含模型每一层输出的隐藏状态的元组。
            第一个张量是嵌入层的输出，后续的张量是每一层输出的隐藏状态。
            形状为 `(batch_size, sequence_length, hidden_size)`。
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            包含自注意力机制 softmax 后的注意力权重的元组。
            用于计算自注意力头部的加权平均值。
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。

    """
    
    # 初始化变量，表示损失、开始位置分数、结束位置分数、预先计算的隐藏状态、每层的隐藏状态、每层的注意力权重
    loss: tf.Tensor | None = None
    start_logits: tf.Tensor = None
    end_logits: tf.Tensor = None
    mems: List[tf.Tensor] | None = None
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    attentions: Tuple[tf.Tensor, ...] | None = None
"""
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`XLNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 用于存储 XLNet 模型输入说明文档的常量字符串
XLNET_INPUTS_DOCSTRING = r"""
"""


# 使用装饰器 `add_start_docstrings` 给 `TFXLNetModel` 类添加描述文档
@add_start_docstrings(
    "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
    XLNET_START_DOCSTRING,  # 引用之前定义的模型文档字符串
)
# 定义 TFXLNetModel 类，继承自 TFXLNetPreTrainedModel
class TFXLNetModel(TFXLNetPreTrainedModel):
    # 初始化方法，接收一个 config 对象和任意数量的位置参数和关键字参数
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)
        # 创建一个名为 transformer 的 TFXLNetMainLayer 实例
        self.transformer = TFXLNetMainLayer(config, name="transformer")

    # 使用装饰器 `unpack_inputs` 给该方法添加描述文档
    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFXLNetModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 添加代码示例的文档字符串，指定检查点、输出类型和配置类
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入的 token IDs，可以为空
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力掩码，可以为空，可以是 NumPy 数组或 TensorFlow 张量
        mems: np.ndarray | tf.Tensor | None = None,  # 循环记忆，可以为空，可以是 NumPy 数组或 TensorFlow 张量
        perm_mask: np.ndarray | tf.Tensor | None = None,  # 排列掩码，可以为空，可以是 NumPy 数组或 TensorFlow 张量
        target_mapping: np.ndarray | tf.Tensor | None = None,  # 目标映射，可以为空，可以是 NumPy 数组或 TensorFlow 张量
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # token 类型 IDs，可以为空，可以是 NumPy 数组或 TensorFlow 张量
        input_mask: np.ndarray | tf.Tensor | None = None,  # 输入掩码，可以为空，可以是 NumPy 数组或 TensorFlow 张量
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部掩码，可以为空，可以是 NumPy 数组或 TensorFlow 张量
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 嵌入输入，可以为空，可以是 NumPy 数组或 TensorFlow 张量
        use_mems: Optional[bool] = None,  # 是否使用循环记忆，可选布尔类型
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选布尔类型
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选布尔类型
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选布尔类型
        training: bool = False,  # 是否在训练模式下，布尔类型，默认为 False
    ) -> Union[TFXLNetModelOutput, Tuple[tf.Tensor]]:
        # 调用 Transformer 模型的前向传播
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            mems=mems,
            perm_mask=perm_mask,
            target_mapping=target_mapping,
            token_type_ids=token_type_ids,
            input_mask=input_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_mems=use_mems,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return outputs

    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记为已经构建
        self.built = True
        # 如果存在 Transformer 模型，则在命名空间下构建
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
@add_start_docstrings(
    """
    XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
    """,
    XLNET_START_DOCSTRING,
)
class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 初始化 XLNet 主体层，使用给定的配置参数
        self.transformer = TFXLNetMainLayer(config, name="transformer")
        # 初始化语言建模头部，与词嵌入权重相连
        self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss")
        # 不支持 XLA 生成
        self.supports_xla_generation = False

    def get_lm_head(self):
        # 返回语言建模头部对象
        return self.lm_loss

    def get_prefix_bias_name(self):
        # 警告，方法已弃用，请使用 `get_bias` 替代
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        # 返回头部名称和语言建模头部名称的组合
        return self.name + "/" + self.lm_loss.name

    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_mems=None, **kwargs):
        # 在输入序列末尾添加虚拟标记（不被注意力机制使用）
        effective_batch_size = inputs.shape[0]
        dummy_token = tf.zeros((effective_batch_size, 1), dtype=inputs.dtype)

        # 计算新标记和最后两个生成标记的注意力值，其余从 `past` 缓存重新加载。
        # 完全自回归模型应该有 offset = 1；offset = 2 似乎计算稍好。
        offset = 2

        if past_key_values:
            # 如果过去键值存在，则在末尾添加虚拟标记
            input_ids = tf.concat([inputs[:, -offset:], dummy_token], axis=1)
        else:
            # 否则，在末尾添加虚拟标记
            input_ids = tf.concat([inputs, dummy_token], axis=1)

        # 构建排列掩码，使之前的标记不看到最后一个标记
        sequence_length = input_ids.shape[1]
        perm_mask = tf.zeros((effective_batch_size, sequence_length, sequence_length - 1))
        perm_mask_seq_end = tf.ones((effective_batch_size, sequence_length, 1))
        perm_mask = tf.concat([perm_mask, perm_mask_seq_end], axis=-1)

        # 仅预测最后一个标记
        target_mapping = tf.zeros((effective_batch_size, 1, sequence_length - 1))
        target_mapping_seq_end = tf.ones((effective_batch_size, 1, 1))
        target_mapping = tf.concat([target_mapping, target_mapping_seq_end], axis=-1)

        inputs = {
            "input_ids": input_ids,
            "perm_mask": perm_mask,
            "target_mapping": target_mapping,
            "use_mems": use_mems,
        }

        # 如果模型参数中定义了过去键值，则用于更快速的解码
        if past_key_values:
            inputs["mems"] = tuple(layer_past[:-offset, :, :] for layer_past in past_key_values)

        return inputs

    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TFXLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC)
    # 定义一个方法用于调用模型，接受多个输入参数，每个参数都有指定的类型或者可以为空
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入的模型的输入 IDs，可以是指定的类型或者空
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力掩码，可以是 NumPy 数组、张量或者空
        mems: np.ndarray | tf.Tensor | None = None,  # 记忆项，可以是 NumPy 数组、张量或者空
        perm_mask: np.ndarray | tf.Tensor | None = None,  # 排列掩码，可以是 NumPy 数组、张量或者空
        target_mapping: np.ndarray | tf.Tensor | None = None,  # 目标映射，可以是 NumPy 数组、张量或者空
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # 标记类型 IDs，可以是 NumPy 数组、张量或者空
        input_mask: np.ndarray | tf.Tensor | None = None,  # 输入掩码，可以是 NumPy 数组、张量或者空
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部掩码，可以是 NumPy 数组、张量或者空
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 输入的嵌入表示，可以是 NumPy 数组、张量或者空
        use_mems: Optional[bool] = None,  # 是否使用记忆项，可选布尔值，默认为 None
        output_attentions: Optional[bool] = None,  # 是否输出注意力，可选布尔值，默认为 None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选布尔值，默认为 None
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选布尔值，默认为 None
        labels: np.ndarray | tf.Tensor | None = None,  # 标签，可以是 NumPy 数组、张量或者空
        training: bool = False,  # 是否处于训练模式，默认为 False
    ):
        # 定义模型调用方法，实现模型的前向传播等功能
        pass  # 此处实际上只是定义了方法的结构，具体的实现需要在此基础上完成

    # 构建方法，用于构建模型的结构
    def build(self, input_shape=None):
        # 如果模型已经构建完毕，则直接返回
        if self.built:
            return
        # 设置模型为已构建状态
        self.built = True
        # 如果存在 transformer 属性，则在其命名空间内构建 transformer
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        # 如果存在 lm_loss 属性，则在其命名空间内构建 lm_loss
        if getattr(self, "lm_loss", None) is not None:
            with tf.name_scope(self.lm_loss.name):
                self.lm_loss.build(None)
# 使用装饰器添加模型文档字符串，描述 XLNet 模型用于序列分类/回归任务的顶层结构
@add_start_docstrings(
    """
    XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
    for GLUE tasks.
    """,
    XLNET_START_DOCSTRING,
)
# 定义 TFXLNetForSequenceClassification 类，继承自 TFXLNetPreTrainedModel 和 TFSequenceClassificationLoss
class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassificationLoss):
    
    # 初始化方法
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)
        # 设置类别数目
        self.num_labels = config.num_labels
        
        # 创建 XLNet 主层，命名为 'transformer'
        self.transformer = TFXLNetMainLayer(config, name="transformer")
        
        # 创建序列摘要层，用于生成序列摘要，初始化范围为 config.initializer_range，命名为 'sequence_summary'
        self.sequence_summary = TFSequenceSummary(
            config, initializer_range=config.initializer_range, name="sequence_summary"
        )
        
        # 创建输出层，用于生成 logits，层的输出尺寸为 config.num_labels，权重初始化方法为 config.initializer_range 中的 initializer，命名为 'logits_proj'
        self.logits_proj = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
        )
        
        # 保存模型配置信息
        self.config = config
    
    # 使用装饰器添加模型前向传播的文档字符串，描述输入参数的含义和形状要求
    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFXLNetForSequenceClassificationOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播方法
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        mems: np.ndarray | tf.Tensor | None = None,
        perm_mask: np.ndarray | tf.Tensor | None = None,
        target_mapping: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        input_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        use_mems: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
        # 省略部分输入参数说明
    ) -> Union[TFXLNetForSequenceClassificationOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 调用transformer模型进行前向传播，返回输出结果
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            mems=mems,
            perm_mask=perm_mask,
            target_mapping=target_mapping,
            token_type_ids=token_type_ids,
            input_mask=input_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_mems=use_mems,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从transformer模型的输出中取第一个元素作为模型输出
        output = transformer_outputs[0]

        # 对模型输出进行序列摘要（summary）
        output = self.sequence_summary(output)
        # 将摘要后的结果投影到logits空间
        logits = self.logits_proj(output)

        # 如果labels不为空，计算损失值；否则损失为None
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果return_dict为False，则返回元组形式的输出
        if not return_dict:
            output = (logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果return_dict为True，则返回TFXLNetForSequenceClassificationOutput对象
        return TFXLNetForSequenceClassificationOutput(
            loss=loss,
            logits=logits,
            mems=transformer_outputs.mems,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建，直接返回
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True
        # 如果self.transformer存在，则在其命名作用域下构建transformer模型
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        # 如果self.sequence_summary存在，则在其命名作用域下构建序列摘要模型
        if getattr(self, "sequence_summary", None) is not None:
            with tf.name_scope(self.sequence_summary.name):
                self.sequence_summary.build(None)
        # 如果self.logits_proj存在，则在其命名作用域下构建logits投影模型
        if getattr(self, "logits_proj", None) is not None:
            with tf.name_scope(self.logits_proj.name):
                self.logits_proj.build([None, None, self.config.d_model])
# 使用装饰器添加文档字符串，描述了这个类的作用是在XLNET模型基础上添加一个多选分类的头部，例如用于RocStories/SWAG任务。
@add_start_docstrings(
    """
    XLNET Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    XLNET_START_DOCSTRING,  # 引用XLNET模型的起始文档字符串
)
class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化XLNET主层，命名为'transformer'
        self.transformer = TFXLNetMainLayer(config, name="transformer")
        # 初始化序列摘要层，用于生成序列汇总
        self.sequence_summary = TFSequenceSummary(
            config, initializer_range=config.initializer_range, name="sequence_summary"
        )
        # 初始化逻辑回归投影层，用于多选分类，输出维度为1
        self.logits_proj = keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
        )
        # 保存模型配置
        self.config = config

    # 使用装饰器添加文档字符串，描述这个方法的作用是处理XLNET模型的前向传播，支持多种输入
    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,  # 引用文档中的检查点示例
        output_type=TFXLNetForMultipleChoiceOutput,  # 引用XLNET多选输出的类型
        config_class=_CONFIG_FOR_DOC,  # 引用文档中的配置类示例
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        input_mask: np.ndarray | tf.Tensor | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        mems: np.ndarray | tf.Tensor | None = None,
        perm_mask: np.ndarray | tf.Tensor | None = None,
        target_mapping: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        use_mems: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """
        # 如果传入了 input_ids，则获取其第二维度的大小作为 num_choices
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]
            # 获取 input_ids 的第三维度大小作为 seq_length
            seq_length = shape_list(input_ids)[2]
        else:
            # 否则，从 inputs_embeds 中获取第二维度大小作为 num_choices
            num_choices = shape_list(inputs_embeds)[1]
            # 获取 inputs_embeds 的第三维度大小作为 seq_length
            seq_length = shape_list(inputs_embeds)[2]

        # 根据是否为 None，将各输入张量展平成二维张量
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        flat_input_mask = tf.reshape(input_mask, (-1, seq_length)) if input_mask is not None else None
        flat_inputs_embeds = (
            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
            if inputs_embeds is not None
            else None
        )

        # 调用 Transformer 模型，传入展平后的输入张量和其他参数
        transformer_outputs = self.transformer(
            flat_input_ids,
            flat_attention_mask,
            mems,
            perm_mask,
            target_mapping,
            flat_token_type_ids,
            flat_input_mask,
            head_mask,
            flat_inputs_embeds,
            use_mems,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从 Transformer 输出中取出第一个元素作为输出
        output = transformer_outputs[0]

        # 对输出进行序列摘要
        logits = self.sequence_summary(output)

        # 对序列摘要后的结果进行投影，得到最终的 logits
        logits = self.logits_proj(logits)

        # 将 logits 重新 reshape 成二维张量
        reshaped_logits = tf.reshape(logits, (-1, num_choices))

        # 如果提供了 labels，则计算损失，否则损失设为 None
        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)

        # 如果 return_dict 为 False，则返回扁平化后的 logits 和可能的其他输出
        if not return_dict:
            output = (reshaped_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回一个 TFXLNetForMultipleChoiceOutput 对象
        return TFXLNetForMultipleChoiceOutput(
            loss=loss,
            logits=reshaped_logits,
            mems=transformer_outputs.mems,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
    # 构建模型的方法，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在变换器（transformer），则在其命名空间内构建变换器
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        # 如果存在序列摘要（sequence_summary），则在其命名空间内构建序列摘要
        if getattr(self, "sequence_summary", None) is not None:
            with tf.name_scope(self.sequence_summary.name):
                self.sequence_summary.build(None)
        # 如果存在 logits 项目（logits_proj），则在其命名空间内构建 logits 项目
        if getattr(self, "logits_proj", None) is not None:
            with tf.name_scope(self.logits_proj.name):
                # 构建 logits 项目，输入形状为 [None, None, self.config.d_model]
                self.logits_proj.build([None, None, self.config.d_model])
"""
XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
# 继承自TFXLNetPreTrainedModel和TFTokenClassificationLoss的XLNet模型，用于标记分类任务（如命名实体识别NER）。

class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificationLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 设置分类的标签数目
        self.num_labels = config.num_labels

        # 初始化XLNet的主要层，命名为'transformer'
        self.transformer = TFXLNetMainLayer(config, name="transformer")
        
        # 设置分类器，为一个全连接层，输出大小为config.num_labels，使用config中定义的初始化范围初始化权重
        self.classifier = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        
        # 保存配置参数
        self.config = config

    # 将多个输入解包并传递给模型的前向传播函数，同时添加了额外的文档字符串说明
    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFXLNetForTokenClassificationOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        mems: np.ndarray | tf.Tensor | None = None,
        perm_mask: np.ndarray | tf.Tensor | None = None,
        target_mapping: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        input_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        use_mems: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
        """
        进行模型的前向传播，接受多种输入参数，包括input_ids、attention_mask等，以及用于训练的labels和是否处于训练模式的training标志位。
        """
    ) -> Union`
    ) -> Union[TFXLNetForTokenClassificationOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """

        # 使用 transformer 处理输入数据，返回 transformer 的输出
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            mems=mems,
            perm_mask=perm_mask,
            target_mapping=target_mapping,
            token_type_ids=token_type_ids,
            input_mask=input_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_mems=use_mems,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取 transformer 的输出中的第一个元素，即 logits
        output = transformer_outputs[0]
        # 使用 classifier 处理 logits，得到最终的分类结果
        logits = self.classifier(output)
        # 如果提供了 labels，则计算损失；否则损失为 None
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果 return_dict 为 False，则返回一个包含 logits 和其他输出的元组
        if not return_dict:
            output = (logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则构建 TFXLNetForTokenClassificationOutput 对象并返回
        return TFXLNetForTokenClassificationOutput(
            loss=loss,
            logits=logits,
            mems=transformer_outputs.mems,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 设置标记为已构建
        self.built = True

        # 如果存在 transformer 属性，则构建 transformer
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)

        # 如果存在 classifier 属性，则构建 classifier
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
# 使用自定义的文档字符串描述这个类，说明它是在XLNet模型基础上构建的用于抽取式问答任务的模型，
# 通过在隐藏状态输出的基础上添加线性层来计算'起始位置logits'和'结束位置logits'。
@add_start_docstrings(
    """
    XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    XLNET_START_DOCSTRING,
)
class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnsweringLoss):
    
    # 初始化方法，接收配置和其他输入参数
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 创建XLNet的主要层，并命名为"transformer"
        self.transformer = TFXLNetMainLayer(config, name="transformer")
        # 创建一个全连接层用于问答输出，其输出维度为config.num_labels，初始化方式为指定范围内的初始化器
        self.qa_outputs = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
        # 将配置保存为对象的属性
        self.config = config

    # 调用方法，实现模型的前向传播
    @unpack_inputs
    # 添加模型前向传播的文档字符串，描述输入参数的形状和含义
    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 添加示例代码的文档字符串，描述模型的检查点、输出类型和配置类
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFXLNetForQuestionAnsweringSimpleOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        mems: np.ndarray | tf.Tensor | None = None,
        perm_mask: np.ndarray | tf.Tensor | None = None,
        target_mapping: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        input_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        use_mems: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
    ) -> Union[TFXLNetForQuestionAnsweringSimpleOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 调用 Transformer 模型进行前向传播，获取输出
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            mems=mems,
            perm_mask=perm_mask,
            target_mapping=target_mapping,
            token_type_ids=token_type_ids,
            input_mask=input_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_mems=use_mems,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 Transformer 输出中提取序列输出
        sequence_output = transformer_outputs[0]

        # 将序列输出传递给 QA 输出层，得到起始位置和结束位置的预测 logits
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)

        # 计算损失，如果提供了起始位置和结束位置的标签
        loss = None
        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            loss = self.hf_compute_loss(labels, (start_logits, end_logits))

        # 如果不需要返回字典，则构建输出元组
        if not return_dict:
            output = (start_logits, end_logits) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典，则构建 TFXLNetForQuestionAnsweringSimpleOutput 对象
        return TFXLNetForQuestionAnsweringSimpleOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            mems=transformer_outputs.mems,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经定义了 Transformer 模型，则构建它
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        # 如果已经定义了 QA 输出层，则构建它
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])

`.\models\xlnet\modeling_xlnet.py`

# 创建一个映射字典，用于从 TensorFlow 到 PyTorch 的模块映射，以尽可能保持 PyTorch 模型与原始 TensorFlow 模型的一致性
def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
    # 初始化一个空的 TensorFlow 到 PyTorch 的映射字典
    tf_to_pt_map = {}
    # 检查模型是否有 "transformer" 属性
    if hasattr(model, "transformer"):
        # 如果模型有 "lm_loss" 属性，加载 lm_loss 的偏置项
        if hasattr(model, "lm_loss"):
            tf_to_pt_map["model/lm_loss/bias"] = model.lm_loss.bias
        
        # 如果模型有 "sequence_summary" 属性，并且在 tf_weights 中包含指定的路径
        if hasattr(model, "sequence_summary") and "model/sequnece_summary/summary/kernel" in tf_weights:
            # 加载 sequence summary 的权重和偏置项
            tf_to_pt_map["model/sequnece_summary/summary/kernel"] = model.sequence_summary.summary.weight
            tf_to_pt_map["model/sequnece_summary/summary/bias"] = model.sequence_summary.summary.bias
        
        # 如果模型有 "logits_proj" 属性，同时满足 finetuning_task 不为 None，并且在 tf_weights 中包含指定的路径
        if (
            hasattr(model, "logits_proj")
            and config.finetuning_task is not None
            and f"model/regression_{config.finetuning_task}/logit/kernel" in tf_weights
        ):
            # 加载 logits_proj 的权重和偏置项
            tf_to_pt_map[f"model/regression_{config.finetuning_task}/logit/kernel"] = model.logits_proj.weight
            tf_to_pt_map[f"model/regression_{config.finetuning_task}/logit/bias"] = model.logits_proj.bias
        
        # 将模型切换至 transformer 属性
        model = model.transformer

    # 加载嵌入和输出层的映射关系
    tf_to_pt_map.update(
        {
            "model/transformer/word_embedding/lookup_table": model.word_embedding.weight,
            "model/transformer/mask_emb/mask_emb": model.mask_emb,
        }
    )

    # 加载 transformer 模块的各个层
    for i, b in enumerate(model.layer):
        layer_str = f"model/transformer/layer_{i}/"
        tf_to_pt_map.update(
            {
                layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight,
                layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias,
                layer_str + "rel_attn/o/kernel": b.rel_attn.o,
                layer_str + "rel_attn/q/kernel": b.rel_attn.q,
                layer_str + "rel_attn/k/kernel": b.rel_attn.k,
                layer_str + "rel_attn/r/kernel": b.rel_attn.r,
                layer_str + "rel_attn/v/kernel": b.rel_attn.v,
                layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight,
                layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias,
                layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight,
                layer_str + "ff/layer_1/bias": b.ff.layer_1.bias,
                layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight,
                layer_str + "ff/layer_2/bias": b.ff.layer_2.bias,
            }
        )

    # 如果 config.untie_r 为真，则加载相对位置偏置列表
    if config.untie_r:
        r_r_list = []
        r_w_list = []
        r_s_list = []
        seg_embed_list = []
        for b in model.layer:
            r_r_list.append(b.rel_attn.r_r_bias)
            r_w_list.append(b.rel_attn.r_w_bias)
            r_s_list.append(b.rel_attn.r_s_bias)
            seg_embed_list.append(b.rel_attn.seg_embed)
    else:
        # 否则加载单一的相对位置偏置项
        r_r_list = [model.r_r_bias]
        r_w_list = [model.r_w_bias]
        r_s_list = [model.r_s_bias]
        seg_embed_list = [model.seg_embed]
    # 更新 tf_to_pt_map 字典，将四个键值对添加或更新到字典中
    tf_to_pt_map.update(
        {
            "model/transformer/r_r_bias": r_r_list,  # 键为 "model/transformer/r_r_bias"，值为 r_r_list
            "model/transformer/r_w_bias": r_w_list,  # 键为 "model/transformer/r_w_bias"，值为 r_w_list
            "model/transformer/r_s_bias": r_s_list,  # 键为 "model/transformer/r_s_bias"，值为 r_s_list
            "model/transformer/seg_embed": seg_embed_list,  # 键为 "model/transformer/seg_embed"，值为 seg_embed_list
        }
    )
    # 返回更新后的 tf_to_pt_map 字典
    return tf_to_pt_map
# 在 PyTorch 模型中加载 TensorFlow 的检查点权重
def load_tf_weights_in_xlnet(model, config, tf_path):
    """Load tf checkpoints in a pytorch model"""
    try:
        import numpy as np  # 导入 NumPy 库，用于数组操作
        import tensorflow as tf  # 导入 TensorFlow 库，用于加载 TensorFlow 模型权重
    except ImportError:
        logger.error(
            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise

    # 从 TensorFlow 模型中加载初始变量列表
    init_vars = tf.train.list_variables(tf_path)
    tf_weights = {}

    # 遍历每个变量名和形状，加载 TensorFlow 模型权重
    for name, shape in init_vars:
        logger.info(f"Loading TF weight {name} with shape {shape}")
        array = tf.train.load_variable(tf_path, name)
        tf_weights[name] = array

    # 构建 TensorFlow 到 PyTorch 权重加载映射
    tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)

    # 遍历映射表中的每个变量名和指针
    for name, pointer in tf_to_pt_map.items():
        logger.info(f"Importing {name}")

        # 如果变量名不在 TensorFlow 权重中，则跳过
        if name not in tf_weights:
            logger.info(f"{name} not in tf pre-trained weights, skipping")
            continue

        array = tf_weights[name]

        # 对于特定的变量名模式，需要进行数组转置操作
        if "kernel" in name and ("ff" in name or "summary" in name or "logit" in name):
            logger.info("Transposing")
            array = np.transpose(array)

        # 如果指针是列表，则需要分割 TensorFlow 的权重数组
        if isinstance(pointer, list):
            assert (
                len(pointer) == array.shape[0]
            ), f"Pointer length {len(pointer)} and array length {array.shape[0]} mismatched"
            for i, p_i in enumerate(pointer):
                arr_i = array[i, ...]
                try:
                    assert (
                        p_i.shape == arr_i.shape
                    ), f"Pointer shape {p_i.shape} and array shape {arr_i.shape} mismatched"
                except AssertionError as e:
                    e.args += (p_i.shape, arr_i.shape)
                    raise
                logger.info(f"Initialize PyTorch weight {name} for layer {i}")
                p_i.data = torch.from_numpy(arr_i)
        else:
            try:
                assert (
                    pointer.shape == array.shape
                ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
            except AssertionError as e:
                e.args += (pointer.shape, array.shape)
                raise
            logger.info(f"Initialize PyTorch weight {name}")
            pointer.data = torch.from_numpy(array)

        # 从 TensorFlow 权重字典中移除处理过的变量名及其相关项
        tf_weights.pop(name, None)
        tf_weights.pop(name + "/Adam", None)
        tf_weights.pop(name + "/Adam_1", None)

    # 输出未复制到 PyTorch 模型的权重变量名列表
    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")

    # 返回加载完权重后的 PyTorch 模型
    return model
    def __init__(self, config):
        super().__init__()  # 调用父类的初始化方法

        if config.d_model % config.n_head != 0:
            raise ValueError(
                f"The hidden size ({config.d_model}) is not a multiple of the number of attention "
                f"heads ({config.n_head}"  # 抛出异常，如果隐藏大小不是注意力头数的倍数
            )

        self.n_head = config.n_head  # 设置注意力头的数量
        self.d_head = config.d_head  # 设置每个头的隐藏大小
        self.d_model = config.d_model  # 设置模型的隐藏大小
        self.scale = 1 / (config.d_head**0.5)  # 缩放因子，用于缩放注意力分数

        # 下面的四个参数用于存储注意力机制中的查询、键、值和输出矩阵
        self.q = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
        self.k = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
        self.v = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
        self.o = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
        self.r = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))

        # 下面四个参数用于存储注意力中的偏置项
        self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
        self.r_s_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
        self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
        self.seg_embed = nn.Parameter(torch.FloatTensor(2, self.n_head, self.d_head))

        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)  # Layer normalization层
        self.dropout = nn.Dropout(config.dropout)  # Dropout层

    def prune_heads(self, heads):
        raise NotImplementedError  # 剪枝注意力头的方法，抛出未实现异常

    @staticmethod
    def rel_shift(x, klen=-1):
        """perform relative shift to form the relative attention score."""
        x_size = x.shape  # 获取输入张量的形状

        x = x.reshape(x_size[1], x_size[0], x_size[2], x_size[3])  # 重塑张量维度顺序
        x = x[1:, ...]  # 去掉第一行，实现相对位移
        x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3])  # 重新整形张量维度
        # x = x[:, 0:klen, :, :]
        x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))  # 通过索引选择相对位移后的部分

        return x

    @staticmethod
    def rel_shift_bnij(x, klen=-1):
        x_size = x.shape  # 获取输入张量的形状

        x = x.reshape(x_size[0], x_size[1], x_size[3], x_size[2])  # 重塑张量维度顺序
        x = x[:, :, 1:, :]  # 去掉第一列，实现相对位移
        x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3] - 1)  # 重新整形张量维度
        # Note: the tensor-slice form was faster in my testing than torch.index_select
        #       However, tracing doesn't like the nature of the slice, and if klen changes
        #       during the run then it'll fail, whereas index_select will be fine.
        x = torch.index_select(x, 3, torch.arange(klen, device=x.device, dtype=torch.long))  # 通过索引选择相对位移后的部分
        # x = x[:, :, :, :klen]

        return x

    def rel_attn_core(
        self,
        q_head,
        k_head_h,
        v_head_h,
        k_head_r,
        seg_mat=None,
        attn_mask=None,
        head_mask=None,
        output_attentions=False,
    ):
        """Core relative positional attention operations."""

        # content based attention score
        ac = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_w_bias, k_head_h)

        # position based attention score
        bd = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_r_bias, k_head_r)
        bd = self.rel_shift_bnij(bd, klen=ac.shape[3])

        # segment based attention score
        if seg_mat is None:
            ef = 0
        else:
            ef = torch.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
            ef = torch.einsum("ijbs,ibns->bnij", seg_mat, ef)

        # merge attention scores and perform masking
        attn_score = (ac + bd + ef) * self.scale
        if attn_mask is not None:
            # Apply attention mask based on its dtype
            if attn_mask.dtype == torch.float16:
                attn_score = attn_score - 65500 * torch.einsum("ijbn->bnij", attn_mask)
            else:
                attn_score = attn_score - 1e30 * torch.einsum("ijbn->bnij", attn_mask)

        # attention probability
        attn_prob = nn.functional.softmax(attn_score, dim=3)
        attn_prob = self.dropout(attn_prob)

        # Mask heads if specified
        if head_mask is not None:
            attn_prob = attn_prob * torch.einsum("ijbn->bnij", head_mask)

        # attention output
        attn_vec = torch.einsum("bnij,jbnd->ibnd", attn_prob, v_head_h)

        if output_attentions:
            return attn_vec, torch.einsum("bnij->ijbn", attn_prob)

        return attn_vec

    def post_attention(self, h, attn_vec, residual=True):
        """Post-attention processing."""
        # post-attention projection (back to `d_model`)
        attn_out = torch.einsum("ibnd,hnd->ibh", attn_vec, self.o)

        attn_out = self.dropout(attn_out)
        if residual:
            attn_out = attn_out + h
        output = self.layer_norm(attn_out)

        return output

    def forward(
        self,
        h,
        g,
        attn_mask_h,
        attn_mask_g,
        r,
        seg_mat,
        mems=None,
        target_mapping=None,
        head_mask=None,
        output_attentions=False,
class XLNetFeedForward(nn.Module):
    # 定义一个 XLNet 模型的前馈层
    def __init__(self, config):
        super().__init__()
        # 初始化层归一化模块，使用给定的 d_model 和 layer_norm_eps 参数
        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
        # 初始化第一个线性层，输入维度为 d_model，输出维度为 d_inner
        self.layer_1 = nn.Linear(config.d_model, config.d_inner)
        # 初始化第二个线性层，输入维度为 d_inner，输出维度为 d_model
        self.layer_2 = nn.Linear(config.d_inner, config.d_model)
        # 初始化 Dropout 模块，使用给定的 dropout 参数
        self.dropout = nn.Dropout(config.dropout)
        # 如果 ff_activation 是字符串，则根据 ACT2FN 字典选择对应的激活函数，否则直接使用给定的激活函数
        if isinstance(config.ff_activation, str):
            self.activation_function = ACT2FN[config.ff_activation]
        else:
            self.activation_function = config.ff_activation

    # 前向传播函数
    def forward(self, inp):
        # 将输入赋给 output
        output = inp
        # 经过第一个线性层
        output = self.layer_1(output)
        # 应用激活函数
        output = self.activation_function(output)
        # 应用 Dropout
        output = self.dropout(output)
        # 经过第二个线性层
        output = self.layer_2(output)
        # 再次应用 Dropout
        output = self.dropout(output)
        # 对 output 和输入 inp 进行残差连接，并进行层归一化
        output = self.layer_norm(output + inp)
        # 返回输出
        return output


class XLNetLayer(nn.Module):
    # 定义一个 XLNet 模型的层
    def __init__(self, config):
        super().__init__()
        # 初始化相对注意力层
        self.rel_attn = XLNetRelativeAttention(config)
        # 初始化前馈层
        self.ff = XLNetFeedForward(config)
        # 初始化 Dropout 模块
        self.dropout = nn.Dropout(config.dropout)
        # 设置前馈层的分块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 设置序列长度维度
        self.seq_len_dim = 1

    # 前向传播函数
    def forward(
        self,
        output_h,
        output_g,
        attn_mask_h,
        attn_mask_g,
        r,
        seg_mat,
        mems=None,
        target_mapping=None,
        head_mask=None,
        output_attentions=False,
    ):
        # 使用相对注意力层计算输出
        outputs = self.rel_attn(
            output_h,
            output_g,
            attn_mask_h,
            attn_mask_g,
            r,
            seg_mat,
            mems=mems,
            target_mapping=target_mapping,
            head_mask=head_mask,
            output_attentions=output_attentions,
        )
        # 获取相对注意力层的输出中的前两个部分
        output_h, output_g = outputs[:2]

        # 如果 output_g 不为 None，则对其应用分块前馈
        if output_g is not None:
            output_g = apply_chunking_to_forward(
                self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, output_g
            )
        # 对 output_h 应用分块前馈
        output_h = apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, output_h)

        # 将相对注意力层的输出重新组合并添加可能的注意力部分
        outputs = (output_h, output_g) + outputs[2:]  # 如果有的话再次添加注意力
        # 返回所有输出
        return outputs

    # 定义前馈层的分块函数
    def ff_chunk(self, output_x):
        # 对输入 output_x 应用前馈层
        output_x = self.ff(output_x)
        # 返回处理后的输出
        return output_x


class XLNetPreTrainedModel(PreTrainedModel):
    """
    处理权重初始化和下载预训练模型的抽象类。
    """

    # 设置 XLNetConfig 作为配置类
    config_class = XLNetConfig
    # 加载 TensorFlow 权重的函数
    load_tf_weights = load_tf_weights_in_xlnet
    # 设置基础模型前缀为 "transformer"
    base_model_prefix = "transformer"
    def _init_weights(self, module):
        """Initialize the weights."""
        # 如果 module 是 nn.Linear 类型
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化权重，均值为 0，标准差为 self.config.initializer_range
            # 这里与 TensorFlow 版本稍有不同，TensorFlow 使用截断正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有偏置项，则将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        
        # 如果 module 是 nn.Embedding 类型
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，均值为 0，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有 padding_idx，则将其对应的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        
        # 如果 module 是 nn.LayerNorm 类型
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为 1
            module.weight.data.fill_(1.0)
        
        # 如果 module 是 XLNetRelativeAttention 类型
        elif isinstance(module, XLNetRelativeAttention):
            # 对以下参数使用正态分布进行初始化，均值为 0，标准差为 self.config.initializer_range
            for param in [
                module.q,
                module.k,
                module.v,
                module.o,
                module.r,
                module.r_r_bias,
                module.r_s_bias,
                module.r_w_bias,
                module.seg_embed,
            ]:
                param.data.normal_(mean=0.0, std=self.config.initializer_range)
        
        # 如果 module 是 XLNetModel 类型
        elif isinstance(module, XLNetModel):
            # 初始化 mask_emb 参数，使用正态分布，均值为 0，标准差为 self.config.initializer_range
            module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
@dataclass
class XLNetModelOutput(ModelOutput):
    """
    Output type of [`XLNetModel`].

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_predict, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.

            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict`
            corresponds to `sequence_length`.
        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
            token ids which have their past given to this model should not be passed as `input_ids` as they have
            already been computed.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    last_hidden_state: torch.FloatTensor
    mems: Optional[List[torch.FloatTensor]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


@dataclass
class XLNetLMHeadModelOutput(ModelOutput):
    """
    Output type of [`XLNetLMHeadModel`].

    This class serves as the output specification for the XLNet language model head.
    """
    Args:
        loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided)
            Language modeling loss (for next-token prediction).
        
        logits (`torch.FloatTensor` of shape `(batch_size, num_predict, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    
            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict`
            corresponds to `sequence_length`.
        
        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
            token ids which have their past given to this model should not be passed as `input_ids` as they have
            already been computed.
        
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # Optional attributes representing outputs from the language model
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    mems: Optional[List[torch.FloatTensor]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义一个数据类，表示 XLNet 模型用于序列分类任务的输出
@dataclass
class XLNetForSequenceClassificationOutput(ModelOutput):
    """
    Output type of [`XLNetForSequenceClassification`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
            分类（或者当 `config.num_labels==1` 时的回归）损失值。
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            分类（或者当 `config.num_labels==1` 时的回归）得分（softmax 之前的分数）。
        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
            包含预先计算的隐藏状态列表，可用于加速序列解码。
            传给模型的 token id 不应该作为 `input_ids` 再次传入，因为它们已经被计算过。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            包含模型每一层的隐藏状态的元组，包括初始的嵌入输出。
            形状为 `(batch_size, sequence_length, hidden_size)`。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            包含每一层的注意力权重的元组。
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
    """

    # 表示损失值，当提供 `label` 时返回
    loss: Optional[torch.FloatTensor] = None
    # 表示分类（或回归）得分，形状为 `(batch_size, config.num_labels)`
    logits: torch.FloatTensor = None
    # 包含预先计算的隐藏状态列表，长度为 `config.n_layers`
    mems: Optional[List[torch.FloatTensor]] = None
    # 包含每一层的隐藏状态的元组，包括初始的嵌入输出
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 包含每一层的注意力权重的元组
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


# 定义一个数据类，表示 XLNet 模型用于标记分类任务的输出
@dataclass
class XLNetForTokenClassificationOutput(ModelOutput):
    """
    Output type of [`XLNetForTokenClassificationOutput`].
    """
    # 定义函数的参数和返回类型注解，这些参数是用于处理分类任务的神经网络模型的输出和中间状态
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
            分类损失值。可选的，当提供了 `labels` 参数时返回。
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
            分类得分（SoftMax 之前的分数）。
        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
            包含预先计算的隐藏状态的列表。可用于加速顺序解码。已经计算过其过去的标记 ID 不应作为 `input_ids` 传递给模型。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型每一层输出的隐藏状态的元组，包括初始嵌入输出。

            模型每一层输出的隐藏状态和初始嵌入输出的 torch.FloatTensor，形状为 `(batch_size, sequence_length, hidden_size)`。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            注意力权重的元组，用于计算自注意力头中的加权平均值。

            注意力 softmax 后的注意力权重，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
    """

    # 初始化函数的返回值，所有值都是可选的
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    mems: Optional[List[torch.FloatTensor]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class XLNetForMultipleChoiceOutput(ModelOutput):
    """
    Output type of [`XLNetForMultipleChoice`].

    Args:
        loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
            Classification loss. Represents the loss value associated with the classification task.
        logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
            Classification scores (before SoftMax). These scores represent the model's raw output for each choice in a multiple-choice scenario.
        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
            Contains pre-computed hidden-states. Can be used to speed up sequential decoding by providing already computed hidden states from previous decoding steps.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple containing hidden-states of the model at the output of each layer plus the initial embedding outputs. This helps in accessing intermediate hidden states if needed.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple containing attention weights after the attention softmax. These weights are used to compute the weighted average in the self-attention heads.

            Each element in the tuple corresponds to attention weights from different layers of the model.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    mems: Optional[List[torch.FloatTensor]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


@dataclass
class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
    """
    Output type of [`XLNetForQuestionAnsweringSimple`].
    
    This class represents the output structure specifically tailored for the task of question answering using XLNet.
    It inherits from `ModelOutput`, providing a standardized way to represent model outputs in this context.
    """
    # 定义函数的参数和返回值的注释文档字符串
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            总的跨度提取损失，是开始位置和结束位置的交叉熵之和。
        start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,)`):
            跨度开始位置的分数（SoftMax 之前）。
        end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,)`):
            跨度结束位置的分数（SoftMax 之前）。
        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
            包含预计算隐藏状态的列表。可以用于加速顺序解码。
            这些模型已经计算过的 token id 不应作为 `input_ids` 传递，因为它们已经计算过。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型在每一层输出的隐藏状态的元组。包括初始嵌入输出。
            形状为 `(batch_size, sequence_length, hidden_size)`。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            每一层注意力权重的元组。
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            
    """
    
    # 定义函数内部的变量类型和默认值
    loss: Optional[torch.FloatTensor] = None
    start_logits: torch.FloatTensor = None
    end_logits: torch.FloatTensor = None
    mems: Optional[List[torch.FloatTensor]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义一个数据类，用于存储 XLNet 问答模型的输出结果，继承自 ModelOutput 类
@dataclass
class XLNetForQuestionAnsweringOutput(ModelOutput):
    
    """
    [`XLNetForQuestionAnswering`] 的输出类型。

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, 如果 `start_positions` 和 `end_positions` 都提供则返回):
            分类损失，作为起始标记、结束标记分类损失的总和（如果提供了 `is_impossible` 则也包括在内）。
        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, 如果未提供 `start_positions` 或 `end_positions` 则返回):
            针对顶部 config.start_n_top 起始标记可能性（波束搜索）的对数概率。
        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, 如果未提供 `start_positions` 或 `end_positions` 则返回):
            针对顶部 config.start_n_top 起始标记可能性（波束搜索）的索引。
        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, 如果未提供 `start_positions` 或 `end_positions` 则返回):
            针对顶部 `config.start_n_top * config.end_n_top` 结束标记可能性（波束搜索）的对数概率。
        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, 如果未提供 `start_positions` 或 `end_positions` 则返回):
            针对顶部 `config.start_n_top * config.end_n_top` 结束标记可能性（波束搜索）的索引。
        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, 如果未提供 `start_positions` 或 `end_positions` 则返回):
            关于答案的 `is_impossible` 标签的对数概率。
        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
            包含预先计算的隐藏状态。可用于加速顺序解码。已经计算过其过去的令牌 id 不应作为 `input_ids` 传递，因为它们已经计算过。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 时返回或当 `config.output_hidden_states=True` 时返回):
            模型每层输出的隐藏状态的元组（一个用于嵌入的输出 + 每一层的输出），形状为 `(batch_size, sequence_length, hidden_size)`。

            每层输出的隐藏状态加上初始嵌入的输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, 当 `output_attentions=True` 时返回或当 `config.output_attentions=True` 时返回):
            注意力权重的元组（每一层一个），形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。

            注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
    """
    # 定义可选的损失张量
    loss: Optional[torch.FloatTensor] = None
    # 定义可选的起始位置的对数概率张量
    start_top_log_probs: Optional[torch.FloatTensor] = None
    # 定义可选的起始位置的索引张量
    start_top_index: Optional[torch.LongTensor] = None
    # 定义可选的结束位置的对数概率张量
    end_top_log_probs: Optional[torch.FloatTensor] = None
    # 定义可选的结束位置的索引张量
    end_top_index: Optional[torch.LongTensor] = None
    # 定义可选的分类器输出张量
    cls_logits: Optional[torch.FloatTensor] = None
    # 定义可选的记忆列表，每个元素为张量
    mems: Optional[List[torch.FloatTensor]] = None
    # 定义可选的隐藏状态元组，包含多个张量
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 定义可选的注意力张量元组，包含多个张量
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# XLNet 模型的文档字符串，描述了模型的继承关系和常见用法
XLNET_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`XLNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# XLNet 模型的输入文档字符串，当前为空
XLNET_INPUTS_DOCSTRING = r"""
"""

# 用于添加文档字符串的装饰器函数，说明了 XLNetModel 类的基本信息和继承关系
@add_start_docstrings(
    "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
    XLNET_START_DOCSTRING,
)
# XLNetModel 类的定义，继承自 XLNetPreTrainedModel 类
class XLNetModel(XLNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化模型参数
        self.mem_len = config.mem_len            # 记忆长度
        self.reuse_len = config.reuse_len        # 复用长度
        self.d_model = config.d_model            # 模型维度
        self.same_length = config.same_length    # 是否相同长度
        self.attn_type = config.attn_type        # 注意力类型
        self.bi_data = config.bi_data            # 是否双向数据
        self.clamp_len = config.clamp_len        # 限制长度
        self.n_layer = config.n_layer            # 层数

        # 初始化词嵌入层、掩码嵌入层、层列表和 dropout 层
        self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)  # 词嵌入层
        self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))  # 掩码嵌入层
        self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])  # XLNetLayer 层列表
        self.dropout = nn.Dropout(config.dropout)  # dropout 层

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入词嵌入层
    def get_input_embeddings(self):
        return self.word_embedding

    # 设置输入词嵌入层
    def set_input_embeddings(self, new_embeddings):
        self.word_embedding = new_embeddings

    # 剪枝头部的方法，当前未实现
    def _prune_heads(self, heads_to_prune):
        raise NotImplementedError
    def create_mask(self, qlen, mlen):
        """
        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.

        Args:
            qlen: Sequence length
            mlen: Mask length

        ::

                  same_length=False: same_length=True: <mlen > < qlen > <mlen > < qlen >
               ^ [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 1 1 1 1]
                 [0 0 0 0 0 0 1 1 1] [1 0 0 0 0 0 1 1 1]
            qlen [0 0 0 0 0 0 0 1 1] [1 1 0 0 0 0 0 1 1]
                 [0 0 0 0 0 0 0 0 1] [1 1 1 0 0 0 0 0 1]
               v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0]

        """
        # 创建一个形状为 (qlen, qlen + mlen) 的全为 1 的张量，表示初始的掩码
        mask = torch.ones((qlen, qlen + mlen), device=self.device)
        
        # 如果 same_length 为 True，则需要特殊处理掩码
        if self.same_length:
            # 提取出下三角矩阵，并且将掩码的上三角部分置为 0
            mask_lo = mask[:, :qlen].tril(-1)
            mask.triu_(mlen + 1)
            mask[:, :qlen] += mask_lo
        else:
            # 如果 same_length 不为 True，则只将掩码的上三角部分置为 0
            mask.triu_(mlen + 1)

        return mask

    def cache_mem(self, curr_out, prev_mem):
        # 将当前输出缓存为新的内存状态。

        # 如果 reuse_len 被定义且大于 0，则截取当前输出的前部分
        if self.reuse_len is not None and self.reuse_len > 0:
            curr_out = curr_out[: self.reuse_len]

        # 如果 mem_len 没有被定义或者为 0，则根据情况设置截断点
        if self.mem_len is None or self.mem_len == 0:
            # 如果 use_mems 激活但未定义 mem_len，则模型在推断时的行为类似于 GPT-2，
            # 返回所有过去和当前的隐藏状态。
            cutoff = 0
        else:
            # 如果 use_mems 激活且定义了 mem_len，则模型返回最后的 mem_len 个隐藏状态，
            # 这是训练和生成长文本时的首选设置。
            cutoff = -self.mem_len
        
        # 如果 prev_mem 为 None，则直接使用当前输出的部分作为新的内存
        if prev_mem is None:
            new_mem = curr_out[cutoff:]
        else:
            # 否则将当前输出与之前的内存连接起来，并根据截断点进行截断
            new_mem = torch.cat([prev_mem, curr_out], dim=0)[cutoff:]

        return new_mem.detach()

    @staticmethod
    def positional_embedding(pos_seq, inv_freq, bsz=None):
        # 根据位置序列和频率逆序列生成位置编码矩阵

        # 计算正弦和余弦函数的输入
        sinusoid_inp = torch.einsum("i,d->id", pos_seq, inv_freq)
        
        # 将正弦和余弦函数的结果连接起来形成位置编码
        pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
        pos_emb = pos_emb[:, None, :]

        # 如果给定了 batch size，则将位置编码矩阵进行扩展
        if bsz is not None:
            pos_emb = pos_emb.expand(-1, bsz, -1)

        return pos_emb
    # 创建相对位置编码。
    freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.int64).float()
    # 计算频率因子，用于相对位置编码
    inv_freq = 1 / torch.pow(10000, (freq_seq / self.d_model))

    if self.attn_type == "bi":
        # 如果是双向注意力机制，设置起始和结束位置
        beg, end = klen, -qlen
    elif self.attn_type == "uni":
        # 如果是单向注意力机制，设置起始和结束位置
        beg, end = klen, -1
    else:
        # 抛出异常，未知的注意力类型
        raise ValueError(f"Unknown `attn_type` {self.attn_type}.")

    if self.bi_data:
        # 创建前向和后向位置序列
        fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.int64).float()
        bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=torch.int64).float()

        if self.clamp_len > 0:
            # 对位置序列进行截断
            fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
            bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)

        if bsz is not None:
            # 创建前向和后向位置的位置嵌入
            fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
            bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
        else:
            # 创建前向和后向位置的位置嵌入（无批次大小限制）
            fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
            bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)

        # 将前向和后向的位置嵌入拼接在一起
        pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1)
    else:
        # 创建前向位置序列
        fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.int64).float()
        if self.clamp_len > 0:
            # 对位置序列进行截断
            fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
        # 创建前向位置的位置嵌入
        pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)

    # 返回位置嵌入张量
    return pos_emb
@add_start_docstrings(
    """
    XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
    """,
    XLNET_START_DOCSTRING,
)
class XLNetLMHeadModel(XLNetPreTrainedModel):
    _tied_weights_keys = ["lm_loss.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.attn_type = config.attn_type  # 从配置中获取注意力类型
        self.same_length = config.same_length  # 从配置中获取是否使用相同长度

        self.transformer = XLNetModel(config)  # 初始化XLNet模型
        self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)  # 初始化语言建模头部线性层

        # 初始化权重并应用最终处理
        self.post_init()

    def get_output_embeddings(self):
        return self.lm_loss  # 返回语言建模头部的线性层

    def set_output_embeddings(self, new_embeddings):
        self.lm_loss = new_embeddings  # 设置新的输出嵌入层

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_mems=None, **kwargs):
        # 在输入的末尾添加一个虚拟标记（该标记不会被注意力机制关注）

        effective_batch_size = input_ids.shape[0]  # 计算有效的批量大小
        dummy_token = torch.zeros((effective_batch_size, 1), dtype=torch.long, device=input_ids.device)  # 创建一个全零的虚拟标记张量

        # 在每次传递中，计算新标记以及最后两个生成标记的注意力值，其余从过去的缓存中重新加载。
        # 纯自回归模型应该有 offset = 1; 使用 offset = 2 似乎计算稍微更好。
        offset = 2

        if past_key_values:
            input_ids = torch.cat([input_ids[:, -offset:], dummy_token], dim=1)  # 如果过去的键值存在，则在末尾添加虚拟标记
        else:
            input_ids = torch.cat([input_ids, dummy_token], dim=1)  # 否则在末尾添加虚拟标记

        # 构建排列掩码，使得之前的标记不会看到最后一个标记
        sequence_length = input_ids.shape[1]  # 获取序列长度
        perm_mask = torch.zeros(
            (effective_batch_size, sequence_length, sequence_length), dtype=torch.float, device=input_ids.device
        )  # 创建全零的排列掩码张量
        perm_mask[:, :, -1] = 1.0  # 最后一个位置设为1.0，表示不允许之前的标记看到最后一个标记

        # 我们只预测最后一个标记
        target_mapping = torch.zeros(
            (effective_batch_size, 1, sequence_length), dtype=torch.float, device=input_ids.device
        )  # 创建全零的目标映射张量
        target_mapping[:, 0, -1] = 1.0  # 最后一个位置设为1.0，表示只预测最后一个标记

        inputs = {
            "input_ids": input_ids,
            "perm_mask": perm_mask,
            "target_mapping": target_mapping,
            "use_mems": use_mems,
        }  # 构建输入字典

        # 如果模型kwargs中定义了过去的键值，则用它进行更快的解码
        if past_key_values:
            inputs["mems"] = tuple(layer_past[:-offset, :, :] for layer_past in past_key_values)  # 为更快的解码使用过去的记忆

        return inputs

    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=XLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC)
    # 定义一个实例方法 `forward`，用于模型的前向传播
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的token IDs，可选的张量类型
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，可选的张量类型
        mems: Optional[torch.Tensor] = None,  # 记忆缓存，可选的张量类型
        perm_mask: Optional[torch.Tensor] = None,  # 排列掩码，可选的张量类型
        target_mapping: Optional[torch.Tensor] = None,  # 目标映射，可选的张量类型
        token_type_ids: Optional[torch.Tensor] = None,  # token 类型 IDs，可选的张量类型
        input_mask: Optional[torch.Tensor] = None,  # 输入掩码，可选的张量类型
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码，可选的张量类型
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入表示，可选的张量类型
        labels: Optional[torch.Tensor] = None,  # 标签，可选的张量类型
        use_mems: Optional[bool] = None,  # 是否使用记忆缓存，可选的布尔类型
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选的布尔类型
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔类型
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，可选的布尔类型
        **kwargs,  # 删除 `use_cache` 在 `XLNetModel` 移除时使用
    ):
        pass  # 此处只是定义方法的占位符，实际内容未提供，暂无具体实现

    @staticmethod
    def _reorder_cache(mems: List[torch.Tensor], beam_idx: torch.Tensor) -> List[torch.Tensor]:
        """
        重新排列 `mems` 缓存，如果调用了 `~PreTrainedModel.beam_search` 或 `~PreTrainedModel.beam_sample`，
        这是为了确保在每个生成步骤中，`mems` 与正确的 `beam_idx` 匹配。
        """
        # 使用 `beam_idx` 将每个层级的过去状态重新排序到对应的设备上
        return [layer_past.index_select(1, beam_idx.to(layer_past.device)) for layer_past in mems]
# 定义 XLNet 序列分类/回归模型，其顶部包含一个用于 GLUE 任务的线性层（放在汇总输出之上）
@add_start_docstrings(
    """
    XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
    for GLUE tasks.
    """,
    XLNET_START_DOCSTRING,
)
class XLNetForSequenceClassification(XLNetPreTrainedModel):
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 记录类别数量
        self.num_labels = config.num_labels
        # 记录配置信息
        self.config = config

        # 初始化 XLNet 模型
        self.transformer = XLNetModel(config)
        # 序列汇总层
        self.sequence_summary = SequenceSummary(config)
        # 输出层，用于分类任务，将模型输出映射到标签数量
        self.logits_proj = nn.Linear(config.d_model, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=XLNetForSequenceClassificationOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        mems: Optional[torch.Tensor] = None,
        perm_mask: Optional[torch.Tensor] = None,
        target_mapping: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        input_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_mems: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,  # 在 XLNetModel 移除 `use_cache` 时应删除此参数
    ):
    ) -> Union[Tuple, XLNetForSequenceClassificationOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 根据传入参数决定是否返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用transformer模型处理输入数据，获取transformer的输出结果
        transformer_outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            mems=mems,
            perm_mask=perm_mask,
            target_mapping=target_mapping,
            token_type_ids=token_type_ids,
            input_mask=input_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_mems=use_mems,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            **kwargs,
        )
        # 获取transformer的输出结果中的第一个tensor作为模型输出
        output = transformer_outputs[0]

        # 对模型输出进行序列摘要处理
        output = self.sequence_summary(output)
        # 将摘要后的结果投影到logits空间
        logits = self.logits_proj(output)

        # 初始化损失值为None
        loss = None
        # 如果有传入标签数据
        if labels is not None:
            # 如果配置中未定义问题类型，则根据标签类型和数量设置问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单标签问题，计算均方误差损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签问题，同样计算均方误差损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类问题，使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类问题，使用带logits的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不要求返回字典形式的输出，则将损失值和其它输出合并返回
        if not return_dict:
            output = (logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回经过XLNet模型处理后的输出结果，包括损失、logits、mems、隐藏状态和注意力
        return XLNetForSequenceClassificationOutput(
            loss=loss,
            logits=logits,
            mems=transformer_outputs.mems,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
# 使用 XLNet 模型，在其上面添加一个用于标记分类（例如命名实体识别）的头部，这个头部是隐藏状态输出之上的一个线性层。
@add_start_docstrings(
    """
    XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    XLNET_START_DOCSTRING,
)
# 定义 XLNetForTokenClassification 类，继承自 XLNetPreTrainedModel
class XLNetForTokenClassification(XLNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 设置标签数量
        self.num_labels = config.num_labels

        # 初始化 XLNet 模型
        self.transformer = XLNetModel(config)
        # 定义分类器，使用线性层将隐藏状态输出转换成指定数量的标签
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    # 前向传播函数，处理输入并返回结果
    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=XLNetForTokenClassificationOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        mems: Optional[torch.Tensor] = None,
        perm_mask: Optional[torch.Tensor] = None,
        target_mapping: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        input_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_mems: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,  # 在 XLNetModel 中删除 `use_cache` 时使用
    ) -> Union[Tuple, XLNetForTokenClassificationOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above)
        """
        # 确定是否使用返回字典，如果未指定则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入参数传递给Transformer模型进行处理
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            mems=mems,
            perm_mask=perm_mask,
            target_mapping=target_mapping,
            token_type_ids=token_type_ids,
            input_mask=input_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_mems=use_mems,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从Transformer模型的输出中获取序列输出
        sequence_output = outputs[0]

        # 将序列输出传递给分类器，生成logits
        logits = self.classifier(sequence_output)

        # 初始化损失为None
        loss = None
        # 如果提供了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不需要返回字典，则返回包含logits和其他输出的元组
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典，则返回XLNetForTokenClassificationOutput对象
        return XLNetForTokenClassificationOutput(
            loss=loss,
            logits=logits,
            mems=outputs.mems,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 带有多选分类头部的 XLNet 模型类定义，用于例如 RACE/SWAG 任务
@add_start_docstrings(
    """
    XLNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RACE/SWAG tasks.
    """,
    XLNET_START_DOCSTRING,
)
class XLNetForMultipleChoice(XLNetPreTrainedModel):
    
    def __init__(self, config):
        super().__init__(config)
        
        # 初始化 XLNet 模型部分
        self.transformer = XLNetModel(config)
        
        # 序列摘要，用于汇总序列输出
        self.sequence_summary = SequenceSummary(config)
        
        # 线性层，将模型输出映射到一个标量，用于多选分类
        self.logits_proj = nn.Linear(config.d_model, 1)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=XLNetForMultipleChoiceOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        input_mask: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        mems: Optional[torch.Tensor] = None,
        perm_mask: Optional[torch.Tensor] = None,
        target_mapping: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_mems: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,  # 当 XLNetModel 中的 `use_cache` 参数移除时删除这部分
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据 `return_dict` 参数确定是否返回一个字典对象，如果未指定则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 计算输入的选项数，即第二维的大小，如果输入中不为空则使用 `input_ids.shape[1]`
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 将输入张量展平，以便在批处理维度上进行处理，如果输入不为空，则进行相应的操作
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        flat_input_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None
        flat_inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用 Transformer 模型进行前向传播，传入相应的参数
        transformer_outputs = self.transformer(
            flat_input_ids,
            token_type_ids=flat_token_type_ids,
            input_mask=flat_input_mask,
            attention_mask=flat_attention_mask,
            mems=mems,
            perm_mask=perm_mask,
            target_mapping=target_mapping,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            use_mems=use_mems,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            **kwargs,
        )

        # 获取 Transformer 输出中的第一个张量，通常是模型输出的主要部分
        output = transformer_outputs[0]

        # 对模型输出进行序列摘要，通常是对序列长度进行汇总或降维
        output = self.sequence_summary(output)

        # 将摘要后的序列输出投影到对应的分类空间，以获得 logits
        logits = self.logits_proj(output)

        # 将 logits 重新整形为二维张量，以便进行多选分类的计算
        reshaped_logits = logits.view(-1, num_choices)

        # 初始化损失为 None
        loss = None

        # 如果提供了 labels，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels.view(-1))

        # 如果不要求返回字典，则重新组织输出的元组
        if not return_dict:
            output = (reshaped_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 构造并返回 XLNetForMultipleChoiceOutput 对象，包含损失、logits 和其他 Transformer 输出
        return XLNetForMultipleChoiceOutput(
            loss=loss,
            logits=reshaped_logits,
            mems=transformer_outputs.mems,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
@add_start_docstrings(
    """
    XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    XLNET_START_DOCSTRING,
)
class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # 初始化 XLNet 模型
        self.transformer = XLNetModel(config)
        # 线性层，用于输出 span 起始和结束的 logits
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=XLNetForQuestionAnsweringSimpleOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        mems: Optional[torch.Tensor] = None,
        perm_mask: Optional[torch.Tensor] = None,
        target_mapping: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        input_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        use_mems: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,  # delete when `use_cache` is removed in XLNetModel
    ):
        """
        Forward pass for XLNetForQuestionAnsweringSimple.
        """
        # 略，这里是具体的前向传播逻辑，根据输入计算输出
    # 定义一个方法 `forward`，用于模型的前向传播
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的 token IDs，可选的 PyTorch 张量
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，可选的 PyTorch 张量
        mems: Optional[torch.Tensor] = None,  # 记忆（历史隐藏状态），可选的 PyTorch 张量
        perm_mask: Optional[torch.Tensor] = None,  # 排列掩码，可选的 PyTorch 张量
        target_mapping: Optional[torch.Tensor] = None,  # 目标映射，可选的 PyTorch 张量
        token_type_ids: Optional[torch.Tensor] = None,  # token 类型 IDs，可选的 PyTorch 张量
        input_mask: Optional[torch.Tensor] = None,  # 输入掩码，可选的 PyTorch 张量
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码，可选的 PyTorch 张量
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入表示，可选的 PyTorch 张量
        start_positions: Optional[torch.Tensor] = None,  # 起始位置，可选的 PyTorch 张量
        end_positions: Optional[torch.Tensor] = None,  # 结束位置，可选的 PyTorch 张量
        is_impossible: Optional[torch.Tensor] = None,  # 是否不可能的标记，可选的 PyTorch 张量
        cls_index: Optional[torch.Tensor] = None,  # 分类索引，可选的 PyTorch 张量
        p_mask: Optional[torch.Tensor] = None,  # 部分掩码，可选的 PyTorch 张量
        use_mems: Optional[bool] = None,  # 是否使用记忆的标志，可选的布尔值
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选的布尔值
        **kwargs,  # 当 `use_cache` 在 XLNetModel 中被移除时删除这部分内容

`.\models\xlnet\tokenization_xlnet.py`

# coding=utf-8
# 代码文件的版权声明和许可信息，遵循 Apache License, Version 2.0
# 详细信息可查看 http://www.apache.org/licenses/LICENSE-2.0

# 导入标准库和第三方库
import os
import unicodedata
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple

# 导入 sentencepiece 库，用于分词
import sentencepiece as spm

# 导入自定义的模块和工具函数
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import SPIECE_UNDERLINE, logging

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件的名称
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}

# 定义预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "xlnet/xlnet-base-cased": "https://huggingface.co/xlnet/xlnet-base-cased/resolve/main/spiece.model",
        "xlnet/xlnet-large-cased": "https://huggingface.co/xlnet/xlnet-large-cased/resolve/main/spiece.model",
    }
}

# 定义预训练模型的位置嵌入尺寸
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "xlnet/xlnet-base-cased": None,
    "xlnet/xlnet-large-cased": None,
}

# 定义各个语段的标识符常量
SEG_ID_A = 0
SEG_ID_B = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

# XLNetTokenizer 类，继承自 PreTrainedTokenizer 类
class XLNetTokenizer(PreTrainedTokenizer):
    """
    构建一个 XLNet 分词器，基于 SentencePiece。

    该分词器继承自 `PreTrainedTokenizer`，其中包含大多数主要方法。用户应参考该超类以获取关于这些方法的更多信息。

    Attributes:
        sp_model (`SentencePieceProcessor`):
            用于所有转换（字符串、token 和 ID）的 SentencePiece 处理器。
    """

    # 类属性：词汇文件的名称
    vocab_files_names = VOCAB_FILES_NAMES

    # 类属性：预训练模型的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

    # 类属性：预训练模型的最大输入尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    # 类属性：填充位置为左侧
    padding_side = "left"

    # 初始化方法
    def __init__(
        self,
        vocab_file,
        do_lower_case=False,
        remove_space=True,
        keep_accents=False,
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>",
        additional_special_tokens=["<eop>", "<eod>"],
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> None:
        # 定义一个函数，初始化一个 Mask token 对象，可以作为普通单词处理，即保留前面的空格
        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token

        # 初始化参数，如果没有传入 sp_model_kwargs，则设为空字典
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 设置对象属性
        self.do_lower_case = do_lower_case  # 是否进行小写处理
        self.remove_space = remove_space    # 是否移除空格
        self.keep_accents = keep_accents    # 是否保留重音符号
        self.vocab_file = vocab_file        # 词汇文件路径

        # 使用 SentencePieceProcessor 初始化 self.sp_model 对象，传入 sp_model_kwargs 参数
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(vocab_file)  # 载入指定的词汇文件

        # 调用父类的初始化方法，传入多个参数和关键字参数
        super().__init__(
            do_lower_case=do_lower_case,
            remove_space=remove_space,
            keep_accents=keep_accents,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            additional_special_tokens=additional_special_tokens,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

        # 设置内部变量 _pad_token_type_id 的值为 3
        self._pad_token_type_id = 3

    @property
    def vocab_size(self):
        # 返回 self.sp_model 中的词汇大小
        return len(self.sp_model)

    def get_vocab(self):
        # 创建词汇表字典，将词汇 ID 映射为对应的 token，并更新额外特殊 token 的编码
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    def __getstate__(self):
        # 复制对象的状态字典
        state = self.__dict__.copy()
        state["sp_model"] = None  # 将 sp_model 设置为 None，用于对象序列化时的状态保存
        return state

    def __setstate__(self, d):
        # 恢复对象的状态
        self.__dict__ = d

        # 为了向后兼容性，在恢复状态后，如果没有 sp_model_kwargs 属性，则设为一个空字典
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 重新初始化 self.sp_model 对象，载入指定的词汇文件
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(self.vocab_file)

    def preprocess_text(self, inputs):
        # 预处理文本函数，根据对象的属性进行文本处理

        # 如果 remove_space 为 True，则移除多余的空格
        if self.remove_space:
            outputs = " ".join(inputs.strip().split())
        else:
            outputs = inputs

        # 替换特定的引号符号
        outputs = outputs.replace("``", '"').replace("''", '"')

        # 如果不保留重音符号，则使用 NFC 规范化和移除所有的组合字符
        if not self.keep_accents:
            outputs = unicodedata.normalize("NFKD", outputs)
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])

        # 如果 do_lower_case 为 True，则将文本转换为小写
        if self.do_lower_case:
            outputs = outputs.lower()

        return outputs
    def _tokenize(self, text: str) -> List[str]:
        """Tokenize a string."""
        # 对输入文本进行预处理
        text = self.preprocess_text(text)
        # 使用预训练的分词模型对文本进行分词，返回分词后的结果
        pieces = self.sp_model.encode(text, out_type=str)
        new_pieces = []
        # 遍历每个分词结果
        for piece in pieces:
            # 如果分词长度大于1且以逗号结尾并且倒数第二个字符是数字
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                # 对满足条件的分词进行进一步处理，去除特殊字符并拆分为更小的片段
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                # 如果原始分词不以特殊字符开头但当前分词片段以特殊字符开头，则进行调整
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                # 将处理后的分词片段添加到新的分词列表中
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                # 将不需要进一步处理的分词直接添加到新的分词列表中
                new_pieces.append(piece)

        return new_pieces

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 使用预训练的分词模型将分词转换为对应的 id
        return self.sp_model.PieceToId(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用预训练的分词模型将 id 转换为对应的分词
        return self.sp_model.IdToPiece(index)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        # 将分词序列转换为单个字符串，并替换特殊字符为空格
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

    def _decode(
        self,
        token_ids: List[int],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        spaces_between_special_tokens: bool = True,
        **kwargs,
    ):
        """Decode a list of token IDs back into a string."""
        # 略
    ) -> str:
        # 从 kwargs 中弹出 "use_source_tokenizer" 参数，默认为 False
        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)

        # 将 token_ids 转换为 tokens，跳过特殊 token
        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)

        # 避免混合字节级和 Unicode 的情况，特别是对于字节级 BPT
        # 需要分别构建添加的 token 和字节级 token 的字符串
        # 参考：https://github.com/huggingface/transformers/issues/1133
        sub_texts = []
        current_sub_text = []
        for token in filtered_tokens:
            if skip_special_tokens and token in self.all_special_ids:
                continue
            if token in self.added_tokens_encoder:
                # 如果当前有未处理完的子字符串，先转换为字符串并添加到 sub_texts
                if current_sub_text:
                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
                    current_sub_text = []
                # 直接添加特殊 token 到 sub_texts
                sub_texts.append(token)
            else:
                # 将普通 token 添加到当前的子字符串
                current_sub_text.append(token)
        # 处理最后一个子字符串
        if current_sub_text:
            sub_texts.append(self.convert_tokens_to_string(current_sub_text))

        # 模仿 Rust 分词器的行为：
        # 默认情况下，特殊 token 之间没有空格
        text = "".join(sub_texts)

        # 是否清理 tokenization 中的空格，默认为 self.clean_up_tokenization_spaces
        clean_up_tokenization_spaces = (
            clean_up_tokenization_spaces
            if clean_up_tokenization_spaces is not None
            else self.clean_up_tokenization_spaces
        )
        # 如果需要清理空格，则调用 clean_up_tokenization 方法清理 text
        if clean_up_tokenization_spaces:
            clean_text = self.clean_up_tokenization(text)
            return clean_text
        else:
            return text

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        通过连接和添加特殊 token，构建用于序列分类任务的模型输入。对于 XLNet，序列的格式如下：

        - 单个序列：`X <sep> <cls>`
        - 序列对：`A <sep> B <sep> <cls>`

        Args:
            token_ids_0 (`List[int]`):
                将添加特殊 token 的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个 ID 列表，用于序列对。

        Returns:
            `List[int]`: 包含适当特殊 token 的输入 ID 列表。
        """
        sep = [self.sep_token_id]  # 分隔 token 的 ID 列表
        cls = [self.cls_token_id]  # 类别 token 的 ID 列表
        if token_ids_1 is None:
            return token_ids_0 + sep + cls  # 返回单个序列的输入 ID 列表
        return token_ids_0 + sep + token_ids_1 + sep + cls  # 返回序列对的输入 ID 列表

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
        """
        生成特殊 token 的掩码，用于标识输入中的特殊 token。

        Args:
            token_ids_0 (`List[int]`):
                第一个序列的 token ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个序列的 token ID 列表。
            already_has_special_tokens (`bool`):
                指示输入 token 是否已经包含特殊 token。

        Returns:
            `List[int]`: 包含特殊 token 掩码的列表。
        """
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # If the token list already has special tokens, delegate to the superclass method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )
        
        # If token_ids_1 exists, create a mask for sequence pairs with special tokens
        if token_ids_1 is not None:
            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
        # Otherwise, create a mask for a single sequence with special tokens
        return ([0] * len(token_ids_0)) + [1, 1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Separator token for XLNet
        sep = [self.sep_token_id]
        # Class segment ID for XLNet
        cls_segment_id = [2]

        # If token_ids_1 is None, return mask for single sequence
        if token_ids_1 is None:
            return len(token_ids_0 + sep) * [0] + cls_segment_id
        
        # Otherwise, return mask for sequence pair
        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
    # 定义一个方法用于保存词汇表到指定目录
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 构建输出词汇表文件路径，根据可选的前缀和默认的词汇表文件名组合而成
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件与输出文件不是同一个文件，并且当前词汇表文件存在，则进行复制操作
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # 如果当前词汇表文件不存在，则将序列化的模型写入到输出文件中
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # 返回保存的输出词汇表文件路径的元组形式
        return (out_vocab_file,)

`.\models\xlnet\tokenization_xlnet_fast.py`

# coding=utf-8
# 版权声明和许可信息

# 导入必要的库和模块
import os  # 导入操作系统相关模块
from shutil import copyfile  # 导入文件复制函数
from typing import List, Optional, Tuple  # 导入类型提示相关模块

# 导入所需的Tokenization相关工具函数和类
from ...tokenization_utils import AddedToken  
from ...tokenization_utils_fast import PreTrainedTokenizerFast  
from ...utils import is_sentencepiece_available, logging  # 导入SentencePiece可用性检查和日志模块

# 如果SentencePiece可用，则导入XLNetTokenizer类
if is_sentencepiece_available():
    from .tokenization_xlnet import XLNetTokenizer
else:
    XLNetTokenizer = None  # 否则将XLNetTokenizer设置为None

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 定义词汇文件名字典
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}

# 定义预训练词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "xlnet/xlnet-base-cased": "https://huggingface.co/xlnet/xlnet-base-cased/resolve/main/spiece.model",
        "xlnet/xlnet-large-cased": "https://huggingface.co/xlnet/xlnet-large-cased/resolve/main/spiece.model",
    },
    "tokenizer_file": {
        "xlnet/xlnet-base-cased": "https://huggingface.co/xlnet/xlnet-base-cased/resolve/main/tokenizer.json",
        "xlnet/xlnet-large-cased": "https://huggingface.co/xlnet/xlnet-large-cased/resolve/main/tokenizer.json",
    },
}

# 定义预训练位置嵌入尺寸
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "xlnet/xlnet-base-cased": None,
    "xlnet/xlnet-large-cased": None,
}

# 定义句子片段标识符常量
SPIECE_UNDERLINE = "▁"

# 定义不同句子段的标识符常量
SEG_ID_A = 0
SEG_ID_B = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

class XLNetTokenizerFast(PreTrainedTokenizerFast):
    """
    快速构建XLNet分词器，基于HuggingFace的tokenizers库。基于Unigram模型。

    该分词器继承自PreTrainedTokenizerFast类，包含大部分主要方法。详细方法信息请参考其超类。
    """
    pass  # 占位符，该类暂时没有自定义方法或属性，只是继承了PreTrainedTokenizerFast的功能
    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether to lowercase the input when tokenizing.
        remove_space (`bool`, *optional*, defaults to `True`):
            Whether to strip the text when tokenizing (removing excess spaces before and after the string).
        keep_accents (`bool`, *optional*, defaults to `False`):
            Whether to keep accents when tokenizing.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"<sep>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"<cls>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
            Additional special tokens used by the tokenizer.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    """

    # VOCAB_FILES_NAMES is typically a constant or a configuration that defines
    # the names or paths of various vocabulary-related files used by the tokenizer.
    vocab_files_names = VOCAB_FILES_NAMES
    # 使用预先定义的词汇文件映射 PRETRAINED_VOCAB_FILES_MAP
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 使用预先定义的最大模型输入大小映射 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 指定填充位置为左侧
    padding_side = "left"
    # 指定使用的慢速分词器类为 XLNetTokenizer

    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=False,
        remove_space=True,
        keep_accents=False,
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>",
        additional_special_tokens=["<eop>", "<eod>"],
        **kwargs,
    ):
        # 如果 mask_token 是字符串，则创建一个 AddedToken 对象，用于处理去除左侧空格
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        # 调用父类的构造方法，初始化分词器对象
        super().__init__(
            vocab_file=vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            remove_space=remove_space,
            keep_accents=keep_accents,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )

        # 设置特殊的 pad_token_type_id 为 3
        self._pad_token_type_id = 3
        # 初始化其他属性
        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file

    @property
    def can_save_slow_tokenizer(self) -> bool:
        # 检查是否可以保存慢速分词器，基于是否存在 vocab_file 文件
        return os.path.isfile(self.vocab_file) if self.vocab_file else False

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        从一个序列或一对序列构建模型输入，用于序列分类任务，通过连接和添加特殊标记。对于 XLNet 模型，输入的格式如下：

        - 单个序列: `X <sep> <cls>`
        - 一对序列: `A <sep> B <sep> <cls>`

        Args:
            token_ids_0 (`List[int]`):
                要添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个序列 ID 列表，用于序列对。

        Returns:
            `List[int]`: 包含适当特殊标记的输入 ID 列表。
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return token_ids_0 + sep + cls
        return token_ids_0 + sep + token_ids_1 + sep + cls

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
        # 这个方法的作用没有具体注释，应该补充一个注释来解释它的功能
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Separator token [SEP] used to separate sequences
        sep = [self.sep_token_id]
        # Classification segment ID indicating the segment for classification
        cls_segment_id = [2]

        # If only one sequence (`token_ids_1` is None), return a mask for the first sequence only
        if token_ids_1 is None:
            return len(token_ids_0 + sep) * [0] + cls_segment_id
        # Otherwise, return a mask for both sequences
        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Check if the fast tokenizer can save vocabulary; raise an error if not
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # Check if save_directory is a valid directory; log an error and return if not
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # Define the output vocabulary file path
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # If the current vocabulary file path is different from the output path, copy the vocabulary file
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # Return the path to the saved vocabulary file
        return (out_vocab_file,)

`.\models\xlnet\init.py`

# 导入必要的模块和函数，包括类型检查功能
from typing import TYPE_CHECKING

# 从工具包中导入相关模块和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块导入结构字典，用于组织需要导入的模块和函数
_import_structure = {"configuration_xlnet": ["XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLNetConfig"]}

# 检查是否安装了 sentencepiece 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 sentencepiece 库，则添加相关的 XLNetTokenizer 到导入结构中
    _import_structure["tokenization_xlnet"] = ["XLNetTokenizer"]

# 检查是否安装了 tokenizers 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 tokenizers 库，则添加相关的 XLNetTokenizerFast 到导入结构中
    _import_structure["tokenization_xlnet_fast"] = ["XLNetTokenizerFast"]

# 检查是否安装了 torch 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 torch 库，则添加相关的 XLNet 模型组件到导入结构中
    _import_structure["modeling_xlnet"] = [
        "XLNET_PRETRAINED_MODEL_ARCHIVE_LIST",
        "XLNetForMultipleChoice",
        "XLNetForQuestionAnswering",
        "XLNetForQuestionAnsweringSimple",
        "XLNetForSequenceClassification",
        "XLNetForTokenClassification",
        "XLNetLMHeadModel",
        "XLNetModel",
        "XLNetPreTrainedModel",
        "load_tf_weights_in_xlnet",
    ]

# 检查是否安装了 tensorflow 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 tensorflow 库，则添加相关的 TensorFlow XLNet 模型组件到导入结构中
    _import_structure["modeling_tf_xlnet"] = [
        "TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFXLNetForMultipleChoice",
        "TFXLNetForQuestionAnsweringSimple",
        "TFXLNetForSequenceClassification",
        "TFXLNetForTokenClassification",
        "TFXLNetLMHeadModel",
        "TFXLNetMainLayer",
        "TFXLNetModel",
        "TFXLNetPreTrainedModel",
    ]

# 如果是类型检查模式，则导入相关的类型定义
if TYPE_CHECKING:
    from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig

    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 sentencepiece 库，则从 tokenization_xlnet 模块中导入 XLNetTokenizer
        from .tokenization_xlnet import XLNetTokenizer

    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 tokenizers 库，则从 tokenization_xlnet_fast 模块中导入 XLNetTokenizerFast
        from .tokenization_xlnet_fast import XLNetTokenizerFast
    # 尝试检查是否存在 Torch 库，如果不存在则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，表示 Torch 库不可用
    except OptionalDependencyNotAvailable:
        pass
    # 如果 Torch 库可用，则执行以下代码块
    else:
        # 从当前目录下的 modeling_xlnet 模块导入以下符号
        from .modeling_xlnet import (
            XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
            XLNetForMultipleChoice,
            XLNetForQuestionAnswering,
            XLNetForQuestionAnsweringSimple,
            XLNetForSequenceClassification,
            XLNetForTokenClassification,
            XLNetLMHeadModel,
            XLNetModel,
            XLNetPreTrainedModel,
            load_tf_weights_in_xlnet,
        )

    # 尝试检查是否存在 TensorFlow 库，如果不存在则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，表示 TensorFlow 库不可用
    except OptionalDependencyNotAvailable:
        pass
    # 如果 TensorFlow 库可用，则执行以下代码块
    else:
        # 从当前目录下的 modeling_tf_xlnet 模块导入以下符号
        from .modeling_tf_xlnet import (
            TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFXLNetForMultipleChoice,
            TFXLNetForQuestionAnsweringSimple,
            TFXLNetForSequenceClassification,
            TFXLNetForTokenClassification,
            TFXLNetLMHeadModel,
            TFXLNetMainLayer,
            TFXLNetModel,
            TFXLNetPreTrainedModel,
        )
else:
    # 如果不在以上的情况下，则导入 sys 模块
    import sys
    
    # 将当前模块注册到 sys.modules 中，使用 _LazyModule 进行延迟加载
    # __name__ 是当前模块的名称，__file__ 是当前文件的路径
    # _import_structure 是导入的结构信息，module_spec=__spec__ 是模块的规范信息
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\xmod\configuration_xmod.py`

# 引入需要的模块和类
from collections import OrderedDict  # 导入有序字典类
from typing import Mapping  # 导入映射类型

# 从相关的模块中导入配置类和其它必要的配置
from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...onnx import OnnxConfig  # 导入ONNX配置类
from ...utils import logging  # 导入日志工具

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# X-MOD预训练模型配置文件映射字典，将模型名称映射到其配置文件的URL
XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/xmod-base": "https://huggingface.co/facebook/xmod-base/resolve/main/config.json",
    "facebook/xmod-large-prenorm": "https://huggingface.co/facebook/xmod-large-prenorm/resolve/main/config.json",
    "facebook/xmod-base-13-125k": "https://huggingface.co/facebook/xmod-base-13-125k/resolve/main/config.json",
    "facebook/xmod-base-30-125k": "https://huggingface.co/facebook/xmod-base-30-125k/resolve/main/config.json",
    "facebook/xmod-base-30-195k": "https://huggingface.co/facebook/xmod-base-30-195k/resolve/main/config.json",
    "facebook/xmod-base-60-125k": "https://huggingface.co/facebook/xmod-base-60-125k/resolve/main/config.json",
    "facebook/xmod-base-60-265k": "https://huggingface.co/facebook/xmod-base-60-265k/resolve/main/config.json",
    "facebook/xmod-base-75-125k": "https://huggingface.co/facebook/xmod-base-75-125k/resolve/main/config.json",
    "facebook/xmod-base-75-269k": "https://huggingface.co/facebook/xmod-base-75-269k/resolve/main/config.json",
}


class XmodConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`XmodModel`]. It is used to instantiate an X-MOD
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the
    [facebook/xmod-base](https://huggingface.co/facebook/xmod-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Examples:

    ```
    >>> from transformers import XmodConfig, XmodModel

    >>> # Initializing an X-MOD facebook/xmod-base style configuration
    >>> configuration = XmodConfig()

    >>> # Initializing a model (with random weights) from the facebook/xmod-base style configuration
    >>> model = XmodModel(configuration)

    >>> # Accessing the model configuration
    ```
    """
    pass  # 此类定义了X-MOD模型的配置，继承自PretrainedConfig类，并提供了实例化模型和控制输出的能力
    # 获取模型的配置信息
    configuration = model.config
# 从 transformers.models.roberta.configuration_roberta.RobertaOnnxConfig 复制并修改为 XmodOnnxConfig
class XmodOnnxConfig(OnnxConfig):
    
    # 定义 inputs 属性，返回一个映射，表示模型的输入格式
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 根据任务类型设置动态轴的不同配置
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 返回有序字典，指定输入名称和对应的动态轴
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),        # 输入的 token IDs
                ("attention_mask", dynamic_axis),  # 输入的注意力掩码
            ]
        )

`.\models\xmod\convert_xmod_original_pytorch_checkpoint_to_pytorch.py`

# 设置编码格式为 UTF-8
# 版权声明及许可信息
# 引入 argparse 用于命令行参数解析
# 从 pathlib 模块中引入 Path 类
# 引入 fairseq 库
# 引入 torch 库
# 从 fairseq 的 xmod 模块中引入 XMODModel 类别名为 FairseqXmodModel
# 从 packaging 模块中引入 version 函数
# 从 transformers 库中引入 XmodConfig, XmodForMaskedLM, XmodForSequenceClassification 类
# 从 transformers.utils 中引入 logging 模块

if version.parse(fairseq.__version__) < version.parse("0.12.2"):
    # 如果 fairseq 版本小于 0.12.2，则抛出异常
    raise Exception("requires fairseq >= 0.12.2")
if version.parse(fairseq.__version__) > version.parse("2"):
    # 如果 fairseq 版本大于 2，则抛出异常
    raise Exception("requires fairseq < v2")

# 设置日志输出等级为 INFO
logging.set_verbosity_info()
# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 示例文本
SAMPLE_TEXT = "Hello, World!"
# 示例语言标识
SAMPLE_LANGUAGE = "en_XX"

def convert_xmod_checkpoint_to_pytorch(
    xmod_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
):
    # 数据目录
    data_dir = Path("data_bin")
    # 从预训练模型路径加载 FairseqXmodModel 模型
    xmod = FairseqXmodModel.from_pretrained(
        model_name_or_path=str(Path(xmod_checkpoint_path).parent),
        checkpoint_file=Path(xmod_checkpoint_path).name,
        _name="xmod_base",
        arch="xmod_base",
        task="multilingual_masked_lm",
        data_name_or_path=str(data_dir),
        bpe="sentencepiece",
        sentencepiece_model=str(Path(xmod_checkpoint_path).parent / "sentencepiece.bpe.model"),
        src_dict=str(data_dir / "dict.txt"),
    )
    # 设置模型为评估模式，禁用 dropout
    xmod.eval()
    # 打印模型信息
    print(xmod)

    # 获取 xmod 模型的句子编码器
    xmod_sent_encoder = xmod.model.encoder.sentence_encoder
    # 根据 xmod 模型的配置创建 XmodConfig 对象
    config = XmodConfig(
        vocab_size=xmod_sent_encoder.embed_tokens.num_embeddings,
        hidden_size=xmod.cfg.model.encoder_embed_dim,
        num_hidden_layers=xmod.cfg.model.encoder_layers,
        num_attention_heads=xmod.cfg.model.encoder_attention_heads,
        intermediate_size=xmod.cfg.model.encoder_ffn_embed_dim,
        max_position_embeddings=514,
        type_vocab_size=1,
        layer_norm_eps=1e-5,  # PyTorch 默认值，与 fairseq 兼容
        pre_norm=xmod.cfg.model.encoder_normalize_before,
        adapter_reduction_factor=getattr(xmod.cfg.model, "bottleneck", 2),
        adapter_layer_norm=xmod.cfg.model.adapter_layer_norm,
        adapter_reuse_layer_norm=xmod.cfg.model.adapter_reuse_layer_norm,
        ln_before_adapter=xmod.cfg.model.ln_before_adapter,
        languages=xmod.cfg.model.languages,
    )
    # 如果需要分类头部，则设置配置对象的标签数量为模型特定分类头的输出权重行数
    if classification_head:
        config.num_labels = xmod.model.classification_heads["mnli"].out_proj.weight.shape[0]
    # 打印 X-MOD 的配置信息
    print("Our X-MOD config:", config)

    # 根据是否有分类头选择模型类型，并设置为评估模式
    model = XmodForSequenceClassification(config) if classification_head else XmodForMaskedLM(config)
    model.eval()

    # 复制所有权重
    # 嵌入层权重
    model.roberta.embeddings.word_embeddings.weight = xmod_sent_encoder.embed_tokens.weight
    model.roberta.embeddings.position_embeddings.weight = xmod_sent_encoder.embed_positions.weight
    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        model.roberta.embeddings.token_type_embeddings.weight
    )  # 将其置零因为 xmod 不使用它们

    model.roberta.embeddings.LayerNorm.weight = xmod_sent_encoder.layernorm_embedding.weight
    model.roberta.embeddings.LayerNorm.bias = xmod_sent_encoder.layernorm_embedding.bias

    # 如果存在层归一化，则复制编码器层归一化的权重和偏置
    if xmod_sent_encoder.layer_norm is not None:
        model.roberta.encoder.LayerNorm.weight = xmod_sent_encoder.layer_norm.weight
        model.roberta.encoder.LayerNorm.bias = xmod_sent_encoder.layer_norm.bias

    # 如果是分类头，复制分类器的权重和偏置
    if classification_head:
        model.classifier.dense.weight = xmod.model.classification_heads["mnli"].dense.weight
        model.classifier.dense.bias = xmod.model.classification_heads["mnli"].dense.bias
        model.classifier.out_proj.weight = xmod.model.classification_heads["mnli"].out_proj.weight
        model.classifier.out_proj.bias = xmod.model.classification_heads["mnli"].out_proj.bias
    else:
        # 如果是语言模型头，复制语言模型头的权重和偏置
        model.lm_head.dense.weight = xmod.model.encoder.lm_head.dense.weight
        model.lm_head.dense.bias = xmod.model.encoder.lm_head.dense.bias
        model.lm_head.layer_norm.weight = xmod.model.encoder.lm_head.layer_norm.weight
        model.lm_head.layer_norm.bias = xmod.model.encoder.lm_head.layer_norm.bias
        model.lm_head.decoder.weight = xmod.model.encoder.lm_head.weight
        model.lm_head.decoder.bias = xmod.model.encoder.lm_head.bias

    # 检查模型输出是否一致
    input_ids = xmod.encode(SAMPLE_TEXT).unsqueeze(0)  # 批量大小为 1
    model.roberta.set_default_language(SAMPLE_LANGUAGE)

    our_output = model(input_ids)[0]
    if classification_head:
        their_output = xmod.model.classification_heads["mnli"](xmod.extract_features(input_ids))
    else:
        their_output = xmod.model(input_ids, lang_id=[SAMPLE_LANGUAGE])[0]
    print(our_output.shape, their_output.shape)
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # 约为 1e-7
    success = torch.allclose(our_output, their_output, atol=1e-3)
    print("Do both models output the same tensors?", "🔥" if success else "💩")
    if not success:
        raise Exception("Something went wRoNg")

    # 创建目录以保存 PyTorch 模型
    Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    # 如果当前脚本作为主程序执行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必选参数
    parser.add_argument(
        "--xmod_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
    )
    # 添加一个参数，指定官方 PyTorch 模型的路径，类型为字符串，必选项

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 添加一个参数，指定输出 PyTorch 模型的文件夹路径，类型为字符串，必选项

    parser.add_argument(
        "--classification_head", action="store_true", help="Whether to convert a final classification head."
    )
    # 添加一个参数，表示是否要转换最终的分类头部，这是一个布尔值参数

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数，将 xmod 模型转换为 PyTorch 模型
    convert_xmod_checkpoint_to_pytorch(
        args.xmod_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
    )

`.\models\xmod\modeling_xmod.py`

# coding=utf-8
# 版权 2023 Meta AI Team 和 HuggingFace Inc. 团队。
#
# 根据 Apache 许可证版本 2.0（“许可证”）授权;
# 除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则根据“现状”分发软件，
# 没有任何形式的明示或暗示的担保或条件。
# 有关特定语言的权限，请参阅许可证。
"""PyTorch X-MOD 模型。"""

import math
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN, gelu
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_xmod import XmodConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 预训练模型存档列表
XMOD_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/xmod-base",
    "facebook/xmod-large-prenorm",
    "facebook/xmod-base-13-125k",
    "facebook/xmod-base-30-125k",
    "facebook/xmod-base-30-195k",
    "facebook/xmod-base-60-125k",
    "facebook/xmod-base-60-265k",
    "facebook/xmod-base-75-125k",
    "facebook/xmod-base-75-269k",
    # 查看所有 X-MOD 模型，请访问 https://huggingface.co/models?filter=xmod
]

# 从 transformers.models.roberta.modeling_roberta.RobertaEmbeddings 复制并将 Roberta->Xmod
class XmodEmbeddings(nn.Module):
    """
    与 BertEmbeddings 相同，但对于位置嵌入索引进行了微小调整。
    """

    # 从 transformers.models.bert.modeling_bert.BertEmbeddings.__init__ 复制
    # 初始化函数，接受一个配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建词嵌入层，用于将词汇索引映射到隐藏表示空间，大小为 vocab_size * hidden_size
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 创建位置嵌入层，用于表示输入序列中每个位置的位置编码，大小为 max_position_embeddings * hidden_size
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 创建标记类型嵌入层，用于区分不同类型的输入标记（如 segment A 和 segment B），大小为 type_vocab_size * hidden_size
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 创建层归一化层，对隐藏表示进行归一化处理，大小为 hidden_size，epsilon 为 layer_norm_eps
        # 注：LayerNorm 命名未使用蛇形命名，以与 TensorFlow 模型变量名保持一致，便于加载 TensorFlow 检查点文件
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建 dropout 层，用于在训练过程中进行随机失活以防止过拟合，概率为 hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        # 设置位置嵌入类型，默认为绝对位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册缓冲区 position_ids，用于存储位置编码向量，大小为 1 * max_position_embeddings
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册缓冲区 token_type_ids，用于存储标记类型向量，大小与 position_ids 相同，类型为 long
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

        # End copy
        # 设置 padding_idx，用于指定词嵌入层中的填充标记索引
        self.padding_idx = config.pad_token_id
        # 重新定义位置嵌入层，指定填充标记索引
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )
    ):
        # 如果未提供位置 ids，则根据输入的 token ids 创建位置 ids。任何填充的 token 保持填充状态。
        if position_ids is None:
            if input_ids is not None:
                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
            else:
                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        # 根据输入的 token ids 或者 embeddings 确定输入形状
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列的长度
        seq_length = input_shape[1]

        # 将 token_type_ids 设置为构造函数中注册的缓冲区，通常情况下全为零，用于在不传递 token_type_ids 的情况下跟踪模型
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果未提供 inputs_embeds，则使用输入的 input_ids 获取 embeddings
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        
        # 获取 token_type_embeddings
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 计算最终的 embeddings
        embeddings = inputs_embeds + token_type_embeddings

        # 如果是绝对位置编码方式，添加位置 embeddings
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        
        # 应用 LayerNorm
        embeddings = self.LayerNorm(embeddings)
        
        # 应用 dropout
        embeddings = self.dropout(embeddings)
        
        # 返回最终的 embeddings
        return embeddings

    # 从输入的 embeddings 直接生成位置 ids，因为无法推断哪些是填充的，因此生成连续的位置 ids
    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        input_shape = inputs_embeds.size()[:-1]
        sequence_length = input_shape[1]

        # 生成连续的位置 ids
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        return position_ids.unsqueeze(0).expand(input_shape)
# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Xmod
class XmodSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查隐藏层大小是否能被注意力头数整除，若不能且未定义embedding_size则引发错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 初始化注意力头数和每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 初始化查询、键和值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 初始化dropout层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # 设置位置嵌入类型，默认为绝对位置嵌入
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        
        # 如果使用相对位置嵌入，初始化距离嵌入的Embedding层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 设置是否为解码器模式
        self.is_decoder = config.is_decoder

    # 将输入张量变形以便计算注意力分数
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 定义前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,



class XmodSelfOutput(nn.Module):
    # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput.__init__
    def __init__(self, config):
        super().__init__()
        # 初始化全连接层、LayerNorm和dropout层
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 定义前向传播函数
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 全连接层计算
        hidden_states = self.dense(hidden_states)
        # dropout层
        hidden_states = self.dropout(hidden_states)
        # 添加输入张量并返回结果
        hidden_states = hidden_states + input_tensor
        return hidden_states


class XmodAttention(nn.Module):
    # 初始化函数，用于初始化一个自注意力模块和一个自注意力输出模块
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 创建自注意力模块，根据配置和位置嵌入类型进行初始化
        self.self = XmodSelfAttention(config, position_embedding_type=position_embedding_type)
        # 创建自注意力输出模块，根据配置进行初始化
        self.output = XmodSelfOutput(config)
        # 存储需要剪枝的注意力头的索引集合
        self.pruned_heads = set()
        # 标记是否使用预层归一化
        self.pre_norm = config.pre_norm

    # 剪枝注意力头的方法，来自 transformers.models.roberta.modeling_roberta.RobertaAttention.prune_heads
    def prune_heads(self, heads):
        # 如果没有需要剪枝的头，直接返回
        if len(heads) == 0:
            return
        # 调用工具函数找到可以剪枝的注意力头及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 对自注意力模块的查询、键、值矩阵执行剪枝操作
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        # 对输出模块的线性层执行剪枝操作
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并记录剪枝过的注意力头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 正向传播函数，接受输入的隐藏状态和各种掩码，返回处理后的输出
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 将原始输入作为残差连接的一部分
        residual = hidden_states
        # 如果使用预层归一化，则在自注意力输出之前对隐藏状态进行归一化处理
        if self.pre_norm:
            hidden_states = self.output.LayerNorm(hidden_states)
        # 调用自注意力模块进行计算，获取自注意力输出
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 将自注意力输出和残差连接，送入自注意力输出模块
        attention_output = self.output(self_outputs[0], residual)
        # 如果不使用预层归一化，则在输出模块之后对注意力输出进行归一化处理
        if not self.pre_norm:
            attention_output = self.output.LayerNorm(attention_output)
        # 将注意力输出和可能的注意力权重返回作为结果
        outputs = (attention_output,) + self_outputs[1:]  # 如果有需要，还可以添加注意力权重
        return outputs
# Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate

# 定义了一个名为XmodIntermediate的神经网络模块，继承自nn.Module
class XmodIntermediate(nn.Module):
    # 初始化函数，接收一个config对象作为参数
    def __init__(self, config):
        super().__init__()
        # 使用线性层将输入特征从config.hidden_size转换为config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据config中的hidden_act字段选择激活函数，并存储在self.intermediate_act_fn中
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播函数，接收一个torch.Tensor类型的hidden_states作为输入，返回一个torch.Tensor
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入特征通过self.dense线性层进行转换
        hidden_states = self.dense(hidden_states)
        # 使用存储在self.intermediate_act_fn中的激活函数处理转换后的特征
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 定义了一个名为XmodAdapter的神经网络模块，继承自nn.Module
class XmodAdapter(nn.Module):
    # 初始化函数，接收一个config对象作为参数
    def __init__(self, config):
        super().__init__()
        # 计算适配器的瓶颈层大小，作为config.hidden_size除以config.adapter_reduction_factor的结果
        self.bottleneck_size = config.hidden_size // config.adapter_reduction_factor
        # 第一个线性层，将输入特征从config.hidden_size转换为self.bottleneck_size
        self.dense1 = nn.Linear(config.hidden_size, self.bottleneck_size)
        # 第二个线性层，将self.bottleneck_size大小的特征转换回config.hidden_size
        self.dense2 = nn.Linear(self.bottleneck_size, config.hidden_size)
        # 根据config中的hidden_act字段选择激活函数，并存储在self.adapter_act_fn中
        if isinstance(config.hidden_act, str):
            self.adapter_act_fn = ACT2FN[config.hidden_act]
        else:
            self.adapter_act_fn = config.hidden_act

    # 前向传播函数，接收一个torch.Tensor类型的hidden_states作为输入，返回一个torch.Tensor
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入特征通过self.dense1线性层进行转换
        hidden_states = self.dense1(hidden_states)
        # 使用存储在self.adapter_act_fn中的激活函数处理转换后的特征
        hidden_states = self.adapter_act_fn(hidden_states)
        # 将特征再通过self.dense2线性层转换回config.hidden_size大小
        hidden_states = self.dense2(hidden_states)
        return hidden_states


# 定义了一个名为XmodOutput的神经网络模块，继承自nn.Module
class XmodOutput(nn.Module):
    # 初始化函数，接收一个config对象作为参数
    def __init__(self, config):
        super().__init__()
        # 将intermediate_size大小的特征转换为config.hidden_size大小
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # LayerNorm层，对config.hidden_size大小的特征进行归一化处理
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 是否在适配器之前应用LayerNorm
        self.ln_before_adapter = config.ln_before_adapter
        # Dropout层，根据config.hidden_dropout_prob概率随机丢弃部分特征
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 是否使用适配器层的LayerNorm
        if config.adapter_layer_norm:
            self.adapter_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        else:
            self.adapter_layer_norm = None
        # 是否重复使用适配器层的LayerNorm
        self.adapter_reuse_layer_norm = config.adapter_reuse_layer_norm
        # 为每种语言创建一个适配器模块，并存储在self.adapter_modules中
        self.adapter_modules = nn.ModuleDict({})
        for language in config.languages:
            self.adapter_modules[str(language)] = XmodAdapter(config)

    # 前向传播函数，接收torch.Tensor类型的hidden_states、input_tensor和lang_ids作为输入，返回一个torch.Tensor
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor, lang_ids: torch.Tensor) -> torch.Tensor:
        # 将intermediate_size大小的特征转换为config.hidden_size大小
        hidden_states = self.dense(hidden_states)
        # 随机丢弃部分特征
        hidden_states = self.dropout(hidden_states)
        # 将丢弃的特征与输入特征相加
        hidden_states = hidden_states + input_tensor
        # 使用语言适配器层处理hidden_states
        hidden_states = self.lang_adapter(lang_ids, hidden_states)
        return hidden_states
    # 处理具有相同语言ID的后续样本，以并行方式处理
    lang_ids, lang_lengths = torch.unique_consecutive(lang_ids, return_counts=True)

    # 如果不在适配器之前进行LayerNorm，则将隐藏状态保留为残差连接的一部分
    if not self.ln_before_adapter:
        residual = hidden_states

    # 如果定义了适配器的LayerNorm，应用LayerNorm到隐藏状态
    if self.adapter_layer_norm is not None:
        hidden_states = self.adapter_layer_norm(hidden_states)
    # 否则，如果允许重用层归一化，则应用默认的LayerNorm到隐藏状态
    elif self.adapter_reuse_layer_norm:
        hidden_states = self.LayerNorm(hidden_states)

    # 如果在适配器之前进行LayerNorm，则将隐藏状态保留为残差连接的一部分
    if self.ln_before_adapter:
        residual = hidden_states

    # 将隐藏状态按照语言长度分割，以便按语言ID处理
    split_hidden_states = torch.split(hidden_states, lang_lengths.tolist(), 0)
    lang_wise_outputs = []
    # 遍历每个语言ID及其对应的分割隐藏状态
    for i, (lang_id, split_hidden_state) in enumerate(zip(lang_ids, split_hidden_states)):
        # 获取语言ID对应的适配器模块，并对分割的隐藏状态应用该适配器
        lang = list(self.adapter_modules.keys())[int(lang_id.item())]
        lang_wise_outputs.append(self.adapter_modules[lang](split_hidden_state))
    # 合并所有语言ID的适配器输出结果
    hidden_states = torch.cat(lang_wise_outputs, 0)

    # 应用dropout到合并后的隐藏状态
    hidden_states = self.dropout(hidden_states)
    # 将dropout后的结果与残差连接进行相加
    hidden_states += residual

    # 返回处理后的隐藏状态作为最终输出
    return hidden_states
# 定义一个名为 XmodLayer 的自定义神经网络层，继承自 nn.Module
class XmodLayer(nn.Module):
    # 初始化函数，接受一个 config 参数
    def __init__(self, config):
        # 调用父类 nn.Module 的初始化函数
        super().__init__()
        
        # 设置当前层的 chunk_size_feed_forward 属性，从 config 中获取
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        
        # 设置序列长度维度为 1
        self.seq_len_dim = 1
        
        # 初始化一个 XmodAttention 对象，并赋值给 self.attention 属性
        self.attention = XmodAttention(config)
        
        # 根据配置设置是否作为解码器
        self.is_decoder = config.is_decoder
        
        # 根据配置设置是否添加交叉注意力
        self.add_cross_attention = config.add_cross_attention
        
        # 如果添加了交叉注意力
        if self.add_cross_attention:
            # 如果不是解码器，则抛出异常
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            
            # 初始化一个带绝对位置嵌入的 XmodAttention 对象，并赋值给 self.crossattention 属性
            self.crossattention = XmodAttention(config, position_embedding_type="absolute")
        
        # 初始化一个 XmodIntermediate 对象，并赋值给 self.intermediate 属性
        self.intermediate = XmodIntermediate(config)
        
        # 初始化一个 XmodOutput 对象，并赋值给 self.output 属性
        self.output = XmodOutput(config)
        
        # 根据配置设置是否进行预正则化，并赋值给 self.pre_norm 属性
        self.pre_norm = config.pre_norm

    # 前向传播函数，接受多个输入参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        lang_ids: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 如果有过去的注意力键/值缓存数据，则仅保留解码器自注意力部分的键/值缓存数据
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        
        # 使用自注意力层处理隐藏状态
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        attention_output = self_attention_outputs[0]

        # 如果是解码器模式，最后一个输出是自注意力缓存的元组
        if self.is_decoder:
            outputs = self_attention_outputs[1:-1]
            present_key_value = self_attention_outputs[-1]
        else:
            # 如果不是解码器模式，添加自注意力权重
            outputs = self_attention_outputs[1:]

        cross_attn_present_key_value = None
        
        # 如果是解码器且有编码器隐藏状态
        if self.is_decoder and encoder_hidden_states is not None:
            # 检查是否有交叉注意力层
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )
            
            # 如果有过去的注意力键/值缓存数据，则仅保留交叉注意力部分的键/值缓存数据
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            
            # 使用交叉注意力层处理注意力输出、注意力掩码、头掩码、编码器隐藏状态及其掩码
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            attention_output = cross_attention_outputs[0]
            outputs = outputs + cross_attention_outputs[1:-1]  # 添加交叉注意力权重
            
            # 将交叉注意力缓存添加到当前注意力键/值元组的第三和第四位置
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        residual = attention_output
        
        # 如果使用预层归一化，对注意力输出进行归一化处理
        if self.pre_norm:
            attention_output = self.output.LayerNorm(attention_output)
        
        # 对注意力输出应用前馈分块处理
        intermediate_output = apply_chunking_to_forward(
            self.feed_forward_chunk,
            self.chunk_size_feed_forward,
            self.seq_len_dim,
            attention_output,
        )
        
        # 使用输出层处理中间输出、残差连接和语言 ID
        layer_output = self.output(intermediate_output, residual, lang_ids)
        
        # 如果不使用预层归一化，对层输出进行归一化处理
        if not self.pre_norm:
            layer_output = self.output.LayerNorm(layer_output)
        
        outputs = (layer_output,) + outputs

        # 如果是解码器模式，将注意力键/值作为最后一个输出返回
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs

    def feed_forward_chunk(self, attention_output):
        # 对注意力输出进行前馈分块处理
        return self.intermediate(attention_output)
# 定义一个名为 XmodEncoder 的类，继承自 nn.Module
class XmodEncoder(nn.Module):
    # 初始化方法，接受一个 config 参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 将传入的 config 参数保存在实例变量 self.config 中
        self.config = config
        # 创建一个由 XmodLayer 实例组成的 ModuleList，长度为 config.num_hidden_layers
        self.layer = nn.ModuleList([XmodLayer(config) for _ in range(config.num_hidden_layers)])
        # 根据配置决定是否进行预归一化
        self.is_pre_norm = config.pre_norm
        # 如果开启了预归一化，则创建一个 LayerNorm 层
        if self.is_pre_norm:
            self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 默认关闭梯度检查点
        self.gradient_checkpointing = False

    # 前向传播方法，接受多个输入参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        lang_ids: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
# 定义一个名为 XmodPooler 的类，继承自 nn.Module
class XmodPooler(nn.Module):
    # 初始化方法，接受一个 config 参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个线性层，输入大小为 config.hidden_size，输出大小为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 创建一个 Tanh 激活函数实例
        self.activation = nn.Tanh()

    # 前向传播方法，接受一个输入参数 hidden_states
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 从 hidden_states 中取出第一个 token 对应的隐藏状态
        first_token_tensor = hidden_states[:, 0]
        # 将该隐藏状态作为输入，经过线性层
        pooled_output = self.dense(first_token_tensor)
        # 经过 Tanh 激活函数
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出
        return pooled_output


# 定义一个名为 XmodPreTrainedModel 的类，继承自 PreTrainedModel
class XmodPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 类变量，指定该类的配置类为 XmodConfig
    config_class = XmodConfig
    # 基础模型前缀，设置为 "roberta"
    base_model_prefix = "roberta"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    # 定义一个内部方法 _init_weights，用于初始化模型的权重
    # 该方法来自于 transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果 module 是 nn.Linear 类型
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化权重，均值为 0，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置，则将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果 module 是 nn.Embedding 类型
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，均值为 0，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果指定了 padding_idx，则将对应位置的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果 module 是 nn.LayerNorm 类型
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为 1
            module.weight.data.fill_(1.0)
    # 设置模型的默认语言代码。当输入中未指定语言时使用。
    def set_default_language(self, language: str):
        # 如果指定的语言不在支持的语言列表中，则抛出数值错误异常。
        if language not in self.config.languages:
            raise ValueError(
                f"{self} does not have an adapter for {language}. Supported languages: {list(self.config.languages)}"
            )
        # 将默认语言设置为给定的语言代码。
        self.config.default_language = language

    # 冻结模型的嵌入和语言适配器。通常在模型在下游任务上进行微调之前应用此操作。
    def freeze_embeddings_and_language_adapters(self):
        # 输出信息日志，表明正在冻结嵌入层。
        logger.info("Freezing embeddings")
        # 遍历 RoBERTa 模型的嵌入层参数，并设置它们为不需要梯度。
        for parameter in self.roberta.embeddings.parameters():
            parameter.requires_grad = False
        # 输出信息日志，表明正在冻结适配器。
        logger.info("Freezing adapters")
        # 遍历 RoBERTa 模型的每个编码层。
        for layer in self.roberta.encoder.layer:
            # 如果编码层有输出适配器的层归一化，则将其参数设置为不需要梯度。
            if layer.output.adapter_layer_norm is not None:
                for parameter in layer.output.adapter_layer_norm.parameters():
                    parameter.requires_grad = False
            # 遍历编码层的输出适配器模块，并将其参数设置为不需要梯度。
            for parameter in layer.output.adapter_modules.parameters():
                parameter.requires_grad = False
# XMOD_START_DOCSTRING 是一个多行字符串常量，用于为该模型的文档提供详细说明和文档链接。
# 这个模型继承自 PreTrainedModel 类。查看超类文档以了解库实现的通用方法（如下载或保存模型、调整输入嵌入大小、修剪头等）。
# 同时，这个模型也是 PyTorch 的 torch.nn.Module 的子类，可以像常规的 PyTorch 模块一样使用。有关一般使用和行为的问题，请参考 PyTorch 文档。
# 参数：
#     config ([`XmodConfig`]): 该模型的配置类，包含模型的所有参数。使用配置文件进行初始化不会加载与模型相关的权重，只加载配置。查看 `~PreTrainedModel.from_pretrained` 方法以加载模型权重。

XMOD_INPUTS_DOCSTRING = r"""
    # 接受输入参数并进行处理的函数
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            输入序列标记在词汇表中的索引。
    
            可以使用 [`AutoTokenizer`] 获取这些索引。详情请参阅 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`]。
    
            [什么是输入 ID？](../glossary#input-ids)
        lang_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            每个样本应该激活的语言适配器的索引。默认情况下为 `self.config.default_language` 对应的索引。
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            遮盖掩盖填充标记索引的注意力掩码。掩码值选取在 `[0, 1]` 之间：
    
            - 1 表示**未掩盖**的标记，
            - 0 表示**已掩盖**的标记。
    
            [什么是注意力掩码？](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            指示输入的第一部分和第二部分的段标记索引。索引选取在 `[0, 1]` 之间：
    
            - 0 对应*句子 A* 的标记，
            - 1 对应*句子 B* 的标记。
    
            [什么是标记类型 ID？](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            每个输入序列标记在位置嵌入中的位置索引。索引选取范围为 `[0, config.max_position_embeddings - 1]`。
    
            [什么是位置 ID？](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            自注意力模块中选择性屏蔽的头部掩码。掩码值选取在 `[0, 1]` 之间：
    
            - 1 表示**未掩盖**的头部，
            - 0 表示**已掩盖**的头部。
    
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            可选参数，而非传递 `input_ids`，你可以直接传递嵌入表示。如果你想要更多控制如何将 `input_ids` 索引转换为相关向量，而不是使用模型的内部嵌入查找矩阵时，这将非常有用。
        output_attentions (`bool`, *optional*):
            是否返回所有注意力层的注意力张量。请参阅返回张量下的 `attentions` 以获取更多详情。
        output_hidden_states (`bool`, *optional*):
            是否返回所有层的隐藏状态。请参阅返回张量下的 `hidden_states` 以获取更多详情。
        return_dict (`bool`, *optional*):
            是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
    """

    @add_start_docstrings(
        "The bare X-MOD Model transformer outputting raw hidden-states without any specific head on top.",
        XMOD_START_DOCSTRING,
    )
    class XmodModel(XmodPreTrainedModel):
        """

        The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
        cross-attention is added between the self-attention layers, following the architecture described in *Attention is
        all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
        Kaiser and Illia Polosukhin.

        To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
        to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
        `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

        .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762

        """

        # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Xmod
        def __init__(self, config, add_pooling_layer=True):
            # 调用父类构造函数，初始化模型
            super().__init__(config)
            # 保存配置信息
            self.config = config

            # 初始化词嵌入层
            self.embeddings = XmodEmbeddings(config)
            # 初始化编码器层
            self.encoder = XmodEncoder(config)

            # 如果需要添加池化层，则初始化池化层；否则设为None
            self.pooler = XmodPooler(config) if add_pooling_layer else None

            # 执行初始化权重和最终处理
            self.post_init()

        # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.get_input_embeddings
        def get_input_embeddings(self):
            # 返回词嵌入层中的词嵌入矩阵
            return self.embeddings.word_embeddings

        # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.set_input_embeddings
        def set_input_embeddings(self, value):
            # 设置词嵌入层的词嵌入矩阵
            self.embeddings.word_embeddings = value

        # Copied from transformers.models.roberta.modeling_roberta.RobertaModel._prune_heads
        def _prune_heads(self, heads_to_prune):
            """
            Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
            class PreTrainedModel
            """
            # 遍历需要修剪的层和头部，并调用注意力层的修剪方法
            for layer, heads in heads_to_prune.items():
                self.encoder.layer[layer].attention.prune_heads(heads)

        @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
        # 为模型的前向方法添加文档字符串注释
    # 前向传播函数，用于模型的前向推理过程
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的 token IDs，可选的 Torch 张量
        lang_ids: Optional[torch.LongTensor] = None,  # 输入的语言 ID，可选的长整型 Torch 张量
        attention_mask: Optional[torch.Tensor] = None,  # 注意力遮罩，可选的 Torch 张量
        token_type_ids: Optional[torch.Tensor] = None,  # token 类型 IDs，可选的 Torch 张量
        position_ids: Optional[torch.Tensor] = None,  # 位置 IDs，可选的 Torch 张量
        head_mask: Optional[torch.Tensor] = None,  # 头部遮罩，可选的 Torch 张量
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入向量，可选的 Torch 张量
        encoder_hidden_states: Optional[torch.Tensor] = None,  # 编码器隐藏状态，可选的 Torch 张量
        encoder_attention_mask: Optional[torch.Tensor] = None,  # 编码器注意力遮罩，可选的 Torch 张量
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 过去的键值对列表，可选的浮点数 Torch 张量列表
        use_cache: Optional[bool] = None,  # 是否使用缓存，可选的布尔值
        output_attentions: Optional[bool] = None,  # 是否输出注意力，可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典形式结果，可选的布尔值
# 为 XmodForCausalLM 类添加文档字符串，描述其作为 CLM 微调模型的功能
@add_start_docstrings(
    "X-MOD Model with a `language modeling` head on top for CLM fine-tuning.",
    XMOD_START_DOCSTRING,
)
class XmodForCausalLM(XmodPreTrainedModel):
    # 定义一个列表，包含 lm_head.decoder.weight 和 lm_head.decoder.bias，表示这些权重是被绑定的
    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

    # 从 transformers.models.roberta.modeling_roberta.RobertaForCausalLM.__init__ 复制而来，用于初始化 XmodForCausalLM 类
    def __init__(self, config):
        super().__init__(config)

        # 如果 config 中 is_decoder 不为真，则记录警告信息
        if not config.is_decoder:
            logger.warning("If you want to use `XmodLMHeadModel` as a standalone, add `is_decoder=True.`")

        # 使用 XmodModel 类初始化 self.roberta，不添加池化层
        self.roberta = XmodModel(config, add_pooling_layer=False)
        # 使用 XmodLMHead 类初始化 self.lm_head
        self.lm_head = XmodLMHead(config)

        # 调用 self.post_init() 方法，用于初始化权重并进行最终处理
        self.post_init()

    # 从 transformers.models.roberta.modeling_roberta.RobertaForCausalLM.get_output_embeddings 复制而来
    def get_output_embeddings(self):
        # 返回 lm_head.decoder，即输出的嵌入层权重
        return self.lm_head.decoder

    # 从 transformers.models.roberta.modeling_roberta.RobertaForCausalLM.set_output_embeddings 复制而来
    def set_output_embeddings(self, new_embeddings):
        # 设置 lm_head.decoder 为新的嵌入层权重 new_embeddings
        self.lm_head.decoder = new_embeddings

    # 使用 @add_start_docstrings_to_model_forward 装饰器，添加文档字符串描述 forward 方法的输入参数
    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        lang_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 此处应该继续添加后续的 forward 方法内容，但未提供完整的代码以供参考
        pass

    # 从 transformers.models.roberta.modeling_roberta.RobertaForCausalLM.prepare_inputs_for_generation 复制而来
    # 准备生成过程中的输入，根据给定的参数调整输入数据和注意力掩码
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
        # 获取输入数据的形状信息
        input_shape = input_ids.shape

        # 如果未提供注意力掩码，则创建一个全为1的注意力掩码，与输入形状相同
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)

        # 如果提供了过去的键值对（past_key_values），则调整输入数据，仅保留未处理部分
        if past_key_values is not None:
            # 获取过去键值对中第一层的长度信息
            past_length = past_key_values[0][0].shape[2]

            # 如果输入数据的长度大于过去处理的长度，则移除前缀部分
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 否则，默认保留最后一个输入ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 调整输入数据，仅保留需要生成的部分
            input_ids = input_ids[:, remove_prefix_length:]

        # 返回包含调整后输入数据、注意力掩码和过去键值对的字典
        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}

    # 从 transformers.models.roberta.modeling_roberta.RobertaForCausalLM._reorder_cache 复制而来
    # 重新排序过去的键值对，以便与beam搜索索引对应
    def _reorder_cache(self, past_key_values, beam_idx):
        # 初始化重新排序后的过去键值对
        reordered_past = ()
        
        # 遍历每一层的过去键值对，根据beam搜索索引重新排序
        for layer_past in past_key_values:
            reordered_past += (
                # 对于每一个过去状态，根据beam搜索索引重新选择对应的位置，并将结果添加到元组中
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        
        # 返回重新排序后的过去键值对
        return reordered_past
# 定义 XmodForMaskedLM 类，用于 X-MOD 模型并在顶部添加语言建模头部
@add_start_docstrings(
    """X-MOD Model with a `language modeling` head on top.""",
    XMOD_START_DOCSTRING,
)
class XmodForMaskedLM(XmodPreTrainedModel):
    # 定义共享权重的键列表
    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

    # 从 transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ 复制而来，将 Roberta 替换为 Xmod
    def __init__(self, config):
        # 调用父类构造函数
        super().__init__(config)

        # 如果配置为解码器，则发出警告
        if config.is_decoder:
            logger.warning(
                "If you want to use `XmodForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # 初始化 XmodModel 实例，并禁用添加池化层选项
        self.roberta = XmodModel(config, add_pooling_layer=False)
        # 初始化 XmodLMHead 实例
        self.lm_head = XmodLMHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    # 从 transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.get_output_embeddings 复制而来
    def get_output_embeddings(self):
        # 返回语言建模头部的解码器权重
        return self.lm_head.decoder

    # 从 transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.set_output_embeddings 复制而来
    def set_output_embeddings(self, new_embeddings):
        # 设置语言建模头部的解码器权重为新的嵌入
        self.lm_head.decoder = new_embeddings

    # 添加注释到模型的前向方法，使用 XMOD_INPUTS_DOCSTRING 格式化的输入文档字符串
    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        lang_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        """
        # Determine whether to use return_dict based on input or default configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass inputs to the Roberta model for processing
        outputs = self.roberta(
            input_ids,
            lang_ids=lang_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # Extract the sequence output from Roberta model outputs
        sequence_output = outputs[0]
        
        # Generate prediction scores using the language model head
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        # Calculate masked language modeling loss if labels are provided
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # If return_dict is False, prepare output tuple including prediction scores and additional outputs
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # If return_dict is True, return MaskedLMOutput with detailed outputs
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead
class XmodLMHead(nn.Module):
    """Roberta Head for masked language modeling."""

    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，将输入维度为 config.hidden_size 的特征映射到相同维度
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化 LayerNorm 层，用于归一化输入特征
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 初始化一个全连接层，将特征映射到词汇表大小的维度
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
        # 初始化一个偏置参数，用于 decoder 层
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
        self.decoder.bias = self.bias

    def forward(self, features, **kwargs):
        # 将特征传入全连接层进行线性变换
        x = self.dense(features)
        # 使用 GELU 激活函数
        x = gelu(x)
        # 将结果归一化
        x = self.layer_norm(x)

        # 将归一化后的结果映射回词汇表维度，加上偏置
        x = self.decoder(x)

        return x

    def _tie_weights(self):
        # 如果两个权重被断开（在 TPU 上或者偏置被重新调整大小时），将它们绑定在一起
        # 用于加速兼容性，避免破坏向后兼容性
        if self.decoder.bias.device.type == "meta":
            self.decoder.bias = self.bias
        else:
            self.bias = self.decoder.bias


@add_start_docstrings(
    """
    X-MOD Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    XMOD_START_DOCSTRING,
)
class XmodForSequenceClassification(XmodPreTrainedModel):
    # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Xmod
    def __init__(self, config):
        super().__init__(config)
        # 初始化分类任务的模型，包括 XMOD 模型和分类器
        self.num_labels = config.num_labels
        self.config = config

        # 初始化 XMOD 模型，不包括池化层
        self.roberta = XmodModel(config, add_pooling_layer=False)
        # 初始化分类头部
        self.classifier = XmodClassificationHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        lang_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用 return_dict；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 Roberta 模型进行前向传播，获取输出
        outputs = self.roberta(
            input_ids,
            lang_ids=lang_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 从模型输出中获取序列输出
        sequence_output = outputs[0]
        # 使用分类器模型进行 logits 计算
        logits = self.classifier(sequence_output)

        loss = None
        # 如果 labels 不为 None，则计算损失
        if labels is not None:
            # 如果问题类型未定义，则根据 num_labels 类型进行定义
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回输出元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 否则，返回 SequenceClassifierOutput 对象，包含损失、logits、隐藏状态和注意力
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
X-MOD Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""
# 使用 XMOD_START_DOCSTRING 和 add_start_docstrings 装饰器为类添加文档字符串
@add_start_docstrings(
    """
    X-MOD Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    XMOD_START_DOCSTRING,
)
# 定义 XmodForMultipleChoice 类，继承自 XmodPreTrainedModel
class XmodForMultipleChoice(XmodPreTrainedModel):
    # 从 transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice.__init__ 复制过来并修改了 Roberta 为 Xmod
    def __init__(self, config):
        # 调用父类 XmodPreTrainedModel 的构造函数
        super().__init__(config)

        # 初始化 self.roberta 属性为 XmodModel(config)
        self.roberta = XmodModel(config)
        # 初始化 self.dropout 属性为 Dropout(config.hidden_dropout_prob)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 初始化 self.classifier 属性为 Linear(config.hidden_size, 1)
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    # 使用 add_start_docstrings_to_model_forward 装饰器为 forward 方法添加文档字符串
    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    # 定义 forward 方法，接收多个输入参数
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        lang_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 继续声明其他输入参数
    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据 return_dict 参数确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 计算 num_choices，即选择数量，为 input_ids 的第二个维度大小
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 将 input_ids 展平为二维张量，以便处理
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        # 如果提供了 lang_ids，则将其重复以匹配 input_ids 的扁平化后形状
        flat_lang_ids = lang_ids.repeat(input_ids.size(0) * input_ids.size(1)) if lang_ids is not None else None
        # 将 position_ids 展平为二维张量，以便处理
        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        # 将 token_type_ids 展平为二维张量，以便处理
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        # 将 attention_mask 展平为二维张量，以便处理
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        # 将 inputs_embeds 展平为三维张量，以便处理
        flat_inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 使用 RoBERTa 模型进行前向传播，传入扁平化后的各种输入和参数
        outputs = self.roberta(
            flat_input_ids,
            lang_ids=flat_lang_ids,
            position_ids=flat_position_ids,
            token_type_ids=flat_token_type_ids,
            attention_mask=flat_attention_mask,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取汇聚的输出向量
        pooled_output = outputs[1]

        # 对汇聚的输出向量应用 dropout
        pooled_output = self.dropout(pooled_output)
        # 使用分类器进行分类预测
        logits = self.classifier(pooled_output)
        # 将 logits 重塑为二维张量，以适应多选项任务的形状
        reshaped_logits = logits.view(-1, num_choices)

        # 如果提供了 labels，则计算交叉熵损失
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果 return_dict 为 False，则返回未打包的输出
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回打包为 MultipleChoiceModelOutput 的输出对象
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
X-MOD Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
# 引入自定义的文档字符串，描述了这个类是一个基于 X-MOD 模型的标记分类模型，通常用于命名实体识别（NER）等任务。

@add_start_docstrings(
    """
    XMOD_START_DOCSTRING,
    """
    # 使用装饰器 @add_start_docstrings，引入了一个文档字符串，可能是为了扩展其他模型的基础文档字符串。
)
class XmodForTokenClassification(XmodPreTrainedModel):
    # 从 transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ 复制而来，只是将 Roberta 替换为 Xmod
    def __init__(self, config):
        super().__init__(config)
        # 调用父类构造函数初始化模型配置

        self.num_labels = config.num_labels
        # 初始化标签数量，从配置中获取

        self.roberta = XmodModel(config, add_pooling_layer=False)
        # 使用 XmodModel 初始化一个 XMOD 模型，关闭添加池化层的选项

        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 如果配置中提供了分类器的 dropout，就使用它，否则使用隐藏层 dropout 的设置
        self.dropout = nn.Dropout(classifier_dropout)
        # 初始化一个 dropout 层，用于模型训练中的随机失活

        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # 初始化一个线性层，将隐藏状态映射到标签数量上

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 使用装饰器 @add_start_docstrings_to_model_forward，引入了一个文档字符串，可能是为了扩展模型前向传播的输入说明。
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        lang_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 定义模型的前向传播方法，接受多个输入参数和可选的控制参数
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 如果 return_dict 不为 None，则使用给定的 return_dict；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 Roberta 模型进行前向传播，获取输出结果
        outputs = self.roberta(
            input_ids,
            lang_ids=lang_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从输出结果中获取序列输出
        sequence_output = outputs[0]

        # 对序列输出应用 dropout
        sequence_output = self.dropout(sequence_output)
        
        # 将经过 dropout 的序列输出输入分类器，得到 logits
        logits = self.classifier(sequence_output)

        # 初始化损失值为 None
        loss = None
        # 如果提供了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # 将 logits 和 labels 展平并计算交叉熵损失
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果 return_dict 为 False，则返回 logits 和额外的 hidden states
        if not return_dict:
            output = (logits,) + outputs[2:]  # 包括 logits 和额外的 hidden states
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 TokenClassifierOutput 对象
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 从 transformers.models.roberta.modeling_roberta.RobertaClassificationHead 复制而来的 Xmod 分类头部模块，用于句子级别的分类任务。
class XmodClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        # 使用线性层将输入特征的大小从 config.hidden_size 转换为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 确定分类器的 dropout 概率，如果未指定，则使用 config.hidden_dropout_prob 的值
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        # 使用线性层将输入特征的大小从 config.hidden_size 转换为 config.num_labels
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        # 从 features 中选择第一个 token 的隐藏状态（相当于 [CLS] 标记）
        x = features[:, 0, :]
        # 对选定的特征进行 dropout 处理
        x = self.dropout(x)
        # 通过全连接层 dense 进行线性变换
        x = self.dense(x)
        # 对变换后的输出应用双曲正切激活函数
        x = torch.tanh(x)
        # 再次对输出进行 dropout 处理
        x = self.dropout(x)
        # 通过输出投影层 out_proj 进行最终的线性变换，得到分类任务的输出
        x = self.out_proj(x)
        return x


@add_start_docstrings(
    """
    X-MOD Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    XMOD_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ 复制而来，用于支持抽取式问答任务的 X-MOD 模型
class XmodForQuestionAnswering(XmodPreTrainedModel):

    def __init__(self, config):
        super().__init__(config)
        # 初始化模型的标签数量
        self.num_labels = config.num_labels

        # 使用 XmodModel 初始化 self.roberta，不添加池化层
        self.roberta = XmodModel(config, add_pooling_layer=False)
        # 使用线性层初始化 self.qa_outputs，将隐藏状态的大小转换为标签数量
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # XMOD 模型的前向传播函数，接受多种输入参数，并输出相应的结果
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        lang_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        return_dict: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # Determine whether to use return_dict based on self.config or provided argument
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    
        # Pass input tensors and optional arguments to the Roberta model
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
    
        # Get the sequence output from the Roberta model's outputs
        sequence_output = outputs[0]
    
        # Feed the sequence output into the QA outputs layer to get logits
        logits = self.qa_outputs(sequence_output)
    
        # Split logits into start and end logits and process them
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()
    
        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If start_positions or end_positions have more than one dimension, squeeze them
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
    
            # Define ignored_index and clamp positions to ignore out-of-sequence positions
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)
    
            # Compute CrossEntropyLoss for start and end logits
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
    
            # Calculate total loss as the average of start_loss and end_loss
            total_loss = (start_loss + end_loss) / 2
    
        # If return_dict is False, return output tuple without loss
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output
    
        # If return_dict is True, return structured output using QuestionAnsweringModelOutput
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 从输入的input_ids中创建位置编码，替换非填充符号为它们的位置编号。位置编号从padding_idx+1开始，忽略填充符号。这段代码修改自fairseq的`utils.make_positions`。

def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        input_ids: torch.Tensor 输入的张量，包含需要进行位置编码的元素
        padding_idx: int 填充符号的索引，非填充符号将被替换为其位置编号
        past_key_values_length: int 过去键值对的长度，用于计算增量索引

    Returns:
        torch.Tensor 返回与输入张量相同形状的张量，其中非填充符号被替换为它们的位置编号
    """
    # 创建一个掩码张量，指示输入张量中非填充符号的位置
    mask = input_ids.ne(padding_idx).int()
    # 计算每个非填充符号的位置编号，累积求和后加上past_key_values_length，并乘以掩码以保留填充符号位置的值
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    # 将增量索引转换为长整型，并加上padding_idx以得到最终位置编号
    return incremental_indices.long() + padding_idx

Transformers-源码解析-一百二十八-

Transformers 源码解析（一百二十八）

.\models\xlm_roberta_xl\__init__.py

.\models\xlnet\configuration_xlnet.py

.\models\xlnet\convert_xlnet_original_tf_checkpoint_to_pytorch.py

.\models\xlnet\modeling_tf_xlnet.py

.\models\xlnet\modeling_xlnet.py

.\models\xlnet\tokenization_xlnet.py

.\models\xlnet\tokenization_xlnet_fast.py

.\models\xlnet\__init__.py

.\models\xmod\configuration_xmod.py

.\models\xmod\convert_xmod_original_pytorch_checkpoint_to_pytorch.py

.\models\xmod\modeling_xmod.py

`.\models\xlm_roberta_xl\init.py`

`.\models\xlnet\configuration_xlnet.py`

`.\models\xlnet\convert_xlnet_original_tf_checkpoint_to_pytorch.py`

`.\models\xlnet\modeling_tf_xlnet.py`

`.\models\xlnet\modeling_xlnet.py`

`.\models\xlnet\tokenization_xlnet.py`

`.\models\xlnet\tokenization_xlnet_fast.py`

`.\models\xlnet\init.py`

`.\models\xmod\configuration_xmod.py`

`.\models\xmod\convert_xmod_original_pytorch_checkpoint_to_pytorch.py`

`.\models\xmod\modeling_xmod.py`