Transformers 源码解析（三十七）

`.\models\deprecated\transfo_xl\convert_transfo_xl_original_tf_checkpoint_to_pytorch.py`

# 设置文件编码为 UTF-8
# 版权声明，使用 Apache 许可证 2.0 版本
# 详细信息可参考 http://www.apache.org/licenses/LICENSE-2.0
# 根据适用法律或书面协议要求，本软件按"原样"分发，不附带任何明示或暗示的保证或条件
"""转换 Transformer XL 检查点和数据集。"""


import argparse  # 导入命令行参数解析模块
import os  # 导入操作系统功能模块
import pickle  # 导入 pickle 序列化模块
import sys  # 导入系统相关的功能模块

import torch  # 导入 PyTorch 深度学习库

# 导入 Transformer XL 相关的配置、模型和权重加载工具
from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl
from transformers.models.deprecated.transfo_xl import tokenization_transfo_xl as data_utils
from transformers.models.deprecated.transfo_xl.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging


logging.set_verbosity_info()  # 设置日志级别为 info

# 解决在加载 Python 2 数据集 pickle 文件时的问题
# 参考：https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
data_utils.Vocab = data_utils.TransfoXLTokenizer
data_utils.Corpus = data_utils.TransfoXLCorpus
sys.modules["data_utils"] = data_utils  # 修改模块路径，兼容 Python 2
sys.modules["vocabulary"] = data_utils  # 修改模块路径，兼容 Python 2


def convert_transfo_xl_checkpoint_to_pytorch(
    tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file
):
    if transfo_xl_dataset_file:
        # 转换预处理的语料库（参见原始 TensorFlow 仓库）
        with open(transfo_xl_dataset_file, "rb") as fp:
            corpus = pickle.load(fp, encoding="latin1")  # 加载 pickle 文件，编码为 Latin-1

        # 将词汇表和数据集缓存保存为字典（长期来看比 pickle 更好）
        pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"]
        print(f"Save vocabulary to {pytorch_vocab_dump_path}")
        corpus_vocab_dict = corpus.vocab.__dict__
        torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)  # 保存词汇表字典到指定路径

        corpus_dict_no_vocab = corpus.__dict__
        corpus_dict_no_vocab.pop("vocab", None)
        pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME
        print(f"Save dataset to {pytorch_dataset_dump_path}")
        torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)  # 保存数据集字典（去除词汇表）到指定路径
    # 如果给定了 TensorFlow 的检查点路径
    if tf_checkpoint_path:
        # 将预训练的 TensorFlow 模型转换
        config_path = os.path.abspath(transfo_xl_config_file)
        tf_path = os.path.abspath(tf_checkpoint_path)

        # 打印转换过程中使用的配置和检查点路径信息
        print(f"Converting Transformer XL checkpoint from {tf_path} with config at {config_path}.")

        # 初始化 PyTorch 模型
        if transfo_xl_config_file == "":
            # 如果未提供配置文件路径，则使用默认配置
            config = TransfoXLConfig()
        else:
            # 从给定的 JSON 配置文件中加载配置
            config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
        # 打印正在构建的 PyTorch 模型配置信息
        print(f"Building PyTorch model from configuration: {config}")
        
        # 使用配置初始化 TransformerXL 模型
        model = TransfoXLLMHeadModel(config)

        # 载入 TensorFlow 的权重到 PyTorch 模型中
        model = load_tf_weights_in_transfo_xl(model, config, tf_path)

        # 保存 PyTorch 模型的权重文件路径
        pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
        # 保存 PyTorch 模型的配置文件路径
        pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)

        # 打印保存 PyTorch 模型权重的路径
        print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}")
        # 将 PyTorch 模型的状态字典保存到指定路径
        torch.save(model.state_dict(), pytorch_weights_dump_path)

        # 打印保存 PyTorch 模型配置文件的路径
        print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}")
        # 将模型配置以 JSON 格式写入指定路径的文件中
        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
            f.write(config.to_json_string())
# 如果这个脚本作为主程序运行，则执行以下代码
if __name__ == "__main__":
    # 创建参数解析器
    parser = argparse.ArgumentParser()
    
    # 添加命令行参数：用于指定 PyTorch 模型或数据集/词汇表的存储路径，是必需的参数
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        required=True,
        help="Path to the folder to store the PyTorch model or dataset/vocab.",
    )
    
    # 添加命令行参数：用于指定可选的 TensorFlow checkpoint 转换路径
    parser.add_argument(
        "--tf_checkpoint_path",
        default="",
        type=str,
        help="An optional path to a TensorFlow checkpoint path to be converted.",
    )
    
    # 添加命令行参数：用于指定可选的 TransfoXL 配置文件路径
    parser.add_argument(
        "--transfo_xl_config_file",
        default="",
        type=str,
        help=(
            "An optional config json file corresponding to the pre-trained BERT model. \n"
            "This specifies the model architecture."
        ),
    )
    
    # 添加命令行参数：用于指定可选的 TransfoXL 数据集文件路径，将其转换成词汇表
    parser.add_argument(
        "--transfo_xl_dataset_file",
        default="",
        type=str,
        help="An optional dataset file to be converted in a vocabulary.",
    )
    
    # 解析命令行参数
    args = parser.parse_args()
    
    # 调用函数 convert_transfo_xl_checkpoint_to_pytorch，将 TensorFlow checkpoint 转换为 PyTorch 格式
    convert_transfo_xl_checkpoint_to_pytorch(
        args.tf_checkpoint_path,
        args.transfo_xl_config_file,
        args.pytorch_dump_folder_path,
        args.transfo_xl_dataset_file,
    )

`.\models\deprecated\transfo_xl\modeling_tf_transfo_xl.py`

"""
 TF 2.0 Transformer XL model.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ....modeling_tf_utils import (
    TFModelInputType,
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ....tf_utils import shape_list, stable_softmax
from ....utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_transfo_xl import TransfoXLConfig
from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置
_CHECKPOINT_FOR_DOC = "transfo-xl/transfo-xl-wt103"
_CONFIG_FOR_DOC = "TransfoXLConfig"

# 预训练模型存档列表
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "transfo-xl/transfo-xl-wt103",
    # 查看所有 Transformer XL 模型的列表：https://huggingface.co/models?filter=transfo-xl
]


class TFPositionalEmbedding(keras.layers.Layer):
    def __init__(self, demb, **kwargs):
        super().__init__(**kwargs)

        # 初始化逆频率矩阵，用于位置编码
        self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))

    def call(self, pos_seq, bsz=None):
        # 将逆频率转换为与位置序列相同的数据类型
        self.inv_freq = tf.cast(self.inv_freq, dtype=pos_seq.dtype)
        # 计算正弦和余弦函数输入，形成位置编码矩阵
        sinusoid_inp = tf.einsum("i,j->ij", pos_seq, self.inv_freq)
        pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)

        if bsz is not None:
            # 如果提供了批大小，对位置编码进行扩展
            return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
        else:
            # 否则返回单个位置编码矩阵
            return pos_emb[:, None, :]


class TFPositionwiseFF(keras.layers.Layer):
    # 初始化函数，用于创建一个新的自定义层对象
    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
        # 调用父类的初始化函数
        super().__init__(**kwargs)

        # 设置模型参数
        self.d_model = d_model    # 模型维度
        self.d_inner = d_inner    # 内部层维度
        self.dropout = dropout    # 丢弃率

        # 第一个全连接层，用于非线性变换
        self.layer_1 = keras.layers.Dense(
            d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0"
        )
        # 第一个丢弃层，用于正则化
        self.drop_1 = keras.layers.Dropout(dropout)
        # 第二个全连接层，用于映射回原始模型维度
        self.layer_2 = keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3")
        # 第二个丢弃层，用于正则化
        self.drop_2 = keras.layers.Dropout(dropout)

        # 层标准化，用于规范化每个样本的特征
        self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")

        # 是否使用预层标准化
        self.pre_lnorm = pre_lnorm

    def call(self, inp, training=False):
        # 如果使用预层标准化
        if self.pre_lnorm:
            # 层标准化 + 位置感知前向反馈
            core_out = self.layer_norm(inp)
            core_out = self.layer_1(core_out)     # 第一个全连接层
            core_out = self.drop_1(core_out, training=training)   # 第一个丢弃层
            core_out = self.layer_2(core_out)     # 第二个全连接层
            core_out = self.drop_2(core_out, training=training)   # 第二个丢弃层

            # 残差连接
            output = core_out + inp   # 输出等于核心输出加上输入
        else:
            # 位置感知前向反馈
            core_out = self.layer_1(inp)   # 第一个全连接层
            core_out = self.drop_1(core_out, training=training)   # 第一个丢弃层
            core_out = self.layer_2(core_out)   # 第二个全连接层
            core_out = self.drop_2(core_out, training=training)   # 第二个丢弃层

            # 残差连接 + 层标准化
            output = self.layer_norm(inp + core_out)   # 输出等于输入加上核心输出后进行层标准化

        return output
# 定义 Transformer 中解码器层的自定义 Keras 层
class TFRelPartialLearnableDecoderLayer(keras.layers.Layer):
    def __init__(
        self,
        n_head,                      # 注意力头的数量
        d_model,                     # 模型的维度
        d_head,                      # 每个注意力头的维度
        d_inner,                     # 内部前馈网络的维度
        dropout,                     # 注意力和前馈网络中的dropout率
        dropatt=0.0,                 # 注意力机制中的额外dropout率
        pre_lnorm=False,             # 是否在 LayerNormalization 前应用注意力和前馈网络
        r_w_bias=None,               # 注意力机制中的位置偏置
        r_r_bias=None,               # 注意力机制中的位置偏置
        layer_norm_epsilon=1e-5,     # LayerNormalization 的 epsilon 参数
        init_std=0.02,               # 权重初始化的标准差
        output_attentions=False,     # 是否输出注意力权重
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.n_head = n_head                           # 初始化注意力头的数量
        self.d_model = d_model                         # 初始化模型的维度
        self.d_head = d_head                           # 初始化每个注意力头的维度
        self.dropout = dropout                         # 初始化dropout率
        self.output_attentions = output_attentions     # 是否输出注意力权重

        # 定义一个全连接层，用于计算查询、键和值
        self.qkv_net = keras.layers.Dense(
            3 * n_head * d_head,                        # 输出维度是 3 * 注意力头数 * 注意力头维度
            kernel_initializer=get_initializer(init_std),
            use_bias=False,
            name="qkv_net"
        )

        self.drop = keras.layers.Dropout(dropout)       # 定义dropout层，用于注意力权重和前馈网络
        self.dropatt = keras.layers.Dropout(dropatt)   # 定义额外的dropout层，用于注意力权重
        self.o_net = keras.layers.Dense(
            d_model,                                    # 输出维度为模型的维度
            kernel_initializer=get_initializer(init_std),
            use_bias=False,
            name="o_net"
        )

        self.layer_norm = keras.layers.LayerNormalization(
            epsilon=layer_norm_epsilon,
            name="layer_norm"
        )

        self.scale = 1 / (d_head**0.5)                  # 缩放因子为注意力头维度的平方根

        self.pre_lnorm = pre_lnorm                      # 是否在LayerNormalization之前应用注意力和前馈网络

        if r_r_bias is not None and r_w_bias is not None:  # 如果提供了位置偏置，则共享位置偏置
            self.r_r_bias = r_r_bias                     # 初始化相对位置的重排偏置
            self.r_w_bias = r_w_bias                     # 初始化相对位置的重排偏置
        else:
            self.r_r_bias = None                         # 否则初始化为None
            self.r_w_bias = None                         # 否则初始化为None

        # 定义一个全连接层，用于计算相对位置的重排
        self.r_net = keras.layers.Dense(
            self.n_head * self.d_head,                   # 输出维度为 注意力头数 * 注意力头维度
            kernel_initializer=get_initializer(init_std),
            use_bias=False,
            name="r_net"
        )

    def build(self, input_shape):
        if self.r_r_bias is None or self.r_w_bias is None:  # 如果未提供位置偏置，则创建并添加可训练的位置偏置
            self.r_r_bias = self.add_weight(
                shape=(self.n_head, self.d_head),
                initializer="zeros",
                trainable=True,
                name="r_r_bias"
            )
            self.r_w_bias = self.add_weight(
                shape=(self.n_head, self.d_head),
                initializer="zeros",
                trainable=True,
                name="r_w_bias"
            )
        super().build(input_shape)

    def _rel_shift(self, x):
        x_size = shape_list(x)

        # 在第二个维度上填充1行0列，用于相对位置的重排
        x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
        # 重新整形张量以便进行切片操作
        x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
        # 进行切片操作以去除填充的部分
        x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
        # 恢复原来的形状
        x = tf.reshape(x, x_size)

        return x
    ):
        super().__init__(**kwargs)
        
        self.dec_attn = TFRelPartialLearnableMultiHeadAttn(
            n_head,
            d_model,
            d_head,
            dropout,
            dropatt=dropatt,
            pre_lnorm=pre_lnorm,
            r_w_bias=r_w_bias,
            r_r_bias=r_r_bias,
            init_std=init_std,
            layer_norm_epsilon=layer_norm_epsilon,
            output_attentions=output_attentions,
            name="dec_attn",
        )
        self.pos_ff = TFPositionwiseFF(
            d_model,
            d_inner,
            dropout,
            pre_lnorm=pre_lnorm,
            init_std=init_std,
            layer_norm_epsilon=layer_norm_epsilon,
            name="pos_ff",
        )


# 初始化函数，用于创建一个新的实例
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.0, pre_lnorm=False,
             r_w_bias=None, r_r_bias=None, init_std=0.02, layer_norm_epsilon=1e-12,
             output_attentions=False, **kwargs):
    # 调用父类的初始化方法，传入额外的关键字参数
    super().__init__(**kwargs)

    # 创建多头注意力机制对象
    self.dec_attn = TFRelPartialLearnableMultiHeadAttn(
        n_head,
        d_model,
        d_head,
        dropout,
        dropatt=dropatt,
        pre_lnorm=pre_lnorm,
        r_w_bias=r_w_bias,
        r_r_bias=r_r_bias,
        init_std=init_std,
        layer_norm_epsilon=layer_norm_epsilon,
        output_attentions=output_attentions,
        name="dec_attn",
    )
    
    # 创建位置前馈神经网络对象
    self.pos_ff = TFPositionwiseFF(
        d_model,
        d_inner,
        dropout,
        pre_lnorm=pre_lnorm,
        init_std=init_std,
        layer_norm_epsilon=layer_norm_epsilon,
        name="pos_ff",
    )



    def call(self, dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=False):
        # 使用 self.dec_attn 对象进行调用，计算注意力输出
        attn_outputs = self.dec_attn(dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=training)
        
        # 使用 self.pos_ff 对象进行调用，计算位置前馈网络输出
        ff_output = self.pos_ff(attn_outputs[0], training=training)

        # 将位置前馈网络输出与注意力输出列表合并为一个输出列表
        outputs = [ff_output] + attn_outputs[1:]

        # 返回最终输出列表
        return outputs


# 定义 call 方法，用于执行模型的前向传播过程，接收多个参数并返回多个输出
def call(self, dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=False):
    # 调用 self.dec_attn 对象，计算注意力输出
    attn_outputs = self.dec_attn(dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=training)
    
    # 调用 self.pos_ff 对象，计算位置前馈网络输出
    ff_output = self.pos_ff(attn_outputs[0], training=training)

    # 将位置前馈网络的输出和注意力输出列表合并为一个最终输出列表
    outputs = [ff_output] + attn_outputs[1:]

    # 返回最终的输出列表
    return outputs
# 定义 TFTransfoEmbeddings 类，继承自 keras.layers.Layer
class TFTransfoEmbeddings(keras.layers.Layer):
    # 初始化方法，接受词汇量大小、嵌入维度、初始化标准差等参数
    def __init__(self, vocab_size, emb_size, init_std, **kwargs):
        super().__init__(**kwargs)

        self.vocab_size = vocab_size  # 设置词汇量大小
        self.emb_size = emb_size      # 设置嵌入维度
        self.init_std = init_std      # 设置初始化标准差

    # 构建方法，在此处创建权重
    def build(self, input_shape):
        self.weight = self.add_weight(
            shape=(self.vocab_size, self.emb_size),  # 设置权重的形状为 (词汇量大小, 嵌入维度)
            initializer=get_initializer(self.init_std),  # 使用给定的初始化器初始化权重
            name="embeddings",  # 设置权重的名称为 "embeddings"
        )

        super().build(input_shape)  # 调用父类的 build 方法

    # 调用方法，用于获取给定输入的嵌入表示
    def call(self, inputs):
        return tf.gather(self.weight, inputs)  # 返回权重中对应输入索引的嵌入表示


# 定义 TFAdaptiveEmbedding 类，继承自 keras.layers.Layer
class TFAdaptiveEmbedding(keras.layers.Layer):
    # 初始化方法，接受词汇量、嵌入维度、投影维度、截断列表等参数
    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs):
        super().__init__(**kwargs)

        self.n_token = n_token      # 设置词汇量
        self.d_embed = d_embed      # 设置嵌入维度
        self.init_std = init_std    # 设置初始化标准差

        self.cutoffs = cutoffs + [n_token]  # 设置截断列表，并加入最大词汇量
        self.div_val = div_val      # 设置除法因子
        self.d_proj = d_proj        # 设置投影维度

        self.emb_scale = d_proj**0.5  # 计算嵌入缩放因子

        self.cutoff_ends = [0] + self.cutoffs  # 计算截断结束点列表

        self.emb_layers = []  # 初始化嵌入层列表
        self.emb_projs = []   # 初始化嵌入投影列表

        # 如果除法因子为 1，抛出未实现错误，否则创建嵌入层和投影
        if div_val == 1:
            raise NotImplementedError
        else:
            for i in range(len(self.cutoffs)):
                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                d_emb_i = d_embed // (div_val**i)
                self.emb_layers.append(
                    TFTransfoEmbeddings(
                        r_idx - l_idx,
                        d_emb_i,
                        init_std,
                        name=f"emb_layers_._{i}",  # 设置嵌入层的名称
                    )
                )

    # 构建方法，在此处创建投影权重
    def build(self, input_shape):
        for i in range(len(self.cutoffs)):
            d_emb_i = self.d_embed // (self.div_val**i)
            self.emb_projs.append(
                self.add_weight(
                    shape=(d_emb_i, self.d_proj),  # 设置投影权重的形状为 (当前嵌入维度 // 当前除法因子^i, 投影维度)
                    initializer=get_initializer(self.init_std),  # 使用给定的初始化器初始化投影权重
                    trainable=True,
                    name=f"emb_projs_._{i}",  # 设置投影权重的名称
                )
            )

        super().build(input_shape)  # 调用父类的 build 方法
    # 定义一个方法 `call`，接受一个输入参数 `inp`
    def call(self, inp):
        # 如果 `div_val` 等于 1，抛出未实现错误
        if self.div_val == 1:
            raise NotImplementedError  # 这里抛出错误，因为在我们的预训练检查点中这些代码未使用
        else:
            # 将输入 `inp` 展平成一维数组 `inp_flat`
            inp_flat = tf.reshape(inp, (-1,))
            # 创建一个全零张量 `emb_flat`，形状为 (inp_flat 的长度, self.d_proj)
            emb_flat = tf.zeros([shape_list(inp_flat)[0], self.d_proj])
            # 遍历 `self.cutoffs` 列表中的每个元素
            for i in range(len(self.cutoffs)):
                # 获取当前分段的左右索引
                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]

                # 创建一个布尔掩码 `mask_i`，标记出位于当前分段内的元素
                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)

                # 根据掩码 `mask_i` 从 `inp_flat` 中获取子集 `inp_i`，并将其归一化到从 0 开始
                inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx
                # 使用 `self.emb_layers[i]` 对 `inp_i` 进行嵌入操作
                emb_i = self.emb_layers[i](inp_i)
                # 将嵌入向量 `emb_i` 与对应的投影矩阵 `self.emb_projs[i]` 进行点乘
                emb_i = tf.einsum("id,de->ie", emb_i, self.emb_projs[i])

                # 根据 `mask_idx` 的索引在 `emb_flat` 上执行散列更新
                mask_idx = tf.where(mask_i)
                scatter = tf.scatter_nd(mask_idx, emb_i, shape_list(emb_flat))
                emb_flat = tf.cast(emb_flat, dtype=scatter.dtype)
                emb_flat += scatter

            # 将 `emb_flat` 重新整形为与 `inp` 相同形状的张量 `embed`
            embed_shape = shape_list(inp) + [self.d_proj]
            embed = tf.reshape(emb_flat, embed_shape)

        # 将嵌入张量 `embed` 乘以 `emb_scale`
        embed *= self.emb_scale

        # 返回嵌入张量 `embed` 作为方法的输出结果
        return embed
@keras_serializable
class TFTransfoXLMainLayer(keras.layers.Layer):
    # 指定配置类为 TransfoXLConfig
    config_class = TransfoXLConfig

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 将传入的配置对象保存到实例中
        self.config = config
        self.output_hidden_states = config.output_hidden_states  # 是否输出隐藏状态
        self.output_attentions = config.output_attentions  # 是否输出注意力权重
        self.return_dict = config.use_return_dict  # 是否使用字典形式返回结果

        self.n_token = config.vocab_size  # 词汇表大小

        self.d_embed = config.d_embed  # 嵌入维度
        self.d_model = config.d_model  # 模型维度
        self.n_head = config.n_head  # 注意力头数
        self.d_head = config.d_head  # 每个注意力头的维度
        self.untie_r = config.untie_r  # 是否解开注意力头的参数

        # 创建自适应嵌入层
        self.word_emb = TFAdaptiveEmbedding(
            config.vocab_size,
            config.d_embed,
            config.d_model,
            config.cutoffs,
            div_val=config.div_val,
            init_std=config.init_std,
            name="word_emb",
        )

        # Dropout 层
        self.drop = keras.layers.Dropout(config.dropout)

        self.n_layer = config.n_layer  # 层数
        self.mem_len = config.mem_len  # 记忆长度
        self.attn_type = config.attn_type  # 注意力类型

        self.layers = []  # 初始化层列表
        if config.attn_type == 0:  # 如果是默认的注意力类型
            # 创建多层自定义解码器层
            for i in range(config.n_layer):
                self.layers.append(
                    TFRelPartialLearnableDecoderLayer(
                        config.n_head,
                        config.d_model,
                        config.d_head,
                        config.d_inner,
                        config.dropout,
                        dropatt=config.dropatt,
                        pre_lnorm=config.pre_lnorm,
                        r_w_bias=None if self.untie_r else self.r_w_bias,
                        r_r_bias=None if self.untie_r else self.r_r_bias,
                        layer_norm_epsilon=config.layer_norm_epsilon,
                        init_std=config.init_std,
                        output_attentions=self.output_attentions,
                        name=f"layers_._{i}",
                    )
                )
        else:  # 如果是其他类型的注意力（这部分未实现）
            raise NotImplementedError  # 已删除这些代码以避免维护死代码 - 在预训练检查点中未使用

        self.same_length = config.same_length  # 是否长度相同
        self.clamp_len = config.clamp_len  # 限制长度

        if self.attn_type == 0:  # 如果是默认的注意力类型
            # 创建位置嵌入层
            self.pos_emb = TFPositionalEmbedding(self.d_model, name="pos_emb")
        else:  # 如果是其他类型的注意力（这部分未实现）
            raise NotImplementedError  # 已删除这些代码以避免维护死代码 - 在预训练检查点中未使用
    # 在构建模型时，根据输入形状设置权重参数，如果未指定则设置r_w_bias和r_r_bias为全零向量
    def build(self, input_shape):
        if not self.untie_r:
            # 添加r_w_bias权重，形状为(n_head, d_head)，初始化为全零，可训练
            self.r_w_bias = self.add_weight(
                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
            )
            # 添加r_r_bias权重，形状为(n_head, d_head)，初始化为全零，可训练
            self.r_r_bias = self.add_weight(
                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
            )
        # 调用父类的build方法
        super().build(input_shape)

    # 返回输入嵌入向量
    def get_input_embeddings(self):
        return self.word_emb

    # 设置输入嵌入向量，但此方法未实现，抛出未实现错误
    def set_input_embeddings(self, value):
        raise NotImplementedError

    # 向后兼容性方法，设置sample_softmax为-1
    def backward_compatible(self):
        self.sample_softmax = -1

    # 重置记忆长度为给定的mem_len
    def reset_memory_length(self, mem_len):
        self.mem_len = mem_len

    # 剪枝特定头部的方法，但此方法未实现，抛出未实现错误
    def _prune_heads(self, heads):
        raise NotImplementedError

    # 初始化记忆数组，若mem_len大于0，则创建长度为mem_len的空记忆列表，每个元素形状为(mem_len, bsz, d_model)
    def init_mems(self, bsz):
        if self.mem_len > 0:
            mems = []
            for i in range(self.n_layer):
                empty = tf.zeros([self.mem_len, bsz, self.d_model])
                mems.append(empty)

            return mems
        else:
            return None

    # 更新记忆数组，将隐藏状态hids缓存到mems中，mlen为之前的记忆长度，qlen为当前查询长度
    def _update_mems(self, hids, mems, mlen, qlen):
        # 如果mems为None，则直接返回None
        if mems is None:
            return None

        # 断言hids和mems的长度相等
        assert len(hids) == len(mems), "len(hids) != len(mems)"

        # 计算新的记忆长度范围
        new_mems = []
        end_idx = mlen + tf.math.maximum(0, qlen)
        beg_idx = tf.math.maximum(0, end_idx - tf.convert_to_tensor(self.mem_len))
        for i in range(len(hids)):
            # 将mems[i]转换为与hids[i]相同的数据类型
            mems[i] = tf.cast(mems[i], dtype=hids[i].dtype)
            # 将hids[i]与mems[i]拼接在一起
            cat = tf.concat([mems[i], hids[i]], axis=0)
            # 停止梯度计算拼接的结果cat
            tf.stop_gradient(cat)
            # 将拼接后的结果按照计算得到的索引范围截取，并加入到new_mems列表中
            new_mems.append(cat[beg_idx:end_idx])

        return new_mems

    # 调用模型，接受多种输入参数，使用装饰器unpack_inputs来解包输入
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        mems: List[tf.Tensor] | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
# TFTransfoXLPreTrainedModel 类的定义，继承自 TFPreTrainedModel，用于处理权重初始化以及预训练模型的下载和加载接口。
class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 TransfoXLConfig
    config_class = TransfoXLConfig
    # 基础模型的前缀名为 "transformer"
    base_model_prefix = "transformer"


# 使用 dataclass 装饰器定义 TFTransfoXLModelOutput 类
@dataclass
class TFTransfoXLModelOutput(ModelOutput):
    """
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        mems (`List[tf.Tensor]` of length `config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
            input) to speed up sequential decoding. The token ids which have their past given to this model should not
            be passed as input ids as they have already been computed.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # 最终隐藏状态，形状为 `(batch_size, sequence_length, hidden_size)` 的 tf.Tensor
    last_hidden_state: tf.Tensor = None
    # mems 是一个长度为 `config.n_layers` 的 List，包含预先计算的隐藏状态（在注意力块中的键和值），用于加速顺序解码。
    mems: List[tf.Tensor] = None
    # hidden_states 是一个可选的元组，当 `output_hidden_states=True` 时返回，或者 `config.output_hidden_states=True` 时返回。
    # 包含每层模型输出的 tf.Tensor（嵌入层输出和每层输出各一个），形状为 `(batch_size, sequence_length, hidden_size)`。
    hidden_states: Tuple[tf.Tensor] | None = None
    # attentions 是一个可选的元组，当 `output_attentions=True` 时返回，或者 `config.output_attentions=True` 时返回。
    # 包含每层注意力权重的 tf.Tensor（每层一个），形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
    attentions: Tuple[tf.Tensor] | None = None


@dataclass
class TFTransfoXLLMHeadModelOutput(ModelOutput):
    """
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
    """
    """
    Args:
        losses (`tf.Tensor` of shape *(batch_size, sequence_length-1)*, *optional*, returned when `labels` is provided):
            Language modeling losses (not reduced).
            语言建模的损失（未归并），形状为 *(batch_size, sequence_length-1)*，在提供 `labels` 时返回。
        prediction_scores (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax).
            语言建模头部的预测分数，形状为 `(batch_size, sequence_length, config.vocab_size)`，
            表示每个词汇标记的预测分数（经过 SoftMax 后的分数）。
        mems (`List[tf.Tensor]` of length `config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
            input) to speed up sequential decoding. The token ids which have their past given to this model should not
            be passed as input ids as they have already been computed.
            包含预先计算的隐藏状态（注意力块中的键和值）。长度为 `config.n_layers` 的列表，
            可用于加速序列解码。已经计算过的 token id 不应该作为输入传递给模型。
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
            模型在每一层输出的隐藏状态，以及初始嵌入输出的元组。当设置 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回。
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
            注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均。当设置 `output_attentions=True` 或 `config.output_attentions=True` 时返回。
    """

    prediction_scores: tf.Tensor = None
    mems: List[tf.Tensor] = None
    hidden_states: Tuple[tf.Tensor] | None = None
    attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFTransfoXLSequenceClassifierOutputWithPast(ModelOutput):
    """
    Base class for outputs of sentence classification models.

    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        mems (`List[tf.Tensor]` of length `config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
            input) to speed up sequential decoding. The token ids which have their past given to this model should not
            be passed as input ids as they have already been computed.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: tf.Tensor | None = None  # 损失值，如果提供了 `labels` 参数，则返回（可选）
    logits: tf.Tensor = None  # 分类（或回归，如果 `config.num_labels==1`）得分，未经 SoftMax 处理前的张量
    mems: List[tf.Tensor] = None  # 长度为 `config.n_layers` 的张量列表，包含预先计算的隐藏状态（注意力块中的键和值）
                                  # 可以用于加速顺序解码
    hidden_states: Tuple[tf.Tensor] | None = None  # 可选，当 `output_hidden_states=True` 时返回，模型在每个层的输出和初始嵌入输出的元组
                                                  # 形状为 `(batch_size, sequence_length, hidden_size)`
    attentions: Tuple[tf.Tensor] | None = None  # 可选，当 `output_attentions=True` 时返回，注意力 softmax 后的注意力权重
                                               # 用于计算自注意力头中的加权平均值



TRANSFO_XL_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    # 在使用 `model.fit()` 支持的任何格式传递输入和标签！然而，如果你想在 Keras 方法之外使用第二种格式，比如在使用 Keras 的 `Functional` API 创建自定义层或模型时，可以使用以下三种方法来收集所有输入张量到第一个位置参数中：
    
    # - 仅包含 `input_ids` 的单个张量：`model(input_ids)`
    # - 包含不同长度列表，按照文档字符串中给定的顺序包含一个或多个输入张量：`model([input_ids, attention_mask])` 或 `model([input_ids, attention_mask, token_type_ids])`
    # - 包含一个或多个输入张量，并与文档字符串中给定的输入名称关联的字典：`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
    
    # 注意，当使用 [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) 创建模型和层时，不需要担心这些问题，因为可以像传递到任何其他 Python 函数一样传递输入！
    
    # Parameters:
    #     config ([`TransfoXLConfig`]): 包含模型所有参数的模型配置类。
    #         使用配置文件初始化不会加载与模型关联的权重，只加载配置。可以查看 [`~PreTrainedModel.from_pretrained`] 方法加载模型权重。
"""

TRANSFO_XL_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        mems (`List[tf.Tensor]` of length `config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
            `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
            given to this model should not be passed as `input_ids` as they have already been computed.
        head_mask (`tf.Tensor` or `Numpy array` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False`):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
"""


@add_start_docstrings(
    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
    TRANSFO_XL_START_DOCSTRING,
)
class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
    """
    TFTransfoXLModel 类的定义，继承自 TFTransfoXLPreTrainedModel 类。

    使用 @add_start_docstrings 装饰器添加了类的文档字符串，说明此类是一个不带顶层头的原始 TransfoXL 模型。

    """
    # 初始化方法，用于创建一个新的TransfoXL模型实例
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法，传递配置和其他可选输入
        super().__init__(config, *inputs, **kwargs)
        # 创建一个TransfoXL的主层实例，命名为"transformer"
        self.transformer = TFTransfoXLMainLayer(config, name="transformer")

    # 将函数输入进行解包，以便在调用模型前添加文档字符串描述
    @unpack_inputs
    # 添加模型前向传播的文档字符串描述，使用TRANSFO_XL_INPUTS_DOCSTRING作为参数
    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
    # 添加代码示例的文档字符串描述，包括checkpoint、output_type、config_class等参数
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFTransfoXLModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的调用方法，接受多个输入参数并返回输出结果
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        mems: List[tf.Tensor] | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: bool | None = None,
        output_hidden_states: bool | None = None,
        return_dict: bool | None = None,
        training: bool = False,
    ) -> TFTransfoXLModelOutput | Tuple[tf.Tensor]:
        # 调用TransfoXL主层的前向传播方法，传递输入参数并接收输出
        outputs = self.transformer(
            input_ids=input_ids,
            mems=mems,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回模型的输出结果
        return outputs
"""
Transformer-XL 模型，在顶部有一个语言建模头部（自适应 softmax，其权重与自适应输入嵌入层相结合）。
"""
@add_start_docstrings(
    """
    Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive
    input embeddings)
    """,
    TRANSFO_XL_START_DOCSTRING,
)
class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 初始化 Transformer-XL 主层，并命名为 "transformer"
        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
        # 是否使用采样 softmax
        self.sample_softmax = config.sample_softmax
        # 断言确保 sample_softmax 小于等于 0，因为采样 softmax 的实现尚未完成
        assert self.sample_softmax <= 0, (
            "Sampling from the softmax is not implemented yet. Please look at issue: #3310:"
            " https://github.com/huggingface/transformers/issues/3310"
        )

        # 创建自适应 softmax，使用 TFAdaptiveSoftmaxMask 类
        self.crit = TFAdaptiveSoftmaxMask(
            config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name="crit"
        )

    # 重置 token embeddings 大小的方法，但未实现
    def _resize_token_embeddings(self, new_num_tokens):
        raise NotImplementedError()

    # 获取输出 embeddings 的方法
    def get_output_embeddings(self):
        """Double-check if you are using adaptive softmax."""
        # 如果存在输出层，则返回最后一层的输出层
        if len(self.crit.out_layers) > 0:
            return self.crit.out_layers[-1]
        # 否则返回 None
        return None

    # 重置记忆长度的方法
    def reset_memory_length(self, mem_len):
        self.transformer.reset_memory_length(mem_len)

    # 初始化记忆的方法
    def init_mems(self, bsz):
        return self.transformer.init_mems(bsz)

    # call 方法，处理模型的前向传播
    @unpack_inputs
    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFTransfoXLLMHeadModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        mems: List[tf.Tensor] | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: bool | None = None,
        output_hidden_states: bool | None = None,
        return_dict: bool | None = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
        **kwargs,
    ):
        # 在这里实现模型的前向传播
        pass  # Placeholder to indicate forward propagation implementation is expected elsewhere
    ) -> TFTransfoXLLMHeadModelOutput | Tuple[tf.Tensor]:
        # 定义函数签名，指定返回类型为 TFTransfoXLLMHeadModelOutput 或 tf.Tensor 元组
        if input_ids is not None:
            # 如果 input_ids 不为 None，则获取其形状的前两个维度大小
            bsz, tgt_len = shape_list(input_ids)[:2]
        else:
            # 如果 input_ids 为 None，则获取 inputs_embeds 的形状的前两个维度大小
            bsz, tgt_len = shape_list(inputs_embeds)[:2]

        # 使用 Transformer 模型进行前向传播计算
        transformer_outputs = self.transformer(
            input_ids,
            mems,
            head_mask,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict,
            training=training,
        )

        # 获取 Transformer 模型输出的最后一个隐藏层
        last_hidden = transformer_outputs[0]
        # 截取最后一个隐藏层的后 tgt_len 长度作为预测的隐藏状态
        pred_hid = last_hidden[:, -tgt_len:]

        # 使用 crit 对预测的隐藏状态进行 softmax 计算，得到预测分数
        softmax_output = self.crit(pred_hid, labels, training=training)
        # 如果 labels 为 None，则返回 softmax_output 作为预测分数
        prediction_scores = softmax_output if labels is None else ()

        # 如果不要求返回字典，则返回预测分数和 transformer_outputs 的其它部分
        if not return_dict:
            return (prediction_scores,) + transformer_outputs[1:]

        # 返回 TFTransfoXLLMHeadModelOutput 类型的对象，包含预测分数和其它 Transformer 模型输出的相关部分
        return TFTransfoXLLMHeadModelOutput(
            prediction_scores=prediction_scores,
            mems=transformer_outputs.mems,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model_kwargs):
        # 准备输入用于生成过程
        inputs = {}

        # 如果 past_key_values 不为 None，则使用 input_ids 的最后一个位置的扩展维度作为输入
        if past_key_values:
            input_ids = tf.expand_dims(input_ids[:, -1], axis=-1)
        else:
            # 否则，直接使用原始的 input_ids
            input_ids = input_ids

        return inputs

    # 根据 torch 的 tie_weights 函数进行调整
    def tf_to_pt_weight_rename(self, tf_weight):
        # 如果配置中要求绑定词嵌入且在 tf_weight 中包含 "crit.out_layers"
        if self.config.tie_word_embeddings and "crit.out_layers" in tf_weight:
            # 返回 tf_weight，并将 "crit.out_layers" 替换为 "transformer.word_emb.emb_layers"
            return tf_weight, tf_weight.replace("crit.out_layers", "transformer.word_emb.emb_layers")
        # 如果配置中要求绑定投影且在 tf_weight 中包含 "crit.out_projs"
        elif self.config.tie_projs and "crit.out_projs" in tf_weight:
            for i, tie_proj in enumerate(self.config.tie_projs):
                # 如果 tie_proj 为真且配置参数符合要求，则替换相应的 tf_weight 部分
                if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
                    return tf_weight, tf_weight.replace(f"crit.out_projs.{i}", "transformer.word_emb.emb_projs.0")
                # 如果 tie_proj 为真且配置参数符合要求，则替换相应的 tf_weight 部分
                elif tie_proj and self.config.div_val != 1:
                    return tf_weight, tf_weight.replace("crit.out_projs", "transformer.word_emb.emb_projs")
        else:
            # 如果不满足以上条件，则返回原始的 tf_weight
            return (tf_weight,)
"""
The Transfo XL Model transformer with a sequence classification head on top (linear layer).

[`TFTransfoXLForSequenceClassification`] uses the last token in order to do the classification, as other causal
models (e.g. GPT-1,GPT-2) do.

Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
"""
@add_start_docstrings(
    """
    Decorator to add docstrings to the model's constructor (__init__ method) for `TFTransfoXLForSequenceClassification`.

    Args:
        config (:class:`~transformers.TransfoXLConfig`):
            The configuration class to instantiate the model with.

    This initializes the sequence classifier by setting the number of labels and creating a Dense layer for scoring,
    and instantiates the main Transformer layer (`TFTransfoXLMainLayer`).

    This model supports sequence classification tasks based on the transformer's last token.
    """,
    TRANSFO_XL_START_DOCSTRING,
)
class TFTransfoXLForSequenceClassification(TFTransfoXLPreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
        # Define a Dense layer (`score`) for predicting sequence classifications
        self.score = keras.layers.Dense(
            config.num_labels,
            kernel_initializer=get_initializer(config.init_range),
            name="score",
            use_bias=False,
        )
        # Initialize the main Transformer layer (`transformer`) for sequence processing
        self.transformer = TFTransfoXLMainLayer(config, name="transformer")

    def get_output_embeddings(self):
        """
        Retrieve the output embeddings from the transformer's word embeddings.

        This method warns that sequence classification models do not have output embeddings and that
        `.get_output_embeddings` will be removed in future versions of transformers.
        """
        logger.warning(
            "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed "
            "in transformers v4.32."
        )
        # Return the word embeddings from the transformer layer
        return self.transformer.word_emb

    @unpack_inputs
    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFTransfoXLSequenceClassifierOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        mems: List[tf.Tensor] | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        **kwargs,
    ):
        """
        Perform the forward pass of the TFTransfoXLForSequenceClassification model.

        This method processes inputs through the Transformer XL model and computes logits for sequence classification.

        Args:
            input_ids (:obj:`tf.Tensor` or :obj:`np.ndarray`, `optional`):
                The input IDs of shape `[batch_size, sequence_length]`.
            mems (:obj:`List[tf.Tensor]` or :obj:`None`, `optional`):
                List of memory states from previous batches to speed up sequential decoding.
            head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` or :obj:`None`, `optional`):
                Mask to nullify selected heads of the self-attention modules.
            inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` or :obj:`None`, `optional`):
                Instead of input IDs, directly pass embeddings. Shape should be `[batch_size, sequence_length, hidden_size]`.
            output_attentions (:obj:`bool`, `optional`):
                Whether to return attention weights.
            output_hidden_states (:obj:`bool`, `optional`):
                Whether to return hidden states.
            return_dict (:obj:`bool`, `optional`):
                Whether to return outputs as a dictionary instead of tuple.
            labels (:obj:`np.ndarray` or :obj:`tf.Tensor` or :obj:`None`, `optional`):
                Labels for computing the sequence classification loss. Shape should be `[batch_size]`.
            training (:obj:`bool`, `optional`):
                Whether to run in training mode. Defaults to `False`.

        Returns:
            :obj:`Union[TFTransfoXLSequenceClassifierOutput, Tuple]`:
                The sequence classifier output, which includes logits, hidden states, and/or attention weights,
                depending on the configuration and optional outputs.

        """
        ) -> Union[Tuple, TFTransfoXLSequenceClassifierOutputWithPast]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
            config.vocab_size - 1]`.
        """
        # 调用 Transformer 模型进行前向传播，并返回输出结果
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            mems=mems,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取 Transformer 输出的隐藏状态
        hidden_states = transformer_outputs[0]
        
        # 使用得分函数对隐藏状态进行分类得到预测的 logits
        logits = self.score(hidden_states)
        
        # 初始化用于选择 logits 的变量
        in_logits = None
        
        # 如果没有定义填充标记，则将序列长度设置为 -1
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            # 如果输入了 input_ids，则计算每个样本的序列长度
            if input_ids is not None:
                # 计算每个样本的实际序列长度
                sequence_lengths = (
                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                    - 1
                )
                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                # 使用序列长度从 logits 中选择相应的 logits
                in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
            else:
                # 如果没有输入 input_ids，则警告并设置序列长度为 -1
                sequence_lengths = -1
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                )
        
        # 初始化损失变量
        loss = None
        
        # 如果有提供标签，则计算损失
        if labels is not None:
            # 如果有输入 input_ids，则获取 batch_size 和 sequence_length
            if input_ids is not None:
                batch_size, sequence_length = shape_list(input_ids)[:2]
            else:
                batch_size, sequence_length = shape_list(inputs_embeds)[:2]
            
            # 如果没有填充标记，并且 batch_size 大于 1，则会报错
            assert (
                self.config.pad_token_id is not None or batch_size == 1
            ), "Cannot handle batch sizes > 1 if no padding token is defined."

            # 如果序列长度不是 Tensor，则从 logits 中选择相应的 logits
            if not tf.is_tensor(sequence_lengths):
                in_logits = logits[0:batch_size, sequence_lengths]

            # 计算损失
            loss = self.hf_compute_loss(tf.reshape(labels, [-1, 1]), tf.reshape(in_logits, [-1, self.num_labels]))

        # 如果没有 return_dict，则返回非字典形式的输出
        pooled_logits = in_logits if in_logits is not None else logits

        if not return_dict:
            # 如果不返回字典，则返回元组形式的输出
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFTransfoXLSequenceClassifierOutputWithPast 类型的字典形式输出
        return TFTransfoXLSequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            mems=transformer_outputs.mems,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

`.\models\deprecated\transfo_xl\modeling_tf_transfo_xl_utilities.py`

# coding=utf-8
# 文件编码声明，指定使用UTF-8编码
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# 版权声明，版权归属于Google AI、Google Brain、Carnegie Mellon University以及HuggingFace Inc.团队
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
# 版权声明，版权归属于NVIDIA CORPORATION，保留所有权利
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 授权许可声明，采用Apache License, Version 2.0，详细内容可访问指定URL获取
# you may not use this file except in compliance with the License.
# 除非符合许可证规定，否则不得使用此文件
# You may obtain a copy of the License at
# 可在指定URL获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# 根据适用法律或书面同意，软件
# distributed under the License is distributed on an "AS IS" BASIS,
# 根据许可证分发在"原样"基础上
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 不提供任何形式的担保或条件，无论是明示的还是隐含的
# See the License for the specific language governing permissions and
# limitations under the License.
# 查看许可证以了解特定语言的权限和限制
"""
 A TF 2.0 Adaptive Softmax for Transformer XL model.
"""
# 模块说明文档字符串，描述了此文件实现了基于Transformer XL的TF 2.0自适应Softmax模型


import tensorflow as tf

from ....modeling_tf_utils import keras
from ....tf_utils import shape_list


class TFAdaptiveSoftmaxMask(keras.layers.Layer):
    def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
        super().__init__(**kwargs)

        self.vocab_size = vocab_size
        self.d_embed = d_embed
        self.d_proj = d_proj

        self.cutoffs = cutoffs + [vocab_size]
        self.cutoff_ends = [0] + self.cutoffs
        self.div_val = div_val

        self.shortlist_size = self.cutoffs[0]
        self.n_clusters = len(self.cutoffs) - 1
        self.head_size = self.shortlist_size + self.n_clusters
        self.keep_order = keep_order

        self.out_layers = []
        self.out_projs = []
    # 定义神经网络层的构建方法，根据输入形状 input_shape 动态构建网络层
    def build(self, input_shape):
        # 如果聚类数大于0，则添加聚类权重和偏置
        if self.n_clusters > 0:
            self.cluster_weight = self.add_weight(
                shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight"
            )
            self.cluster_bias = self.add_weight(
                shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias"
            )

        # 根据 div_val 的值分支处理
        if self.div_val == 1:
            # 遍历 self.cutoffs 列表的长度，依次处理每个 cutoff
            for i in range(len(self.cutoffs)):
                # 如果投影维度 d_proj 不等于嵌入维度 d_embed，则添加输出投影权重
                if self.d_proj != self.d_embed:
                    weight = self.add_weight(
                        shape=(self.d_embed, self.d_proj),
                        initializer="zeros",
                        trainable=True,
                        name=f"out_projs_._{i}",
                    )
                    self.out_projs.append(weight)
                else:
                    # 否则添加 None，表示无需额外投影
                    self.out_projs.append(None)
                # 添加输出层权重和偏置
                weight = self.add_weight(
                    shape=(self.vocab_size, self.d_embed),
                    initializer="zeros",
                    trainable=True,
                    name=f"out_layers_._{i}_._weight",
                )
                bias = self.add_weight(
                    shape=(self.vocab_size,),
                    initializer="zeros",
                    trainable=True,
                    name=f"out_layers_._{i}_._bias",
                )
                self.out_layers.append((weight, bias))
        else:
            # 处理 div_val 不为1的情况
            for i in range(len(self.cutoffs)):
                # 获取当前 cutoff 的左右索引
                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                # 计算当前层的嵌入维度 d_emb_i
                d_emb_i = self.d_embed // (self.div_val**i)

                # 添加输出投影权重
                weight = self.add_weight(
                    shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name=f"out_projs_._{i}"
                )
                self.out_projs.append(weight)
                # 添加输出层权重和偏置
                weight = self.add_weight(
                    shape=(r_idx - l_idx, d_emb_i),
                    initializer="zeros",
                    trainable=True,
                    name=f"out_layers_._{i}_._weight",
                )
                bias = self.add_weight(
                    shape=(r_idx - l_idx,),
                    initializer="zeros",
                    trainable=True,
                    name=f"out_layers_._{i}_._bias",
                )
                self.out_layers.append((weight, bias))
        
        # 调用父类的 build 方法，完成神经网络层的构建
        super().build(input_shape)

    @staticmethod
    # 静态方法：计算对数概率的对数几率
    def _logit(x, W, b, proj=None):
        # 将输入 x 赋值给 y
        y = x
        # 如果提供了投影矩阵 proj，则进行投影操作
        if proj is not None:
            y = tf.einsum("ibd,ed->ibe", y, proj)
        # 计算最终的对数几率，使用 tf.einsum 实现张量乘法和加法
        return tf.einsum("ibd,nd->ibn", y, W) + b

    @staticmethod
    # 静态方法：根据目标索引从对数概率中收集对应的对数概率值
    def _gather_logprob(logprob, target):
        # 获取对数概率 logprob 的形状信息
        lp_size = shape_list(logprob)
        # 生成一个范围张量 r，其长度为 lp_size[0]，数据类型与目标张量一致
        r = tf.range(lp_size[0], dtype=target.dtype)
        # 构造索引张量 idx，形状为 [lp_size[0], 2]，每行包含一个范围值和对应的目标索引
        idx = tf.stack([r, target], 1)
        # 使用 tf.gather_nd 根据索引 idx 从 logprob 中收集对应的对数概率值
        return tf.gather_nd(logprob, idx)

`.\models\deprecated\transfo_xl\modeling_transfo_xl.py`

# 声明 Python 源文件的编码格式为 UTF-8
# 版权声明和许可信息，指明代码的版权和使用许可
# 包含 Apache License 2.0 许可信息，告知使用者可以按此许可使用代码

"""
 PyTorch Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. In particular
 https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
"""
# 引入警告模块，用于可能的警告信息
import warnings
# 引入 dataclasses 模块中的 dataclass 装饰器
from dataclasses import dataclass
# 引入 typing 模块中的 List, Optional, Tuple, Union 类型定义
from typing import List, Optional, Tuple, Union

# 引入 PyTorch 库
import torch
# 引入 torch.nn 模块中的 nn 模块
from torch import nn
# 引入损失函数 BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 从 ....modeling_utils 模块导入 PreTrainedModel 类
from ....modeling_utils import PreTrainedModel
# 从 ....utils 模块中导入 ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging 函数
from ....utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
# 从 .configuration_transfo_xl 模块中导入 TransfoXLConfig 类
from .configuration_transfo_xl import TransfoXLConfig
# 从 .modeling_transfo_xl_utilities 模块中导入 ProjectedAdaptiveLogSoftmax 类
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 预设用于文档的模型检查点
_CHECKPOINT_FOR_DOC = "transfo-xl/transfo-xl-wt103"
# 预设用于文档的配置信息
_CONFIG_FOR_DOC = "TransfoXLConfig"

# Transformer XL 的预训练模型存档列表
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "transfo-xl/transfo-xl-wt103",
    # 可以在 https://huggingface.co/models?filter=transfo-xl 查看所有 Transformer XL 模型
]


def build_tf_to_pytorch_map(model, config):
    """
    A map of modules from TF to PyTorch. This time I use a map to keep the PyTorch model as identical to the original
    PyTorch model as possible.
    """
    # 创建一个从 TF 到 PyTorch 的模块映射字典，目的是尽可能地保持 PyTorch 模型与原始模型的一致性
    tf_to_pt_map = {}
    # 检查模型是否具有"transformer"属性
    if hasattr(model, "transformer"):
        # 如果模型是 TransfoXLLMHeadModel 类型，需要加载 Adaptive Softmax 部分
        tf_to_pt_map.update(
            {
                "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
                "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias,
            }
        )
        # 遍历 Adaptive Softmax 的各层
        for i, (out_l, proj_l, tie_proj) in enumerate(
            zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs)
        ):
            layer_str = f"transformer/adaptive_softmax/cutoff_{i}/"
            if config.tie_word_embeddings:
                # 更新映射，将偏置项加入映射表中
                tf_to_pt_map.update({layer_str + "b": out_l.bias})
            else:
                # 如果未实现绑定词嵌入，则抛出未实现错误
                raise NotImplementedError
                # 下面这行代码可能并未在 TF 代码中实现
                tf_to_pt_map.update({layer_str + "lookup_table": out_l.weight, layer_str + "b": out_l.bias})
            if not tie_proj:
                # 如果不绑定投影层，将投影层添加到映射表中
                tf_to_pt_map.update({layer_str + "proj": proj_l})
        # 加载其余的 Transformer 部分
        model = model.transformer

    # Embeddings（嵌入层）
    for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
        layer_str = f"transformer/adaptive_embed/cutoff_{i}/"
        # 更新映射，将嵌入层的权重和投影矩阵加入映射表中
        tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l})

    # Transformer blocks（Transformer 块）
    for i, b in enumerate(model.layers):
        layer_str = f"transformer/layer_{i}/"
        # 更新映射，将注意力层和前馈神经网络层的权重、偏置加入映射表中
        tf_to_pt_map.update(
            {
                layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
                layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
                layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
                layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
                layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
                layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
                layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
                layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
                layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
                layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
                layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
            }
        )

    # Relative positioning biases（相对位置偏置）
    if config.untie_r:
        # 如果配置为解绑相对位置偏置，则逐个添加到列表中
        r_r_list = []
        r_w_list = []
        for b in model.layers:
            r_r_list.append(b.dec_attn.r_r_bias)
            r_w_list.append(b.dec_attn.r_w_bias)
    else:
        # 否则将整体的相对位置偏置添加到列表中
        r_r_list = [model.r_r_bias]
        r_w_list = [model.r_w_bias]
    # 更新映射，将相对位置偏置加入映射表中
    tf_to_pt_map.update({"transformer/r_r_bias": r_r_list, "transformer/r_w_bias": r_w_list})
    # 返回映射表
    return tf_to_pt_map
# 在给定的 PyTorch 模型中加载 TensorFlow 的权重
def load_tf_weights_in_transfo_xl(model, config, tf_path):
    """Load tf checkpoints in a pytorch model"""
    try:
        import numpy as np  # 导入 NumPy 库，用于处理数组数据
        import tensorflow as tf  # 导入 TensorFlow 库，用于加载 TensorFlow 模型权重
    except ImportError:
        logger.error(
            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise

    # 构建 TensorFlow 到 PyTorch 权重加载映射
    tf_to_pt_map = build_tf_to_pytorch_map(model, config)

    # 从 TensorFlow 模型加载权重
    init_vars = tf.train.list_variables(tf_path)
    tf_weights = {}
    for name, shape in init_vars:
        logger.info(f"Loading TF weight {name} with shape {shape}")
        array = tf.train.load_variable(tf_path, name)
        tf_weights[name] = array

    # 将 TensorFlow 权重映射到 PyTorch 模型中
    for name, pointer in tf_to_pt_map.items():
        assert name in tf_weights  # 断言确保映射中的权重在 TensorFlow 加载的权重中存在
        array = tf_weights[name]
        
        # 如果权重名中包含 "kernel" 或 "proj"，需要对数组进行转置
        if "kernel" in name or "proj" in name:
            array = np.transpose(array)
        
        # 如果权重名中包含 "r_r_bias" 或 "r_w_bias"，并且指针长度大于 1，则需要拆分 TensorFlow 的权重
        if ("r_r_bias" in name or "r_w_bias" in name) and len(pointer) > 1:
            assert len(pointer) == array.shape[0]  # 断言确保指针长度与数组第一维度长度相等
            for i, p_i in enumerate(pointer):
                arr_i = array[i, ...]
                try:
                    assert p_i.shape == arr_i.shape  # 断言确保指针形状与数组形状相等
                except AssertionError as e:
                    e.args += (p_i.shape, arr_i.shape)
                    raise
                logger.info(f"Initialize PyTorch weight {name} for layer {i}")
                p_i.data = torch.from_numpy(arr_i)
        else:
            try:
                assert pointer.shape == array.shape  # 断言确保指针形状与数组形状相等
            except AssertionError as e:
                e.args += (pointer.shape, array.shape)
                raise
            logger.info(f"Initialize PyTorch weight {name}")
            pointer.data = torch.from_numpy(array)
        
        # 从 TensorFlow 权重字典中移除已处理的权重项
        tf_weights.pop(name, None)
        tf_weights.pop(name + "/Adam", None)
        tf_weights.pop(name + "/Adam_1", None)

    # 输出未能复制到 PyTorch 模型的权重名称列表
    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")
    return model


class PositionalEmbedding(nn.Module):
    def __init__(self, demb):
        super().__init__()

        self.demb = demb

        # 计算位置编码的频率逆数，注册为模型的缓冲区
        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
        self.register_buffer("inv_freq", inv_freq)
    # 定义一个方法 `forward`，用于生成位置编码
    def forward(self, pos_seq, bsz=None):
        # 根据位置序列和预定义的频率向量生成正弦波和余弦波输入
        sinusoid_inp = torch.outer(pos_seq, self.inv_freq)
        # 将正弦波和余弦波拼接在一起形成位置编码张量
        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)

        # 如果指定了批大小 `bsz`，则将位置编码张量扩展成相应形状后返回
        if bsz is not None:
            return pos_emb[:, None, :].expand(-1, bsz, -1)
        else:
            # 否则返回未扩展的位置编码张量
            return pos_emb[:, None, :]
# 定义一个相对位置可学习的多头注意力模块
class RelPartialLearnableMultiHeadAttn(nn.Module):
    def __init__(
        self,
        n_head,
        d_model,
        d_head,
        dropout,
        dropatt=0,
        pre_lnorm=False,
        r_r_bias=None,
        r_w_bias=None,
        layer_norm_epsilon=1e-5,
    ):
        super().__init__()

        self.n_head = n_head  # 注意力头的数量
        self.d_model = d_model  # 模型的维度
        self.d_head = d_head  # 每个注意力头的维度
        self.dropout = dropout  # dropout 概率

        # qkv_net 是用来计算查询、键、值的线性层
        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False)

        self.drop = nn.Dropout(dropout)  # dropout 模块
        self.dropatt = nn.Dropout(dropatt)  # attention dropout 模块
        # o_net 是用来计算输出的线性层
        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)

        # layer_norm 是层归一化层
        self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)

        self.scale = 1 / (d_head**0.5)  # 缩放因子

        self.pre_lnorm = pre_lnorm  # 是否在层归一化之前应用

        if r_r_bias is None or r_w_bias is None:  # 如果没有提供偏置，创建新的参数
            self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
            self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
        else:
            self.r_r_bias = r_r_bias  # 相对位置注意力的查询偏置
            self.r_w_bias = r_w_bias  # 相对位置注意力的键值偏置

        # r_net 是用来计算相对位置编码的线性层
        self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)

    def _rel_shift(self, x):
        # 对输入张量进行相对位置偏移
        zero_pad_shape = (x.size(0), 1) + x.size()[2:]
        zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype)
        x_padded = torch.cat([zero_pad, x], dim=1)

        x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:]
        x_padded = x_padded.view(*x_padded_shape)

        x = x_padded[1:].view_as(x)

        return x
    # 初始化方法，用于创建一个新的RelPartialLearnableMultiHeadAttn对象和一个PositionwiseFF对象
    def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs):
        # 调用父类的初始化方法
        super().__init__()

        # 创建一个RelPartialLearnableMultiHeadAttn对象，用于处理解码器的注意力部分
        self.dec_attn = RelPartialLearnableMultiHeadAttn(
            n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs
        )
        
        # 创建一个PositionwiseFF对象，用于处理解码器的前向传播部分
        self.pos_ff = PositionwiseFF(
            d_model, d_inner, dropout, pre_lnorm=kwargs.get("pre_lnorm"), layer_norm_epsilon=layer_norm_epsilon
        )

    # 前向传播方法，接受解码器输入并进行处理
    def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None, output_attentions=False):
        # 使用RelPartialLearnableMultiHeadAttn对象处理解码器输入，得到注意力输出
        attn_outputs = self.dec_attn(
            dec_inp,
            r,
            attn_mask=dec_attn_mask,
            mems=mems,
            head_mask=head_mask,
            output_attentions=output_attentions,
        )
        
        # 使用PositionwiseFF对象处理RelPartialLearnableMultiHeadAttn的输出，得到前向传播的输出
        ff_output = self.pos_ff(attn_outputs[0])

        # 将前向传播的输出和注意力机制的输出合并成一个列表作为最终的输出
        outputs = [ff_output] + attn_outputs[1:]

        # 返回最终的输出
        return outputs
# 定义一个自适应嵌入层的神经网络模块
class AdaptiveEmbedding(nn.Module):
    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False):
        super().__init__()

        self.n_token = n_token  # 嵌入层的词汇表大小
        self.d_embed = d_embed  # 嵌入向量的维度

        self.cutoffs = cutoffs + [n_token]  # 嵌入层的截断点列表，加上词汇表大小
        self.div_val = div_val  # 分割值，用于确定每个截断的嵌入维度变化
        self.d_proj = d_proj  # 投影后的维度

        self.emb_scale = d_proj**0.5  # 嵌入向量缩放因子

        self.cutoff_ends = [0] + self.cutoffs  # 嵌入层的截断点列表，包括起始点0

        self.emb_layers = nn.ModuleList()  # 嵌入层模块列表
        self.emb_projs = nn.ParameterList()  # 嵌入投影参数列表
        if div_val == 1:
            self.emb_layers.append(nn.Embedding(n_token, d_embed, sparse=sample_softmax > 0))
            # 如果没有分割，直接创建一个标准的嵌入层
            if d_proj != d_embed:
                self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
                # 如果投影维度与嵌入维度不同，添加一个投影参数
        else:
            # 如果有分割，根据每个截断区间创建对应的嵌入层和投影参数
            for i in range(len(self.cutoffs)):
                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                d_emb_i = d_embed // (div_val**i)
                self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i))
                self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))

    def forward(self, inp):
        if self.div_val == 1:
            embed = self.emb_layers[0](inp)  # 使用第一个嵌入层对输入进行嵌入
            if self.d_proj != self.d_embed:
                embed = nn.functional.linear(embed, self.emb_projs[0])
                # 如果投影维度不等于嵌入维度，对嵌入向量进行线性投影
        else:
            param = next(self.parameters())
            inp_flat = inp.view(-1)
            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], dtype=param.dtype, device=param.device)
            # 创建一个与输入扁平化后大小相同的零张量

            for i in range(len(self.cutoffs)):
                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]

                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
                indices_i = mask_i.nonzero().squeeze()

                if indices_i.numel() == 0:
                    continue

                inp_i = inp_flat.index_select(0, indices_i) - l_idx
                emb_i = self.emb_layers[i](inp_i)
                emb_i = nn.functional.linear(emb_i, self.emb_projs[i])
                # 对于每个区间，对相应的输入进行嵌入和投影操作，并将结果复制回原始位置

                emb_flat.index_copy_(0, indices_i, emb_i)

            embed_shape = inp.size() + (self.d_proj,)
            embed = emb_flat.view(embed_shape)
            # 将扁平化的嵌入向量形状转换回原始形状

        embed.mul_(self.emb_scale)  # 缩放嵌入向量

        return embed


class TransfoXLPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = TransfoXLConfig  # 配置类的引用
    load_tf_weights = load_tf_weights_in_transfo_xl  # 加载 TensorFlow 权重的方法引用
    base_model_prefix = "transformer"  # 基础模型前缀

    def _init_weight(self, weight):
        if self.config.init == "uniform":
            nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
            # 如果初始化配置为均匀分布，使用均匀分布初始化权重
        elif self.config.init == "normal":
            nn.init.normal_(weight, 0.0, self.config.init_std)
            # 如果初始化配置为正态分布，使用正态分布初始化权重

    def _init_bias(self, bias):
        nn.init.constant_(bias, 0.0)
        # 使用常数初始化偏置
    # 初始化神经网络模型的权重
    def _init_weights(self, m):
        # 获取当前层的类名
        classname = m.__class__.__name__
        # 如果是线性层（Linear）
        if classname.find("Linear") != -1:
            # 初始化权重
            if hasattr(m, "weight") and m.weight is not None:
                self._init_weight(m.weight)
            # 初始化偏置
            if hasattr(m, "bias") and m.bias is not None:
                self._init_bias(m.bias)
        # 如果是自适应嵌入层（AdaptiveEmbedding）
        elif classname.find("AdaptiveEmbedding") != -1:
            # 初始化嵌入投影层的权重
            if hasattr(m, "emb_projs"):
                for i in range(len(m.emb_projs)):
                    if m.emb_projs[i] is not None:
                        nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
        # 如果是普通嵌入层（Embedding）
        elif classname.find("Embedding") != -1:
            # 初始化权重
            if hasattr(m, "weight"):
                self._init_weight(m.weight)
        # 如果是投影自适应对数softmax层（ProjectedAdaptiveLogSoftmax）
        elif classname.find("ProjectedAdaptiveLogSoftmax") != -1:
            # 初始化聚类权重
            if hasattr(m, "cluster_weight") and m.cluster_weight is not None:
                self._init_weight(m.cluster_weight)
            # 初始化聚类偏置
            if hasattr(m, "cluster_bias") and m.cluster_bias is not None:
                self._init_bias(m.cluster_bias)
            # 初始化输出投影层的权重
            if hasattr(m, "out_projs"):
                for i in range(len(m.out_projs)):
                    if m.out_projs[i] is not None:
                        nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)
        # 如果是层归一化层（LayerNorm）
        elif classname.find("LayerNorm") != -1:
            # 初始化权重，均值为1.0，标准差为self.config.init_std
            if hasattr(m, "weight"):
                nn.init.normal_(m.weight, 1.0, self.config.init_std)
            # 初始化偏置
            if hasattr(m, "bias") and m.bias is not None:
                self._init_bias(m.bias)
        # 对于其他情况
        else:
            # 初始化特定属性的权重
            if hasattr(m, "r_emb"):
                self._init_weight(m.r_emb)
            if hasattr(m, "r_w_bias"):
                self._init_weight(m.r_w_bias)
            if hasattr(m, "r_r_bias"):
                self._init_weight(m.r_r_bias)
            if hasattr(m, "r_bias"):
                self._init_bias(m.r_bias)
    # 调整模型输入的 token embeddings 矩阵大小，如果 new_num_tokens 不等于 config.vocab_size，则进行调整。
    # 调整后需注意是否需要重新绑定权重 embeddings，如果模型类有 tie_weights() 方法的话。
    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, layer: Optional[int] = -1):
        """
        Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. Take care of tying
        weights embeddings afterwards if the model class has a *tie_weights()* method.
    
        Arguments:
            new_num_tokens: (*optional*) int:
                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at
                the end. Reducing the size will remove vectors from the end. If not provided or None: does nothing and
                just returns a pointer to the input tokens `torch.nn.Embeddings` Module of the model.
            layer: (*optional*) int:
                Layer of the *AdaptiveEmbedding* where the resizing should be done. Per default the last layer will be
                resized. Be aware that when resizing other than the last layer, you have to ensure that the new
                token(s) in the tokenizer are at the corresponding position.
    
        Return: `torch.nn.Embeddings` Pointer to the input tokens Embeddings Module of the model
        """
        # 获取基础模型（如果需要的话）
        base_model = getattr(self, self.base_model_prefix, self)
    
        # 如果 new_num_tokens 为 None，则返回当前输入 token embeddings 的指针
        if new_num_tokens is None:
            return self.get_input_embeddings()
    
        # 获取新的 token 数量和层索引
        new_num_tokens_layer, layer = self._get_new_num_tokens_layer(new_num_tokens, layer)
        # 断言新的 embedding 层大小大于 0
        assert new_num_tokens_layer > 0, "The size of the new embedding layer cannot be 0 or less"
        # 调整 token embeddings 并获取模型 embeds
        model_embeds = base_model._resize_token_embeddings(new_num_tokens_layer, layer)
    
        # 更新基础模型和当前模型的配置
        self.config.vocab_size = new_num_tokens
        base_model.vocab_size = new_num_tokens
        base_model.n_token = new_num_tokens
    
        # 获取新的 embedding 形状列表
        new_embedding_shapes = self._get_embedding_shapes()
        # 调整截断点（如果有）
        self._resize_cutoffs(new_num_tokens, new_num_tokens_layer, new_embedding_shapes, layer)
    
        # 如果需要的话重新绑定权重
        self.tie_weights()
    
        # 返回模型 embeds
        return model_embeds
    
    def _get_new_num_tokens_layer(self, new_num_tokens, layer):
        # 获取输入 embeddings
        embeddings = self.get_input_embeddings()
        # 如果 layer 为 -1，则设置为最后一层的索引
        if layer == -1:
            layer = len(embeddings.emb_layers) - 1
        # 断言 layer 在有效范围内
        assert 0 <= layer <= len(embeddings.emb_layers) - 1
    
        # 计算新的 embedding 层的 token 数量
        new_num_tokens_layer = (
            new_num_tokens
            - sum([emb.weight.shape[0] for emb in embeddings.emb_layers[:layer]])
            - sum([emb.weight.shape[0] for emb in embeddings.emb_layers[layer + 1:]])
        )
        return new_num_tokens_layer, layer
    
    def _get_embedding_shapes(self):
        # 获取输入 embeddings
        embeddings = self.get_input_embeddings()
        # 返回每个 embedding 层的 token 数量列表
        return [emb.weight.shape[0] for emb in embeddings.emb_layers]
    # 调整模型中的词嵌入大小，用于处理新的词汇量
    def _resize_token_embeddings(self, new_num_tokens, layer=-1):
        # 获取当前的输入词嵌入层
        embeddings = self.get_input_embeddings()
        # 如果新的词汇量为None，直接返回当前的词嵌入层
        if new_num_tokens is None:
            return embeddings
        # 获取调整后的新词嵌入层
        new_embeddings_layer = self._get_resized_embeddings(embeddings.emb_layers[layer], new_num_tokens)
        # 将新的词嵌入层放回原来的位置
        embeddings.emb_layers[layer] = new_embeddings_layer

        # 更新模型的输入词嵌入
        self.set_input_embeddings(embeddings)

        # 返回更新后的输入词嵌入
        return self.get_input_embeddings()

    # 调整截断点列表，以匹配新的词汇量和词嵌入尺寸
    def _resize_cutoffs(self, new_num_tokens, new_emb_size, new_embedding_shapes, layer):
        # 获取当前的输入词嵌入层
        embeddings = self.get_input_embeddings()

        # 根据新的词嵌入形状更新截断点列表
        for i in range(layer, len(embeddings.cutoffs)):
            embeddings.cutoffs[i] = sum(new_embedding_shapes[: i + 1])

        # 更新截断结束点列表
        embeddings.cutoff_ends = [0] + embeddings.cutoffs
        # 更新词汇量
        embeddings.n_token = new_num_tokens

        # 更新配置中的截断点列表
        self.config.cutoffs = embeddings.cutoffs[:-1]

        # 返回更新后的截断点列表
        return embeddings.cutoffs
# 使用 `dataclass` 装饰器定义一个数据类，表示 TransfoXL 模型的输出结果，继承自 `ModelOutput` 类。
@dataclass
class TransfoXLModelOutput(ModelOutput):
    """
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
            input) to speed up sequential decoding. The token ids which have their past given to this model should not
            be passed as input ids as they have already been computed.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # 定义类的属性，表示模型输出中的最后一个隐藏层的状态
    last_hidden_state: torch.FloatTensor
    # 包含预先计算的隐藏状态列表，用于在顺序解码中加速处理
    mems: List[torch.FloatTensor] = None
    # 可选的元组，包含模型每一层的隐藏状态，返回条件是在 `output_hidden_states=True` 时或 `config.output_hidden_states=True` 时
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 可选的元组，包含每一层的注意力权重，返回条件是在 `output_attentions=True` 时或 `config.output_attentions=True` 时
    attentions: Optional[Tuple[torch.FloatTensor]] = None


# 使用 `dataclass` 装饰器定义一个数据类，表示带有历史信息的 TransfoXL 序列分类器的输出结果，继承自 `ModelOutput` 类。
@dataclass
class TransfoXLSequenceClassifierOutputWithPast(ModelOutput):
    """
    Base class for outputs of sentence classification models.
    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            分类（或回归，如果config.num_labels==1）的损失值。
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            分类（或回归，如果config.num_labels==1）的分数（SoftMax之前）。
        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
            包含预先计算的隐藏状态（注意力模块中的键和值）。可以用来加速顺序解码。
            给定给模型的过去记忆的令牌ID不应作为输入ID传递，因为它们已经被计算过。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            `torch.FloatTensor`的元组（一个用于嵌入层的输出 + 每层的输出），形状为`(batch_size, sequence_length, hidden_size)`。
            模型每层的隐藏状态加上初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            `torch.FloatTensor`的元组（每层一个）形状为`(batch_size, num_heads, sequence_length, sequence_length)`。
            注意力softmax后的注意力权重，用于计算自注意力头中的加权平均值。
    """

    loss: Optional[torch.FloatTensor] = None  # 分类或回归损失（如果提供了标签，则返回）
    logits: torch.FloatTensor = None  # 分类或回归的分数（SoftMax之前）
    mems: List[torch.FloatTensor] = None  # 预先计算的隐藏状态列表，用于加速顺序解码
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 模型每层的隐藏状态和初始嵌入输出的元组
    attentions: Optional[Tuple[torch.FloatTensor]] = None  # 每层注意力权重的元组，用于自注意力加权平均值计算
@dataclass
class TransfoXLLMHeadModelOutput(ModelOutput):
    """
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        losses (`torch.FloatTensor` of shape *(batch_size, sequence_length-1)*, *optional*, returned when `labels` is provided):
            Language modeling losses (not reduced).
        prediction_scores (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax).
        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
            input) to speed up sequential decoding. The token ids which have their past given to this model should not
            be passed as input ids as they have already been computed.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        loss (`torch.FloatTensor` of shape `()`, *optional*, returned when `labels` is provided)
            Reduced language modeling loss.
    """

    losses: Optional[torch.FloatTensor] = None  # 语言模型损失，未经归约
    prediction_scores: torch.FloatTensor = None  # 语言建模头部的预测分数，经过SoftMax后每个词汇标记的分数
    mems: List[torch.FloatTensor] = None  # 预先计算的隐藏状态列表，用于加速顺序解码
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 模型各层输出的隐藏状态，包括初始嵌入输出
    attentions: Optional[Tuple[torch.FloatTensor]] = None  # 注意力权重，经过注意力SoftMax后的结果，用于计算自注意力头的加权平均值
    loss: Optional[torch.FloatTensor] = None  # 归约的语言建模损失

    @property
    def logits(self):
        # 预测分数是自适应SoftMax的输出，参见 `modeling_transfo_xl_utilities` 文件。
        # 由于自适应SoftMax返回log softmax值，因此 `self.prediction_scores` 严格来说不是 `logits`，但其行为方式与logits相同。
        return self.prediction_scores


TRANSFO_XL_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)
"""
    # 这个模型也是 PyTorch 中的一个子类 [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)。
    # 可以像普通的 PyTorch 模块一样使用，并且在使用和行为上可以参考 PyTorch 的文档。

    # 参数:
    #     config ([`TransfoXLConfig`]): 包含模型所有参数的配置类。
    #         使用配置文件初始化模型时不会加载模型的权重，只会加载配置信息。
    #         若要加载模型的权重，请查看 [`~PreTrainedModel.from_pretrained`] 方法。
"""
TRANSFO_XL_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
            `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
            given to this model should not be passed as `input_ids` as they have already been computed.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
    TRANSFO_XL_START_DOCSTRING,
)
class TransfoXLModel(TransfoXLPreTrainedModel):
    """
    TransfoXLModel class inherits from TransfoXLPreTrainedModel and represents the main model for TransfoXL.

    This class provides the core Transformer-XL model for processing sequences, without any task-specific head.

    Args:
        config (:class:`~transformers.TransfoXLConfig`):
            The model configuration class that defines the model architecture and its parameters.

    Inherits from:
        :class:`~transformers.TransfoXLPreTrainedModel`
    """
    # 初始化方法，用于初始化Transformer-XL模型的各个参数和层
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 设置词汇表大小
        self.n_token = config.vocab_size

        # 设置词嵌入的维度和模型的维度
        self.d_embed = config.d_embed
        self.d_model = config.d_model

        # 设置注意力头的数量和每个头的维度
        self.n_head = config.n_head
        self.d_head = config.d_head

        # 创建自适应词嵌入层，根据词汇表大小、词嵌入维度、模型维度、截断参数和分割值创建
        self.word_emb = AdaptiveEmbedding(
            config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
        )

        # 添加Dropout层，使用指定的丢弃率
        self.drop = nn.Dropout(config.dropout)

        # 设置层数和记忆长度
        self.n_layer = config.n_layer
        self.mem_len = config.mem_len

        # 设置注意力类型
        self.attn_type = config.attn_type

        # 如果不是解耦的注意力机制，则创建r_w_bias和r_r_bias作为可学习参数
        if not config.untie_r:
            self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
            self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))

        # 创建多层Transformer-XL解码层
        self.layers = nn.ModuleList()
        if config.attn_type == 0:  # 默认的注意力类型
            # 根据层数循环创建RelPartialLearnableDecoderLayer，并添加到self.layers中
            for i in range(config.n_layer):
                self.layers.append(
                    RelPartialLearnableDecoderLayer(
                        config.n_head,
                        config.d_model,
                        config.d_head,
                        config.d_inner,
                        config.dropout,
                        dropatt=config.dropatt,
                        pre_lnorm=config.pre_lnorm,
                        r_w_bias=None if config.untie_r else self.r_w_bias,
                        r_r_bias=None if config.untie_r else self.r_r_bias,
                        layer_norm_epsilon=config.layer_norm_epsilon,
                    )
                )
        else:
            # 如果不是默认的注意力类型，抛出未实现错误
            raise NotImplementedError  # 移除这些以避免维护死代码

        # 设置同等长度和限制长度
        self.same_length = config.same_length
        self.clamp_len = config.clamp_len

        # 根据注意力类型设置位置编码
        if self.attn_type == 0:  # 默认注意力类型
            self.pos_emb = PositionalEmbedding(self.d_model)
        else:
            # 如果不是默认注意力类型，抛出未实现错误
            raise NotImplementedError  # 移除这些以避免维护死代码

        # 初始化权重并进行最终处理
        self.post_init()

    # 返回词嵌入层
    def get_input_embeddings(self):
        return self.word_emb

    # 设置新的输入词嵌入层
    def set_input_embeddings(self, new_embeddings):
        self.word_emb = new_embeddings

    # 向后兼容性方法
    def backward_compatible(self):
        self.sample_softmax = -1

    # 重置记忆长度
    def reset_memory_length(self, mem_len):
        self.mem_len = mem_len

    # 剪枝注意力头
    def _prune_heads(self, heads):
        logger.info("Head pruning is not implemented for Transformer-XL model")
        pass  # 留空函数体
    def init_mems(self, bsz):
        # 如果预定义的记忆长度大于零
        if self.mem_len > 0:
            # 创建一个空列表 mems 来存储记忆张量
            mems = []
            # 获取模型的第一个参数（通常是用来确定数据类型和设备）
            param = next(self.parameters())
            # 对每一层进行循环，创建一个全零张量作为记忆，并加入 mems 列表
            for i in range(self.n_layer):
                empty = torch.zeros(self.mem_len, bsz, self.config.d_model, dtype=param.dtype, device=param.device)
                mems.append(empty)

            # 返回记忆张量列表
            return mems
        else:
            # 如果预定义的记忆长度不大于零，则返回 None
            return None

    def _update_mems(self, hids, mems, mlen, qlen):
        # 如果 mems 为 None，则直接返回 None
        if mems is None:
            return None

        # 断言 hids 和 mems 的长度必须相等，否则抛出异常
        assert len(hids) == len(mems), "len(hids) != len(mems)"

        # 计算可以缓存到 mems 中的步数总和为 mlen + max(0, qlen)
        with torch.no_grad():
            # 创建一个新的 mems 列表
            new_mems = []
            # 计算起始索引和结束索引
            end_idx = mlen + max(0, qlen)
            beg_idx = max(0, end_idx - self.mem_len)
            # 对每一层的隐藏状态进行循环处理
            for i in range(len(hids)):
                # 将旧的记忆和当前隐藏状态拼接起来，然后截取指定范围的片段，并且分离计算图
                cat = torch.cat([mems[i], hids[i]], dim=0)
                new_mems.append(cat[beg_idx:end_idx].detach())

        # 返回更新后的 mems 列表
        return new_mems

    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TransfoXLModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        mems: Optional[List[torch.FloatTensor]] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
"""
The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive
input embeddings)
"""
@add_start_docstrings(
    """
    The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive
    input embeddings)
    """,
    TRANSFO_XL_START_DOCSTRING,
)
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
    _tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.transformer = TransfoXLModel(config)
        self.sample_softmax = config.sample_softmax
        self.trainer_compatible = getattr(config, "trainer_compatible", False)

        if not self.trainer_compatible:
            warnings.warn(
                "The output of TransfoXL will be updated in v5 to support a single loss as first argument. In order "
                "to use that updated output, please specify `trainer_compatible=True` as your configuration"
                " attribute.",
                DeprecationWarning,
            )

        assert self.sample_softmax <= 0, (
            "Sampling from the softmax is not implemented yet. Please look at issue: #3310:"
            " https://github.com/huggingface/transformers/issues/3310"
        )

        self.crit = ProjectedAdaptiveLogSoftmax(
            config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
        )

        # Initialize weights and apply final processing
        self.post_init()

    def tie_weights(self):
        """
        Run this to be sure output and input (adaptive) softmax weights are tied
        """

        if self.config.tie_word_embeddings:
            for i in range(len(self.crit.out_layers)):
                # Tie or clone weights between output and input (adaptive) softmax layers
                self._tie_or_clone_weights(self.crit.out_layers[i], self.transformer.word_emb.emb_layers[i])
        if self.config.tie_projs:
            for i, tie_proj in enumerate(self.config.tie_projs):
                if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
                    if self.config.torchscript:
                        # Clone weights if using torchscript
                        self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone())
                    else:
                        # Assign weights directly
                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
                elif tie_proj and self.config.div_val != 1:
                    if self.config.torchscript:
                        # Clone weights if using torchscript
                        self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone())
                    else:
                        # Assign weights directly
                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]

    def reset_memory_length(self, mem_len):
        # Reset the memory length for the transformer model
        self.transformer.reset_memory_length(mem_len)

    def init_mems(self, bsz):
        # Initialize memories for the transformer model with batch size bsz
        return self.transformer.init_mems(bsz)

    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TransfoXLLMHeadModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播方法，接受以下参数：

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的词索引序列，类型为可选的长整型张量
        mems: Optional[List[torch.FloatTensor]] = None,  # 可选的记忆列表，每个元素是浮点数张量
        head_mask: Optional[torch.FloatTensor] = None,  # 可选的注意力头掩码，浮点数张量
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 可选的输入嵌入表示，浮点数张量
        labels: Optional[torch.LongTensor] = None,  # 可选的标签，长整型张量
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，布尔值
    ) -> Union[Tuple, TransfoXLLMHeadModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        # Determine whether to use the return dictionary from the function argument or the model's configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # Determine batch size (`bsz`) and target sequence length (`tgt_len`) based on input_ids or inputs_embeds
        if input_ids is not None:
            bsz, tgt_len = input_ids.size(0), input_ids.size(1)
        elif inputs_embeds is not None:
            bsz, tgt_len = inputs_embeds.size(0), inputs_embeds.size(1)
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # Pass inputs to the transformer model and retrieve outputs
        transformer_outputs = self.transformer(
            input_ids,
            mems=mems,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the last hidden states from transformer_outputs
        last_hidden = transformer_outputs[0]
        # Predicted hidden states for the target length of the sequence
        pred_hid = last_hidden[:, -tgt_len:]

        # Adjust labels to prevent NaN loss during backward pass
        if labels is not None:
            # Check if all labels are -100 (masked), then modify to prevent NaN during loss computation
            miss_valid_label = labels[0, 1:].sum() == (labels.size(1) - 1) * -100
            if miss_valid_label:
                # Set a valid label (e.g., EOS token) to prevent NaN in loss calculation
                labels[0, 1] = self.config.eos_token_id

        # Compute softmax output based on predicted hidden states and labels
        softmax_output = self.crit(pred_hid, labels)
        # Reshape softmax output into a tensor of shape (batch_size, tgt_len, vocab_size) if labels are None
        prediction_scores = softmax_output.view(bsz, tgt_len, -1) if labels is None else ()

        # Compute losses if labels are provided
        if labels is not None:
            losses = softmax_output.view(bsz, tgt_len - 1)
            # Exclude padding tokens (-100) from loss calculation
            loss = losses[losses != 0].mean()
        else:
            losses, loss = None, None

        # Return outputs based on return_dict and trainer_compatible settings
        if not return_dict:
            if self.trainer_compatible:
                output = (prediction_scores, losses) if losses is not None else (prediction_scores,)
                output += transformer_outputs[1:]
                return ((loss,) + output) if loss is not None else output
            else:
                output = (prediction_scores, *transformer_outputs[1:])
                output = ((losses,) + output) if losses is not None else output
                return (output + (loss,)) if loss is not None else output

        # Return TransfoXLLMHeadModelOutput containing loss, prediction_scores, and other transformer outputs
        return TransfoXLLMHeadModelOutput(
            loss=loss,
            prediction_scores=prediction_scores,
            losses=losses,
            mems=transformer_outputs.mems,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
    # 获取模型的输出嵌入
    def get_output_embeddings(self):
        """Double-check if you are using adaptive softmax."""
        # 如果使用了自适应softmax，返回输出层对象
        if self.sample_softmax > 0:
            return self.out_layer
        else:
            # 否则返回临界评估器的最后一层输出
            return self.crit.out_layers[-1]

    # 为生成准备输入
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model_kwargs):
        inputs = {}

        # 如果过去的键值已在模型参数中定义，则使用它以加快解码速度
        if past_key_values:
            inputs["mems"] = past_key_values
            # 取最后一个输入ID，并扩展维度以匹配模型预期的形状
            inputs["input_ids"] = input_ids[:, -1].unsqueeze(-1)
        else:
            inputs["input_ids"] = input_ids

        return inputs

    # 调整截止点大小
    def _resize_cutoffs(self, new_num_tokens, new_emb_size, new_embedding_shapes, layer):
        # 调用父类方法以获取新的截止点
        new_cutoffs = super()._resize_cutoffs(new_num_tokens, new_emb_size, new_embedding_shapes, layer)

        # 更新临界评估器的截止点和token数
        self.crit.cutoffs = new_cutoffs
        self.crit.cutoff_ends = [0] + new_cutoffs
        self.crit.n_token = new_num_tokens

    # 重新排序缓存
    @staticmethod
    def _reorder_cache(mems: List[torch.Tensor], beam_idx: torch.Tensor) -> List[torch.Tensor]:
        """
        This function is used to re-order the `mems` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `mems` with the correct beam_idx at every
        generation step.
        """
        # 对于每一层的过去缓存，根据beam_idx重新排序以匹配生成步骤
        return [layer_past.index_select(1, beam_idx.to(layer_past.device)) for layer_past in mems]
@add_start_docstrings(
    """
    The Transformer-XL Model transformer with a sequence classification head on top (linear layer).

    [`TransfoXLForSequenceClassification`] uses the last token in order to do the classification, as other causal
    models (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    TRANSFO_XL_START_DOCSTRING,
)
class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel):
    """
    Transformer-XL模型的序列分类器，顶部带有线性层。

    [`TransfoXLForSequenceClassification`] 使用最后一个token进行分类，类似于其他因果模型（例如GPT-1）。

    由于它在最后一个token上进行分类，因此需要知道最后一个token的位置。如果配置中定义了`pad_token_id`，则在每一行中找到不是填充token的最后一个token。如果未定义`pad_token_id`，则简单地取批次中每一行的最后一个值。当传递`inputs_embeds`而不是`input_ids`时，无法猜测填充token，因此执行相同操作（取批次中每一行的最后一个值）。
    """
    
    def __init__(self, config):
        """
        初始化方法，设置模型配置。

        Args:
            config (:class:`~transformers.TransfoXLConfig`):
                模型的配置对象，包含了模型的各种参数和超参数。
        """
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = TransfoXLModel(config)
        self.score = nn.Linear(config.d_embed, self.num_labels, bias=False)
        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TransfoXLSequenceClassifierOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        mems: Optional[List[torch.FloatTensor]] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传播方法。

        Args:
            input_ids (:obj:`torch.LongTensor`, 可选):
                输入token的ID张量。
            mems (:obj:`List[torch.FloatTensor]`, 可选):
                Transformer-XL的记忆（memory）部分，用于长序列训练。
            head_mask (:obj:`torch.FloatTensor`, 可选):
                头部的掩码张量，用于控制层的注意力权重。
            inputs_embeds (:obj:`torch.FloatTensor`, 可选):
                输入的嵌入张量，替代input_ids。
            labels (:obj:`torch.LongTensor`, 可选):
                分类任务的标签张量。
            output_attentions (:obj:`bool`, 可选):
                是否输出注意力权重。
            output_hidden_states (:obj:`bool`, 可选):
                是否输出隐藏状态。
            return_dict (:obj:`bool`, 可选):
                是否返回字典格式的输出。

        Returns:
            :class:`~transformers.modeling_transfo_xl.TransfoXLSequenceClassifierOutputWithPast`:
                输出对象，包含分类器的结果和可能的附加信息。
        """
        pass  # 实际前向传播逻辑在这里未给出，但这是一个定义前向传播的占位符方法

`.\models\deprecated\transfo_xl\modeling_transfo_xl_utilities.py`

# coding=utf-8
# 定义编码方式为 UTF-8

# Utilities for PyTorch Transformer XL model. Directly adapted from https://github.com/kimiyoung/transformer-xl.
# PyTorch Transformer XL 模型的实用工具，直接从 https://github.com/kimiyoung/transformer-xl 进行了适应。

import torch
# 导入 PyTorch 库
from torch import nn
# 从 PyTorch 库中导入 nn 模块

# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
# CUDA_MINOR = int(torch.version.cuda.split('.')[1])

class ProjectedAdaptiveLogSoftmax(nn.Module):
    # 定义 ProjectedAdaptiveLogSoftmax 类，继承自 nn.Module
    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False):
        super().__init__()
        # 调用父类的初始化方法

        self.n_token = n_token
        # 设置词汇表大小
        self.d_embed = d_embed
        # 设置嵌入维度大小
        self.d_proj = d_proj
        # 设置投影维度大小

        self.cutoffs = cutoffs + [n_token]
        # 设置分隔阈值列表，并加入词汇表大小作为最后一个阈值
        self.cutoff_ends = [0] + self.cutoffs
        # 设置分隔阈值结束位置列表，包含从0开始到每个阈值结束的位置
        self.div_val = div_val
        # 设置划分值

        self.shortlist_size = self.cutoffs[0]
        # 设置短列表大小为第一个阈值
        self.n_clusters = len(self.cutoffs) - 1
        # 设置集群数量为阈值数量减一
        self.head_size = self.shortlist_size + self.n_clusters
        # 设置头部大小为短列表大小加上集群数量

        if self.n_clusters > 0:
            # 如果集群数量大于零
            self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
            # 设置集群权重为 nn.Parameter，大小为集群数量乘以嵌入维度
            self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
            # 设置集群偏置为 nn.Parameter，大小为集群数量

        self.out_layers = nn.ModuleList()
        # 设置输出层为 nn.ModuleList
        self.out_projs = nn.ParameterList()
        # 设置输出投影为 nn.ParameterList

        if div_val == 1:
            # 如果划分值为1
            for i in range(len(self.cutoffs)):
                # 对于阈值列表的每个元素
                if d_proj != d_embed:
                    self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
                    # 如果投影维度不等于嵌入维度，添加投影参数
                else:
                    self.out_projs.append(None)
                    # 否则添加空值

            self.out_layers.append(nn.Linear(d_embed, n_token))
            # 添加线性层，输入维度为嵌入维度，输出维度为词汇表大小
        else:
            for i in range(len(self.cutoffs)):
                # 对于阈值列表的每个元素
                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                # 获取左右索引值

                d_emb_i = d_embed // (div_val**i)
                # 计算当前嵌入维度

                self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
                # 添加投影参数

                self.out_layers.append(nn.Linear(d_emb_i, r_idx - l_idx))
                # 添加线性层，输入维度为当前嵌入维度，输出维度为右索引减去左索引

        self.keep_order = keep_order
        # 设置保持顺序标志位
    # 定义一个方法，用于计算逻辑输出
    def _compute_logit(self, hidden, weight, bias, proj):
        # 如果没有投影矩阵，则直接使用线性变换计算逻辑输出
        if proj is None:
            logit = nn.functional.linear(hidden, weight, bias=bias)
        else:
            # 如果有投影矩阵，则先对隐藏层进行投影操作
            proj_hid = nn.functional.linear(hidden, proj.t().contiguous())
            # 然后使用投影后的结果与权重进行线性变换，计算逻辑输出
            logit = nn.functional.linear(proj_hid, weight, bias=bias)
            # CUDA_MAJOR 和 CUDA_MINOR 小于等于 9.1 时使用下面的方法计算逻辑输出
            # logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
            # 如果有偏置，则加上偏置值
            # if bias is not None:
            #     logit = logit + bias

        # 返回计算得到的逻辑输出
        return logit
    def log_prob(self, hidden):
        r"""
        Computes log probabilities for all \\(n\_classes\\) From:
        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.p

        Args:
            hidden (Tensor): a minibatch of example

        Returns:
            log-probabilities of for each class \\(c\\) in range \\(0 <= c <= n\_classes\\), where \\(n\_classes\\) is
            a parameter passed to `AdaptiveLogSoftmaxWithLoss` constructor. Shape:

            - Input: \\((N, in\_features)\\)
            - Output: \\((N, n\_classes)\\)
        """
        # Check if the number of clusters is zero
        if self.n_clusters == 0:
            # Compute logit using the first output layer's weights, biases, and projection
            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
            # Apply log_softmax to the computed logit along the last dimension
            return nn.functional.log_softmax(logit, dim=-1)
        else:
            # Initialize empty lists for weights and biases
            weights, biases = [], []
            # Iterate over cutoffs to construct weights and biases
            for i in range(len(self.cutoffs)):
                if self.div_val == 1:
                    # Calculate left and right indices based on cutoff ends
                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                    # Extract weight and bias from the first output layer within the specified range
                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
                else:
                    # Use weights and biases directly from respective output layers
                    weight_i = self.out_layers[i].weight
                    bias_i = self.out_layers[i].bias

                # Concatenate cluster weights and biases if it's the first iteration
                if i == 0:
                    weight_i = torch.cat([weight_i, self.cluster_weight], dim=0)
                    bias_i = torch.cat([bias_i, self.cluster_bias], dim=0)

                # Append constructed weight and bias to lists
                weights.append(weight_i)
                biases.append(bias_i)

            # Extract head weight, bias, and projection
            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
            # Compute logit using the extracted parameters
            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)

            # Initialize an empty tensor 'out' with the same device as 'hidden'
            out = hidden.new_empty((head_logit.size(0), self.n_token))
            # Compute log softmax along the second dimension (classes) of head logit
            head_logprob = nn.functional.log_softmax(head_logit, dim=1)

            # Define cutoff values with 0 as the initial value
            cutoff_values = [0] + self.cutoffs
            # Iterate over cutoff values to compute log probabilities
            for i in range(len(cutoff_values) - 1):
                start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]

                if i == 0:
                    # Assign head log probabilities to the initial segment of 'out'
                    out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]]
                else:
                    # Extract weight, bias, and projection for the current segment
                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
                    # Compute logit for the current segment
                    tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)
                    # Compute log softmax for the computed logit
                    tail_logprob_i = nn.functional.log_softmax(tail_logit_i, dim=1)

                    # Combine head log probabilities and tail log probabilities
                    logprob_i = head_logprob[:, -i] + tail_logprob_i
                    # Assign computed log probabilities to the respective segment of 'out'
                    out[:, start_idx: stop_idx] = logprob_i

            # Return the computed log probabilities 'out'
            return out

`.\models\deprecated\transfo_xl\tokenization_transfo_xl.py`

# 设置编码格式为 UTF-8
# 版权声明和许可信息，详细说明了使用该代码的条件和限制
# 导入所需的模块和库
# 引入正则表达式、计数器和有序字典等工具
# 导入 NumPy 库，并命名为 np

# 如果 sacremoses 可用，则导入 sacremoses 库
# 如果 Torch 可用，则导入 Torch 库
# 获取 logging 模块的记录器对象，并命名为 logger

# 定义词汇文件名的映射
# 定义预训练词汇文件的映射
# 定义预训练位置嵌入的大小映射
# 定义预训练语料库的映射和语料库名称

# 定义匹配数字的正则表达式和替换方式的元组
# 定义将已经被 token 化的数字进行重新组合的规则
    # 对给定的文本进行数字解标记化处理
    for reg, sub in DETOKENIZE_NUMBERS:
        # 使用正则表达式替换文本中匹配到的模式为指定的替换字符串
        text = re.sub(reg, sub, text)
    # 返回处理后的文本
    return text
class TransfoXLTokenizer(PreTrainedTokenizer):
    """
    Construct a Transformer-XL tokenizer adapted from Vocab class in [the original
    code](https://github.com/kimiyoung/transformer-xl). The Transformer-XL tokenizer is a word-level tokenizer (no
    sub-word tokenization).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        special (`List[str]`, *optional*):
            A list of special tokens (to be treated by the original implementation of this tokenizer).
        min_freq (`int`, *optional*, defaults to 0):
            The minimum number of times a token has to be present in order to be kept in the vocabulary (otherwise it
            will be mapped to `unk_token`).
        max_size (`int`, *optional*):
            The maximum size of the vocabulary. If left unset, it will default to the size of the vocabulary found
            after excluding the tokens according to the `min_freq` rule.
        lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.
        delimiter (`str`, *optional*):
            The delimiter used between tokens.
        vocab_file (`str`, *optional*):
            File containing the vocabulary (from the original implementation).
        pretrained_vocab_file (`str`, *optional*):
            File containing the vocabulary as saved with the `save_pretrained()` method.
        never_split (`List[str]`, *optional*):
            List of tokens that should never be split. If no list is specified, will simply use the existing special
            tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        eos_token (`str`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        additional_special_tokens (`List[str]`, *optional*, defaults to `['<formula>']`):
            A list of additional special tokens (for the HuggingFace functionality).
        language (`str`, *optional*, defaults to `"en"`):
            The language of this tokenizer (used for mose preprocessing).
    """

    # 定义一些类属性，用于管理词汇文件和模型输入的最大尺寸
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids"]

    def __init__(
        self,
        special=None,
        min_freq=0,
        max_size=None,
        lower_case=False,
        delimiter=None,
        vocab_file=None,
        pretrained_vocab_file: str = None,
        never_split=None,
        unk_token="<unk>",
        eos_token="<eos>",
        additional_special_tokens=["<formula>"],
        language="en",
        **kwargs,
    ):
        # 初始化方法，用于创建一个新的Tokenizer对象
        # 参数说明如上述文档所述
        super().__init__(
            unk_token=unk_token,
            eos_token=eos_token,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )

    @property
    # 定义一个属性方法，用于访问类属性
    # 返回当前对象的小写字母状态
    def do_lower_case(self):
        return self.lower_case

    # 编译用于匹配标点符号周围空格的正则表达式模式
    def _compile_space_around_punctuation_pattern(self):
        # 创建正向预查以匹配特殊标点符号之前的位置
        look_ahead_for_special_token = f"(?=[{self.punctuation_symbols}])"
        # 创建正向预查以匹配除空格外所有字符之前的位置
        look_ahead_to_match_all_except_space = r"(?=[^\s])"
        # 返回编译后的正则表达式模式对象
        return re.compile(r"" + look_ahead_for_special_token + look_ahead_to_match_all_except_space)

    # 统计文件中的符号并返回句子列表
    def count_file(self, path, verbose=False, add_eos=False):
        # 如果启用详细模式，则记录文件计数过程
        if verbose:
            logger.info(f"counting file {path} ...")
        # 断言文件路径存在，否则抛出异常
        assert os.path.exists(path), f"Input file {path} not found"

        # 初始化句子列表
        sents = []
        # 打开文件进行读取
        with open(path, "r", encoding="utf-8") as f:
            # 逐行读取文件内容
            for idx, line in enumerate(f):
                # 如果启用详细模式并且达到指定的行数间隔，则记录当前行数
                if verbose and idx > 0 and idx % 500000 == 0:
                    logger.info(f"    line {idx}")
                # 对当前行进行符号化处理，可选择在末尾添加结束符
                symbols = self.tokenize(line, add_eos=add_eos)
                # 更新符号计数器
                self.counter.update(symbols)
                # 将处理后的符号列表添加到句子列表中
                sents.append(symbols)

        # 返回处理后的句子列表
        return sents

    # 统计符号列表中的符号
    def count_sents(self, sents, verbose=False):
        """
        sents : a list of sentences, each a list of tokenized symbols
        """
        # 如果启用详细模式，则记录句子计数过程
        if verbose:
            logger.info(f"counting {len(sents)} sents ...")
        # 遍历句子列表并统计符号
        for idx, symbols in enumerate(sents):
            # 如果启用详细模式并且达到指定的行数间隔，则记录当前行数
            if verbose and idx > 0 and idx % 500000 == 0:
                logger.info(f"    line {idx}")
            # 更新符号计数器
            self.counter.update(symbols)

    # 从文件构建词汇表
    def _build_from_file(self, vocab_file):
        # 初始化索引到符号的映射列表
        self.idx2sym = []
        # 初始化符号到索引的有序字典
        self.sym2idx = OrderedDict()

        # 打开词汇文件进行读取
        with open(vocab_file, "r", encoding="utf-8") as f:
            # 逐行读取文件内容
            for line in f:
                # 剥离行末尾的空白字符并按空格分割取第一个符号
                symb = line.strip().split()[0]
                # 添加符号到词汇表中
                self.add_symbol(symb)
        # 如果词汇表中存在"<UNK>"符号，则设置其索引为unk_idx
        if "<UNK>" in self.sym2idx:
            self.unk_idx = self.sym2idx["<UNK>"]
        # 否则如果词汇表中存在"<unk>"符号，则设置其索引为unk_idx
        elif "<unk>" in self.sym2idx:
            self.unk_idx = self.sym2idx["<unk>"]
        # 否则抛出异常，表示找不到用于替换的未知符号
        else:
            raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement.")

    # 保存词汇表到指定目录中
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果保存目录已存在，则设置词汇文件路径
        if os.path.isdir(save_directory):
            vocab_file = os.path.join(
                save_directory,
                (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["pretrained_vocab_file"],
            )
        # 否则设置词汇文件路径为指定的文件名前缀
        else:
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        # 打开词汇文件并使用pickle将当前对象的字典形式保存到文件中
        with open(vocab_file, "wb") as f:
            pickle.dump(self.__dict__, f)
        # 返回保存的词汇文件路径
        return (vocab_file,)
    # 构建词汇表的方法
    def build_vocab(self):
        # 如果指定了词汇文件，从文件中构建词汇表
        if self.vocab_file:
            logger.info(f"building vocab from {self.vocab_file}")
            self._build_from_file(self.vocab_file)
            logger.info(f"Final vocab size {len(self.sym2idx)}")
        else:
            # 如果没有指定词汇文件，根据设定的最小频率和最大大小构建词汇表
            logger.info(f"building vocab with min_freq={self.min_freq}, max_size={self.max_size}")
            # 初始化索引到符号的列表和符号到索引的有序字典
            self.idx2sym = []
            self.sym2idx = OrderedDict()

            # 添加特殊符号到词汇表中
            for sym in self.special:
                self.add_special(sym)

            # 根据计数器中的频率最高的符号构建词汇表，直到达到最大大小或者低于最小频率
            for sym, cnt in self.counter.most_common(self.max_size):
                if cnt < self.min_freq:
                    break
                self.add_symbol(sym)

            # 打印构建后的词汇表大小和原始符号的唯一标记数
            logger.info(f"Final vocab size {len(self.sym2idx)} from {len(self.counter)} unique tokens")

    # 使用 PyTorch 的方法进行文件编码
    @torch_only_method
    def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
        # 如果启用详细模式，记录编码文件的信息
        if verbose:
            logger.info(f"encoding file {path} ...")
        # 断言检查路径是否存在
        assert os.path.exists(path), f"Output file {path} not found"
        # 初始化编码结果列表
        encoded = []
        # 打开文件并逐行处理
        with open(path, "r", encoding="utf-8") as f:
            for idx, line in enumerate(f):
                # 如果启用详细模式并且处理了一定数量的行数，记录当前处理的行数
                if verbose and idx > 0 and idx % 500000 == 0:
                    logger.info(f"    line {idx}")
                # 对每一行进行分词和编码成张量，并添加到编码结果列表中
                symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos)
                encoded.append(self.convert_to_tensor(symbols))

        # 如果需要按顺序返回编码结果，将所有张量拼接成一个张量
        if ordered:
            encoded = torch.cat(encoded)

        return encoded

    # 使用 PyTorch 的方法进行句子编码
    @torch_only_method
    def encode_sents(self, sents, ordered=False, verbose=False):
        # 如果启用详细模式，记录编码句子的信息
        if verbose:
            logger.info(f"encoding {len(sents)} sents ...")
        # 初始化编码结果列表
        encoded = []
        # 遍历每一个句子进行编码
        for idx, symbols in enumerate(sents):
            # 如果启用详细模式并且处理了一定数量的句子，记录当前处理的句子数
            if verbose and idx > 0 and idx % 500000 == 0:
                logger.info(f"    line {idx}")
            # 将每个句子的符号序列转换成张量，并添加到编码结果列表中
            encoded.append(self.convert_to_tensor(symbols))

        # 如果需要按顺序返回编码结果，将所有张量拼接成一个张量
        if ordered:
            encoded = torch.cat(encoded)

        return encoded

    # 向词汇表中添加特殊符号
    def add_special(self, sym):
        # 如果符号不在词汇表中，则将其添加到词汇表中，并为其设置索引属性
        if sym not in self.sym2idx:
            self.idx2sym.append(sym)
            self.sym2idx[sym] = len(self.idx2sym) - 1
            setattr(self, f"{sym.strip('<>')}_idx", self.sym2idx[sym])

    # 向词汇表中添加普通符号
    def add_symbol(self, sym):
        # 如果符号不在词汇表中，则将其添加到词汇表中，并为其设置索引属性
        if sym not in self.sym2idx:
            self.idx2sym.append(sym)
            self.sym2idx[sym] = len(self.idx2sym) - 1
    def move_added_token(self, token: str, target_idx: int):
        """
        Moves an added token to a specific position in the vocab. This method should be used when resizing an embedding
        layer other than the last one in the `AdaptiveEmbedding` in order to move the token in the tokenizer from the
        default position (at the very end) to the desired one.

        Args:
            token: The token to move to a specific position in the vocab.
            target_idx: The position where the token should be moved to.
        """
        # 确保要移动的 token 是已添加的 token
        assert token in self.added_tokens_encoder, "Token which should be moved has to be an added token"
        # 确保要移动的 token 不在词汇表中
        assert token not in self.idx2sym, "Token which should be moved is already in vocab"

        # 将 token 插入到目标位置
        self.idx2sym.insert(target_idx, token)
        # 更新 token 对应的索引位置
        self.sym2idx[token] = target_idx

        # 调整后续 token 在 sym2idx 中的索引位置
        for idx in range(target_idx + 1, len(self.idx2sym)):
            current_sym = self.idx2sym[idx]
            self.sym2idx[current_sym] = idx

        # 从 added_tokens 中删除 token
        old_index = self._added_tokens_encoder.pop(token)
        self._added_tokens_decoder.pop(old_index)

    def moses_punct_norm(self, text):
        """
        Normalize punctuation in the text using MosesPunctNormalizer.

        Args:
            text: Input text to be normalized.

        Returns:
            Normalized text with standardized punctuation.
        """
        return self.moses_punct_normalizer.normalize(text)

    def moses_tokenize(self, text):
        """
        Tokenizes text using MosesTokenizer.

        Args:
            text: Input text to be tokenized.

        Returns:
            List of tokens extracted from the input text.
        """
        return self.moses_tokenizer.tokenize(
            text, aggressive_dash_splits=True, return_str=False, escape=False, protected_patterns=self.never_split
        )

    def moses_pipeline(self, text: str) -> List[str]:
        """
        Performs a pipeline of basic text preprocessing tasks using MosesPunctNormalizer and MosesTokenizer. Also handles
        splitting of large comma-separated numbers and floating point values.

        Args:
            text: Text to be tokenized and preprocessed.

        Returns:
            A list of tokenized strings.
        """
        text = self.moses_punct_norm(text)
        text = self.moses_tokenize(text)
        text = tokenize_numbers(text)  # Assuming tokenize_numbers is defined elsewhere
        return text

    def _convert_id_to_token(self, idx):
        """
        Converts an index to a token using the vocabulary.

        Args:
            idx: Index to be converted into a token.

        Returns:
            Corresponding token based on the index.
        """
        assert 0 <= idx < len(self), f"Index {idx} out of vocabulary range"
        return self.idx2sym[idx]
    def _convert_token_to_id(self, sym):
        """Converts a token (str) into an id using the vocabulary."""
        # 如果符号在符号到索引的映射中存在，则返回其对应的索引
        if sym in self.sym2idx:
            return self.sym2idx[sym]
        else:
            # 如果符号不在映射中
            # logger.info(f'encounter unk {sym}')
            # assert '<eos>' not in sym
            # 如果对象具有 unk_idx 属性，则返回其映射值，否则根据预训练模型的后向兼容性返回默认的未知标记索引
            if hasattr(self, "unk_idx"):
                return self.sym2idx.get(sym, self.unk_idx)
            elif "<unk>" in self.sym2idx:
                return self.sym2idx["<unk>"]
            elif "<UNK>" in self.sym2idx:
                return self.sym2idx["<UNK>"]
            else:
                # 如果符号既不在映射中，也没有未知标记，则抛出异常
                raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement.")

    def convert_tokens_to_string(self, tokens):
        """
        Converts a sequence of tokens (string) into a single string.
        Additionally, converts split numbers back to their original form.
        """
        # 使用 Moses detokenizer 将 tokens 转换为字符串
        out_string = self.moses_detokenizer.detokenize(tokens)
        # 对字符串中的数字进行反转识别并返回处理后的字符串
        return detokenize_numbers(out_string).strip()

    @torch_only_method
    def convert_to_tensor(self, symbols):
        """Converts a list of symbols into a PyTorch tensor of Long type."""
        return torch.LongTensor(self.convert_tokens_to_ids(symbols))

    @property
    def vocab_size(self):
        """Returns the size of the vocabulary."""
        return len(self.idx2sym)

    def get_vocab(self):
        """Returns a dictionary containing the entire vocabulary."""
        vocab = self.sym2idx.copy()
        vocab.update(self.added_tokens_encoder)
        return vocab

    def _tokenize(self, line, add_eos=False, add_double_eos=False):
        """
        Tokenizes a line of text with optional end-of-sequence tokens.
        """
        # 去除行首尾空白字符
        line = line.strip()
        # 若需要，将字符串转换为小写
        if self.lower_case:
            line = line.lower()

        # 如果分隔符为空，则直接使用整个行作为 symbols
        if self.delimiter == "":
            symbols = line
        else:
            # 使用 Moses pipeline 对行进行分词处理
            symbols = self.moses_pipeline(line)

        # 根据参数决定是否添加特定的结束符号
        if add_double_eos:  # lm1b
            return ["<S>"] + symbols + ["<S>"]
        elif add_eos:
            return symbols + ["<eos>"]
        else:
            return symbols
class LMOrderedIterator(object):
    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
        """
        data -- LongTensor -- the LongTensor is strictly ordered
        """
        self.bsz = bsz
        self.bptt = bptt
        self.ext_len = ext_len if ext_len is not None else 0

        self.device = device

        # Work out how cleanly we can divide the dataset into bsz parts.
        self.n_step = data.size(0) // bsz

        # Trim off any extra elements that wouldn't cleanly fit (remainders).
        data = data.narrow(0, 0, self.n_step * bsz)

        # Evenly divide the data across the bsz batches.
        self.data = data.view(bsz, -1).t().contiguous().to(device)

        # Number of mini-batches
        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt

    def get_batch(self, i, bptt=None):
        if bptt is None:
            bptt = self.bptt
        seq_len = min(bptt, self.data.size(0) - 1 - i)

        end_idx = i + seq_len
        beg_idx = max(0, i - self.ext_len)

        data = self.data[beg_idx:end_idx]
        target = self.data[i + 1 : i + 1 + seq_len]

        data_out = data.transpose(0, 1).contiguous().to(self.device)
        target_out = target.transpose(0, 1).contiguous().to(self.device)

        return data_out, target_out, seq_len

    def get_fixlen_iter(self, start=0):
        for i in range(start, self.data.size(0) - 1, self.bptt):
            yield self.get_batch(i)

    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
        max_len = self.bptt + max_deviation * std
        i = start
        while True:
            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0
            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
            data, target, seq_len = self.get_batch(i, bptt)
            i += seq_len
            yield data, target, seq_len
            if i >= self.data.size(0) - 2:
                break

    def __iter__(self):
        return self.get_fixlen_iter()


class LMShuffledIterator(object):
    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
        """
        data -- list[LongTensor] -- there is no order among the LongTensors
        """
        self.data = data

        self.bsz = bsz
        self.bptt = bptt
        self.ext_len = ext_len if ext_len is not None else 0

        self.device = device
        self.shuffle = shuffle

    def get_sent_stream(self):
        # index iterator
        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data)))

        # sentence iterator
        for idx in epoch_indices:
            yield self.data[idx]

    @torch_only_method
    def stream_iterator(self, sent_stream):
        # streams for each data in the batch
        streams = [None] * self.bsz  # 初始化一个列表，用于存储每个批次中数据的流

        data = torch.LongTensor(self.bptt, self.bsz)  # 创建一个大小为 (bptt x bsz) 的长整型张量，用于存储数据
        target = torch.LongTensor(self.bptt, self.bsz)  # 创建一个大小为 (bptt x bsz) 的长整型张量，用于存储目标

        n_retain = 0  # 初始化保留数据的数量

        while True:
            # data   : [n_retain+bptt x bsz]
            # target : [bptt x bsz]
            data[n_retain:].fill_(-1)  # 将数据张量中从索引 n_retain 开始的所有元素填充为 -1
            target.fill_(-1)  # 将目标张量中所有元素填充为 -1

            valid_batch = True  # 标志位，表示当前批次是否有效

            for i in range(self.bsz):
                n_filled = 0  # 初始化已填充数据的数量
                try:
                    while n_filled < self.bptt:
                        if streams[i] is None or len(streams[i]) <= 1:
                            streams[i] = next(sent_stream)  # 获取下一个句子流并存储在 streams[i] 中
                        # number of new tokens to fill in
                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)  # 计算需要填充的新令牌数量
                        # first n_retain tokens are retained from last batch
                        data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new]  # 将流中的令牌填充到数据张量中
                        target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1]  # 将流中的目标填充到目标张量中
                        streams[i] = streams[i][n_new:]  # 更新 streams[i]，去除已填充的令牌
                        n_filled += n_new  # 更新已填充的数量
                except StopIteration:
                    valid_batch = False  # 如果出现 StopIteration 异常，则当前批次无效
                    break

            if not valid_batch:
                return  # 如果当前批次无效，结束循环

            data_out = data.transpose(0, 1).contiguous().to(self.device)  # 转置并确保张量在内存中连续，然后移到指定设备上
            target_out = target.transpose(0, 1).contiguous().to(self.device)  # 转置并确保张量在内存中连续，然后移到指定设备上

            yield data_out, target_out, self.bptt  # 生成当前批次的数据、目标和 bptt 值

            n_retain = min(data.size(0), self.ext_len)  # 更新保留数据的数量
            if n_retain > 0:
                data[:n_retain] = data[-n_retain:]  # 将数据张量中最后 n_retain 行的数据复制到开头
            data.resize_(n_retain + self.bptt, data.size(1))  # 调整数据张量的大小，以便容纳更多的数据

    def __iter__(self):
        # sent_stream is an iterator
        sent_stream = self.get_sent_stream()  # 获取句子流的迭代器

        for batch in self.stream_iterator(sent_stream):
            yield batch  # 生成批次数据
class LMMultiFileIterator(LMShuffledIterator):
    def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
        self.paths = paths  # 初始化文件路径列表
        self.vocab = vocab  # 初始化词汇表对象

        self.bsz = bsz  # 批量大小
        self.bptt = bptt  # 每个时间步长
        self.ext_len = ext_len if ext_len is not None else 0  # 扩展长度，默认为0

        self.device = device  # 设备类型，默认为CPU
        self.shuffle = shuffle  # 是否随机化顺序

    def get_sent_stream(self, path):
        sents = self.vocab.encode_file(path, add_double_eos=True)  # 使用词汇表编码文件内容，并添加双端标记
        if self.shuffle:
            np.random.shuffle(sents)  # 如果需要，随机打乱句子顺序
        sent_stream = iter(sents)  # 创建句子流迭代器

        return sent_stream  # 返回句子流迭代器

    def __iter__(self):
        if self.shuffle:
            np.random.shuffle(self.paths)  # 如果需要，随机打乱文件路径顺序

        for path in self.paths:
            # sent_stream 是一个迭代器
            sent_stream = self.get_sent_stream(path)  # 获取当前文件路径的句子流迭代器
            for batch in self.stream_iterator(sent_stream):  # 对句子流进行批量迭代
                yield batch  # 生成批量数据


class TransfoXLCorpus(object):
    @classmethod
    @torch_only_method
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
        """
        Instantiate a pre-processed corpus.
        """
        vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)  # 从预训练模型名或路径实例化词汇表
        is_local = os.path.isdir(pretrained_model_name_or_path)  # 检查是否为本地路径
        # 如果需要，重定向到缓存
        try:
            resolved_corpus_file = cached_file(pretrained_model_name_or_path, CORPUS_NAME, cache_dir=cache_dir)
        except EnvironmentError:
            logger.error(
                f"Corpus '{pretrained_model_name_or_path}' was not found in corpus list"
                f" ({', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys())}. We assumed '{pretrained_model_name_or_path}'"
                f" was a path or url but couldn't find files {CORPUS_NAME} at this path or url."
            )
            return None  # 如果出错，返回空值
        if is_local:
            logger.info(f"loading corpus file {resolved_corpus_file}")  # 如果是本地路径，记录日志加载语料文件
        else:
            logger.info(f"loading corpus file {CORPUS_NAME} from cache at {resolved_corpus_file}")  # 否则，从缓存加载语料文件

        # 实例化语料对象
        corpus = cls(*inputs, **kwargs)
        corpus_dict = torch.load(resolved_corpus_file)  # 加载语料文件内容到字典
        for key, value in corpus_dict.items():
            corpus.__dict__[key] = value  # 将加载的内容赋值给语料对象的属性
        corpus.vocab = vocab  # 设置语料对象的词汇表属性
        if corpus.train is not None:
            corpus.train = torch.tensor(corpus.train, dtype=torch.long)  # 将训练数据转换为长整型张量
        if corpus.valid is not None:
            corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)  # 将验证数据转换为长整型张量
        if corpus.test is not None:
            corpus.test = torch.tensor(corpus.test, dtype=torch.long)  # 将测试数据转换为长整型张量
        return corpus  # 返回语料对象实例

    def __init__(self, *args, **kwargs):
        self.vocab = TransfoXLTokenizer(*args, **kwargs)  # 初始化语料的词汇表
        self.dataset = None  # 初始化数据集属性为空
        self.train = None  # 初始化训练数据为空
        self.valid = None  # 初始化验证数据为空
        self.test = None  # 初始化测试数据为空
    # 构建语料库的方法，根据指定路径和数据集名称来设置语料库
    def build_corpus(self, path, dataset):
        # 将数据集名称存储到实例变量中
        self.dataset = dataset

        # 根据数据集类型执行相应的操作
        if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
            # 统计训练、验证和测试文件中的词频
            self.vocab.count_file(os.path.join(path, "train.txt"))
            self.vocab.count_file(os.path.join(path, "valid.txt"))
            self.vocab.count_file(os.path.join(path, "test.txt"))
        elif self.dataset == "wt103":
            # 对于 wt103 数据集，只统计训练文件中的词频
            self.vocab.count_file(os.path.join(path, "train.txt"))
        elif self.dataset == "lm1b":
            # 构建训练文件路径的模式，并获取匹配的文件路径列表
            train_path_pattern = os.path.join(
                path,
                "1-billion-word-language-modeling-benchmark-r13output",
                "training-monolingual.tokenized.shuffled",
                "news.en-*",
            )
            train_paths = glob.glob(train_path_pattern)
            # 在调用 build_vocab() 方法时，从文件中加载词汇表

        # 构建词汇表
        self.vocab.build_vocab()

        # 根据数据集类型编码训练、验证和测试文件
        if self.dataset in ["ptb", "wt2", "wt103"]:
            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True)
            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True)
        elif self.dataset in ["enwik8", "text8"]:
            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False)
            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False)
        elif self.dataset == "lm1b":
            self.train = train_paths
            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True)

    # 获取数据迭代器的方法，根据指定的分割（训练、验证、测试）返回相应的数据迭代器
    def get_iterator(self, split, *args, **kwargs):
        if split == "train":
            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
                # 使用 LMOrderedIterator 创建有序数据迭代器
                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
            elif self.dataset == "lm1b":
                # 对于 lm1b 数据集，设置 shuffle 参数为 True，使用 LMMultiFileIterator 创建多文件数据迭代器
                kwargs["shuffle"] = True
                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
        elif split in ["valid", "test"]:
            # 获取验证或测试数据集
            data = self.valid if split == "valid" else self.test
            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
                # 使用 LMOrderedIterator 创建有序数据迭代器
                data_iter = LMOrderedIterator(data, *args, **kwargs)
            elif self.dataset == "lm1b":
                # 对于 lm1b 数据集，使用 LMShuffledIterator 创建打乱顺序的数据迭代器
                data_iter = LMShuffledIterator(data, *args, **kwargs)
        else:
            data_iter = None
            # 如果分割类型未识别，则抛出异常
            raise ValueError(f"Split not recognized: {split}")

        return data_iter
# 仅限于 Torch 方法的装饰器，标志着这是一个专门为 Torch 框架设计的方法
@torch_only_method
def get_lm_corpus(datadir, dataset):
    # 构建缓存文件路径
    fn = os.path.join(datadir, "cache.pt")
    # 构建另一个缓存文件路径
    fn_pickle = os.path.join(datadir, "cache.pkl")
    
    # 如果存在 cache.pt 文件
    if os.path.exists(fn):
        # 记录日志，提示正在加载缓存数据集
        logger.info("Loading cached dataset...")
        # 使用 Torch 加载 cache.pkl 文件作为数据集
        corpus = torch.load(fn_pickle)
    
    # 如果存在 cache.pt 文件（这个条件似乎重复了，因为它与上一个条件相同）
    elif os.path.exists(fn):
        # 记录日志，提示正在从 pickle 文件加载缓存数据集
        logger.info("Loading cached dataset from pickle...")
        # 如果未设置 TRUST_REMOTE_CODE 环境变量为 True，则抛出 ValueError
        if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
            raise ValueError(
                "This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially "
                "malicious. It's recommended to never unpickle data that could have come from an untrusted source, or "
                "that could have been tampered with. If you already verified the pickle data and decided to use it, "
                "you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it."
            )
        # 以二进制只读模式打开 cache.pt 文件，并使用 pickle 加载数据集
        with open(fn, "rb") as fp:
            corpus = pickle.load(fp)
    
    # 如果以上两个条件均不满足
    else:
        # 记录日志，提示正在生成指定数据集的数据
        logger.info(f"Producing dataset {dataset}...")
        kwargs = {}
        # 根据数据集类型设置不同的参数
        if dataset in ["wt103", "wt2"]:
            kwargs["special"] = ["<eos>"]
            kwargs["lower_case"] = False
        elif dataset == "ptb":
            kwargs["special"] = ["<eos>"]
            kwargs["lower_case"] = True
        elif dataset == "lm1b":
            kwargs["special"] = []
            kwargs["lower_case"] = False
            kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt")
        elif dataset in ["enwik8", "text8"]:
            # 如果数据集是 enwik8 或 text8，则不设置任何参数
            pass
        
        # 使用 TransfoXLCorpus 类构建数据集 corpus，传入指定的数据目录和参数 kwargs
        corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
        # 使用 Torch 将生成的数据集 corpus 保存到 cache.pt 文件中
        torch.save(corpus, fn)

    # 返回生成或加载的数据集 corpus
    return corpus

`.\models\deprecated\transfo_xl\init.py`

# 导入必要的模块和函数

# 引入类型检查器的模块
from typing import TYPE_CHECKING

# 引入自定义的异常：OptionalDependencyNotAvailable，_LazyModule等
from ....utils import OptionalDependencyNotAvailable, _LazyModule

# 引入判断是否可用的函数：is_tf_available, is_torch_available
from ....utils import is_tf_available, is_torch_available

# 定义导入结构的字典，包含不同模块对应的导入内容
_import_structure = {
    "configuration_transfo_xl": ["TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP", "TransfoXLConfig"],
    "tokenization_transfo_xl": ["TransfoXLCorpus", "TransfoXLTokenizer"],
}

# 尝试导入 Torch 模块，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则添加相关模块到导入结构中
    _import_structure["modeling_transfo_xl"] = [
        "TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
        "AdaptiveEmbedding",
        "TransfoXLForSequenceClassification",
        "TransfoXLLMHeadModel",
        "TransfoXLModel",
        "TransfoXLPreTrainedModel",
        "load_tf_weights_in_transfo_xl",
    ]

# 尝试导入 TensorFlow 模块，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 TensorFlow 可用，则添加相关模块到导入结构中
    _import_structure["modeling_tf_transfo_xl"] = [
        "TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFAdaptiveEmbedding",
        "TFTransfoXLForSequenceClassification",
        "TFTransfoXLLMHeadModel",
        "TFTransfoXLMainLayer",
        "TFTransfoXLModel",
        "TFTransfoXLPreTrainedModel",
    ]

# 如果是类型检查环境，导入类型相关的模块
if TYPE_CHECKING:
    from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
    from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer

    # 尝试导入 Torch 模块，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 Torch 可用，则导入相关模块到当前作用域
        from .modeling_transfo_xl import (
            TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
            AdaptiveEmbedding,
            TransfoXLForSequenceClassification,
            TransfoXLLMHeadModel,
            TransfoXLModel,
            TransfoXLPreTrainedModel,
            load_tf_weights_in_transfo_xl,
        )

    # 尝试导入 TensorFlow 模块，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果条件不满足之前的任何情况，则执行以下导入操作
        # 从当前目录的.modeling_tf_transfo_xl模块中导入以下内容：
        from .modeling_tf_transfo_xl import (
            TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFAdaptiveEmbedding,
            TFTransfoXLForSequenceClassification,
            TFTransfoXLLMHeadModel,
            TFTransfoXLMainLayer,
            TFTransfoXLModel,
            TFTransfoXLPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于操作 Python 解释器的系统相关功能
    import sys

    # 设置当前模块的名称对应的模块对象为 _LazyModule 类的实例，用于延迟加载模块
    # 这里将当前模块的名称、文件路径、导入结构和模块规范传递给 _LazyModule 构造函数
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\deprecated\van\configuration_van.py`

# coding=utf-8
# 版权所有 2022 年 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可证版本 2.0 授权;
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件是基于"原样"提供的，
# 没有任何明示或暗示的保证或条件。
# 有关更多信息，请参阅许可证。

""" VAN 模型配置"""

from ....configuration_utils import PretrainedConfig  # 导入预训练配置类
from ....utils import logging  # 导入日志工具

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

VAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "Visual-Attention-Network/van-base": (
        "https://huggingface.co/Visual-Attention-Network/van-base/blob/main/config.json"
    ),
}

class VanConfig(PretrainedConfig):
    r"""
    这是配置类，用于存储 [`VanModel`] 的配置。根据指定的参数实例化 VAN 模型，
    定义模型架构。使用默认值实例化配置将生成与 VAN
    [Visual-Attention-Network/van-base](https://huggingface.co/Visual-Attention-Network/van-base)
    架构类似的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。阅读来自 [`PretrainedConfig`] 的文档获取更多信息。
    """
    Args:
        image_size (`int`, *optional*, defaults to 224):
            每个图像的大小（分辨率）。
        num_channels (`int`, *optional*, defaults to 3):
            输入通道的数量。
        patch_sizes (`List[int]`, *optional*, defaults to `[7, 3, 3, 3]`):
            每个阶段嵌入层使用的补丁大小。
        strides (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
            每个阶段嵌入层用来降采样输入的步幅大小。
        hidden_sizes (`List[int]`, *optional*, defaults to `[64, 128, 320, 512]`):
            每个阶段的隐藏层维度。
        depths (`List[int]`, *optional*, defaults to `[3, 3, 12, 3]`):
            每个阶段的层数。
        mlp_ratios (`List[int]`, *optional*, defaults to `[8, 8, 4, 4]`):
            每个阶段MLP层的扩展比率。
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            每一层的非线性激活函数（可以是函数或字符串）。支持的字符串有 "gelu", "relu", "selu" 和 "gelu_new"。
        initializer_range (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态初始化器的标准差。
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            层归一化层使用的 epsilon。
        layer_scale_init_value (`float`, *optional*, defaults to 0.01):
            层缩放的初始值。
        drop_path_rate (`float`, *optional*, defaults to 0.0):
            随机深度（stochastic depth）的丢弃概率。
        dropout_rate (`float`, *optional*, defaults to 0.0):
            丢弃的概率。

    Example:
    ```
    >>> from transformers import VanModel, VanConfig

    >>> # Initializing a VAN van-base style configuration
    >>> configuration = VanConfig()
    >>> # Initializing a model from the van-base style configuration
    >>> model = VanModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    ):
        # 调用父类的初始化方法，传递所有的关键字参数
        super().__init__(**kwargs)
        # 设置图像大小
        self.image_size = image_size
        # 设置通道数
        self.num_channels = num_channels
        # 设置每个补丁的大小
        self.patch_sizes = patch_sizes
        # 设置每个补丁的步长
        self.strides = strides
        # 设置隐藏层的大小
        self.hidden_sizes = hidden_sizes
        # 设置深度
        self.depths = depths
        # 设置多层感知机（MLP）的比率
        self.mlp_ratios = mlp_ratios
        # 设置隐藏层的激活函数
        self.hidden_act = hidden_act
        # 设置初始化范围
        self.initializer_range = initializer_range
        # 设置层归一化的 epsilon 值
        self.layer_norm_eps = layer_norm_eps
        # 设置层尺度初始化值
        self.layer_scale_init_value = layer_scale_init_value
        # 设置丢弃路径的比率
        self.drop_path_rate = drop_path_rate
        # 设置丢弃率
        self.dropout_rate = dropout_rate

`.\models\deprecated\van\convert_van_to_pytorch.py`

# coding=utf-8
# 声明编码格式为 UTF-8
# Copyright 2022 BNRist (Tsinghua University), TKLNDST (Nankai University) and The HuggingFace Inc. team. All rights reserved.
# 版权声明：2022 年 BNRist（清华大学）、TKLNDST（南开大学）及 The HuggingFace Inc. 团队保留所有权利。

# Licensed under the Apache License, Version 2.0 (the "License");
# 授权许可，使用 Apache 许可版本 2.0
# you may not use this file except in compliance with the License.
# 除非符合许可，否则不得使用此文件。
# You may obtain a copy of the License at
# 可以在以下网址获取许可的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，本软件根据“原样”分发，不附带任何明示或暗示的担保或条件。
# See the License for the specific language governing permissions and
# 请参阅许可，了解特定语言的权限和限制。
# limitations under the License.
"""Convert VAN checkpoints from the original repository.

将原始仓库的 VAN 检查点转换为特定格式。
URL: https://github.com/Visual-Attention-Network/VAN-Classification"""

import argparse
import json
import sys
from dataclasses import dataclass, field
from functools import partial
from pathlib import Path
from typing import List

import torch
import torch.nn as nn
# 导入模块
from huggingface_hub import cached_download, hf_hub_download
# 从 torch 模块导入 Tensor 类型
from torch import Tensor

# 从 transformers 模块导入 AutoImageProcessor, VanConfig, VanForImageClassification 类
from transformers import AutoImageProcessor, VanConfig, VanForImageClassification
# 从 transformers.models.deprecated.van.modeling_van 模块导入 VanLayerScaling 类
from transformers.models.deprecated.van.modeling_van import VanLayerScaling
# 从 transformers.utils 模块导入 logging 函数
from transformers.utils import logging

# 设置日志输出级别为 INFO
logging.set_verbosity_info()
# 获取当前模块的 logger
logger = logging.get_logger(__name__)

# 定义 Tracker 类，用于跟踪模块
@dataclass
class Tracker:
    module: nn.Module
    # 被追踪的模块列表
    traced: List[nn.Module] = field(default_factory=list)
    # 注册的钩子列表
    handles: list = field(default_factory=list)

    # 前向钩子函数
    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
        # 检查模块是否没有子模块或者是 Conv2d 或 BatchNorm2d 类型
        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
        if has_not_submodules:
            # 排除 VanLayerScaling 类型模块
            if not isinstance(m, VanLayerScaling):
                self.traced.append(m)

    # 调用实例时执行的函数，用于模块跟踪
    def __call__(self, x: Tensor):
        # 遍历模块的所有子模块，并注册前向钩子
        for m in self.module.modules():
            self.handles.append(m.register_forward_hook(self._forward_hook))
        # 执行模块的前向传播
        self.module(x)
        # 移除所有注册的钩子
        [x.remove() for x in self.handles]
        return self

    # 返回具有参数的被追踪模块列表
    @property
    def parametrized(self):
        return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced))


# 定义 ModuleTransfer 类，用于模块转移
@dataclass
class ModuleTransfer:
    src: nn.Module
    dest: nn.Module
    verbose: int = 0
    src_skip: List = field(default_factory=list)
    dest_skip: List = field(default_factory=list))
    # 定义一个方法，使对象实例可以像函数一样被调用，接受一个张量 `x` 作为参数
    def __call__(self, x: Tensor):
        """
        Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the
        hood we tracked all the operations in both modules.
        """
        # 对目标模块 `self.dest` 执行跟踪操作，并获取其参数化表示
        dest_traced = Tracker(self.dest)(x).parametrized
        # 对源模块 `self.src` 执行跟踪操作，并获取其参数化表示
        src_traced = Tracker(self.src)(x).parametrized

        # 过滤掉在 `self.src_skip` 中指定的类型的参数化表示
        src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced))
        # 过滤掉在 `self.dest_skip` 中指定的类型的参数化表示
        dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced))

        # 如果目标模块和源模块的操作数量不同，则抛出异常
        if len(dest_traced) != len(src_traced):
            raise Exception(
                f"Numbers of operations are different. Source module has {len(src_traced)} operations while"
                f" destination module has {len(dest_traced)}."
            )

        # 逐个加载源模块的状态字典到目标模块中
        for dest_m, src_m in zip(dest_traced, src_traced):
            dest_m.load_state_dict(src_m.state_dict())
            # 如果设置了详细输出模式 (`verbose == 1`)，则打印迁移信息
            if self.verbose == 1:
                print(f"Transfered from={src_m} to={dest_m}")
# 复制源模型的参数到目标模型中，确保两者结构兼容
def copy_parameters(from_model: nn.Module, our_model: nn.Module) -> nn.Module:
    # 获取源模型的状态字典
    from_state_dict = from_model.state_dict()
    # 获取目标模型的状态字典
    our_state_dict = our_model.state_dict()
    # 获取目标模型的配置信息
    config = our_model.config
    # 初始化一个空列表用于存储所有需要复制的键值对
    all_keys = []
    # 遍历配置中的隐藏层尺寸列表
    for stage_idx in range(len(config.hidden_sizes)):
        # 根据深度遍历每个阶段的块数量
        for block_id in range(config.depths[stage_idx]):
            # 构建源模型中的键名
            from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_1"
            # 构建目标模型中对应的键名
            to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.attention_scaling.weight"
            # 将源模型键名和目标模型键名作为元组加入列表
            all_keys.append((from_key, to_key))
            # 类似地，构建另一个键对应关系用于 MLP 缩放权重
            from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_2"
            to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.mlp_scaling.weight"
            # 将键名和目标键名作为元组加入列表
            all_keys.append((from_key, to_key))

    # 遍历复制源模型到目标模型的所有键值对
    for from_key, to_key in all_keys:
        our_state_dict[to_key] = from_state_dict.pop(from_key)

    # 使用复制后的状态字典更新目标模型的参数
    our_model.load_state_dict(our_state_dict)
    # 返回更新后的目标模型
    return our_model


# 下载和转换权重，并将模型推送到Hub
def convert_weight_and_push(
    name: str,
    config: VanConfig,
    checkpoint: str,
    from_model: nn.Module,
    save_directory: Path,
    push_to_hub: bool = True,
):
    # 打印正在下载权重信息
    print(f"Downloading weights for {name}...")
    # 缓存下载检查点路径
    checkpoint_path = cached_download(checkpoint)
    # 打印转换模型信息
    print(f"Converting {name}...")
    # 从检查点加载源模型的状态字典
    from_state_dict = torch.load(checkpoint_path)["state_dict"]
    # 加载源模型的状态字典到源模型
    from_model.load_state_dict(from_state_dict)
    # 设置源模型为评估模式
    from_model.eval()
    # 禁用梯度计算
    with torch.no_grad():
        # 创建用于图像分类的 VanForImageClassification 模型，并设置为评估模式
        our_model = VanForImageClassification(config).eval()
        # 创建 ModuleTransfer 实例，用于从源模型传输参数到目标模型
        module_transfer = ModuleTransfer(src=from_model, dest=our_model)
        # 创建随机输入张量
        x = torch.randn((1, 3, 224, 224))
        # 通过 module_transfer 将源模型的参数传输到目标模型
        module_transfer(x)
        # 使用 copy_parameters 函数复制源模型的参数到目标模型
        our_model = copy_parameters(from_model, our_model)

    # 检查源模型和目标模型的输出是否接近，否则引发异常
    if not torch.allclose(from_model(x), our_model(x).logits):
        raise ValueError("The model logits don't match the original one.")

    # 设置检查点名称为模型名称
    checkpoint_name = name
    # 打印检查点名称
    print(checkpoint_name)

    # 如果设置为推送到Hub，则执行以下操作
    if push_to_hub:
        # 将模型推送到Hub
        our_model.push_to_hub(
            repo_path_or_name=save_directory / checkpoint_name,
            commit_message="Add model",
            use_temp_dir=True,
        )

        # 使用预训练模型 facebook/convnext-base-224-22k-1k 创建图像处理器实例
        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
        # 将图像处理器推送到Hub
        image_processor.push_to_hub(
            repo_path_or_name=save_directory / checkpoint_name,
            commit_message="Add image processor",
            use_temp_dir=True,
        )

        # 打印推送成功信息
        print(f"Pushed {checkpoint_name}")


# 下载权重文件并将模型参数保存在本地
def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
    # 定义文件名
    filename = "imagenet-1k-id2label.json"
    # 类别数
    num_labels = 1000
    # Hub repo ID
    repo_id = "huggingface/label-files"
    # 从 Hub 下载类标签文件并加载为 JSON
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    # 转换 JSON 中的键值对为整数类型
    id2label = {int(k): v for k, v in id2label.items()}
    # 将 id2label 保存为新的变量
    id2label = id2label
    # 构建 label2id 字典，键为类别名称，值为类别 ID
    label2id = {v: k for k, v in id2label.items()}
    # 创建一个部分应用了 VanConfig 的函数 ImageNetPreTrainedConfig，用于配置预训练模型的参数
    ImageNetPreTrainedConfig = partial(VanConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
    
    # 定义一个字典 names_to_config，包含不同模型名称到其对应配置的映射
    names_to_config = {
        "van-tiny": ImageNetPreTrainedConfig(
            hidden_sizes=[32, 64, 160, 256],
            depths=[3, 3, 5, 2],
            mlp_ratios=[8, 8, 4, 4],
        ),
        "van-small": ImageNetPreTrainedConfig(
            hidden_sizes=[64, 128, 320, 512],
            depths=[2, 2, 4, 2],
            mlp_ratios=[8, 8, 4, 4],
        ),
        "van-base": ImageNetPreTrainedConfig(
            hidden_sizes=[64, 128, 320, 512],
            depths=[3, 3, 12, 3],
            mlp_ratios=[8, 8, 4, 4],
        ),
        "van-large": ImageNetPreTrainedConfig(
            hidden_sizes=[64, 128, 320, 512],
            depths=[3, 5, 27, 3],
            mlp_ratios=[8, 8, 4, 4],
        ),
    }
    
    # 定义一个字典 names_to_original_models，包含不同模型名称到其原始模型的映射
    names_to_original_models = {
        "van-tiny": van_tiny,
        "van-small": van_small,
        "van-base": van_base,
        "van-large": van_large,
    }
    
    # 定义一个字典 names_to_original_checkpoints，包含不同模型名称到其原始检查点的 URL 映射
    names_to_original_checkpoints = {
        "van-tiny": (
            "https://huggingface.co/Visual-Attention-Network/VAN-Tiny-original/resolve/main/van_tiny_754.pth.tar"
        ),
        "van-small": (
            "https://huggingface.co/Visual-Attention-Network/VAN-Small-original/resolve/main/van_small_811.pth.tar"
        ),
        "van-base": (
            "https://huggingface.co/Visual-Attention-Network/VAN-Base-original/resolve/main/van_base_828.pth.tar"
        ),
        "van-large": (
            "https://huggingface.co/Visual-Attention-Network/VAN-Large-original/resolve/main/van_large_839.pth.tar"
        ),
    }
    
    # 如果指定了模型名称，则将该模型的配置和原始模型转换并推送到指定目录或 Hub
    if model_name:
        convert_weight_and_push(
            model_name,
            names_to_config[model_name],
            checkpoint=names_to_original_checkpoints[model_name],
            from_model=names_to_original_models[model_name](),
            save_directory=save_directory,
            push_to_hub=push_to_hub,
        )
    # 否则，遍历所有模型名称及其配置，并将每个模型的配置和原始模型转换并推送到指定目录或 Hub
    else:
        for model_name, config in names_to_config.items():
            convert_weight_and_push(
                model_name,
                config,
                checkpoint=names_to_original_checkpoints[model_name],
                from_model=names_to_original_models[model_name](),
                save_directory=save_directory,
                push_to_hub=push_to_hub,
            )
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码块
    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必需参数
    parser.add_argument(
        "--model-name",
        default=None,
        type=str,
        help=(
            "The name of the model you wish to convert, it must be one of the supported resnet* architecture,"
            " currently: van-tiny/small/base/large. If `None`, all of them will the converted."
        ),
    )
    # 添加模型名称参数，指定要转换的模型名称

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=Path,
        required=True,
        help="Path to the output PyTorch model directory.",
    )
    # 添加参数，指定输出的 PyTorch 模型目录的路径，此参数为必需

    parser.add_argument(
        "--van_dir",
        required=True,
        type=Path,
        help=(
            "A path to VAN's original implementation directory. You can download from here:"
            " https://github.com/Visual-Attention-Network/VAN-Classification"
        ),
    )
    # 添加参数，指定 VAN（Visual Attention Network）原始实现的目录路径

    parser.add_argument(
        "--push_to_hub",
        default=True,
        type=bool,
        required=False,
        help="If True, push model and image processor to the hub.",
    )
    # 添加参数，指定是否将模型和图像处理器推送到 Hub

    args = parser.parse_args()
    # 解析命令行参数并存储到 args 对象中

    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
    # 从 args 对象中获取 PyTorch 模型输出目录的路径，并指定其类型为 Path
    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
    # 如果指定的 PyTorch 模型输出目录不存在，则创建该目录，允许创建多层父目录

    van_dir = args.van_dir
    # 从 args 对象中获取 VAN 实现目录的路径

    # 将 VAN 实现目录的父目录路径添加到 sys.path 中，以便引入 maskformer 目录
    sys.path.append(str(van_dir.parent))
    from van.models.van import van_base, van_large, van_small, van_tiny
    # 从 VAN 实现中导入不同规模的 VAN 模型

    # 调用函数，将权重转换并推送到 Hub
    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)

`.\models\deprecated\van\modeling_van.py`

# coding=utf-8
# 版权声明及许可信息

"""
PyTorch Visual Attention Network (VAN) model.
"""

import math
from collections import OrderedDict
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入激活函数映射
from ....activations import ACT2FN
# 导入模型输出类
from ....modeling_outputs import (
    BaseModelOutputWithNoAttention,
    BaseModelOutputWithPoolingAndNoAttention,
    ImageClassifierOutputWithNoAttention,
)
# 导入预训练模型基类
from ....modeling_utils import PreTrainedModel
# 导入工具函数：添加代码示例文档字符串、添加模型前向传播的起始文档字符串、日志记录
from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
# 导入 VAN 模型配置
from .configuration_van import VanConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的配置项
_CONFIG_FOR_DOC = "VanConfig"

# 用于文档的检查点
_CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base"
# 预期输出形状
_EXPECTED_OUTPUT_SHAPE = [1, 512, 7, 7]

# 图像分类用的检查点
_IMAGE_CLASS_CHECKPOINT = "Visual-Attention-Network/van-base"
# 预期的图像分类输出
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

# 预训练模型存档列表
VAN_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "Visual-Attention-Network/van-base",
    # 查看所有 VAN 模型 https://huggingface.co/models?filter=van
]

# 从 transformers.models.convnext.modeling_convnext.drop_path 复制过来的函数
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    按样本（在残差块的主路径中应用）丢弃路径（随机深度）。

    Comment by Ross Wightman: 这与我为 EfficientNet 等网络创建的 DropConnect 实现相同，
    然而，原始名称具有误导性，因为“Drop Connect”是另一篇论文中的不同形式的 dropout…
    参见讨论: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 …
    我选择更改层和参数名称为 'drop path'，而不是将 DropConnect 作为层名称，并使用 'survival rate' 作为参数。
    """
    if drop_prob == 0.0 or not training:
        return input
    keep_prob = 1 - drop_prob
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # 适用于不同维度的张量，而不仅仅是 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 二值化
    # 计算输出值，通过输入值除以保持概率得到，然后乘以一个随机生成的张量
    output = input.div(keep_prob) * random_tensor
    # 返回计算得到的输出值
    return output
# 从 transformers.models.convnext.modeling_convnext.ConvNextDropPath 复制而来，更名为 VanDropPath
class VanDropPath(nn.Module):
    """每个样本使用丢弃路径（Stochastic Depth）（当应用于残差块的主路径时）。"""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


class VanOverlappingPatchEmbedder(nn.Module):
    """
    使用 patchify 操作对输入进行下采样，默认使用步幅为 4 的窗口使相邻窗口重叠一半区域。
    来自 [PVTv2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797)。
    """

    def __init__(self, in_channels: int, hidden_size: int, patch_size: int = 7, stride: int = 4):
        super().__init__()
        self.convolution = nn.Conv2d(
            in_channels, hidden_size, kernel_size=patch_size, stride=stride, padding=patch_size // 2
        )
        self.normalization = nn.BatchNorm2d(hidden_size)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        hidden_state = self.convolution(input)
        hidden_state = self.normalization(hidden_state)
        return hidden_state


class VanMlpLayer(nn.Module):
    """
    带有深度卷积的 MLP，来自 [PVTv2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797)。
    """

    def __init__(
        self,
        in_channels: int,
        hidden_size: int,
        out_channels: int,
        hidden_act: str = "gelu",
        dropout_rate: float = 0.5,
    ):
        super().__init__()
        self.in_dense = nn.Conv2d(in_channels, hidden_size, kernel_size=1)
        self.depth_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=3, padding=1, groups=hidden_size)
        self.activation = ACT2FN[hidden_act]
        self.dropout1 = nn.Dropout(dropout_rate)
        self.out_dense = nn.Conv2d(hidden_size, out_channels, kernel_size=1)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        hidden_state = self.in_dense(hidden_state)
        hidden_state = self.depth_wise(hidden_state)
        hidden_state = self.activation(hidden_state)
        hidden_state = self.dropout1(hidden_state)
        hidden_state = self.out_dense(hidden_state)
        hidden_state = self.dropout2(hidden_state)
        return hidden_state


class VanLargeKernelAttention(nn.Module):
    """
    基础的大核注意力（LKA）。
    """
    # 初始化函数，接受隐藏层大小作为参数
    def __init__(self, hidden_size: int):
        # 调用父类初始化方法
        super().__init__()
        # 定义深度可分离卷积层，输入和输出通道数均为 hidden_size，卷积核大小为 5x5，填充为 2
        self.depth_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=5, padding=2, groups=hidden_size)
        # 定义带孔深度可分离卷积层，输入和输出通道数均为 hidden_size，卷积核大小为 7x7，扩张率为 3，填充为 9
        self.depth_wise_dilated = nn.Conv2d(
            hidden_size, hidden_size, kernel_size=7, dilation=3, padding=9, groups=hidden_size
        )
        # 定义逐点卷积层，输入和输出通道数均为 hidden_size，卷积核大小为 1x1
        self.point_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=1)

    # 前向传播函数，接受隐藏状态张量并返回处理后的张量
    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        # 深度可分离卷积层的前向传播，对隐藏状态进行卷积操作
        hidden_state = self.depth_wise(hidden_state)
        # 带孔深度可分离卷积层的前向传播，对隐藏状态进行卷积操作
        hidden_state = self.depth_wise_dilated(hidden_state)
        # 逐点卷积层的前向传播，对隐藏状态进行卷积操作
        hidden_state = self.point_wise(hidden_state)
        # 返回处理后的隐藏状态张量
        return hidden_state
class VanLargeKernelAttentionLayer(nn.Module):
    """
    Computes attention using Large Kernel Attention (LKA) and attends the input.
    """

    def __init__(self, hidden_size: int):
        super().__init__()
        # 初始化一个 Large Kernel Attention 对象
        self.attention = VanLargeKernelAttention(hidden_size)

    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        # 计算注意力权重
        attention = self.attention(hidden_state)
        # 将注意力权重应用到隐藏状态上
        attended = hidden_state * attention
        return attended


class VanSpatialAttentionLayer(nn.Module):
    """
    Van spatial attention layer composed by projection (via conv) -> act -> Large Kernel Attention (LKA) attention ->
    projection (via conv) + residual connection.
    """

    def __init__(self, hidden_size: int, hidden_act: str = "gelu"):
        super().__init__()
        # 通过卷积进行投影和激活函数
        self.pre_projection = nn.Sequential(
            OrderedDict(
                [
                    ("conv", nn.Conv2d(hidden_size, hidden_size, kernel_size=1)),
                    ("act", ACT2FN[hidden_act]),  # 使用指定的激活函数
                ]
            )
        )
        # 初始化一个 VanLargeKernelAttentionLayer 层
        self.attention_layer = VanLargeKernelAttentionLayer(hidden_size)
        # 通过卷积进行投影
        self.post_projection = nn.Conv2d(hidden_size, hidden_size, kernel_size=1)

    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        residual = hidden_state
        # 前向传播：投影和激活函数
        hidden_state = self.pre_projection(hidden_state)
        # 前向传播：使用注意力层
        hidden_state = self.attention_layer(hidden_state)
        # 前向传播：投影
        hidden_state = self.post_projection(hidden_state)
        # 添加残差连接
        hidden_state = hidden_state + residual
        return hidden_state


class VanLayerScaling(nn.Module):
    """
    Scales the inputs by a learnable parameter initialized by `initial_value`.
    """

    def __init__(self, hidden_size: int, initial_value: float = 1e-2):
        super().__init__()
        # 初始化一个可学习的权重参数
        self.weight = nn.Parameter(initial_value * torch.ones((hidden_size)), requires_grad=True)

    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        # 通过增加维度来进行广播操作
        hidden_state = self.weight.unsqueeze(-1).unsqueeze(-1) * hidden_state
        return hidden_state


class VanLayer(nn.Module):
    """
    Van layer composed by normalization layers, large kernel attention (LKA) and a multi layer perceptron (MLP).
    """

    def __init__(
        self,
        config: VanConfig,
        hidden_size: int,
        mlp_ratio: int = 4,
        drop_path_rate: float = 0.5,
        # 省略部分
    ):
        # 调用父类的初始化方法
        super().__init__()
        # 根据给定的 drop_path_rate 创建 VanDropPath 实例或者 nn.Identity 实例
        self.drop_path = VanDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
        # 创建一个 nn.BatchNorm2d 实例，用于预处理输入数据
        self.pre_normomalization = nn.BatchNorm2d(hidden_size)
        # 创建一个 VanSpatialAttentionLayer 实例，处理输入数据的注意力机制
        self.attention = VanSpatialAttentionLayer(hidden_size, config.hidden_act)
        # 创建一个 VanLayerScaling 实例，用于缩放注意力输出
        self.attention_scaling = VanLayerScaling(hidden_size, config.layer_scale_init_value)
        # 创建一个 nn.BatchNorm2d 实例，用于处理注意力输出的后处理
        self.post_normalization = nn.BatchNorm2d(hidden_size)
        # 创建一个 VanMlpLayer 实例，处理注意力输出的 MLP 层
        self.mlp = VanMlpLayer(
            hidden_size, hidden_size * mlp_ratio, hidden_size, config.hidden_act, config.dropout_rate
        )
        # 创建一个 VanLayerScaling 实例，用于缩放 MLP 输出
        self.mlp_scaling = VanLayerScaling(hidden_size, config.layer_scale_init_value)

    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        # 保存输入的残差连接
        residual = hidden_state
        # 对输入进行预处理
        hidden_state = self.pre_normomalization(hidden_state)
        # 应用注意力机制
        hidden_state = self.attention(hidden_state)
        # 缩放注意力输出
        hidden_state = self.attention_scaling(hidden_state)
        # 应用 drop_path 操作
        hidden_state = self.drop_path(hidden_state)
        # 添加残差连接
        hidden_state = residual + hidden_state
        # 更新残差连接
        residual = hidden_state
        # 对注意力输出进行后处理
        hidden_state = self.post_normalization(hidden_state)
        # 应用 MLP 层
        hidden_state = self.mlp(hidden_state)
        # 缩放 MLP 输出
        hidden_state = self.mlp_scaling(hidden_state)
        # 应用 drop_path 操作
        hidden_state = self.drop_path(hidden_state)
        # 添加残差连接
        hidden_state = residual + hidden_state
        # 返回处理后的输出
        return hidden_state
        hidden_state: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
        """
        Perform forward pass through the VanEncoder.

        Args:
            hidden_state (torch.Tensor): Input tensor of shape (batch_size, channels, height, width).
            output_hidden_states (bool, optional): Whether to output hidden states of all stages.
            return_dict (bool, optional): Whether to return a dictionary with hidden states.

        Returns:
            torch.Tensor or Tuple[torch.Tensor, Dict[str, torch.Tensor]]: Depending on `return_dict`,
                either the final encoded tensor or a tuple containing the final tensor and a dictionary
                with hidden states from each stage.
        """
        for stage in self.stages:
            hidden_state = stage(hidden_state)

        if output_hidden_states:
            hidden_states_dict = {f"hidden_state_{i}": stage(hidden_state) for i, stage in enumerate(self.stages)}
            if return_dict:
                return hidden_state, hidden_states_dict
            else:
                return hidden_state

        return hidden_state if not return_dict else (hidden_state, {})
    ) -> Union[Tuple, BaseModelOutputWithNoAttention]:
        all_hidden_states = () if output_hidden_states else None
        # 初始化一个空元组，用于存储所有隐藏状态，如果不需要输出隐藏状态，则置为 None

        for _, stage_module in enumerate(self.stages):
            # 遍历 self.stages 中的每个阶段模块，每个模块称为 stage_module
            hidden_state = stage_module(hidden_state)
            # 对当前隐藏状态应用当前阶段模块，更新隐藏状态

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_state,)
                # 如果需要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 元组中

        if not return_dict:
            # 如果不返回字典格式的输出
            return tuple(v for v in [hidden_state, all_hidden_states] if v is not None)
            # 返回隐藏状态和所有隐藏状态的元组，去除 None 值

        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=all_hidden_states)
        # 返回一个 BaseModelOutputWithNoAttention 对象，包含最终的隐藏状态和所有隐藏状态
class VanPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 VanConfig 作为该模型的配置类
    config_class = VanConfig
    # 模型的基础名称前缀为 "van"
    base_model_prefix = "van"
    # 主要输入的名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果是 nn.Linear 模块，使用截断正态分布初始化权重
        if isinstance(module, nn.Linear):
            nn.init.trunc_normal_(module.weight, std=self.config.initializer_range)
            # 如果是 nn.Linear 模块且有偏置，则初始化偏置为常数 0
            if isinstance(module, nn.Linear) and module.bias is not None:
                nn.init.constant_(module.bias, 0)
        # 如果是 nn.LayerNorm 模块，初始化偏置为常数 0，权重为常数 1.0
        elif isinstance(module, nn.LayerNorm):
            nn.init.constant_(module.bias, 0)
            nn.init.constant_(module.weight, 1.0)
        # 如果是 nn.Conv2d 模块，使用正态分布初始化权重，偏置初始化为零
        elif isinstance(module, nn.Conv2d):
            fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
            fan_out //= module.groups
            module.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            if module.bias is not None:
                module.bias.data.zero_()


# VAN_START_DOCSTRING 是一个原始字符串，用于定义关于 VAN 模型的文档字符串
VAN_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`VanConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# VAN_INPUTS_DOCSTRING 是一个原始字符串，用于定义 VAN 模型的输入文档字符串
VAN_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`ConvNextImageProcessor.__call__`] for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all stages. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

@add_start_docstrings(
    "The bare VAN model outputting raw features without any specific head on top. Note, VAN does not have an embedding"
    " layer.",
    VAN_START_DOCSTRING,
)
# VanModel 类继承自 VanPreTrainedModel，用于具体实现 VAN 模型
class VanModel(VanPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 设置模型配置
        self.config = config
        # 使用 VanEncoder 根据配置初始化编码器
        self.encoder = VanEncoder(config)
        # 最后的 layernorm 层，使用 nn.LayerNorm 初始化，eps 参数由 config 提供
        self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
        # 初始化权重并应用最终处理
        self.post_init()

    # 使用 VAN_INPUTS_DOCSTRING 注释模型前向方法的参数
    @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
    # 将函数修饰为文档化代码示例的装饰器，指定了一些文档化参数
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndNoAttention,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    # 定义前向传播函数，接受像素值和可选的参数，返回编码器输出或元组
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor],  # 输入像素值的张量，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态的布尔值，可选
        return_dict: Optional[bool] = None,  # 是否使用字典形式返回结果的布尔值，可选
    ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:  # 返回值可以是元组或指定的输出类型
        # 如果未提供输出隐藏状态的参数，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未提供返回字典的参数，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用编码器处理输入像素值，根据参数决定是否输出隐藏状态或使用字典返回
        encoder_outputs = self.encoder(
            pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 提取编码器输出的最后隐藏状态作为主要输出
        last_hidden_state = encoder_outputs[0]
        # 对最后隐藏状态进行全局平均池化，将高度和宽度维度降为1，保留批次和通道维度
        pooled_output = last_hidden_state.mean(dim=[-2, -1])

        # 如果不要求使用返回字典，返回编码器的最后隐藏状态和池化后的输出，以及可能的其他输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果要求使用返回字典，构造指定类型的输出对象，包括最后隐藏状态、池化输出和所有隐藏状态
        return BaseModelOutputWithPoolingAndNoAttention(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
        )
# 使用 VAN 模型进行图像分类任务的定制，包含一个线性分类器作为顶层（位于池化特征之上），例如用于 ImageNet 数据集。
@add_start_docstrings(
    """
    VAN 模型，顶部附带一个图像分类头部（线性层在池化特征之上），例如适用于 ImageNet。
    """,
    VAN_START_DOCSTRING,
)
class VanForImageClassification(VanPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 初始化 VAN 模型
        self.van = VanModel(config)
        # 分类器头部
        self.classifier = (
            # 如果配置中指定的标签数大于 0，则使用线性层；否则使用恒等映射
            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutputWithNoAttention,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 设置返回字典，如果未指定则根据配置决定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的前向传播方法，获取输出
        outputs = self.van(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)

        # 如果指定了使用返回字典，则从输出中获取汇聚后的特征表示
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 使用分类器对汇聚后的特征表示进行分类得到 logits
        logits = self.classifier(pooled_output)

        # 初始化损失为 None
        loss = None
        # 如果提供了标签
        if labels is not None:
            # 如果问题类型未指定，则根据标签类型和类数设定问题类型
            if self.config.problem_type is None:
                if self.config.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算相应的损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.config.num_labels == 1:
                    # 对于回归问题，计算均方误差损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签回归问题，计算均方误差损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类问题，计算交叉熵损失
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类问题，计算二元交叉熵损失
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不使用返回字典，则将 logits 和额外的隐藏状态输出返回
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 使用自定义的输出类返回结果，包括损失、logits 和隐藏状态
        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)

`.\models\deprecated\van\init.py`

# 版权声明和许可信息，声明代码版权及许可协议
#
# 依赖导入：从不同位置导入必要的模块和函数
from typing import TYPE_CHECKING

# 引入自定义的异常类和模块，用于在依赖不可用时触发异常
from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块导入结构的字典，用于懒加载模块和函数
_import_structure = {"configuration_van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"]}

# 检查是否可用 Torch 库，如果不可用则抛出自定义的依赖异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则添加以下模块到导入结构字典中
    _import_structure["modeling_van"] = [
        "VAN_PRETRAINED_MODEL_ARCHIVE_LIST",
        "VanForImageClassification",
        "VanModel",
        "VanPreTrainedModel",
    ]

# 如果当前环境支持类型检查，则从特定模块导入相关配置和模型类
if TYPE_CHECKING:
    from .configuration_van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig

    # 再次检查 Torch 是否可用，不可用则抛出异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，则从 modeling_van 模块导入以下类和常量
        from .modeling_van import (
            VAN_PRETRAINED_MODEL_ARCHIVE_LIST,
            VanForImageClassification,
            VanModel,
            VanPreTrainedModel,
        )

# 如果不是类型检查环境，则直接将当前模块注册为懒加载模块
else:
    import sys

    # 将当前模块注册为懒加载模块，用 _LazyModule 进行包装
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\deprecated\init.py`

# 导入必要的模块：os 模块提供了与操作系统交互的功能，sys 模块提供了与 Python 解释器交互的功能
import os
import sys

# 定义函数：遍历指定路径下的所有文件和文件夹，返回它们的绝对路径列表
def list_files(directory):
    # 初始化一个空列表，用于存储所有文件的绝对路径
    files = []
    # 遍历指定路径下的所有文件和文件夹
    for dirpath, _, filenames in os.walk(directory):
        # 将每个文件的绝对路径添加到列表中
        for f in filenames:
            files.append(os.path.abspath(os.path.join(dirpath, f)))
    # 返回所有文件的绝对路径列表
    return files

# 如果脚本直接运行（而非被导入），执行以下操作
if __name__ == "__main__":
    # 如果未提供参数，则提示正确的用法
    if len(sys.argv) != 2:
        print("Usage: python list_files.py directory")
        # 退出程序，并返回非零状态码以指示错误
        sys.exit(1)
    
    # 从命令行参数获取目录路径
    directory = sys.argv[1]
    
    # 调用函数获取目录下所有文件的绝对路径列表
    files = list_files(directory)
    
    # 打印每个文件的绝对路径
    for f in files:
        print(f)

`.\models\depth_anything\configuration_depth_anything.py`

# coding=utf-8
# 引入深度学习框架的配置
# 版权声明和许可证明，保留所有权利
#
# 根据 Apache 许可证 2.0 版本，只有在符合许可证条件下才能使用此文件
# 可以通过以下链接获取许可证副本
# http://www.apache.org/licenses/LICENSE-2.0
#
# 如果没有适用法律要求或书面同意，本软件是基于“现状”分发的，
# 没有任何形式的担保或条件，无论是明示的还是暗示的
# 详见许可证内容以了解更多信息
""" DepthAnything model configuration"""

import copy

# 从相关模块导入预训练配置和日志记录工具
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import CONFIG_MAPPING

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# 预训练模型配置文件映射，将预训练模型名称映射到其配置文件的 URL
DEPTH_ANYTHING_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "LiheYoung/depth-anything-small-hf": "https://huggingface.co/LiheYoung/depth-anything-small-hf/resolve/main/config.json",
}


class DepthAnythingConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DepthAnythingModel`]. It is used to instantiate an DepthAnything
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DepthAnything
    [LiheYoung/depth-anything-small-hf](https://huggingface.co/LiheYoung/depth-anything-small-hf) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
            The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to
            leverage the [`AutoBackbone`] API.
        backbone (`str`, *optional*):
            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
            Whether to use pretrained weights for the backbone.
        patch_size (`int`, *optional*, defaults to 14):
            The size of the patches to extract from the backbone features.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        reassemble_hidden_size (`int`, *optional*, defaults to 384):
            The number of input channels of the reassemble layers.
        reassemble_factors (`List[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
            The up/downsampling factors of the reassemble layers.
        neck_hidden_sizes (`List[str]`, *optional*, defaults to `[48, 96, 192, 384]`):
            The hidden sizes to project to for the feature maps of the backbone.
        fusion_hidden_size (`int`, *optional*, defaults to 64):
            The number of channels before fusion.
        head_in_index (`int`, *optional*, defaults to -1):
            The index of the features to use in the depth estimation head.
        head_hidden_size (`int`, *optional*, defaults to 32):
            The number of output channels in the second convolution of the depth estimation head.

    Example:

    ```
    >>> from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation

    >>> # Initializing a DepthAnything small style configuration
    >>> configuration = DepthAnythingConfig()

    >>> # Initializing a model from the DepthAnything small style configuration
    >>> model = DepthAnythingForDepthEstimation(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```

    # Define `model_type` as "depth_anything" for identifying the model type.
    model_type = "depth_anything"

    # Constructor for initializing the DepthAnythingForDepthEstimation class.
    def __init__(
        self,
        backbone_config=None,
        backbone=None,
        use_pretrained_backbone=False,
        patch_size=14,
        initializer_range=0.02,
        reassemble_hidden_size=384,
        reassemble_factors=[4, 2, 1, 0.5],
        neck_hidden_sizes=[48, 96, 192, 384],
        fusion_hidden_size=64,
        head_in_index=-1,
        head_hidden_size=32,
        **kwargs,
    ):
        super().__init__(**kwargs)
        # 调用父类的初始化方法，传递所有关键字参数

        if use_pretrained_backbone:
            # 如果指定使用预训练的主干网络，则抛出数值错误异常
            raise ValueError("Pretrained backbones are not supported yet.")

        if backbone_config is not None and backbone is not None:
            # 如果同时指定了 `backbone` 和 `backbone_config`，则抛出数值错误异常
            raise ValueError("You can't specify both `backbone` and `backbone_config`.")

        if backbone_config is None and backbone is None:
            # 如果未指定 `backbone_config` 和 `backbone`，记录日志并使用默认的 `Dinov2` 主干网络配置进行初始化
            logger.info("`backbone_config` is `None`. Initializing the config with the default `Dinov2` backbone.")
            backbone_config = CONFIG_MAPPING["dinov2"](
                image_size=518,
                hidden_size=384,
                num_attention_heads=6,
                out_indices=[9, 10, 11, 12],
                apply_layernorm=True,
                reshape_hidden_states=False,
            )
        elif isinstance(backbone_config, dict):
            # 如果 `backbone_config` 是字典类型，则根据字典中的 `model_type` 获取对应的配置类并从字典初始化 `backbone_config`
            backbone_model_type = backbone_config.get("model_type")
            config_class = CONFIG_MAPPING[backbone_model_type]
            backbone_config = config_class.from_dict(backbone_config)

        self.backbone_config = backbone_config
        self.backbone = backbone
        self.use_pretrained_backbone = use_pretrained_backbone
        self.reassemble_hidden_size = reassemble_hidden_size
        self.patch_size = patch_size
        self.initializer_range = initializer_range
        self.reassemble_factors = reassemble_factors
        self.neck_hidden_sizes = neck_hidden_sizes
        self.fusion_hidden_size = fusion_hidden_size
        self.head_in_index = head_in_index
        self.head_hidden_size = head_hidden_size

    def to_dict(self):
        """
        将当前实例序列化为 Python 字典。重写默认的 `PretrainedConfig.to_dict` 方法。返回:
            `Dict[str, any]`: 包含此配置实例所有属性的字典,
        """
        output = copy.deepcopy(self.__dict__)

        if output["backbone_config"] is not None:
            # 如果 `backbone_config` 不为 None，则将其转换为字典形式
            output["backbone_config"] = self.backbone_config.to_dict()

        output["model_type"] = self.__class__.model_type
        return output

Transformers-源码解析-三十七-

Transformers 源码解析（三十七）

.\models\deprecated\transfo_xl\convert_transfo_xl_original_tf_checkpoint_to_pytorch.py

.\models\deprecated\transfo_xl\modeling_tf_transfo_xl.py

.\models\deprecated\transfo_xl\modeling_tf_transfo_xl_utilities.py

.\models\deprecated\transfo_xl\modeling_transfo_xl.py

.\models\deprecated\transfo_xl\modeling_transfo_xl_utilities.py

.\models\deprecated\transfo_xl\tokenization_transfo_xl.py

.\models\deprecated\transfo_xl\__init__.py

.\models\deprecated\van\configuration_van.py

.\models\deprecated\van\convert_van_to_pytorch.py

.\models\deprecated\van\modeling_van.py

.\models\deprecated\van\__init__.py

.\models\deprecated\__init__.py

.\models\depth_anything\configuration_depth_anything.py

`.\models\deprecated\transfo_xl\convert_transfo_xl_original_tf_checkpoint_to_pytorch.py`

`.\models\deprecated\transfo_xl\modeling_tf_transfo_xl.py`

`.\models\deprecated\transfo_xl\modeling_tf_transfo_xl_utilities.py`

`.\models\deprecated\transfo_xl\modeling_transfo_xl.py`

`.\models\deprecated\transfo_xl\modeling_transfo_xl_utilities.py`

`.\models\deprecated\transfo_xl\tokenization_transfo_xl.py`

`.\models\deprecated\transfo_xl\init.py`

`.\models\deprecated\van\configuration_van.py`

`.\models\deprecated\van\convert_van_to_pytorch.py`

`.\models\deprecated\van\modeling_van.py`

`.\models\deprecated\van\init.py`

`.\models\deprecated\init.py`

`.\models\depth_anything\configuration_depth_anything.py`