Transformers 源码解析（三十五）

`.\models\deformable_detr\init.py`

# 版权声明和许可证信息，指明该代码的版权归属和许可证条款
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入类型检查模块中的TYPE_CHECKING
from typing import TYPE_CHECKING

# 导入必要的依赖
# 导入自定义工具函数和类
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块的导入结构字典
_import_structure = {
    "configuration_deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"],
}

# 检查视觉库是否可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加以下导入到_import_structure字典
    _import_structure["feature_extraction_deformable_detr"] = ["DeformableDetrFeatureExtractor"]
    _import_structure["image_processing_deformable_detr"] = ["DeformableDetrImageProcessor"]

# 检查Torch库是否可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加以下导入到_import_structure字典
    _import_structure["modeling_deformable_detr"] = [
        "DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DeformableDetrForObjectDetection",
        "DeformableDetrModel",
        "DeformableDetrPreTrainedModel",
    ]

# 如果正在进行类型检查
if TYPE_CHECKING:
    # 从configuration_deformable_detr模块导入特定符号
    from .configuration_deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig

    # 检查视觉库是否可用，若不可用则忽略导入
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若可用，则从feature_extraction_deformable_detr和image_processing_deformable_detr模块导入特定符号
        from .feature_extraction_deformable_detr import DeformableDetrFeatureExtractor
        from .image_processing_deformable_detr import DeformableDetrImageProcessor

    # 检查Torch库是否可用，若不可用则忽略导入
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若可用，则从modeling_deformable_detr模块导入特定符号
        from .modeling_deformable_detr import (
            DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
            DeformableDetrForObjectDetection,
            DeformableDetrModel,
            DeformableDetrPreTrainedModel,
        )

# 如果不是类型检查模式，则为当前模块创建一个延迟加载模块
else:
    import sys

    # 使用_LazyModule将当前模块的导入结构暴露给sys.modules
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\deit\configuration_deit.py`

# coding=utf-8
# Copyright 2021 Facebook AI Research (FAIR) and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" DeiT model configuration"""

# 从 collections 模块导入 OrderedDict，用于有序字典的支持
from collections import OrderedDict
# 导入 Mapping 用于类型提示
from typing import Mapping

# 从 packaging 模块导入 version，用于版本处理
from packaging import version

# 导入预训练配置的基类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 导入 OnnxConfig 用于 ONNX 格式配置
from ...onnx import OnnxConfig
# 导入 logging 模块中的 get_logger 函数
from ...utils import logging

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 预训练模型的配置文件映射，包含模型名称及其对应的配置文件 URL
DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/deit-base-distilled-patch16-224": (
        "https://huggingface.co/facebook/deit-base-patch16-224/resolve/main/config.json"
    ),
    # 查看所有 DeiT 模型的列表：https://huggingface.co/models?filter=deit
}


# DeiT 模型的配置类，继承自 PretrainedConfig
class DeiTConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DeiTModel`]. It is used to instantiate an DeiT
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DeiT
    [facebook/deit-base-distilled-patch16-224](https://huggingface.co/facebook/deit-base-distilled-patch16-224)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 模型类型设定为 "deit"
    model_type = "deit"

    # 初始化函数，定义了模型的各种配置参数
    def __init__(
        self,
        # 编码器层和池化层的维度
        hidden_size=768,
        # Transformer 编码器中隐藏层的数量
        num_hidden_layers=12,
        # Transformer 编码器中每个注意力层的注意力头数
        num_attention_heads=12,
        # Transformer 编码器中"中间"（即前馈）层的维度
        intermediate_size=3072,
        # 编码器和池化层中的非线性激活函数
        hidden_act="gelu",
        # 嵌入层、编码器和池化层中所有全连接层的 dropout 概率
        hidden_dropout_prob=0.0,
        # 注意力概率的 dropout 比例
        attention_probs_dropout_prob=0.0,
        # 初始化所有权重矩阵的截断正态初始化器的标准差
        initializer_range=0.02,
        # 层归一化层使用的 epsilon 值
        layer_norm_eps=1e-12,
        # 每个图像的大小（分辨率）
        image_size=224,
        # 每个图像块（patch）的大小（分辨率）
        patch_size=16,
        # 输入通道的数量
        num_channels=3,
        # 是否为查询、键和值添加偏置
        qkv_bias=True,
        # 解码器头部中用于掩蔽图像建模的空间分辨率增加因子
        encoder_stride=16,
        **kwargs,
        ):
        # 调用父类的初始化方法，传递所有关键字参数
        super().__init__(**kwargs)

        # 初始化模型的隐藏层大小
        self.hidden_size = hidden_size
        # 设置模型的隐藏层数量
        self.num_hidden_layers = num_hidden_layers
        # 设置注意力头的数量
        self.num_attention_heads = num_attention_heads
        # 设置中间层的大小
        self.intermediate_size = intermediate_size
        # 激活函数类型
        self.hidden_act = hidden_act
        # 隐藏层的 dropout 概率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 注意力概率的 dropout 概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 初始化范围
        self.initializer_range = initializer_range
        # 层标准化的 epsilon 值
        self.layer_norm_eps = layer_norm_eps
        # 图像大小
        self.image_size = image_size
        # 图像分块大小
        self.patch_size = patch_size
        # 图像通道数
        self.num_channels = num_channels
        # 是否使用 QKV 偏置
        self.qkv_bias = qkv_bias
        # 编码器步长
        self.encoder_stride = encoder_stride
class DeiTOnnxConfig(OnnxConfig):
    # 定义一个新的配置类 DeiTOnnxConfig，继承自 OnnxConfig 类

    torch_onnx_minimum_version = version.parse("1.11")
    # 设置 torch 和 ONNX 的最低兼容版本为 1.11

    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 定义 inputs 属性，返回一个有序字典，表示输入数据的结构
        return OrderedDict(
            [
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
            ]
        )

    @property
    def atol_for_validation(self) -> float:
        # 定义 atol_for_validation 属性，返回一个浮点数，表示验证时的容差值
        return 1e-4

`.\models\deit\convert_deit_timm_to_pytorch.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert DeiT distilled checkpoints from the timm library."""


import argparse  # 导入解析命令行参数的模块
import json  # 导入处理 JSON 格式数据的模块
from pathlib import Path  # 导入处理路径操作的模块

import requests  # 导入处理 HTTP 请求的模块
import timm  # 导入处理图像模型的模块
import torch  # 导入 PyTorch 深度学习框架
from huggingface_hub import hf_hub_download  # 导入从 Hugging Face Hub 下载模型的函数
from PIL import Image  # 导入处理图像的模块

from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor  # 导入 DeiT 相关模块
from transformers.utils import logging  # 导入日志记录工具


logging.set_verbosity_info()  # 设置日志记录级别为 info
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


# here we list all keys to be renamed (original name on the left, our name on the right)
def create_rename_keys(config, base_model=False):
    rename_keys = []  # 初始化存储重命名键的列表
    for i in range(config.num_hidden_layers):
        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
        # 遍历编码器层，处理输出投影、两个前馈神经网络和两个层归一化层
        rename_keys.append((f"blocks.{i}.norm1.weight", f"deit.encoder.layer.{i}.layernorm_before.weight"))
        rename_keys.append((f"blocks.{i}.norm1.bias", f"deit.encoder.layer.{i}.layernorm_before.bias"))
        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"deit.encoder.layer.{i}.attention.output.dense.weight"))
        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"deit.encoder.layer.{i}.attention.output.dense.bias"))
        rename_keys.append((f"blocks.{i}.norm2.weight", f"deit.encoder.layer.{i}.layernorm_after.weight"))
        rename_keys.append((f"blocks.{i}.norm2.bias", f"deit.encoder.layer.{i}.layernorm_after.bias"))
        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"deit.encoder.layer.{i}.intermediate.dense.weight"))
        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"deit.encoder.layer.{i}.intermediate.dense.bias"))
        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"deit.encoder.layer.{i}.output.dense.weight"))
        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"deit.encoder.layer.{i}.output.dense.bias"))

    # projection layer + position embeddings
    # 投影层和位置嵌入的重命名处理
    rename_keys.extend(
        [
            ("cls_token", "deit.embeddings.cls_token"),
            ("dist_token", "deit.embeddings.distillation_token"),
            ("patch_embed.proj.weight", "deit.embeddings.patch_embeddings.projection.weight"),
            ("patch_embed.proj.bias", "deit.embeddings.patch_embeddings.projection.bias"),
            ("pos_embed", "deit.embeddings.position_embeddings"),
        ]
    )
    if base_model:
        # 如果存在基础模型，进行以下操作
        # 将原始键名重命名为新的键名，适用于layernorm和pooler的情况
        rename_keys.extend(
            [
                ("norm.weight", "layernorm.weight"),
                ("norm.bias", "layernorm.bias"),
                ("pre_logits.fc.weight", "pooler.dense.weight"),
                ("pre_logits.fc.bias", "pooler.dense.bias"),
            ]
        )

        # 如果仅有基础模型，移除所有以"deit"开头的键名中的"deit"
        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("deit") else pair for pair in rename_keys]
    else:
        # 如果没有基础模型，进行以下操作
        # 将原始键名重命名为新的键名，适用于layernorm和分类头的情况
        rename_keys.extend(
            [
                ("norm.weight", "deit.layernorm.weight"),
                ("norm.bias", "deit.layernorm.bias"),
                ("head.weight", "cls_classifier.weight"),
                ("head.bias", "cls_classifier.bias"),
                ("head_dist.weight", "distillation_classifier.weight"),
                ("head_dist.bias", "distillation_classifier.bias"),
            ]
        )

    # 返回处理后的键名列表
    return rename_keys
# 将每个编码器层的矩阵分割为查询（query）、键（key）和值（value）
def read_in_q_k_v(state_dict, config, base_model=False):
    # 遍历每个编码器层
    for i in range(config.num_hidden_layers):
        if base_model:
            prefix = ""
        else:
            prefix = "deit."
        
        # 从 state_dict 中弹出输入投影层权重和偏置的参数
        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
        
        # 将查询（query）、键（key）和值（value）的权重添加到 state_dict 中
        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
            : config.hidden_size, :
        ]
        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
            config.hidden_size : config.hidden_size * 2, :
        ]
        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
            config.hidden_size : config.hidden_size * 2
        ]
        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
            -config.hidden_size :, :
        ]
        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]


# 将字典中的某个键（old）重命名为新键（new）
def rename_key(dct, old, new):
    val = dct.pop(old)
    dct[new] = val


# 在一张可爱猫咪的图片上验证我们的结果
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 通过 HTTP 请求打开并获取图片
    im = Image.open(requests.get(url, stream=True).raw)
    return im


# 在没有梯度的情况下执行函数
@torch.no_grad()
def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
    """
    将模型权重从其它结构复制、粘贴并调整到我们的 DeiT 结构中。
    """
    
    # 定义默认的 DeiT 配置
    config = DeiTConfig()
    # 所有 DeiT 模型都有微调的头部
    base_model = False
    # 数据集（在 ImageNet 2012 上微调）、补丁大小和图像大小
    config.num_labels = 1000
    repo_id = "huggingface/label-files"
    filename = "imagenet-1k-id2label.json"
    
    # 加载 ImageNet 类别到标签映射
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}
    
    # 解析 DeiT 模型名称中的补丁大小和图像大小
    config.patch_size = int(deit_name[-6:-4])
    config.image_size = int(deit_name[-3:])
    
    # 根据模型名称设置架构大小
    if deit_name[9:].startswith("tiny"):
        config.hidden_size = 192
        config.intermediate_size = 768
        config.num_hidden_layers = 12
        config.num_attention_heads = 3
    elif deit_name[9:].startswith("small"):
        config.hidden_size = 384
        config.intermediate_size = 1536
        config.num_hidden_layers = 12
        config.num_attention_heads = 6
    # 如果模型名称从第9个字符开始以"base"开头，不执行任何操作
    if deit_name[9:].startswith("base"):
        pass
    # 如果模型名称从第4个字符开始以"large"开头，设置一些配置参数
    elif deit_name[4:].startswith("large"):
        config.hidden_size = 1024
        config.intermediate_size = 4096
        config.num_hidden_layers = 24
        config.num_attention_heads = 16

    # 使用timm库创建指定预训练模型的实例并设为评估模式
    timm_model = timm.create_model(deit_name, pretrained=True)
    timm_model.eval()

    # 获取timm模型的state_dict，准备对其进行重命名和修改
    state_dict = timm_model.state_dict()
    # 创建需要重命名的键对列表
    rename_keys = create_rename_keys(config, base_model)
    # 遍历重命名键对列表，对state_dict中的键进行重命名
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    # 根据config和base_model读入query、key、value
    read_in_q_k_v(state_dict, config, base_model)

    # 使用HuggingFace库创建DeiT图像分类模型实例并设为评估模式
    model = DeiTForImageClassificationWithTeacher(config).eval()
    # 加载预训练模型的state_dict到当前模型实例
    model.load_state_dict(state_dict)

    # 使用DeiTImageProcessor处理图像，以准备输入模型
    size = int(
        (256 / 224) * config.image_size
    )  # 维持与224像素图像相同的比例，参考https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103
    # 创建DeiTImageProcessor实例，设置大小和裁剪大小
    image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size)
    # 使用DeiTImageProcessor处理图像，返回PyTorch张量
    encoding = image_processor(images=prepare_img(), return_tensors="pt")
    # 获取像素值张量
    pixel_values = encoding["pixel_values"]
    # 将输入像素值传入模型，获取模型的输出
    outputs = model(pixel_values)

    # 使用timm模型对像素值进行推断，获取其logits
    timm_logits = timm_model(pixel_values)
    # 断言timm模型输出的形状与HuggingFace模型输出的logits形状相同
    assert timm_logits.shape == outputs.logits.shape
    # 断言timm模型输出的logits与HuggingFace模型输出的logits在指定容差范围内近似相等
    assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)

    # 创建保存PyTorch模型和图像处理器的文件夹路径
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 打印提示信息，保存当前模型到指定路径
    print(f"Saving model {deit_name} to {pytorch_dump_folder_path}")
    # 将当前模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)
    # 打印提示信息，保存图像处理器到指定路径
    print(f"Saving image processor to {pytorch_dump_folder_path}")
    # 将图像处理器保存到指定路径
    image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    # 如果当前脚本被直接执行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # Required parameters
    parser.add_argument(
        "--deit_name",
        default="vit_deit_base_distilled_patch16_224",
        type=str,
        help="Name of the DeiT timm model you'd like to convert.",
    )
    # 添加一个必需的命令行参数 --deit_name，指定默认值和帮助信息

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加另一个命令行参数 --pytorch_dump_folder_path，指定默认值和帮助信息

    args = parser.parse_args()
    # 解析命令行参数并将其存储在 args 对象中

    convert_deit_checkpoint(args.deit_name, args.pytorch_dump_folder_path)
    # 调用 convert_deit_checkpoint 函数，传入解析后的参数 args.deit_name 和 args.pytorch_dump_folder_path

`.\models\deit\feature_extraction_deit.py`

# coding=utf-8
# 声明编码格式为 UTF-8

# 版权声明
# 版权所有 © 2021 年 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）授权；
# 除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件，
# 没有任何明示或暗示的保证或条件。
# 有关许可证的详细信息，请参阅许可证。

"""DeiT 的特征提取器类。"""

# 导入警告模块
import warnings

# 导入日志记录工具
from ...utils import logging

# 导入 DeiT 图像处理模块
from .image_processing_deit import DeiTImageProcessor

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# DeiTFeatureExtractor 类继承自 DeiTImageProcessor 类
class DeiTFeatureExtractor(DeiTImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告，提示 DeiTFeatureExtractor 类已被弃用，并将在 Transformers 版本 5 中移除
        warnings.warn(
            "The class DeiTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
            " use DeiTImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类的初始化方法
        super().__init__(*args, **kwargs)

`.\models\deit\image_processing_deit.py`

# 引入必要的依赖库和模块，包括类型提示和图像处理工具
from typing import Dict, List, Optional, Union  # 导入类型提示
import numpy as np  # 导入 NumPy 库

# 导入图像处理相关的工具和函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import resize, to_channel_dimension_format
from ...image_utils import (
    IMAGENET_STANDARD_MEAN,  # 导入图像标准均值
    IMAGENET_STANDARD_STD,   # 导入图像标准方差
    ChannelDimension,        # 导入通道维度
    ImageInput,              # 导入图像输入类型
    PILImageResampling,      # 导入 PIL 图像重采样方式
    infer_channel_dimension_format,  # 推断通道维度格式
    is_scaled_image,         # 判断是否为缩放图像
    make_list_of_images,     # 创建图像列表
    to_numpy_array,          # 转换为 NumPy 数组
    valid_images,            # 验证图像有效性
    validate_kwargs,         # 验证关键字参数
    validate_preprocess_arguments,  # 验证预处理参数
)
from ...utils import TensorType, is_vision_available, logging  # 导入其他必要工具

# 如果视觉工具可用，导入 PIL 库
if is_vision_available():
    import PIL

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 定义 DeiTImageProcessor 类，继承自 BaseImageProcessor 类
class DeiTImageProcessor(BaseImageProcessor):
    r"""
    Constructs a DeiT image processor.
    """
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
            `do_resize` in `preprocess`.
        size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
            Size of the image after `resize`. Can be overridden by `size` in `preprocess`.
        resample (`PILImageResampling` filter, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
        do_center_crop (`bool`, *optional*, defaults to `True`):
            Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
            is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in `preprocess`.
        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
            Desired output size when applying center-cropping. Can be overridden by `crop_size` in `preprocess`.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
            parameter in the `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
    """
    # 定义模型输入名称列表，只包含一个元素"pixel_values"
    model_input_names = ["pixel_values"]

    # 初始化方法，设置各种预处理参数的默认值
    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PIL.Image.BICUBIC,
        do_center_crop: bool = True,
        crop_size: Dict[str, int] = None,
        rescale_factor: Union[int, float] = 1 / 255,
        do_rescale: bool = True,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        **kwargs,
    ):
    # 定义构造函数，继承自父类，接受关键字参数
    def __init__(
        self,
        size: Optional[Dict[str, int]] = None,
        do_resize: bool = True,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        do_center_crop: bool = False,
        crop_size: Optional[Dict[str, int]] = None,
        do_rescale: bool = False,
        rescale_factor: float = 1.0,
        do_normalize: bool = False,
        image_mean: Optional[List[float]] = None,
        image_std: Optional[List[float]] = None,
        **kwargs,
    ):
        # 调用父类的构造函数，传入所有关键字参数
        super().__init__(**kwargs)
        
        # 如果 size 参数为 None，则设定默认值 {"height": 256, "width": 256}
        size = size if size is not None else {"height": 256, "width": 256}
        # 调用 get_size_dict 函数，规范化 size 字典的格式
        size = get_size_dict(size)
        
        # 如果 crop_size 参数为 None，则设定默认值 {"height": 224, "width": 224}
        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
        # 调用 get_size_dict 函数，规范化 crop_size 字典的格式，并指定参数名 "crop_size"
        crop_size = get_size_dict(crop_size, param_name="crop_size")

        # 设定是否进行 resize 操作的标志
        self.do_resize = do_resize
        # 设定图像尺寸的参数
        self.size = size
        # 设定图像 resize 时的插值方法
        self.resample = resample
        # 设定是否进行中心裁剪操作的标志
        self.do_center_crop = do_center_crop
        # 设定裁剪尺寸的参数
        self.crop_size = crop_size
        # 设定是否进行图像 rescale 的标志
        self.do_rescale = do_rescale
        # 设定图像 rescale 的比例因子
        self.rescale_factor = rescale_factor
        # 设定是否进行图像 normalize 的标志
        self.do_normalize = do_normalize
        # 如果未指定图像均值，则使用预设的 IMAGENET_STANDARD_MEAN
        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
        # 如果未指定图像标准差，则使用预设的 IMAGENET_STANDARD_STD
        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD

        # 设定有效的处理器关键字列表，用于验证
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_center_crop",
            "crop_size",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    
    # 从 transformers.models.vit.image_processing_vit.ViTImageProcessor.resize 复制的函数，设定插值方法为 PILImageResampling.BICUBIC
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        """
        size = get_size_dict(size)  # 调用函数获取处理后的尺寸信息
        if "height" not in size or "width" not in size:
            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
        output_size = (size["height"], size["width"])  # 获取输出图像的尺寸
        return resize(
            image,
            size=output_size,  # 调整图像尺寸为指定的大小
            resample=resample,  # 使用指定的重采样方法
            data_format=data_format,  # 设置输出图像的通道维度格式
            input_data_format=input_data_format,  # 设置输入图像的通道维度格式
            **kwargs,
        )

    def preprocess(
        self,
        images: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample=None,
        do_center_crop: bool = None,
        crop_size: Dict[str, int] = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,

`.\models\deit\modeling_deit.py`

# 设置编码格式为 UTF-8
# 版权声明，版权归Facebook AI Research (FAIR)，Ross Wightman，The HuggingFace Inc. team所有
#
# 根据Apache许可证2.0版授权使用此文件；
# 除非符合许可证的要求，否则您不能使用本文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"分发，
# 没有任何形式的担保或条件，包括但不限于明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。
""" PyTorch DeiT模型。"""

import collections.abc  # 导入collections.abc模块
import math  # 导入math模块
from dataclasses import dataclass  # 从dataclasses模块导入dataclass装饰器
from typing import Optional, Set, Tuple, Union  # 导入类型提示相关内容

import torch  # 导入PyTorch库
import torch.utils.checkpoint  # 导入PyTorch的checkpoint工具
from torch import nn  # 从PyTorch导入神经网络模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 从PyTorch导入损失函数类

from ...activations import ACT2FN  # 导入激活函数映射表
from ...modeling_outputs import (  # 导入模型输出相关类
    BaseModelOutput,
    BaseModelOutputWithPooling,
    ImageClassifierOutput,
    MaskedImageModelingOutput,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer  # 导入模型剪枝相关工具
from ...utils import (  # 导入实用工具函数和类
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_deit import DeiTConfig  # 导入DeiT模型的配置类

logger = logging.get_logger(__name__)  # 获取日志记录器

# 通用文档字符串
_CONFIG_FOR_DOC = "DeiTConfig"

# 基础文档字符串
_CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
_EXPECTED_OUTPUT_SHAPE = [1, 198, 768]

# 图像分类文档字符串
_IMAGE_CLASS_CHECKPOINT = "facebook/deit-base-distilled-patch16-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/deit-base-distilled-patch16-224",
    # 查看所有DeiT模型：https://huggingface.co/models?filter=deit
]


class DeiTEmbeddings(nn.Module):
    """
    构建CLS令牌、蒸馏令牌、位置和补丁嵌入。可选地，还包括掩码令牌。
    """

    def __init__(self, config: DeiTConfig, use_mask_token: bool = False) -> None:
        super().__init__()

        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))  # 定义CLS令牌参数
        self.distillation_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))  # 定义蒸馏令牌参数
        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None  # 如果使用掩码令牌，则定义掩码令牌参数
        self.patch_embeddings = DeiTPatchEmbeddings(config)  # 初始化补丁嵌入层
        num_patches = self.patch_embeddings.num_patches  # 获取补丁数
        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 2, config.hidden_size))  # 定义位置嵌入参数
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # 定义dropout层
    # 定义一个方法 `forward`，用于模型前向传播，接受像素值张量 `pixel_values` 和可选的布尔类型掩码张量 `bool_masked_pos`，返回处理后的张量
    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor:
        # 使用 `patch_embeddings` 方法将像素值张量转换为嵌入张量
        embeddings = self.patch_embeddings(pixel_values)
        # 获取嵌入张量的批处理大小、序列长度和嵌入维度
        batch_size, seq_length, _ = embeddings.size()

        # 如果存在掩码张量，则用 `mask_token` 替换掩码的视觉标记
        if bool_masked_pos is not None:
            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
            # 创建掩码张量，并将其转换为与 `mask_tokens` 相同的数据类型
            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
            # 用 `mask` 控制张量 `embeddings` 中的掩码视觉标记部分，保持未掩码部分不变
            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask

        # 将 `cls_token` 扩展为与批处理大小相同的形状
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        # 将 `distillation_token` 扩展为与批处理大小相同的形状
        distillation_tokens = self.distillation_token.expand(batch_size, -1, -1)
        # 在维度1上连接 `cls_tokens`、`distillation_tokens` 和 `embeddings`
        embeddings = torch.cat((cls_tokens, distillation_tokens, embeddings), dim=1)
        # 将位置嵌入加到 `embeddings` 中
        embeddings = embeddings + self.position_embeddings
        # 应用 dropout 操作到 `embeddings` 中
        embeddings = self.dropout(embeddings)
        # 返回处理后的嵌入张量 `embeddings`
        return embeddings
# 定义一个名为 DeiTPatchEmbeddings 的类，继承自 nn.Module，用于将形状为 `(batch_size, num_channels, height, width)` 的像素值转换为形状为 `(batch_size, seq_length, hidden_size)` 的初始隐藏状态（patch embeddings），以供 Transformer 模型使用。
class DeiTPatchEmbeddings(nn.Module):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config):
        super().__init__()
        # 从配置中获取图像尺寸和patch尺寸
        image_size, patch_size = config.image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.hidden_size

        # 如果图像尺寸和patch尺寸不是可迭代对象，则转换为元组
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # 计算图像中的patch数量
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches

        # 使用 nn.Conv2d 定义投影层，将图像的每个patch投影到隐藏空间
        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        # 获取输入张量的形状信息
        batch_size, num_channels, height, width = pixel_values.shape
        # 如果输入通道数与配置中的不匹配，则抛出异常
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # 如果输入图像尺寸与配置中的不匹配，则抛出异常
        if height != self.image_size[0] or width != self.image_size[1]:
            raise ValueError(
                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
            )
        # 对输入的像素值进行投影并展平，然后转置以匹配 Transformer 的输入格式
        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
        return x


# 从 transformers.models.vit.modeling_vit.ViTSelfAttention 复制并修改为 DeiT
class DeiTSelfAttention(nn.Module):
    def __init__(self, config: DeiTConfig) -> None:
        super().__init__()
        # 如果隐藏大小不能被注意力头数整除，并且配置中没有嵌入大小的属性，则抛出异常
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
                f"heads {config.num_attention_heads}."
            )

        # 设置注意力头数和每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 定义查询、键和值的线性映射层，用于生成查询、键和值的表示
        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)

        # 定义用于注意力概率的 dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
    # 对输入张量进行形状转换，将最后两个维度重新组织为多个注意头的形式
    new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
    x = x.view(new_x_shape)
    return x.permute(0, 2, 1, 3)

def forward(
    self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
    # 通过查询函数处理隐藏状态，生成混合查询层
    mixed_query_layer = self.query(hidden_states)

    # 将键层进行分组，以适应多头注意力的计算需求
    key_layer = self.transpose_for_scores(self.key(hidden_states))
    # 将值层进行分组，以适应多头注意力的计算需求
    value_layer = self.transpose_for_scores(self.value(hidden_states))
    # 将查询层进行分组，以适应多头注意力的计算需求
    query_layer = self.transpose_for_scores(mixed_query_layer)

    # 计算原始的注意力分数，通过查询和键的点积得到
    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

    # 对注意力分数进行除以注意力头大小的平方根的缩放
    attention_scores = attention_scores / math.sqrt(self.attention_head_size)

    # 对注意力分数进行 softmax 操作，将其归一化为注意力概率
    attention_probs = nn.functional.softmax(attention_scores, dim=-1)

    # 对注意力概率进行 dropout 操作，以减少过拟合风险
    attention_probs = self.dropout(attention_probs)

    # 如果给定了头部掩码，则将注意力概率与掩码相乘，实现头部掩蔽
    if head_mask is not None:
        attention_probs = attention_probs * head_mask

    # 计算上下文层，通过注意力概率加权值层的值
    context_layer = torch.matmul(attention_probs, value_layer)

    # 调整上下文层的维度顺序，使其恢复到原始输入的维度形状
    context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
    # 重新组织上下文层的形状，将多头注意力的结果合并
    new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
    context_layer = context_layer.view(new_context_layer_shape)

    # 根据是否需要输出注意力权重，返回相应的结果
    outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

    return outputs
# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DeiT
class DeiTSelfOutput(nn.Module):
    """
    The residual connection is defined in DeiTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    """

    def __init__(self, config: DeiTConfig) -> None:
        super().__init__()
        # 定义线性层，用于变换隐藏状态的维度
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义 Dropout 层，用于随机置零隐藏状态中的部分元素，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用线性层变换隐藏状态
        hidden_states = self.dense(hidden_states)
        # 对变换后的隐藏状态进行 Dropout 处理
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->DeiT
class DeiTAttention(nn.Module):
    def __init__(self, config: DeiTConfig) -> None:
        super().__init__()
        # 初始化自注意力层，用于计算注意力分布
        self.attention = DeiTSelfAttention(config)
        # 初始化自注意力输出层，用于处理自注意力层的输出结果
        self.output = DeiTSelfOutput(config)
        # 初始化一个空集合，用于记录剪枝过的注意力头
        self.pruned_heads = set()

    def prune_heads(self, heads: Set[int]) -> None:
        if len(heads) == 0:
            return
        # 查找可剪枝的注意力头并获取索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.attention.query = prune_linear_layer(self.attention.query, index)
        self.attention.key = prune_linear_layer(self.attention.key, index)
        self.attention.value = prune_linear_layer(self.attention.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并记录剪枝过的注意力头
        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 进行自注意力计算
        self_outputs = self.attention(hidden_states, head_mask, output_attentions)

        # 使用自注意力输出层处理自注意力计算的结果和原始隐藏状态
        attention_output = self.output(self_outputs[0], hidden_states)

        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出注意力分布，则添加到输出中
        return outputs


# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->DeiT
class DeiTIntermediate(nn.Module):
    def __init__(self, config: DeiTConfig) -> None:
        super().__init__()
        # 初始化线性层，用于变换隐藏状态的维度至中间维度
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act, str):
            # 如果配置中的隐藏激活函数是字符串，则使用预定义的激活函数字典中对应的函数
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            # 否则直接使用配置中的激活函数
            self.intermediate_act_fn = config.hidden_act
    # 前向传播函数，接收隐藏状态张量作为输入，返回处理后的隐藏状态张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态张量通过全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 应用激活函数到线性变换后的隐藏状态张量
        hidden_states = self.intermediate_act_fn(hidden_states)

        # 返回处理后的隐藏状态张量作为输出
        return hidden_states
# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->DeiT
class DeiTOutput(nn.Module):
    def __init__(self, config: DeiTConfig) -> None:
        super().__init__()
        # 定义一个全连接层，将中间尺寸的特征转换为隐藏层尺寸
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 定义一个用于随机失活的层，以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入的隐藏状态进行全连接层的变换
        hidden_states = self.dense(hidden_states)
        # 对全连接层的输出进行随机失活
        hidden_states = self.dropout(hidden_states)

        # 将全连接层的输出与输入张量相加，实现残差连接
        hidden_states = hidden_states + input_tensor

        return hidden_states


# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->DeiT
class DeiTLayer(nn.Module):
    """This corresponds to the Block class in the timm implementation."""

    def __init__(self, config: DeiTConfig) -> None:
        super().__init__()
        # 定义用于分块的前馈传播的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度设置为1
        self.seq_len_dim = 1
        # 使用DeiTAttention类来定义注意力机制
        self.attention = DeiTAttention(config)
        # 使用DeiTIntermediate类定义中间层结构
        self.intermediate = DeiTIntermediate(config)
        # 使用DeiTOutput类定义输出层结构
        self.output = DeiTOutput(config)
        # 在隐藏层上应用LayerNorm，设置eps为config中的层归一化参数
        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 同样在隐藏层上应用LayerNorm，设置eps为config中的层归一化参数
        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 对输入的隐藏状态应用LayerNorm，然后传入注意力机制
        self_attention_outputs = self.attention(
            self.layernorm_before(hidden_states),  # in DeiT, layernorm is applied before self-attention
            head_mask,
            output_attentions=output_attentions,
        )
        # 获取注意力机制的输出
        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # 如果需要输出注意力权重，也一并返回

        # 第一个残差连接
        hidden_states = attention_output + hidden_states

        # 在DeiT中，还会在自注意力后应用LayerNorm
        layer_output = self.layernorm_after(hidden_states)
        # 将LayerNorm后的输出传入中间层
        layer_output = self.intermediate(layer_output)

        # 第二个残差连接
        layer_output = self.output(layer_output, hidden_states)

        outputs = (layer_output,) + outputs

        return outputs


# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->DeiT
class DeiTEncoder(nn.Module):
    def __init__(self, config: DeiTConfig) -> None:
        super().__init__()
        self.config = config
        # 创建一个由多个DeiTLayer组成的层列表，列表长度由config中的层数决定
        self.layer = nn.ModuleList([DeiTLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点设为False，用于控制是否使用梯度检查点来节省内存
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        ) -> Union[tuple, BaseModelOutput]:
        # 如果不输出隐藏状态，则初始化为空元组；否则为None，以便后续累积隐藏状态
        all_hidden_states = () if output_hidden_states else None
        # 如果不输出注意力权重，则初始化为空元组；否则为None，以便后续累积注意力权重
        all_self_attentions = () if output_attentions else None

        # 遍历模型的每一层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前隐藏状态加入到所有隐藏状态的元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码（如果存在）
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 如果启用了梯度检查点且处于训练模式，则使用梯度检查点功能
            if self.gradient_checkpointing and self.training:
                # 调用梯度检查点函数，用于推断当前层的输出
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    layer_head_mask,
                    output_attentions,
                )
            else:
                # 否则，直接调用当前层的前向传播函数
                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)

            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力权重，则将当前层的注意力权重加入到所有注意力权重的元组中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果需要输出隐藏状态，则将最终的隐藏状态加入到所有隐藏状态的元组中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要以字典形式返回结果，则以元组形式返回相应的结果组件
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则，以BaseModelOutput对象的形式返回结果，其中包含最终的隐藏状态、所有隐藏状态和所有注意力权重
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
class DeiTPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 DeiTConfig 作为配置类
    config_class = DeiTConfig
    # 基础模型前缀为 "deit"
    base_model_prefix = "deit"
    # 主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不需要分割的模块列表
    _no_split_modules = ["DeiTLayer"]

    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
        """Initialize the weights"""
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 将输入向上转换为 `fp32`，然后转换回所需的 `dtype`，以避免在 `half` 模式下出现 `trunc_normal_cpu` 未实现的问题
            module.weight.data = nn.init.trunc_normal_(
                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
            ).to(module.weight.dtype)
            if module.bias is not None:
                # 如果存在偏置，则将其数据清零
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            # 如果是 LayerNorm 模块，则将偏置数据清零，权重数据填充为 1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


DEIT_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`DeiTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

DEIT_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`DeiTImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

@add_start_docstrings(
    "The bare DeiT Model transformer outputting raw hidden-states without any specific head on top.",
    DEIT_START_DOCSTRING,



# 插入一个名为 DEIT_START_DOCSTRING 的常量，用于开始一个文档字符串的标记
)
# 定义一个新的类 DeiTModel，继承自 DeiTPreTrainedModel
class DeiTModel(DeiTPreTrainedModel):
    # 初始化方法，接受配置 config 和两个可选参数 add_pooling_layer 和 use_mask_token
    def __init__(self, config: DeiTConfig, add_pooling_layer: bool = True, use_mask_token: bool = False) -> None:
        # 调用父类的初始化方法
        super().__init__(config)
        # 将配置信息保存到实例变量中
        self.config = config

        # 初始化嵌入层，使用 DeiTEmbeddings 类，并根据 use_mask_token 参数确定是否使用掩码标记
        self.embeddings = DeiTEmbeddings(config, use_mask_token=use_mask_token)
        # 初始化编码器，使用 DeiTEncoder 类
        self.encoder = DeiTEncoder(config)

        # 初始化层归一化，使用 nn.LayerNorm 类，设置归一化尺寸和 epsilon 值
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 如果 add_pooling_layer 为 True，则初始化池化层，使用 DeiTPooler 类；否则设为 None
        self.pooler = DeiTPooler(config) if add_pooling_layer else None

        # 执行初始化后处理方法
        self.post_init()

    # 获取输入嵌入层的方法
    def get_input_embeddings(self) -> DeiTPatchEmbeddings:
        return self.embeddings.patch_embeddings

    # 剪枝模型头部的方法
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历需要剪枝的层及其对应的头部列表
        for layer, heads in heads_to_prune.items():
            # 在编码器的指定层上，调用注意力机制的剪枝方法
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 使用文档字符串装饰器添加模型前向方法的说明文档
    @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
    # 使用代码示例文档字符串装饰器添加示例代码和参数说明
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    # 模型的前向传播方法
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        """
        # 根据输入参数或模型配置决定是否输出注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 根据输入参数或模型配置决定是否输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据输入参数或模型配置决定是否使用返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果像素值为空，则抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 准备头部遮罩（head_mask），用于多头注意力机制的控制
        # head_mask 中 1.0 表示保留该头部的注意力权重
        # attention_probs 的形状为 bsz x n_heads x N x N
        # 输入的 head_mask 形状为 [num_heads] 或 [num_hidden_layers x num_heads]
        # head_mask 被转换为 [num_hidden_layers x batch x num_heads x seq_length x seq_length] 的形状
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        # TODO: 可能有更清晰的方式来转换输入（来自 `ImageProcessor` 的一侧？）
        # 检查像素值的数据类型是否与预期的数据类型一致，如果不一致则进行转换
        expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
        if pixel_values.dtype != expected_dtype:
            pixel_values = pixel_values.to(expected_dtype)

        # 将像素值和可选的布尔掩码位置作为输入，进行嵌入编码处理
        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)

        # 将嵌入输出作为编码器的输入，进行编码器的前向传播
        encoder_outputs = self.encoder(
            embedding_output,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 提取编码器的序列输出
        sequence_output = encoder_outputs[0]
        # 应用层归一化到序列输出
        sequence_output = self.layernorm(sequence_output)
        # 如果存在池化层，则将序列输出池化为池化输出
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        # 如果不使用返回字典，则返回头部输出和编码器的其他输出
        if not return_dict:
            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
            return head_outputs + encoder_outputs[1:]

        # 如果使用返回字典，则返回包含编码器输出的 BaseModelOutputWithPooling 对象
        return BaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->DeiT
class DeiTPooler(nn.Module):
    def __init__(self, config: DeiTConfig):
        super().__init__()
        # 初始化一个全连接层，输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 激活函数使用 Tanh
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # 通过取第一个 token 对应的隐藏状态来"池化"模型
        first_token_tensor = hidden_states[:, 0]
        # 将第一个 token 的隐藏状态作为输入，经过全连接层得到池化输出
        pooled_output = self.dense(first_token_tensor)
        # 应用 Tanh 激活函数
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出
        return pooled_output


@add_start_docstrings(
    """DeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    """,
    DEIT_START_DOCSTRING,
)
# DeiTForMaskedImageModeling 类，继承自 DeiTPreTrainedModel
class DeiTForMaskedImageModeling(DeiTPreTrainedModel):
    def __init__(self, config: DeiTConfig) -> None:
        super().__init__(config)

        # 初始化 DeiTModel，设置 add_pooling_layer=False 和 use_mask_token=True
        self.deit = DeiTModel(config, add_pooling_layer=False, use_mask_token=True)

        # 定义解码器，使用卷积层和像素混洗层
        self.decoder = nn.Sequential(
            nn.Conv2d(
                in_channels=config.hidden_size,
                out_channels=config.encoder_stride**2 * config.num_channels,
                kernel_size=1,
            ),
            nn.PixelShuffle(config.encoder_stride),
        )

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=MaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
    # 定义前向传播函数，接收像素值、布尔掩码位置、头部掩码等参数
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



@add_start_docstrings(
    """
    DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    """,
    DEIT_START_DOCSTRING,
)
# DeiTForImageClassification 类，继承自 DeiTPreTrainedModel
class DeiTForImageClassification(DeiTPreTrainedModel):
    def __init__(self, config: DeiTConfig) -> None:
        super().__init__(config)

        # 设置类别数量
        self.num_labels = config.num_labels
        # 初始化 DeiTModel，不使用额外的池化层
        self.deit = DeiTModel(config, add_pooling_layer=False)

        # 分类器头部，根据类别数量确定输出维度
        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
    # 使用装饰器替换返回文档字符串，指定输出类型为ImageClassifierOutput，配置类为_CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
    # 定义神经网络的前向传播函数，接受以下参数：
    # pixel_values: 可选的 torch.Tensor，表示像素值
    # head_mask: 可选的 torch.Tensor，表示头部屏蔽（mask）
    # labels: 可选的 torch.Tensor，表示标签数据
    # output_attentions: 可选的 bool 值，控制是否输出注意力权重
    # output_hidden_states: 可选的 bool 值，控制是否输出隐藏状态
    # return_dict: 可选的 bool 值，控制是否返回结果字典
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 数据类，用于保存`DeiTForImageClassificationWithTeacher`模型的输出结果
@dataclass
class DeiTForImageClassificationWithTeacherOutput(ModelOutput):
    """
    `DeiTForImageClassificationWithTeacher`的输出类型。

    Args:
        logits (`torch.FloatTensor`，形状为 `(batch_size, config.num_labels)`):
            预测分数，是`cls_logits`和`distillation_logits`的平均值。
        cls_logits (`torch.FloatTensor`，形状为 `(batch_size, config.num_labels)`):
            分类头部的预测分数（即最终隐藏状态的类令牌上的线性层）。
        distillation_logits (`torch.FloatTensor`，形状为 `(batch_size, config.num_labels)`):
            蒸馏头部的预测分数（即最终隐藏状态的蒸馏令牌上的线性层）。
        hidden_states (`tuple(torch.FloatTensor)`，*可选*，当传递 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
            一个元组，包含 `torch.FloatTensor`（一个用于嵌入的输出 + 每个层的输出），形状为 `(batch_size, sequence_length, hidden_size)`。
            模型在每一层输出的隐藏状态，以及初始嵌入的输出。
        attentions (`tuple(torch.FloatTensor)`，*可选*，当传递 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
            一个元组，包含 `torch.FloatTensor`（每个层一个），形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            经过注意力 softmax 后的注意力权重，在自注意力头中用于计算加权平均值。

    """

@add_start_docstrings(
    """
    带有图像分类头的DeiT模型转换器（在[CLS]令牌的最终隐藏状态上有一个线性层，以及在蒸馏令牌的最终隐藏状态上有一个线性层），例如用于ImageNet。

    .. warning::

           此模型仅支持推断。尚不支持使用蒸馏进行微调（即带有教师）。
    """,
    DEIT_START_DOCSTRING,
)
class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel):
    """
    带有教师的DeiT模型，用于图像分类。
    """
    def __init__(self, config: DeiTConfig) -> None:
        # 调用父类的初始化方法，传入配置对象
        super().__init__(config)

        # 设置分类标签数量
        self.num_labels = config.num_labels
        # 使用给定配置初始化 DeiT 模型，不添加池化层
        self.deit = DeiTModel(config, add_pooling_layer=False)

        # 分类器头部
        # 如果标签数量大于零，则使用线性层作为分类器，否则使用恒等映射
        self.cls_classifier = (
            nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
        )
        # 同上，为蒸馏分类器设置线性层或恒等映射
        self.distillation_classifier = (
            nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=DeiTForImageClassificationWithTeacherOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[tuple, DeiTForImageClassificationWithTeacherOutput]:
        # 如果 return_dict 为 None，则使用配置对象中的 use_return_dict 属性
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入像素值和其他参数传递给 DeiT 模型进行前向计算
        outputs = self.deit(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取序列输出（通常是最后一层的输出）
        sequence_output = outputs[0]

        # 对序列输出的第一个位置进行分类预测
        cls_logits = self.cls_classifier(sequence_output[:, 0, :])
        # 对序列输出的第二个位置进行蒸馏分类预测
        distillation_logits = self.distillation_classifier(sequence_output[:, 1, :])

        # 在推断时，返回两个分类器预测结果的平均值作为最终 logits
        logits = (cls_logits + distillation_logits) / 2

        # 如果不要求返回字典，则返回一个包含 logits 和所有输出的元组
        if not return_dict:
            output = (logits, cls_logits, distillation_logits) + outputs[1:]
            return output

        # 如果要求返回字典，则构建 DeiTForImageClassificationWithTeacherOutput 对象并返回
        return DeiTForImageClassificationWithTeacherOutput(
            logits=logits,
            cls_logits=cls_logits,
            distillation_logits=distillation_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\deit\modeling_tf_deit.py`

# 设置编码格式为 UTF-8
# 版权声明，版权归 Facebook AI Research (FAIR) 和 HuggingFace Inc. 团队所有
#
# 根据 Apache License, Version 2.0 授权，除非符合许可证，否则不得使用此文件
# 可以在以下链接获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则依据"原样"分发此软件
# 没有任何明示或暗示的保证或条件，包括但不限于适销性或特定用途的适用性保证
# 请查看许可证以获取详细信息
""" TensorFlow DeiT model. """

# 引入未来支持的注释类型
from __future__ import annotations

# 引入标准库
import collections.abc
import math
# 引入数据类
from dataclasses import dataclass
# 引入类型提示
from typing import Optional, Tuple, Union

# 引入 TensorFlow 库
import tensorflow as tf

# 引入相关模块和函数
# 从活化函数模块中获取 TensorFlow 版本的激活函数
from ...activations_tf import get_tf_activation
# 引入 TensorFlow 版本的模型输出类
from ...modeling_tf_outputs import (
    TFBaseModelOutput,
    TFBaseModelOutputWithPooling,
    TFImageClassifierOutput,
    TFMaskedImageModelingOutput,
)
# 引入 TensorFlow 工具函数
from ...modeling_tf_utils import (
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
# 引入 TensorFlow 实用函数
from ...tf_utils import shape_list, stable_softmax
# 引入工具函数
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 引入 DeiT 配置类
from .configuration_deit import DeiTConfig

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 用于文档的一般配置描述
_CONFIG_FOR_DOC = "DeiTConfig"

# 用于文档的基础描述
_CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
_EXPECTED_OUTPUT_SHAPE = [1, 198, 768]

# 用于图像分类的描述
_IMAGE_CLASS_CHECKPOINT = "facebook/deit-base-distilled-patch16-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

# 预训练的 TensorFlow DeiT 模型存档列表
TF_DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/deit-base-distilled-patch16-224",
    # 查看所有 DeiT 模型：https://huggingface.co/models?filter=deit
]


@dataclass
class TFDeiTForImageClassificationWithTeacherOutput(ModelOutput):
    """
    [`DeiTForImageClassificationWithTeacher`] 的输出类型。
    """
    # 定义函数参数，表示模型的预测分数和其他中间结果
    Args:
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores as the average of the cls_logits and distillation logits.
            预测分数，作为 cls_logits 和 distillation_logits 的平均值
        cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
            class token).
            分类头部的预测分数，即在类令牌的最终隐藏状态之上的线性层的输出
        distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
            distillation token).
            蒸馏头部的预测分数，即在蒸馏令牌的最终隐藏状态之上的线性层的输出
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
            the initial embedding outputs.
            隐藏状态的元组，包含每个层的输出（嵌入层输出和每层输出），仅在设置 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
            注意力权重的元组，包含每个层的注意力权重，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`，
            仅在设置 `output_attentions=True` 或 `config.output_attentions=True` 时返回
    """

    # 初始化参数，默认为 None，表示可能的输出结果
    logits: tf.Tensor = None
    cls_logits: tf.Tensor = None
    distillation_logits: tf.Tensor = None
    hidden_states: Tuple[tf.Tensor] | None = None
    attentions: Tuple[tf.Tensor] | None = None
    """
    构建 CLS token、蒸馏 token、位置和补丁嵌入。可选择是否包含 mask token。
    """

    def __init__(self, config: DeiTConfig, use_mask_token: bool = False, **kwargs) -> None:
        super().__init__(**kwargs)
        self.config = config
        self.use_mask_token = use_mask_token
        self.patch_embeddings = TFDeiTPatchEmbeddings(config=config, name="patch_embeddings")
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")

    def build(self, input_shape=None):
        # 添加 CLS token 权重，形状为 (1, 1, hidden_size)，初始化为零
        self.cls_token = self.add_weight(
            shape=(1, 1, self.config.hidden_size),
            initializer=keras.initializers.zeros(),
            trainable=True,
            name="cls_token",
        )
        # 添加蒸馏 token 权重，形状为 (1, 1, hidden_size)，初始化为零
        self.distillation_token = self.add_weight(
            shape=(1, 1, self.config.hidden_size),
            initializer=keras.initializers.zeros(),
            trainable=True,
            name="distillation_token",
        )
        # 如果需要使用 mask token，则添加 mask token 权重，形状为 (1, 1, hidden_size)，初始化为零
        self.mask_token = None
        if self.use_mask_token:
            self.mask_token = self.add_weight(
                shape=(1, 1, self.config.hidden_size),
                initializer=keras.initializers.zeros(),
                trainable=True,
                name="mask_token",
            )
        # 计算补丁数量，用于位置嵌入的构建
        num_patches = self.patch_embeddings.num_patches
        # 添加位置嵌入权重，形状为 (1, num_patches + 2, hidden_size)，初始化为零
        self.position_embeddings = self.add_weight(
            shape=(1, num_patches + 2, self.config.hidden_size),
            initializer=keras.initializers.zeros(),
            trainable=True,
            name="position_embeddings",
        )

        # 如果已经构建过，则直接返回
        if self.built:
            return
        self.built = True
        # 如果存在 patch_embeddings 属性，则构建 patch_embeddings
        if getattr(self, "patch_embeddings", None) is not None:
            with tf.name_scope(self.patch_embeddings.name):
                self.patch_embeddings.build(None)
        # 如果存在 dropout 属性，则构建 dropout
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)

    def call(
        self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None, training: bool = False
    ):
        # 略
        pass
    ) -> tf.Tensor:
        # 使用 patch_embeddings 方法生成图像块的嵌入表示
        embeddings = self.patch_embeddings(pixel_values)
        # 获取嵌入张量的批大小、序列长度和特征维度信息
        batch_size, seq_length, _ = shape_list(embeddings)

        if bool_masked_pos is not None:
            # 使用 mask_token 在指定位置替换掩码的视觉标记
            mask_tokens = tf.tile(self.mask_token, [batch_size, seq_length, 1])
            # 创建用于掩码的布尔张量，并扩展其维度以适应张量运算
            mask = tf.expand_dims(bool_masked_pos, axis=-1)
            mask = tf.cast(mask, dtype=mask_tokens.dtype)
            # 应用掩码，保留未被掩码的嵌入值，替换掩码位置的嵌入为 mask_tokens
            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask

        # 生成一组重复的 cls_token，用于每个批次
        cls_tokens = tf.repeat(self.cls_token, repeats=batch_size, axis=0)
        # 生成一组重复的 distillation_token，用于每个批次
        distillation_tokens = tf.repeat(self.distillation_token, repeats=batch_size, axis=0)
        # 将 cls_tokens、distillation_tokens 和 embeddings 拼接在一起
        embeddings = tf.concat((cls_tokens, distillation_tokens, embeddings), axis=1)
        # 添加位置嵌入到嵌入张量中
        embeddings = embeddings + self.position_embeddings
        # 应用 dropout 操作到嵌入张量中，用于训练时的正则化
        embeddings = self.dropout(embeddings, training=training)
        # 返回最终生成的嵌入张量
        return embeddings
# 定义一个自定义层 TFDeiTPatchEmbeddings，用于将像素值 `pixel_values` 转换为 Transformer 模型可用的初始隐藏状态（patch embeddings）
class TFDeiTPatchEmbeddings(keras.layers.Layer):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config: DeiTConfig, **kwargs) -> None:
        super().__init__(**kwargs)
        # 从配置中获取图像大小和patch大小
        image_size, patch_size = config.image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.hidden_size

        # 如果图像大小或patch大小不是可迭代对象，则转换为元组
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        
        # 计算patch的数量
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        
        # 设置对象的属性
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches

        # 创建一个卷积层，用于将像素值投影到隐藏大小的空间，作为Transformer的输入
        self.projection = keras.layers.Conv2D(
            hidden_size, kernel_size=patch_size, strides=patch_size, name="projection"
        )

    # 定义层的前向传播逻辑，将像素值转换为patch embeddings
    def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
        # 获取输入张量的形状信息
        batch_size, height, width, num_channels = shape_list(pixel_values)
        
        # 在即时执行模式下，确保像素值的通道维度与配置中设置的通道数匹配
        if tf.executing_eagerly() and num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        
        # 在即时执行模式下，确保输入图像的尺寸与配置中设置的图像尺寸匹配
        if tf.executing_eagerly() and (height != self.image_size[0] or width != self.image_size[1]):
            raise ValueError(
                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
            )
        
        # 将像素值通过投影层投影为形状为(batch_size, seq_length, hidden_size)的张量x
        x = self.projection(pixel_values)
        
        # 获取投影后张量x的形状信息
        batch_size, height, width, num_channels = shape_list(x)
        
        # 将x重新调整形状为(batch_size, seq_length, hidden_size)，其中seq_length=height*width
        x = tf.reshape(x, (batch_size, height * width, num_channels))
        
        # 返回转换后的张量x作为patch embeddings
        return x

    # 在模型构建时调用，用于构建投影层
    def build(self, input_shape=None):
        # 如果已经构建，则直接返回
        if self.built:
            return
        
        # 标记此层为已构建
        self.built = True
        
        # 如果投影层已经存在，则在其名称作用域下构建该层
        if getattr(self, "projection", None) is not None:
            with tf.name_scope(self.projection.name):
                self.projection.build([None, None, None, self.num_channels])


# 从 transformers.models.vit.modeling_tf_vit.TFViTSelfAttention 复制并修改为 DeiT 模型中的自注意力层 TFDeiTSelfAttention
class TFDeiTSelfAttention(keras.layers.Layer):
    def __init__(self, config: DeiTConfig, **kwargs):
        super().__init__(**kwargs)

        # 检查隐藏大小是否能被注意力头数整除
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
                f"of attention heads ({config.num_attention_heads})"
            )

        # 初始化注意力头数和每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)

        # 创建查询、键、值的全连接层，用于后续注意力计算
        self.query = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
        )
        self.key = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
        )
        self.value = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )

        # 定义用于 dropout 的层
        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
        self.config = config

    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # 将输入张量从 [batch_size, seq_length, all_head_size] 重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size]
        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))

        # 转置张量从 [batch_size, seq_length, num_attention_heads, attention_head_size] 到 [batch_size, num_attention_heads, seq_length, attention_head_size]
        return tf.transpose(tensor, perm=[0, 2, 1, 3])

    def call(
        self,
        hidden_states: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 获取隐藏状态的批量大小
        batch_size = shape_list(hidden_states)[0]
        # 使用 self.query 函数处理隐藏状态，生成混合的查询层
        mixed_query_layer = self.query(inputs=hidden_states)
        # 使用 self.key 函数处理隐藏状态，生成混合的键层
        mixed_key_layer = self.key(inputs=hidden_states)
        # 使用 self.value 函数处理隐藏状态，生成混合的值层
        mixed_value_layer = self.value(inputs=hidden_states)
        # 将混合的查询层转置以用于注意力分数计算
        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
        # 将混合的键层转置以用于注意力分数计算
        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
        # 将混合的值层转置以用于注意力分数计算
        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)

        # 计算注意力分数，即查询和键的点积
        # 结果形状为 (batch size, num_heads, seq_len_q, seq_len_k)
        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
        # 缩放注意力分数
        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
        attention_scores = tf.divide(attention_scores, dk)

        # 将注意力分数归一化为概率
        attention_probs = stable_softmax(logits=attention_scores, axis=-1)

        # 对注意力分数进行 dropout 处理
        attention_probs = self.dropout(inputs=attention_probs, training=training)

        # 如果给定了头部掩码，则应用头部掩码
        if head_mask is not None:
            attention_probs = tf.multiply(attention_probs, head_mask)

        # 计算注意力输出，将注意力分数乘以值层
        attention_output = tf.matmul(attention_probs, value_layer)
        # 调整输出张量的维度顺序
        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])

        # 将输出张量重新形状为 (batch_size, seq_len_q, all_head_size)
        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
        # 如果需要输出注意力分数，则返回注意力输出和注意力分数
        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)

        return outputs

    def build(self, input_shape=None):
        # 如果已经构建过网络层，则直接返回
        if self.built:
            return
        # 标记网络层已经构建
        self.built = True
        # 如果存在 self.query 属性，则构建查询网络层
        if getattr(self, "query", None) is not None:
            with tf.name_scope(self.query.name):
                self.query.build([None, None, self.config.hidden_size])
        # 如果存在 self.key 属性，则构建键网络层
        if getattr(self, "key", None) is not None:
            with tf.name_scope(self.key.name):
                self.key.build([None, None, self.config.hidden_size])
        # 如果存在 self.value 属性，则构建值网络层
        if getattr(self, "value", None) is not None:
            with tf.name_scope(self.value.name):
                self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->DeiT
class TFDeiTSelfOutput(keras.layers.Layer):
    """
    The residual connection is defined in TFDeiTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    """

    def __init__(self, config: DeiTConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 创建一个全连接层，用于变换隐藏状态的维度
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 创建一个 dropout 层，用于随机失活一部分神经元，防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将隐藏状态输入全连接层进行变换
        hidden_states = self.dense(inputs=hidden_states)
        # 在训练阶段，对变换后的隐藏状态进行随机失活处理
        hidden_states = self.dropout(inputs=hidden_states, training=training)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建过网络层，则直接返回
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                # 构建全连接层，指定输入形状和隐藏状态的维度
                self.dense.build([None, None, self.config.hidden_size])


# Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->DeiT
class TFDeiTAttention(keras.layers.Layer):
    def __init__(self, config: DeiTConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 创建自注意力层，用于处理注意力机制
        self.self_attention = TFDeiTSelfAttention(config, name="attention")
        # 创建输出层，用于处理自注意力层输出的隐藏状态
        self.dense_output = TFDeiTSelfOutput(config, name="output")

    def prune_heads(self, heads):
        # 目前未实现头部修剪的方法
        raise NotImplementedError

    def call(
        self,
        input_tensor: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 调用自注意力层处理输入张量，获取自注意力输出
        self_outputs = self.self_attention(
            hidden_states=input_tensor, head_mask=head_mask, output_attentions=output_attentions, training=training
        )
        # 将自注意力输出传入输出层处理，得到注意力机制的输出
        attention_output = self.dense_output(
            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
        )
        # 如果需要输出注意力信息，则将注意力信息添加到输出中
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them

        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建过网络层，则直接返回
        if getattr(self, "self_attention", None) is not None:
            with tf.name_scope(self.self_attention.name):
                # 构建自注意力层，不需要指定具体的输入形状
                self.self_attention.build(None)
        if getattr(self, "dense_output", None) is not None:
            with tf.name_scope(self.dense_output.name):
                # 构建输出层，不需要指定具体的输入形状
                self.dense_output.build(None)


# Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->DeiT
class TFDeiTIntermediate(keras.layers.Layer):
    # 初始化方法，用于初始化一个新的对象实例
    def __init__(self, config: DeiTConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 创建一个全连接层，设置单元数为config中指定的中间大小，
        # 内核初始化方式使用config中的初始化范围，命名为"dense"
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 根据config中的隐藏激活函数，获取对应的激活函数对象或者名称
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        
        # 将config保存在对象中
        self.config = config

    # 调用方法，用于执行前向传播
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 使用全连接层处理输入的隐藏状态数据
        hidden_states = self.dense(inputs=hidden_states)
        # 使用中间激活函数处理全连接层的输出隐藏状态数据
        hidden_states = self.intermediate_act_fn(hidden_states)

        # 返回处理后的隐藏状态数据
        return hidden_states

    # 构建方法，用于构建模型层次结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记为已经构建
        self.built = True
        # 如果存在全连接层dense，则使用其名字作为作用域，构建该层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
# 从transformers.models.vit.modeling_tf_vit.TFViTOutput复制并更名为TFDeiTOutput，用于实现Transformer中的输出层逻辑，但采用DeiT的配置。
class TFDeiTOutput(keras.layers.Layer):
    def __init__(self, config: DeiTConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，输出维度为config.hidden_size，权重初始化使用config中定义的初始化范围。
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 创建一个Dropout层，使用config中定义的dropout概率。
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 通过全连接层处理输入hidden_states，输出经过线性变换后的结果。
        hidden_states = self.dense(inputs=hidden_states)
        # 对处理后的结果进行Dropout操作，以防止过拟合。
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 将Dropout后的结果与输入tensor相加，实现残差连接。
        hidden_states = hidden_states + input_tensor

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建过，直接返回。否则，根据self.dense的名字作用域，构建全连接层，输入维度为None，None，self.config.intermediate_size。
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])


# TFDeiTLayer是Transformer中的一个层，对应于timm实现中的Block类。
class TFDeiTLayer(keras.layers.Layer):
    """This corresponds to the Block class in the timm implementation."""

    def __init__(self, config: DeiTConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建DeiTAttention层，使用给定的DeiT配置，并命名为"attention"。
        self.attention = TFDeiTAttention(config, name="attention")
        # 创建DeiTIntermediate层，使用给定的DeiT配置，并命名为"intermediate"。
        self.intermediate = TFDeiTIntermediate(config, name="intermediate")
        # 创建TFDeiTOutput层，使用给定的DeiT配置，并命名为"output"。
        self.deit_output = TFDeiTOutput(config, name="output")

        # 创建LayerNormalization层，在训练过程中使用给定的epsilon参数进行归一化，命名为"layernorm_before"和"layernorm_after"。
        self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
        self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
        self.config = config

    def call(
        self,
        hidden_states: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 对输入hidden_states应用LayerNormalization，然后传递给self.attention，获取attention_outputs。
        attention_outputs = self.attention(
            input_tensor=self.layernorm_before(inputs=hidden_states, training=training),
            head_mask=head_mask,
            output_attentions=output_attentions,
            training=training,
        )
        attention_output = attention_outputs[0]

        # 第一个残差连接，将attention_output加到hidden_states上。
        hidden_states = attention_output + hidden_states

        # 再次应用LayerNormalization，得到layer_output。
        layer_output = self.layernorm_after(inputs=hidden_states, training=training)

        # 将layer_output传递给self.intermediate层进行处理，得到intermediate_output。
        intermediate_output = self.intermediate(hidden_states=layer_output, training=training)

        # 第二个残差连接，将intermediate_output和hidden_states传递给self.deit_output处理，得到layer_output。
        layer_output = self.deit_output(
            hidden_states=intermediate_output, input_tensor=hidden_states, training=training
        )
        # 将输出打包成元组outputs，如果需要输出attention信息，则将其添加到outputs中。
        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them

        return outputs
    # 定义一个方法用于建立模型，可以接受输入形状作为参数
    def build(self, input_shape=None):
        # 如果模型已经建立过，则直接返回，不进行重复建立
        if self.built:
            return
        # 设置标记，表明模型已经建立
        self.built = True
        
        # 如果存在 self.attention 属性，则构建 attention 层
        if getattr(self, "attention", None) is not None:
            # 使用 attention 层的名称作为命名空间
            with tf.name_scope(self.attention.name):
                # 调用 attention 层的 build 方法，传入 None 作为输入形状
                self.attention.build(None)
        
        # 如果存在 self.intermediate 属性，则构建 intermediate 层
        if getattr(self, "intermediate", None) is not None:
            # 使用 intermediate 层的名称作为命名空间
            with tf.name_scope(self.intermediate.name):
                # 调用 intermediate 层的 build 方法，传入 None 作为输入形状
                self.intermediate.build(None)
        
        # 如果存在 self.deit_output 属性，则构建 deit_output 层
        if getattr(self, "deit_output", None) is not None:
            # 使用 deit_output 层的名称作为命名空间
            with tf.name_scope(self.deit_output.name):
                # 调用 deit_output 层的 build 方法，传入 None 作为输入形状
                self.deit_output.build(None)
        
        # 如果存在 self.layernorm_before 属性，则构建 layernorm_before 层
        if getattr(self, "layernorm_before", None) is not None:
            # 使用 layernorm_before 层的名称作为命名空间
            with tf.name_scope(self.layernorm_before.name):
                # 调用 layernorm_before 层的 build 方法，传入一个形状为 [None, None, self.config.hidden_size] 的列表作为输入形状
                self.layernorm_before.build([None, None, self.config.hidden_size])
        
        # 如果存在 self.layernorm_after 属性，则构建 layernorm_after 层
        if getattr(self, "layernorm_after", None) is not None:
            # 使用 layernorm_after 层的名称作为命名空间
            with tf.name_scope(self.layernorm_after.name):
                # 调用 layernorm_after 层的 build 方法，传入一个形状为 [None, None, self.config.hidden_size] 的列表作为输入形状
                self.layernorm_after.build([None, None, self.config.hidden_size])
# 从transformers.models.vit.modeling_tf_vit.TFViTEncoder复制并修改为ViT->DeiT
class TFDeiTEncoder(keras.layers.Layer):
    def __init__(self, config: DeiTConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化层列表，每层命名为"layer_._{i}"
        self.layer = [TFDeiTLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]

    def call(
        self,
        hidden_states: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        output_hidden_states: bool,
        return_dict: bool,
        training: bool = False,
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 如果输出隐藏状态，初始化空元组以存储所有隐藏状态
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重，初始化空元组以存储所有注意力权重
        all_attentions = () if output_attentions else None

        # 遍历每个编码层
        for i, layer_module in enumerate(self.layer):
            # 如果输出隐藏状态，将当前隐藏状态加入到列表中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 调用编码层的计算
            layer_outputs = layer_module(
                hidden_states=hidden_states,
                head_mask=head_mask[i],
                output_attentions=output_attentions,
                training=training,
            )
            # 更新隐藏状态为当前层的输出
            hidden_states = layer_outputs[0]

            # 如果输出注意力权重，将当前层的注意力权重加入到列表中
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # 添加最后一层的隐藏状态到所有隐藏状态列表中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的结果，根据是否为空过滤掉None值并返回结果元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)

        # 返回 TFBaseModelOutput 对象，包含最后的隐藏状态、所有隐藏状态和所有注意力权重
        return TFBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )

    def build(self, input_shape=None):
        # 如果已经构建，直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 如果存在层列表，为每一层设置命名作用域并构建层
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                with tf.name_scope(layer.name):
                    layer.build(None)


@keras_serializable
class TFDeiTMainLayer(keras.layers.Layer):
    config_class = DeiTConfig

    def __init__(
        self, config: DeiTConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs
    ) -> None:
        super().__init__(**kwargs)
        # 初始化配置
        self.config = config

        # 初始化嵌入层和编码器层
        self.embeddings = TFDeiTEmbeddings(config, use_mask_token=use_mask_token, name="embeddings")
        self.encoder = TFDeiTEncoder(config, name="encoder")

        # 初始化层归一化层
        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
        # 如果需要添加池化层，则初始化池化层
        self.pooler = TFDeiTPooler(config, name="pooler") if add_pooling_layer else None

    def get_input_embeddings(self) -> TFDeiTPatchEmbeddings:
        # 返回嵌入层的补丁嵌入
        return self.embeddings.patch_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 修剪模型的注意力头，heads_to_prune 是一个字典，键为层号，值为要修剪的头列表，参见基类 PreTrainedModel
        raise NotImplementedError
    def get_head_mask(self, head_mask):
        # 如果 head_mask 不为 None，抛出未实现的错误
        if head_mask is not None:
            raise NotImplementedError
        else:
            # 否则，创建一个长度为 self.config.num_hidden_layers 的 None 列表作为 head_mask
            head_mask = [None] * self.config.num_hidden_layers

        # 返回创建或传入的 head_mask
        return head_mask

    @unpack_inputs
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        bool_masked_pos: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor, ...]]:
        # 设置 output_attentions，如果未提供则使用 self.config.output_attentions
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置 output_hidden_states，如果未提供则使用 self.config.output_hidden_states
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置 return_dict，如果未提供则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果 pixel_values 为 None，则抛出数值错误
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 转置 pixel_values 张量，将 NCHW 格式转换为 NHWC 格式
        pixel_values = tf.transpose(pixel_values, (0, 2, 3, 1))

        # 准备 head_mask，调用 self.get_head_mask 获取头部掩码
        head_mask = self.get_head_mask(head_mask)

        # 使用 embeddings 方法生成嵌入输出
        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos, training=training)

        # 使用 encoder 方法生成编码器输出
        encoder_outputs = self.encoder(
            embedding_output,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取编码器的序列输出
        sequence_output = encoder_outputs[0]

        # 对序列输出进行 layer normalization
        sequence_output = self.layernorm(sequence_output, training=training)

        # 如果存在池化器，使用池化器生成池化输出
        pooled_output = self.pooler(sequence_output, training=training) if self.pooler is not None else None

        # 如果 return_dict 为 False，则返回 head_outputs 和额外的 encoder_outputs
        if not return_dict:
            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
            return head_outputs + encoder_outputs[1:]

        # 如果 return_dict 为 True，则返回 TFBaseModelOutputWithPooling 对象
        return TFBaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
    # 如果模型已经构建完成，则直接返回，不进行重复构建
    if self.built:
        return
    # 将模型标记为已构建状态
    self.built = True
    # 如果模型具有嵌入层（embeddings），则构建嵌入层
    if getattr(self, "embeddings", None) is not None:
        # 使用嵌入层的名称作为命名空间，构建嵌入层
        with tf.name_scope(self.embeddings.name):
            self.embeddings.build(None)
    # 如果模型具有编码器（encoder），则构建编码器
    if getattr(self, "encoder", None) is not None:
        # 使用编码器的名称作为命名空间，构建编码器
        with tf.name_scope(self.encoder.name):
            self.encoder.build(None)
    # 如果模型具有层归一化（layernorm），则构建层归一化
    if getattr(self, "layernorm", None) is not None:
        # 使用层归一化的名称作为命名空间，构建层归一化，指定输入形状为 [None, None, self.config.hidden_size]
        with tf.name_scope(self.layernorm.name):
            self.layernorm.build([None, None, self.config.hidden_size])
    # 如果模型具有池化器（pooler），则构建池化器
    if getattr(self, "pooler", None) is not None:
        # 使用池化器的名称作为命名空间，构建池化器
        with tf.name_scope(self.pooler.name):
            self.pooler.build(None)
# 从 transformers.models.vit.modeling_tf_vit.TFViTPreTrainedModel 复制并修改为 ViT->DeiT 的大小写。
class TFDeiTPreTrainedModel(TFPreTrainedModel):
    """
    一个抽象类，处理权重初始化以及预训练模型的下载和加载的简单接口。
    """

    # 配置类，指定为 DeiTConfig
    config_class = DeiTConfig
    # 基础模型前缀，设定为 "deit"
    base_model_prefix = "deit"
    # 主输入名称，设定为 "pixel_values"
    main_input_name = "pixel_values"


# 下面是 DEIT_START_DOCSTRING 的文档字符串
DEIT_START_DOCSTRING = r"""
    This model is a TensorFlow
    [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular
    TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior.

    Parameters:
        config ([`DeiTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 下面是 DEIT_INPUTS_DOCSTRING 的文档字符串
DEIT_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`DeiTImageProcessor.__call__`] for details.

        head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare DeiT Model transformer outputting raw hidden-states without any specific head on top.",
    DEIT_START_DOCSTRING,
)
# TFDeiTModel 类的文档字符串和初始化方法注释
class TFDeiTModel(TFDeiTPreTrainedModel):
    def __init__(
        self, config: DeiTConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs
    ) -> None:
        # 调用父类的初始化方法
        super().__init__(config, **kwargs)

        # 创建 TFDeiTMainLayer 实例作为 self.deit 属性
        self.deit = TFDeiTMainLayer(
            config, add_pooling_layer=add_pooling_layer, use_mask_token=use_mask_token, name="deit"
        )

    # 使用装饰器添加输入解包，模型前向方法的起始文档字符串和代码示例文档字符串
    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    # 定义一个方法 `call`，用于调用模型。
    def call(
        self,
        pixel_values: tf.Tensor | None = None,  # 输入像素值的张量，可以为空
        bool_masked_pos: tf.Tensor | None = None,  # 布尔类型的遮罩位置张量，可以为空
        head_mask: tf.Tensor | None = None,  # 头部遮罩张量，可以为空
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重的布尔值，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态的布尔值，可选
        return_dict: Optional[bool] = None,  # 是否返回字典类型的结果，可选
        training: bool = False,  # 是否处于训练模式，默认为 False
    ) -> Union[Tuple, TFBaseModelOutputWithPooling]:
        # 调用模型 `deit`，传递给它所有的参数，并返回其输出
        outputs = self.deit(
            pixel_values=pixel_values,
            bool_masked_pos=bool_masked_pos,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 返回模型的输出
        return outputs

    # 定义一个方法 `build`，用于构建模型的层次结构
    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True
        # 如果存在 `deit` 属性，则在 `deit` 的命名作用域下构建模型
        if getattr(self, "deit", None) is not None:
            with tf.name_scope(self.deit.name):
                # 调用 `deit` 的构建方法，传入 `None` 的输入形状
                self.deit.build(None)
# 从 transformers.models.vit.modeling_tf_vit.TFViTPooler 复制而来，将 ViT 改为 DeiT
class TFDeiTPooler(keras.layers.Layer):
    def __init__(self, config: DeiTConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，用于池化模型，输出维度为 config.hidden_size
        self.dense = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 通过简单地选择第一个 token 的隐藏状态来“池化”模型
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(inputs=first_token_tensor)

        return pooled_output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建，直接返回；否则，按照指定的输入形状构建 dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


class TFDeitPixelShuffle(keras.layers.Layer):
    """TF 实现的 torch.nn.PixelShuffle 的层"""

    def __init__(self, upscale_factor: int, **kwargs) -> None:
        super().__init__(**kwargs)
        if not isinstance(upscale_factor, int) or upscale_factor < 2:
            raise ValueError(f"upscale_factor 必须是大于等于 2 的整数，当前值为 {upscale_factor}")
        self.upscale_factor = upscale_factor

    def call(self, x: tf.Tensor) -> tf.Tensor:
        hidden_states = x
        batch_size, _, _, num_input_channels = shape_list(hidden_states)
        block_size_squared = self.upscale_factor**2
        output_depth = int(num_input_channels / block_size_squared)
        
        # 计算输出通道数时，PyTorch 的 PixelShuffle 和 TF 的 depth_to_space 在输出上存在差异，
        # 因为通道的选择顺序会导致组合顺序不同，详情参考：
        # https://stackoverflow.com/questions/68272502/tf-depth-to-space-not-same-as-torchs-pixelshuffle-when-output-channels-1
        permutation = tf.constant(
            [[i + j * block_size_squared for i in range(block_size_squared) for j in range(output_depth)]]
        )
        # 使用 permutation 重新组合隐藏状态张量的通道
        hidden_states = tf.gather(params=hidden_states, indices=tf.tile(permutation, [batch_size, 1]), batch_dims=-1)
        # 使用 TF 的 depth_to_space 函数进行像素洗牌操作
        hidden_states = tf.nn.depth_to_space(hidden_states, block_size=self.upscale_factor, data_format="NHWC")
        return hidden_states


class TFDeitDecoder(keras.layers.Layer):
    def __init__(self, config: DeiTConfig, **kwargs) -> None:
        super().__init__(**kwargs)
        
        # 创建一个卷积层，输出通道数为 config.encoder_stride 的平方乘以 config.num_channels，卷积核大小为 1x1
        self.conv2d = keras.layers.Conv2D(
            filters=config.encoder_stride**2 * config.num_channels, kernel_size=1, name="0"
        )
        # 创建 TFDeitPixelShuffle 层，用于解码器
        self.pixel_shuffle = TFDeitPixelShuffle(config.encoder_stride, name="1")
        self.config = config
    # 定义一个方法用于调用模型，接受一个张量作为输入，并可选择是否进行训练
    def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将输入张量赋给隐藏状态变量
        hidden_states = inputs
        # 对隐藏状态进行二维卷积操作
        hidden_states = self.conv2d(hidden_states)
        # 对卷积后的结果进行像素重排操作
        hidden_states = self.pixel_shuffle(hidden_states)
        # 返回处理后的隐藏状态
        return hidden_states

    # 构建模型的方法
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 设置模型已构建标志为True
        self.built = True
        # 如果模型具有conv2d属性，则构建conv2d层
        if getattr(self, "conv2d", None) is not None:
            # 在TensorFlow中使用name_scope管理命名空间
            with tf.name_scope(self.conv2d.name):
                # 构建conv2d层，指定输入的形状为[None, None, None, self.config.hidden_size]
                self.conv2d.build([None, None, None, self.config.hidden_size])
        # 如果模型具有pixel_shuffle属性，则构建pixel_shuffle层
        if getattr(self, "pixel_shuffle", None) is not None:
            # 在TensorFlow中使用name_scope管理命名空间
            with tf.name_scope(self.pixel_shuffle.name):
                # 构建pixel_shuffle层，不指定具体的输入形状
                self.pixel_shuffle.build(None)
@add_start_docstrings(
    """
    在遮蔽图像建模中使用的DeiT模型，其顶部有一个解码器，正如[SimmIM](https://arxiv.org/abs/2111.09886)中所提出的。
    """,
    DEIT_START_DOCSTRING,
)
class TFDeiTForMaskedImageModeling(TFDeiTPreTrainedModel):
    def __init__(self, config: DeiTConfig) -> None:
        super().__init__(config)

        # 初始化DeiT主层，禁用池化层，启用掩码令牌，命名为"deit"
        self.deit = TFDeiTMainLayer(config, add_pooling_layer=False, use_mask_token=True, name="deit")
        # 初始化解码器，使用给定的DeiT配置，命名为"decoder"
        self.decoder = TFDeitDecoder(config, name="decoder")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        bool_masked_pos: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ):
        """
        DeiT模型的前向传播方法，接受像素值、掩码位置、头部掩码等参数。
        """
        # 省略了具体的前向传播逻辑，由于不在要求内，不能提供更多细节。

    def build(self, input_shape=None):
        """
        构建模型，确保DeiT和解码器都已构建。
        """
        if self.built:
            return
        self.built = True
        if getattr(self, "deit", None) is not None:
            with tf.name_scope(self.deit.name):
                self.deit.build(None)
        if getattr(self, "decoder", None) is not None:
            with tf.name_scope(self.decoder.name):
                self.decoder.build(None)


@add_start_docstrings(
    """
    带有图像分类头部的DeiT模型变换器（在[CLS]标记的最终隐藏状态之上有一个线性层），例如用于ImageNet。
    """,
    DEIT_START_DOCSTRING,
)
class TFDeiTForImageClassification(TFDeiTPreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config: DeiTConfig):
        super().__init__(config)

        # 配置标签数量
        self.num_labels = config.num_labels
        # 初始化DeiT主层，禁用池化层，命名为"deit"
        self.deit = TFDeiTMainLayer(config, add_pooling_layer=False, name="deit")

        # 分类器头部
        self.classifier = (
            # 如果标签数大于0，则使用稠密层，命名为"classifier"
            keras.layers.Dense(config.num_labels, name="classifier")
            if config.num_labels > 0
            # 否则使用线性激活函数，命名为"classifier"
            else keras.layers.Activation("linear", name="classifier")
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        labels: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ):
        """
        DeiT模型的前向传播方法，接受像素值、头部掩码、标签等参数。
        """
        # 省略了具体的前向传播逻辑，由于不在要求内，不能提供更多细节。
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 设置返回字典的标志，如果未提供则使用模型配置中的默认设置

        outputs = self.deit(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 使用图像编码器（self.deit）处理像素值，可选地应用头部遮罩、输出注意力和隐藏状态，根据return_dict参数返回结果

        sequence_output = outputs[0]
        # 获取模型输出中的序列输出（通常是最后一层的输出）

        logits = self.classifier(sequence_output[:, 0, :])
        # 使用分类器（self.classifier）计算逻辑回归，通常使用序列输出的第一个位置的信息

        # 不使用蒸馏令牌

        loss = None if labels is None else self.hf_compute_loss(labels, logits)
        # 如果提供了标签，则使用标签和逻辑回归计算损失，否则损失为None

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output
        # 如果不要求返回字典形式的结果，则返回一个元组，包含逻辑回归和其它可能的输出

        return TFImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
        # 返回一个TFImageClassifierOutput对象，包含损失、逻辑回归、隐藏状态和注意力信息
    # 如果模型已经构建完成，则直接返回，不进行重复构建
    if self.built:
        return
    
    # 标记模型已经构建完成
    self.built = True
    
    # 如果存在名为"deit"的属性，并且不为None，则构建"deit"模型部分
    if getattr(self, "deit", None) is not None:
        # 在命名空间中构建"deit"模型
        with tf.name_scope(self.deit.name):
            self.deit.build(None)
    
    # 如果存在名为"classifier"的属性，并且不为None，则构建"classifier"模型部分
    if getattr(self, "classifier", None) is not None:
        # 在命名空间中构建"classifier"模型
        with tf.name_scope(self.classifier.name):
            # 构建"classifier"模型，指定输入形状为[None, None, self.config.hidden_size]
            self.classifier.build([None, None, self.config.hidden_size])
# 使用装饰器为类添加文档字符串，描述了此类的主要功能和警告信息
@add_start_docstrings(
    """
    DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
    the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.

    .. warning::

            This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
            supported.
    """,
    DEIT_START_DOCSTRING,
)
class TFDeiTForImageClassificationWithTeacher(TFDeiTPreTrainedModel):
    def __init__(self, config: DeiTConfig) -> None:
        # 调用父类构造函数初始化模型
        super().__init__(config)

        # 保存分类标签数量
        self.num_labels = config.num_labels
        # 创建 DeiT 主层实例，不添加池化层
        self.deit = TFDeiTMainLayer(config, add_pooling_layer=False, name="deit")

        # 分类器头部初始化
        # 如果有分类标签数量，则创建全连接层作为分类器；否则创建线性激活层
        self.cls_classifier = (
            keras.layers.Dense(config.num_labels, name="cls_classifier")
            if config.num_labels > 0
            else keras.layers.Activation("linear", name="cls_classifier")
        )
        # 同上，针对蒸馏分类器头部
        self.distillation_classifier = (
            keras.layers.Dense(config.num_labels, name="distillation_classifier")
            if config.num_labels > 0
            else keras.layers.Activation("linear", name="distillation_classifier")
        )
        # 保存配置信息
        self.config = config

    # 使用装饰器定义 call 方法的输入输出说明文档
    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=TFDeiTForImageClassificationWithTeacherOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    # 定义模型的前向传播逻辑
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        **kwargs
    ) -> TFDeiTForImageClassificationWithTeacherOutput:
        # 在这里实现具体的前向传播逻辑，处理输入张量和模型参数
        pass  # 这里的 pass 仅用于示例，实际应填写前向传播的具体实现
    # 定义模型的前向传播方法，输入参数包括像素值、头部掩码、是否输出注意力和隐藏状态、是否返回字典以及训练模式
    ) -> Union[tuple, TFDeiTForImageClassificationWithTeacherOutput]:
        # 如果未指定返回字典，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 DeiT 模型进行推理
        outputs = self.deit(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取序列输出
        sequence_output = outputs[0]

        # 对序列输出的第一个位置进行分类器预测
        cls_logits = self.cls_classifier(sequence_output[:, 0, :])
        # 对序列输出的第二个位置进行蒸馏分类器预测
        distillation_logits = self.distillation_classifier(sequence_output[:, 1, :])

        # 在推理阶段，返回两个分类器预测结果的平均值作为最终预测
        logits = (cls_logits + distillation_logits) / 2

        # 如果不需要返回字典，则返回预测结果和其他输出信息
        if not return_dict:
            output = (logits, cls_logits, distillation_logits) + outputs[1:]
            return output

        # 否则，返回一个包含预测 logits、分类器 logits、蒸馏分类器 logits、隐藏状态和注意力信息的 TFDeiTForImageClassificationWithTeacherOutput 对象
        return TFDeiTForImageClassificationWithTeacherOutput(
            logits=logits,
            cls_logits=cls_logits,
            distillation_logits=distillation_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    # 构建模型的方法，设置模型已构建标志并构建模型的各个组件
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 设置模型已构建标志为 True
        self.built = True

        # 如果 DeiT 模型存在，则构建它
        if getattr(self, "deit", None) is not None:
            with tf.name_scope(self.deit.name):
                self.deit.build(None)

        # 如果分类器存在，则构建分类器
        if getattr(self, "cls_classifier", None) is not None:
            with tf.name_scope(self.cls_classifier.name):
                self.cls_classifier.build([None, None, self.config.hidden_size])

        # 如果蒸馏分类器存在，则构建蒸馏分类器
        if getattr(self, "distillation_classifier", None) is not None:
            with tf.name_scope(self.distillation_classifier.name):
                self.distillation_classifier.build([None, None, self.config.hidden_size])

`.\models\deit\init.py`

# 版权声明和许可证声明
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import TYPE_CHECKING

# 从 HuggingFace 内部的 utils 模块中导入所需的依赖和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_torch_available,
    is_vision_available,
)

# 定义导入结构字典，包含需要导入的模块和类
_import_structure = {"configuration_deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig", "DeiTOnnxConfig"]}

# 检查视觉处理是否可用，若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若视觉处理可用，则添加特征提取和图像处理模块到导入结构中
    _import_structure["feature_extraction_deit"] = ["DeiTFeatureExtractor"]
    _import_structure["image_processing_deit"] = ["DeiTImageProcessor"]

# 检查是否 Torch 可用，若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若 Torch 可用，则添加模型建模相关的类到导入结构中
    _import_structure["modeling_deit"] = [
        "DEIT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DeiTForImageClassification",
        "DeiTForImageClassificationWithTeacher",
        "DeiTForMaskedImageModeling",
        "DeiTModel",
        "DeiTPreTrainedModel",
    ]

# 检查是否 TensorFlow 可用，若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若 TensorFlow 可用，则添加 TensorFlow 下的模型建模相关的类到导入结构中
    _import_structure["modeling_tf_deit"] = [
        "TF_DEIT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFDeiTForImageClassification",
        "TFDeiTForImageClassificationWithTeacher",
        "TFDeiTForMaskedImageModeling",
        "TFDeiTModel",
        "TFDeiTPreTrainedModel",
    ]

# 若当前环境支持类型检查，则进行进一步的导入
if TYPE_CHECKING:
    # 从相应模块中导入具体的类和配置
    from .configuration_deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig, DeiTOnnxConfig

    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若视觉处理可用，则进一步导入特征提取和图像处理相关类
        from .feature_extraction_deit import DeiTFeatureExtractor
        from .image_processing_deit import DeiTImageProcessor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若 Torch 可用，则进一步导入模型建模相关的类
        from .modeling_deit import (
            DEIT_PRETRAINED_MODEL_ARCHIVE_LIST,
            DeiTForImageClassification,
            DeiTForImageClassificationWithTeacher,
            DeiTForMaskedImageModeling,
            DeiTModel,
            DeiTPreTrainedModel,
        )

    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 如果发生 OptionalDependencyNotAvailable 异常，则什么都不做，直接 pass
    except OptionalDependencyNotAvailable:
        pass
    # 如果没有发生异常，则导入以下模块
    else:
        from .modeling_tf_deit import (
            TF_DEIT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFDeiTForImageClassification,
            TFDeiTForImageClassificationWithTeacher,
            TFDeiTForMaskedImageModeling,
            TFDeiTModel,
            TFDeiTPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于操作 Python 解释器的系统功能
    import sys

    # 将当前模块添加到 sys.modules 中，使用 _LazyModule 进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\deprecated\bort\convert_bort_original_gluonnlp_checkpoint_to_pytorch.py`

# 导入必要的库和模块
import argparse  # 解析命令行参数的库
import os  # 提供与操作系统交互的功能

import gluonnlp as nlp  # GluonNLP，一个自然语言处理工具包
import mxnet as mx  # MXNet，一个深度学习框架
import numpy as np  # NumPy，用于处理数组和数值计算的库
import torch  # PyTorch，一个深度学习框架
from gluonnlp.base import get_home_dir  # 获取用户主目录的函数
from gluonnlp.model.bert import BERTEncoder  # GluonNLP中的BERT编码器
from gluonnlp.model.utils import _load_vocab  # 加载词汇表的内部函数
from gluonnlp.vocab import Vocab  # GluonNLP中的词汇表类
from packaging import version  # 版本管理工具
from torch import nn  # PyTorch中的神经网络模块

from transformers import BertConfig, BertForMaskedLM, BertModel, RobertaTokenizer  # Transformers库中的BERT相关模块
from transformers.models.bert.modeling_bert import (  # Transformers中BERT模型的具体实现
    BertIntermediate,
    BertLayer,
    BertOutput,
    BertSelfAttention,
    BertSelfOutput,
)
from transformers.utils import logging  # Transformers中的日志模块

# 检查GluonNLP和MXNet的版本是否符合要求，如果不符合则抛出异常
if version.parse(nlp.__version__) != version.parse("0.8.3"):
    raise Exception("requires gluonnlp == 0.8.3")

if version.parse(mx.__version__) != version.parse("1.5.0"):
    raise Exception("requires mxnet == 1.5.0")

# 设置日志输出等级为INFO
logging.set_verbosity_info()
# 获取logger对象用于记录日志
logger = logging.get_logger(__name__)

# 示例文本
SAMPLE_TEXT = "The Nymphenburg Palace is a beautiful palace in Munich!"


def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_folder_path: str):
    """
    Convert the original Bort checkpoint (based on MXNET and Gluonnlp) to our BERT structure-
    将原始的基于MXNet和GluonNLP的Bort检查点转换为我们的BERT结构。
    """

    # 原始的Bort配置参数
    bort_4_8_768_1024_hparams = {
        "attention_cell": "multi_head",
        "num_layers": 4,
        "units": 1024,
        "hidden_size": 768,
        "max_length": 512,
        "num_heads": 8,
        "scaled": True,
        "dropout": 0.1,
        "use_residual": True,
        "embed_size": 1024,
        "embed_dropout": 0.1,
        "word_embed": None,
        "layer_norm_eps": 1e-5,
        "token_type_vocab_size": 2,
    }

    # 使用预定义的Bort参数作为初始参数
    predefined_args = bort_4_8_768_1024_hparams

    # 在此处构建原始的Bort模型
    # 参考自官方的BERT实现，详见：
    # https://github.com/alexa/bort/blob/master/bort/bort.py
    # 创建一个 BERTEncoder 对象，用于编码文本
    encoder = BERTEncoder(
        attention_cell=predefined_args["attention_cell"],  # 设置注意力机制类型
        num_layers=predefined_args["num_layers"],  # 设置编码器层数
        units=predefined_args["units"],  # 设置每个编码器层的单元数
        hidden_size=predefined_args["hidden_size"],  # 设置隐藏层大小
        max_length=predefined_args["max_length"],  # 设置最大序列长度
        num_heads=predefined_args["num_heads"],  # 设置注意力头数
        scaled=predefined_args["scaled"],  # 是否进行缩放的注意力机制
        dropout=predefined_args["dropout"],  # 设置丢弃率
        output_attention=False,  # 是否输出注意力分布
        output_all_encodings=False,  # 是否输出所有编码
        use_residual=predefined_args["use_residual"],  # 是否使用残差连接
        activation=predefined_args.get("activation", "gelu"),  # 激活函数类型，默认为GELU
        layer_norm_eps=predefined_args.get("layer_norm_eps", None),  # LayerNorm层的epsilon值
    )

    # 需要先获取词汇信息
    # 使用 RoBERTa 相同的词汇表名称
    vocab_name = "openwebtext_ccnews_stories_books_cased"

    # 指定 Gluonnlp 的词汇下载文件夹
    gluon_cache_dir = os.path.join(get_home_dir(), "models")

    # 加载词汇表，并使用 Vocab 类进行处理
    bort_vocab = _load_vocab(vocab_name, None, gluon_cache_dir, cls=Vocab)

    # 创建原始的 BERT 模型，用于后续的参数加载
    original_bort = nlp.model.BERTModel(
        encoder,
        len(bort_vocab),  # BERT 模型的词汇表大小
        units=predefined_args["units"],  # 设置每个编码器层的单元数
        embed_size=predefined_args["embed_size"],  # 嵌入向量的大小
        embed_dropout=predefined_args["embed_dropout"],  # 嵌入层的丢弃率
        word_embed=predefined_args["word_embed"],  # 是否使用词嵌入
        use_pooler=False,  # 是否使用池化器
        use_token_type_embed=False,  # 是否使用类型嵌入
        token_type_vocab_size=predefined_args["token_type_vocab_size"],  # 类型嵌入的大小
        use_classifier=False,  # 是否使用分类器
        use_decoder=False,  # 是否使用解码器
    )

    # 加载 BERT 模型的预训练参数
    original_bort.load_parameters(bort_checkpoint_path, cast_dtype=True, ignore_extra=True)

    # 收集模型参数的前缀
    params = original_bort._collect_params_with_prefix()

    # 构建适用于 Transformers 的配置对象
    hf_bort_config_json = {
        "architectures": ["BertForMaskedLM"],  # 模型架构类型
        "attention_probs_dropout_prob": predefined_args["dropout"],  # 注意力概率的丢弃率
        "hidden_act": "gelu",  # 隐藏层的激活函数类型
        "hidden_dropout_prob": predefined_args["dropout"],  # 隐藏层的丢弃率
        "hidden_size": predefined_args["embed_size"],  # 隐藏层的大小
        "initializer_range": 0.02,  # 参数初始化范围
        "intermediate_size": predefined_args["hidden_size"],  # 中间层的大小
        "layer_norm_eps": predefined_args["layer_norm_eps"],  # LayerNorm 层的 epsilon 值
        "max_position_embeddings": predefined_args["max_length"],  # 最大位置嵌入的长度
        "model_type": "bort",  # 模型类型
        "num_attention_heads": predefined_args["num_heads"],  # 注意力头数
        "num_hidden_layers": predefined_args["num_layers"],  # 隐藏层的层数
        "pad_token_id": 1,  # 填充 token 的 ID （2 = BERT, 1 = RoBERTa）
        "type_vocab_size": 1,  # 类型词汇表的大小 （2 = BERT, 1 = RoBERTa）
        "vocab_size": len(bort_vocab),  # 词汇表的大小
    }

    # 从 JSON 配置构建 BertConfig 对象
    hf_bort_config = BertConfig.from_dict(hf_bort_config_json)

    # 创建适用于 Transformers 的 BertForMaskedLM 模型
    hf_bort_model = BertForMaskedLM(hf_bort_config)

    # 将模型设置为评估模式
    hf_bort_model.eval()

    # 参数映射表（从 Gluonnlp 到 Transformers）
    # * 表示层索引
    #
    # | Gluon 参数                       | Transformers 参数
    # | ---------------------------------------------------- | ----------------------
    # Helper function to convert MXNET Arrays to PyTorch 的参数数组
    def to_torch(mx_array) -> nn.Parameter:
        # 将 MXNET 的 NDArray 转换为 PyTorch 的 nn.Parameter 类型
        return nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy()))
    
    # Check param shapes and map new HF param back
    # 检查参数形状并将新的 HF 参数映射回去
    # 定义函数，用于检查和映射参数
    def check_and_map_params(hf_param, gluon_param):
        # 获取 HF 参数的形状
        shape_hf = hf_param.shape

        # 将 gluon_param 转换为 PyTorch 张量
        gluon_param = to_torch(params[gluon_param])
        # 获取 gluon_param 的形状
        shape_gluon = gluon_param.shape

        # 断言 HF 参数和 gluon_param 的形状必须一致，否则抛出异常
        assert (
            shape_hf == shape_gluon
        ), f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"

        # 返回 gluon_param
        return gluon_param

    # 将 HF Bort 模型的 word embeddings 参数映射到 gluon_param
    hf_bort_model.bert.embeddings.word_embeddings.weight = check_and_map_params(
        hf_bort_model.bert.embeddings.word_embeddings.weight, "word_embed.0.weight"
    )

    # 将 HF Bort 模型的 position embeddings 参数映射到 gluon_param
    hf_bort_model.bert.embeddings.position_embeddings.weight = check_and_map_params(
        hf_bort_model.bert.embeddings.position_embeddings.weight, "encoder.position_weight"
    )

    # 将 HF Bort 模型的 LayerNorm bias 参数映射到 gluon_param
    hf_bort_model.bert.embeddings.LayerNorm.bias = check_and_map_params(
        hf_bort_model.bert.embeddings.LayerNorm.bias, "encoder.layer_norm.beta"
    )

    # 将 HF Bort 模型的 LayerNorm weight 参数映射到 gluon_param
    hf_bort_model.bert.embeddings.LayerNorm.weight = check_and_map_params(
        hf_bort_model.bert.embeddings.LayerNorm.weight, "encoder.layer_norm.gamma"
    )

    # 将 HF Bort 模型的 token_type_embeddings 权重置零，受 RoBERTa 转换脚本启发（Bort 不使用这些参数）
    hf_bort_model.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        hf_bort_model.bert.embeddings.token_type_embeddings.weight.data
    )

    # 将 HF Bort 模型转换为半精度模型，节省空间和能耗
    hf_bort_model.half()

    # 比较两个模型的输出
    tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")

    # 对示例文本进行编码
    input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"]

    # 获取 gluon 模型的输出
    gluon_input_ids = mx.nd.array([input_ids])
    output_gluon = original_bort(inputs=gluon_input_ids, token_types=[])

    # 保存并重新加载 HF Bort 模型，以获取 Transformer 的输出
    hf_bort_model.save_pretrained(pytorch_dump_folder_path)
    hf_bort_model = BertModel.from_pretrained(pytorch_dump_folder_path)
    hf_bort_model.eval()

    # 对示例文本再次编码，并获取 HF Bort 模型的输出
    input_ids = tokenizer.encode_plus(SAMPLE_TEXT, return_tensors="pt")
    output_hf = hf_bort_model(**input_ids)[0]

    # 将 gluon_layer 和 hf_layer 转换为 numpy 数组
    gluon_layer = output_gluon[0].asnumpy()
    hf_layer = output_hf[0].detach().numpy()

    # 计算两个输出之间的最大绝对差
    max_absolute_diff = np.max(np.abs(hf_layer - gluon_layer)).item()

    # 检查两个输出是否在给定的误差范围内相等
    success = np.allclose(gluon_layer, hf_layer, atol=1e-3)

    # 如果成功，则打印输出相同的消息；否则打印输出不同的消息及绝对差
    if success:
        print("✔️ Both model do output the same tensors")
    else:
        print("❌ Both model do **NOT** output the same tensors")
        print("Absolute difference is:", max_absolute_diff)
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # 必选参数
    parser.add_argument(
        "--bort_checkpoint_path", default=None, type=str, required=True, help="Path the official Bort params file."
    )
    # 添加一个参数选项：--bort_checkpoint_path，类型为字符串，必选参数，用于指定官方Bort参数文件的路径

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 添加一个参数选项：--pytorch_dump_folder_path，类型为字符串，必选参数，用于指定输出PyTorch模型的路径

    args = parser.parse_args()
    # 解析命令行参数，并将其存储在args对象中

    convert_bort_checkpoint_to_pytorch(args.bort_checkpoint_path, args.pytorch_dump_folder_path)
    # 调用convert_bort_checkpoint_to_pytorch函数，传入解析后的参数args中的路径信息作为参数

`.\models\deprecated\bort\init.py`

# 导入必要的模块：os 模块提供了操作系统相关的功能，shutil 模块提供了高级的文件操作功能
import os
import shutil

# 定义一个函数，用于复制指定目录下的所有文件到另一个目录中
def copy_all_files(src_dir, dst_dir):
    # 如果目标目录不存在，则创建目标目录
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)
    
    # 遍历源目录下的所有文件和子目录
    for item in os.listdir(src_dir):
        # 构造完整的源文件或子目录路径
        src = os.path.join(src_dir, item)
        # 构造完整的目标文件或子目录路径
        dst = os.path.join(dst_dir, item)
        
        # 如果是文件，则执行复制操作
        if os.path.isfile(src):
            shutil.copy(src, dst)
        # 如果是子目录，则递归调用本函数复制子目录及其内容
        elif os.path.isdir(src):
            copy_all_files(src, dst)

`.\models\deprecated\mctct\configuration_mctct.py`

# 设置编码格式为 UTF-8

# 导入必要的模块和类
from ....configuration_utils import PretrainedConfig
from ....utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义预训练模型配置文件的下载映射
MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "speechbrain/m-ctc-t-large": "https://huggingface.co/speechbrain/m-ctc-t-large/resolve/main/config.json",
    # 查看所有 M-CTC-T 模型的链接地址：https://huggingface.co/models?filter=mctct
}

# MCTCTConfig 类，用于存储 M-CTC-T 模型的配置信息
class MCTCTConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MCTCTModel`]. It is used to instantiate an
    M-CTC-T model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the M-CTC-T
    [speechbrain/m-ctc-t-large](https://huggingface.co/speechbrain/m-ctc-t-large) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import MCTCTConfig, MCTCTModel

    >>> # Initializing a M-CTC-T mctct-large style configuration
    >>> configuration = MCTCTConfig()

    >>> # Initializing a model (with random weights) from the mctct-large style configuration
    >>> model = MCTCTModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型为 M-CTC-T
    model_type = "mctct"

    # 初始化方法，设置模型的各种参数
    def __init__(
        self,
        vocab_size=8065,
        hidden_size=1536,
        num_hidden_layers=36,
        intermediate_size=6144,
        num_attention_heads=4,
        attention_head_dim=384,
        max_position_embeddings=920,
        layer_norm_eps=1e-5,
        layerdrop=0.3,
        hidden_act="relu",
        initializer_range=0.02,
        hidden_dropout_prob=0.3,
        attention_probs_dropout_prob=0.3,
        pad_token_id=1,
        bos_token_id=0,
        eos_token_id=2,
        conv_glu_dim=1,
        conv_dropout=0.3,
        num_conv_layers=1,
        conv_kernel=(7,),
        conv_stride=(3,),
        input_feat_per_channel=80,
        input_channels=1,
        conv_channels=None,
        ctc_loss_reduction="sum",
        ctc_zero_infinity=False,
        **kwargs,
    ):
        # 调用父类的初始化方法，传递参数
        super().__init__(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_hidden_layers=num_hidden_layers,
            intermediate_size=intermediate_size,
            num_attention_heads=num_attention_heads,
            attention_head_dim=attention_head_dim,
            max_position_embeddings=max_position_embeddings,
            layer_norm_eps=layer_norm_eps,
            layerdrop=layerdrop,
            hidden_act=hidden_act,
            initializer_range=initializer_range,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            conv_glu_dim=conv_glu_dim,
            conv_dropout=conv_dropout,
            num_conv_layers=num_conv_layers,
            conv_kernel=conv_kernel,
            conv_stride=conv_stride,
            input_feat_per_channel=input_feat_per_channel,
            input_channels=input_channels,
            conv_channels=conv_channels,
            ctc_loss_reduction=ctc_loss_reduction,
            ctc_zero_infinity=ctc_zero_infinity,
            **kwargs,
        )
        ):
            # 调用父类的构造函数，传递所有关键字参数，并设置特定的标记 ID
            super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
            # 初始化模型配置参数
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
            self.intermediate_size = intermediate_size
            self.num_attention_heads = num_attention_heads
            self.attention_head_dim = attention_head_dim
            self.max_position_embeddings = max_position_embeddings
            self.layer_norm_eps = layer_norm_eps
            self.layerdrop = layerdrop
            self.hidden_act = hidden_act
            self.initializer_range = initializer_range
            self.hidden_dropout_prob = hidden_dropout_prob
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            self.pad_token_id = pad_token_id
            self.bos_token_id = bos_token_id
            self.eos_token_id = eos_token_id
            self.conv_glu_dim = conv_glu_dim
            self.conv_dropout = conv_dropout
            self.num_conv_layers = num_conv_layers
            self.input_feat_per_channel = input_feat_per_channel
            self.input_channels = input_channels
            self.conv_channels = conv_channels
            self.ctc_loss_reduction = ctc_loss_reduction
            self.ctc_zero_infinity = ctc_zero_infinity

            # 防止配置测试失败并导出为 JSON
            self.conv_kernel = list(conv_kernel)  # 将卷积核大小转换为列表
            self.conv_stride = list(conv_stride)  # 将卷积步长转换为列表

            # 检查卷积核大小与卷积层数是否匹配，如果不匹配则抛出错误
            if len(self.conv_kernel) != self.num_conv_layers:
                raise ValueError(
                    "Configuration for convolutional module is incorrect. "
                    "It is required that `len(config.conv_kernel)` == `config.num_conv_layers` "
                    f"but is `len(config.conv_kernel) = {len(self.conv_kernel)}`, "
                    f"`config.num_conv_layers = {self.num_conv_layers}`."
                )

`.\models\deprecated\mctct\feature_extraction_mctct.py`

# coding=utf-8
# 声明代码文件采用UTF-8编码格式

# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
# 版权声明，保留所有权利

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据Apache许可证2.0版本授权使用该代码

# you may not use this file except in compliance with the License.
# 只有在遵守许可证的情况下才能使用本文件

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证的副本

#     http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非法律要求或书面同意，否则依据许可证分发的软件都是基于“原样”提供的，不带任何形式的担保或条件。

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查阅许可证，了解具体语言授权和限制

"""
Feature extractor class for M-CTC-T
"""

# 导入必要的库和模块
from typing import List, Optional, Union

import numpy as np

# 导入音频处理相关的工具函数和类
from ....audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
# 导入特征提取的序列工具函数和类
from ....feature_extraction_sequence_utils import SequenceFeatureExtractor
# 导入特征提取相关的批量特征类
from ....feature_extraction_utils import BatchFeature
# 导入文件处理相关的工具函数和类
from ....file_utils import PaddingStrategy, TensorType
# 导入日志记录工具
from ....utils import logging

# 获取本模块的日志记录器
logger = logging.get_logger(__name__)

# 定义 M-CTC-T 特征提取器类，继承自 SequenceFeatureExtractor 类
class MCTCTFeatureExtractor(SequenceFeatureExtractor):
    r"""
    Constructs a M-CTC-T feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods. This
    code has been adapted from Flashlight's C++ code. For more information about the implementation, one can refer to
    this [notebook](https://colab.research.google.com/drive/1GLtINkkhzms-IsdcGy_-tVCkv0qNF-Gt#scrollTo=pMCRGMmUC_an)
    that takes the user step-by-step in the implementation.
    """
    """
    Args:
        feature_size (`int`, defaults to 80):
            The feature dimension of the extracted features. This is the number of mel_frequency
            coefficients to compute per frame.
        sampling_rate (`int`, defaults to 16000):
            The sampling rate at which the audio files should be digitized, expressed in hertz (Hz).
        padding_value (`float`, defaults to 0.0):
            The value used to fill the padding frames.
        hop_length (`int`, defaults to 10):
            Number of audio samples between consecutive frames.
        win_length (`int`, defaults to 25):
            Length of the window function applied to each frame, in milliseconds.
        win_function (`str`, defaults to `"hamming_window"`):
            Name for the window function used for windowing, must be accessible via `torch.{win_function}`.
        frame_signal_scale (`float`, defaults to 32768.0):
            Scaling factor applied to the audio frames before applying Discrete Fourier Transform (DFT).
        preemphasis_coeff (`float`, defaults to 0.97):
            Coefficient applied in pre-emphasis filtering of audio signals before DFT.
        mel_floor (`float`, defaults to 1.0):
            Minimum value enforced for mel frequency bank values.
        normalize_means (`bool`, *optional*, defaults to `True`):
            Whether to zero-mean normalize the extracted features.
        normalize_vars (`bool`, *optional*, defaults to `True`):
            Whether to unit-variance normalize the extracted features.
    """

    # List of input names expected by the model
    model_input_names = ["input_features", "attention_mask"]

    def __init__(
        self,
        feature_size=80,
        sampling_rate=16000,
        padding_value=0.0,
        hop_length=10,
        win_length=25,
        win_function="hamming_window",
        frame_signal_scale=32768.0,
        preemphasis_coeff=0.97,
        mel_floor=1.0,
        normalize_means=True,
        normalize_vars=True,
        return_attention_mask=False,
        **kwargs,
    ):
        # Call parent constructor with specified arguments
        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)

        # Initialize instance variables with provided or default values
        self.feature_size = feature_size
        self.sampling_rate = sampling_rate
        self.padding_value = padding_value
        self.hop_length = hop_length
        self.win_length = win_length
        self.frame_signal_scale = frame_signal_scale
        self.preemphasis_coeff = preemphasis_coeff
        self.mel_floor = mel_floor
        self.normalize_means = normalize_means
        self.normalize_vars = normalize_vars
        self.win_function = win_function
        self.return_attention_mask = return_attention_mask

        # Calculate sample size and stride in samples
        self.sample_size = win_length * sampling_rate // 1000
        self.sample_stride = hop_length * sampling_rate // 1000

        # Determine the optimal FFT length based on sample size
        self.n_fft = optimal_fft_length(self.sample_size)
        # Calculate number of frequencies in the FFT output
        self.n_freqs = (self.n_fft // 2) + 1
    # Extracts MFSC (Mel Frequency Spectral Coefficients) features from a single waveform vector.
    # Adapted from Flashlight's C++ MFSC code.
    def _extract_mfsc_features(self, one_waveform: np.array) -> np.ndarray:
        # Determine the window function based on the specified type ('hamming_window' or default).
        if self.win_function == "hamming_window":
            window = window_function(window_length=self.sample_size, name=self.win_function, periodic=False)
        else:
            window = window_function(window_length=self.sample_size, name=self.win_function)

        # Compute mel filter banks for the given audio properties.
        fbanks = mel_filter_bank(
            num_frequency_bins=self.n_freqs,
            num_mel_filters=self.feature_size,
            min_frequency=0.0,
            max_frequency=self.sampling_rate / 2.0,
            sampling_rate=self.sampling_rate,
        )

        # Compute MFSC features using the spectrogram function.
        msfc_features = spectrogram(
            one_waveform * self.frame_signal_scale,  # Scale the waveform
            window=window,
            frame_length=self.sample_size,
            hop_length=self.sample_stride,
            fft_length=self.n_fft,
            center=False,
            preemphasis=self.preemphasis_coeff,
            mel_filters=fbanks,
            mel_floor=self.mel_floor,
            log_mel="log",  # Logarithmic mel scaling
        )
        
        # Transpose the features to have time steps as rows and features as columns.
        return msfc_features.T

    # Normalize a single array 'x' based on specified normalization options and input length.
    def _normalize_one(self, x, input_length, padding_value):
        # Ensure that we normalize float32 arrays.
        if self.normalize_means:
            mean = x[:input_length].mean(axis=0)
            x = np.subtract(x, mean)
        if self.normalize_vars:
            std = x[:input_length].std(axis=0)
            x = np.divide(x, std)

        # If the input length is less than the array length, pad the array.
        if input_length < x.shape[0]:
            x[input_length:] = padding_value

        # Ensure the array is of type float32.
        x = x.astype(np.float32)

        return x

    # Normalize a list of input features, optionally using an attention mask.
    def normalize(
        self, input_features: List[np.ndarray], attention_mask: Optional[np.ndarray] = None
    ) -> List[np.ndarray]:
        # Calculate lengths of sequences based on attention mask if available, otherwise use array lengths.
        lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features]
        # Normalize each input feature array using _normalize_one method and return the normalized list.
        return [self._normalize_one(x, n, self.padding_value) for x, n in zip(input_features, lengths)]

    # Callable method for preprocessing raw speech data with various options.
    def __call__(
        self,
        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        padding: Union[bool, str, PaddingStrategy] = False,
        max_length: Optional[int] = None,
        truncation: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        sampling_rate: Optional[int] = None,
        **kwargs,
    ):

`.\models\deprecated\mctct\modeling_mctct.py`

# coding=utf-8
# 版权所有 2022 年 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（"许可证"）授权；
# 除非符合许可证的规定，否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"提供，
# 不提供任何明示或暗示的担保或条件。
# 有关更多详细信息，请参阅许可证。
""" PyTorch M-CTC-T 模型。"""

import math
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

from ....activations import ACT2FN
from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from ....integrations.deepspeed import is_deepspeed_zero3_enabled
from ....modeling_attn_mask_utils import _prepare_4d_attention_mask
from ....modeling_outputs import BaseModelOutput, CausalLMOutput
from ....modeling_utils import (
    PreTrainedModel,
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from ....utils import logging
from .configuration_mctct import MCTCTConfig

logger = logging.get_logger(__name__)

_HIDDEN_STATES_START_POSITION = 1

_CONFIG_FOR_DOC = "MCTCTConfig"

# Base docstring
_CHECKPOINT_FOR_DOC = "speechbrain/m-ctc-t-large"
_EXPECTED_OUTPUT_SHAPE = [1, 195, 1536]

# CTC docstring
_CTC_EXPECTED_OUTPUT = '"Mr. Quilter is the apostle of the middle classes, and we\'re glad to welcome his gospel."'
_CTC_EXPECTED_LOSS = 1885.65

MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "speechbrain/m-ctc-t-large",
    # See all M-CTC-T models at https://huggingface.co/models?filter=mctct
]


class MCTCTConv1dSubsampler(nn.Module):
    """
    Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
    via gated linear units (https://arxiv.org/abs/1911.08460)
    """
    def __init__(self, config):
        super().__init__()
        self.config = config  # 保存配置信息到对象属性中
        self.glu_dim = config.conv_glu_dim  # 从配置中获取 GLU 操作的维度

        self.dropout = nn.Dropout(config.conv_dropout)  # 使用配置中的 dropout 概率初始化 dropout 层

        self.num_layers = config.num_conv_layers  # 获取卷积层的数量
        self.in_channels = config.input_feat_per_channel * config.input_channels  # 计算输入通道数乘以每个通道的特征数

        if self.num_layers > 1:
            if config.conv_channels is None:
                raise ValueError(
                    "Need to specify `conv_channels` configuration in `MCTCTConfig` to use multiple convolution"
                    " layers."
                )
            self.mid_channels = config.conv_channels  # 如果有多于一层卷积，则从配置中获取中间层的通道数
        else:
            self.mid_channels = None  # 如果只有一层卷积，则中间层通道数为 None

        self.out_channels = config.hidden_size * 2  # 计算输出通道数，考虑到 GLU 操作会减半
        self.kernel_size = config.conv_kernel  # 获取卷积核大小的配置
        self.stride = config.conv_stride  # 获取卷积步长的配置

        # NOTE: MCTCT 模型原理上只使用一个卷积核。我为了灵活性允许多层卷积，但不确定模型定义应该限制为单层。
        # 这在考虑像 forward() 函数第1行的填充时尤其重要。
        self.conv_layers = nn.ModuleList(
            nn.Conv1d(
                self.in_channels if i == 0 else self.mid_channels[i-1],  # 输入通道数根据层次确定
                self.mid_channels[i] if i < self.num_layers - 1 else self.out_channels,  # 输出通道数根据层次确定
                kernel_size=k,
                stride=self.stride[i],
                padding="valid",  # 使用有效填充（无填充）
            )
            for i, k in enumerate(self.kernel_size)  # 遍历卷积核大小的配置列表
        )

    def forward(self, input_features):
        # NOTE: 参考 __init__ 中的注释，目前只计算填充，就像只有一层卷积一样。
        padding = sum([size // 2 for size in self.kernel_size])  # 计算填充大小使卷积后大小不变

        input_features = torch.nn.functional.pad(input_features, (0, 0, padding, padding), "constant", 0)  # 对输入特征进行填充
        hidden_states = input_features.transpose(1, 2).contiguous()  # 调整输入特征的维度顺序，变为 Batch x Frame x Time

        for conv in self.conv_layers:
            hidden_states = conv(hidden_states)  # 执行卷积操作
            hidden_states = nn.functional.glu(hidden_states, dim=self.glu_dim)  # 执行 GLU 激活函数
            hidden_states = self.dropout(hidden_states)  # 应用 dropout

        hidden_states = hidden_states.transpose(1, 2).contiguous()  # 调整隐藏状态的维度顺序，变为 Batch x Time x Frame
        return hidden_states  # 返回处理后的隐藏状态作为输出
# 定义 MCTCTEmbeddings 类，用于构建来自单词、位置和标记类型嵌入的总体嵌入。
class MCTCTEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    # 初始化方法，接受一个配置对象 config
    def __init__(self, config):
        super().__init__()
        # 使用 nn.Embedding 创建单词嵌入层，vocab_size 表示词汇表大小，hidden_size 表示隐藏层大小，padding_idx 表示填充标记的索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 使用 nn.Embedding 创建位置嵌入层，max_position_embeddings 表示最大位置嵌入数量
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 使用 nn.Embedding 创建标记类型嵌入层，type_vocab_size 表示标记类型数量
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 创建 MCTCTLayerNorm 实例作为 LayerNorm 层
        # self.LayerNorm 不使用 snake_case，以保持与 TensorFlow 模型变量名一致，便于加载任何 TensorFlow 检查点文件
        self.LayerNorm = MCTCTLayerNorm()
        # 使用 nn.Dropout 创建丢弃层，hidden_dropout_prob 表示丢弃概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 注册 position_ids 缓冲区，存储位置 id，torch.arange 创建从 0 到 max_position_embeddings-1 的张量
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册 token_type_ids 缓冲区，存储标记类型 id，创建与 position_ids 相同大小的全零张量
        self.register_buffer(
            "token_type_ids",
            torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
            persistent=False,
        )

    # 前向传播方法，接受多个输入参数并返回嵌入张量
    def forward(
        self, input_features=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
    ):
        # 获取输入特征的形状
        input_shape = input_features.size() if input_features is not None else inputs_embeds.size()[:-1]

        # 获取序列长度
        seq_length = input_shape[1]

        # 如果 position_ids 为 None，则使用注册的 position_ids 缓冲区中的值
        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        # 如果 token_type_ids 为 None，则使用注册的 token_type_ids 缓冲区中的值
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果 inputs_embeds 为 None，则使用 word_embeddings 对 input_features 进行嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_features)

        # 使用 token_type_embeddings 对 token_type_ids 进行嵌入
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将单词嵌入和标记类型嵌入相加，得到最终嵌入张量
        embeddings = inputs_embeds + token_type_embeddings

        # 对嵌入张量进行 LayerNorm 处理
        embeddings = self.LayerNorm(embeddings)
        # 对嵌入张量进行丢弃处理
        embeddings = self.dropout(embeddings)
        # 返回处理后的嵌入张量
        return embeddings
    # 初始化函数，接受配置参数并初始化模型的各种属性
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 检查隐藏大小是否是注意力头数的倍数，同时没有嵌入大小属性
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            # 如果不是倍数关系，则抛出数值错误异常
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 初始化注意力头数和每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = config.attention_head_dim
        # 计算所有注意力头的总大小
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 初始化查询、键、值的线性变换层，用于注意力机制中的线性变换
        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=False)

        # 初始化 dropout 层，用于注意力概率的 dropout
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

        # 初始化最大位置嵌入大小和距离嵌入层
        self.max_position_embeddings = config.max_position_embeddings
        self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 是否为解码器模型的标志
        self.is_decoder = config.is_decoder

    # 将输入张量变换为注意力分数张量的形状
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 重塑输入张量为指定形状的张量
    def reshape_fortran(self, x, shape):
        if len(x.shape) > 0:
            x = x.permute(*reversed(range(len(x.shape))))
        return x.reshape(*reversed(shape)).permute(*reversed(range(len(shape))))

    # 实现相对位置嵌入旋转的函数
    def relative_position_embedding_rotate(self, scores):
        # 注意：需要重新评估是否真正需要重新实现这部分代码，
        # 或者完全重构的原因是由于代码的其他部分。添加这部分和重塑 fortrain 代码似乎非常不理想。
        
        # 将张量维度重新排列，调整位置以适应后续操作
        scores = scores.permute(0, 2, 3, 1)  # 例如 [10, 1839, 14, 4]

        batch, hidden_state, seq_len, heads = scores.shape

        # 在第二维度上拼接零张量，扩展张量尺寸
        scores = torch.cat((scores, torch.zeros((batch, seq_len, seq_len, heads), device=scores.device)), dim=1)

        # 调用重塑函数，将张量重新整形为指定形状
        scores = self.reshape_fortran(scores, [batch, (hidden_state + seq_len) * seq_len, 1, heads])

        # 保留部分张量尺寸，截取需要的部分张量
        scores = scores[:, : (seq_len + hidden_state - 1) * seq_len]

        # 再次调用重塑函数，将张量调整为另一种指定形状
        scores = self.reshape_fortran(scores, [batch, hidden_state + seq_len - 1, seq_len, heads])

        halfpoint = hidden_state // 2
        # 调整张量顺序，使得维度重新排列
        scores = scores[:, halfpoint : halfpoint + seq_len].transpose(1, 2)  # 例如 [10, 14, 14, 4]

        # 返回重新排列后的张量，调整张量顺序以适应后续操作
        return scores.permute(0, 3, 1, 2)

    # 前向传播函数，定义了模型的计算流程
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        ):
            # 使用 self.query 对隐藏状态进行查询操作，得到混合查询层
            mixed_query_layer = self.query(hidden_states)
            # 将混合查询层的结果除以 sqrt(attention_head_size)，用于缩放
            mixed_query_layer = mixed_query_layer / math.sqrt(self.attention_head_size)

            # 使用 self.key 对隐藏状态进行键生成，并转置以匹配注意力分数计算的要求
            key_layer = self.transpose_for_scores(self.key(hidden_states))
            # 使用 self.value 对隐藏状态进行值生成，并转置以匹配注意力分数计算的要求
            value_layer = self.transpose_for_scores(self.value(hidden_states))

            # 将混合查询层转置以匹配注意力分数计算的要求
            query_layer = self.transpose_for_scores(mixed_query_layer)

            # 计算注意力分数，使用 torch.matmul 进行矩阵乘法
            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

            # 获取相对位置嵌入权重
            positional_embedding = self.distance_embedding.weight
            # 使用 einsum 函数计算相对位置分数
            relative_position_scores = torch.einsum("lh, bche -> bcle", positional_embedding, query_layer.transpose(2, 3))

            # 对相对位置分数应用旋转操作
            relative_position_scores = self.relative_position_embedding_rotate(relative_position_scores)
            # 将相对位置分数添加到注意力分数中
            attention_scores = attention_scores + relative_position_scores

            # 如果存在注意力掩码，则将其应用到注意力分数中
            if attention_mask is not None:
                attention_scores = attention_scores + attention_mask

            # 对注意力分数进行 softmax 归一化得到注意力概率
            attention_probs = nn.functional.softmax(attention_scores, dim=-1)

            # 使用 dropout 函数对注意力概率进行随机失活
            attention_probs = self.dropout(attention_probs)

            # 如果存在头部掩码，则将其应用到注意力概率中
            if head_mask is not None:
                attention_probs = attention_probs * head_mask

            # 计算上下文层，将注意力概率与值层相乘得到加权后的上下文表示
            context_layer = torch.matmul(attention_probs, value_layer)

            # 对上下文层进行维度变换和展平操作
            context_layer = context_layer.permute(0, 2, 1, 3).flatten(start_dim=-2)

            # 根据是否需要输出注意力权重决定输出内容
            outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

            # 返回输出内容
            return outputs
class MCTCTLayerNorm(nn.Module):
    # 定义一个自定义的 LayerNorm 模块，用于单例权重和偏置参数的标准化
    def __init__(self):
        super().__init__()
        # 初始化单例权重参数为1，可学习的模型参数
        self.singleton_weight = nn.Parameter(torch.ones(1))
        # 初始化单例偏置参数为0，可学习的模型参数
        self.singleton_bias = nn.Parameter(torch.zeros(1))

    def forward(self, hidden_states):
        # 根据单例权重和偏置参数对输入的 hidden_states 进行线性变换和偏置
        return (hidden_states * self.singleton_weight) + self.singleton_bias


class MCTCTSelfOutput(nn.Module):
    # 定义 Transformer 模型中的 SelfOutput 层，包括线性映射、LayerNorm 和 dropout 操作
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 使用线性映射将隐藏状态映射到相同的维度
        self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
        # 使用 LayerNorm 进行归一化处理
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 使用 dropout 进行随机失活以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        # 对隐藏状态进行线性映射
        hidden_states = self.dense(hidden_states)
        # 对映射结果进行 dropout 操作
        hidden_states = self.dropout(hidden_states)
        # 对 dropout 后的结果进行 LayerNorm 处理并与输入张量相加
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class MCTCTAttention(nn.Module):
    # 定义 Transformer 模型中的 Attention 层，包括 SelfAttention 和 SelfOutput
    def __init__(self, config):
        super().__init__()
        # 初始化 SelfAttention 层
        self.self = MCTCTSelfAttention(config)
        # 初始化 SelfOutput 层
        self.output = MCTCTSelfOutput(config)
        # 初始化用于存储被修剪的 attention head 的集合
        self.pruned_heads = set()

    def prune_heads(self, heads):
        # 根据传入的 head 索引列表修剪模型的 attention heads
        if len(heads) == 0:
            return
        # 寻找可以修剪的 heads 和对应的索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 对 SelfAttention 和 SelfOutput 中的线性层进行修剪
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并记录修剪的 heads
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
    ):
        # 前向传播函数，首先通过 SelfAttention 层获取输出，然后经过 SelfOutput 层得到最终输出
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions,
        )
        attention_output = self.output(self_outputs[0], hidden_states)
        outputs = (attention_output,) + self_outputs[1:]  # 如果有需要，添加注意力权重信息

        return outputs


class MCTCTIntermediate(nn.Module):
    # 定义 Transformer 模型中的 Intermediate 层，包括线性映射和激活函数
    def __init__(self, config):
        super().__init__()
        # 使用线性映射将隐藏状态映射到中间层的维度
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
        # 根据配置选择相应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
    # 定义一个类方法 `forward`，用于处理输入的隐藏状态数据
    def forward(self, hidden_states):
        # 将隐藏状态数据输入全连接层 `dense` 进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的隐藏状态数据应用激活函数 `intermediate_act_fn`
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回处理后的隐藏状态数据
        return hidden_states
class MCTCTOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，将输入特征从 config.intermediate_size 转换为 config.hidden_size，且不使用偏置
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
        # 初始化 LayerNorm 层，对隐藏状态的每个特征进行归一化，eps 是归一化过程中的稳定性参数
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 Dropout 层，以 config.hidden_dropout_prob 的概率随机丢弃隐藏状态中的特征
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        # 将隐藏状态经过全连接层变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的隐藏状态进行 Dropout 处理
        hidden_states = self.dropout(hidden_states)
        # 将经过 Dropout 处理后的隐藏状态与输入张量进行残差连接，并经过 LayerNorm 处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class MCTCTLayer(nn.Module):
    def __init__(self, config: MCTCTConfig):
        super().__init__()

        # 设置序列长度维度为1，用于后续的块状处理
        self.seq_len_dim = 1
        # 设置前馈传播的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward

        # 初始化 MCTCTIntermediate 层和 MCTCTAttention 层
        self.intermediate = MCTCTIntermediate(config)
        self.attention = MCTCTAttention(config)
        # 标记是否为解码器
        self.is_decoder = config.is_decoder
        # 初始化 MCTCTOutput 层
        self.output = MCTCTOutput(config)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
    ):
        # 使用注意力层处理隐藏状态，获取自注意力输出和其它附加信息
        self_attention_outputs = self.attention(
            hidden_states, attention_mask, head_mask, output_attentions=output_attentions
        )
        # 获取自注意力的输出
        attention_output = self_attention_outputs[0]
        # 如果输出注意力权重，将其加入输出中
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        # 将注意力输出应用于前馈传播的块状处理，按照指定的块大小和序列长度维度进行处理
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )

        # 将块状处理后的输出添加到总体输出中
        outputs = (layer_output,) + outputs

        return outputs

    def feed_forward_chunk(self, attention_output):
        # 将注意力输出输入到中间层进行处理
        intermediate_output = self.intermediate(attention_output)
        # 将中间层的输出输入到输出层进行处理
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output


class MCTCTPreTrainedModel(PreTrainedModel):
    """
    一个处理权重初始化、预训练模型下载和加载的抽象类。
    """

    # 指定配置类
    config_class = MCTCTConfig
    # 模型名称前缀
    base_model_prefix = "mctct"
    # 主输入名称
    main_input_name = "input_features"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    def _init_weights(self, module):
        """Initialize the weights"""
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            # 对于线性层，使用正态分布初始化权重
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果存在偏置项，初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 对于嵌入层，使用正态分布初始化权重
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果定义了padding_idx，将对应的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 对于LayerNorm层，初始化偏置为零，权重为1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, MCTCTLayerNorm):
            # 对于自定义的LayerNorm，初始化权重为1，偏置为零
            module.singleton_weight.data.fill_(1.0)
            module.singleton_bias.data.zero_()
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            # 对于线性层和一维卷积层，使用正态分布初始化权重
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果存在偏置项，初始化为零
            if module.bias is not None:
                module.bias.data.zero_()

    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
        """
        Computes the output length of the convolutional layers
        """
        dilation = 1
        for _, kernel_sz, stride in zip(
            range(self.config.num_conv_layers), self.config.conv_kernel, self.config.conv_stride
        ):
            padding = kernel_sz // 2
            # 计算卷积层输出长度
            input_lengths = input_lengths + 2 * padding - dilation * (kernel_sz - 1) - 1
            input_lengths = torch.div(input_lengths, stride, rounding_mode="trunc") + 1

        return input_lengths

    def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
        # 如果注意力掩码的维度大于2，将其转换为2维
        if len(attention_mask.shape) > 2:
            attention_mask = attention_mask[:, :, -1]

        # 计算通过特征向量长度生成的注意力掩码
        subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
        bsz = attention_mask.size()[0]
        attention_mask = torch.zeros(
            (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
        )

        # 设置所有输出长度前的位置为1，表示需要关注这些位置
        attention_mask[(torch.arange(bsz, device=attention_mask.device), subsampled_lengths - 1)] = 1
        # 对注意力掩码进行累积操作，确保在输出长度之前的位置都被关注到
        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).long()
        return attention_mask
MCTCT_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MCTCTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

MCTCT_INPUTS_DOCSTRING = r"""
    Args:
        input_features (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`Wav2Vec2CTCTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""

class MCTCTEncoder(MCTCTPreTrainedModel):
    def __init__(self, config: MCTCTConfig):
        super().__init__(config)
        self.hidden_dropout_prob = config.hidden_dropout_prob  # 初始化隐藏层的dropout概率

        self.layer_norm = MCTCTLayerNorm()  # 初始化层归一化
        self.conv = MCTCTConv1dSubsampler(config)  # 使用给定配置初始化一维卷积子采样器
        self.layers = nn.ModuleList([MCTCTLayer(config) for _ in range(config.num_hidden_layers)])  # 使用给定配置初始化多层MCTCTLayer组成的模块列表

        self.gradient_checkpointing = False  # 初始化梯度检查点为False

    def forward(
        self,
        input_features: torch.Tensor,
        attention_mask: torch.Tensor,
        head_mask: torch.Tensor,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    # 定义一个字符串，描述了一个模型的基本特性：M-CTC-T Model，输出原始的隐藏状态而没有额外的头部处理。
    # 这是模型文档字符串的开始。
    "The bare M-CTC-T Model transformer outputting raw hidden-states without any specific head on top.",
    MCTCT_START_DOCSTRING,
# 定义一个 MCTCTModel 类，继承自 MCTCTPreTrainedModel 类
class MCTCTModel(MCTCTPreTrainedModel):
    def __init__(self, config):
        # 调用父类的构造方法，传入配置参数
        super().__init__(config)
        # 将配置参数保存在对象的属性中
        self.config = config

        # 初始化编码器（encoder）部分
        self.encoder = MCTCTEncoder(config)

        # 执行额外的初始化操作和最终处理
        self.post_init()

    # 使用装饰器添加模型前向传播方法的文档字符串
    @add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 使用装饰器添加示例代码的文档字符串
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    # 定义模型的前向传播方法
    def forward(
        self,
        input_features: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        # 如果未显式提供输出注意力的设置，则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未显式提供输出隐藏状态的设置，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未显式提供返回字典的设置，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果输入特征为空，则引发数值错误
        if input_features is None:
            raise ValueError("You have to specify input_features.")

        # 将输入特征传递给编码器，获取编码器的输出
        encoder_outputs = self.encoder(
            input_features,
            attention_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取编码器输出中的序列输出
        sequence_output = encoder_outputs[0]

        # 如果不使用返回字典，则返回一个包含序列输出和其它编码器输出的元组
        if not return_dict:
            return (sequence_output,) + encoder_outputs[1:]

        # 如果使用返回字典，则返回一个包含序列输出、隐藏状态和注意力的 BaseModelOutput 对象
        return BaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


# 使用装饰器添加模型描述文档字符串，并继承自 MCTCTPreTrainedModel 类
@add_start_docstrings(
    """MCTCT Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
    MCTCT_START_DOCSTRING,
)
class MCTCTForCTC(MCTCTPreTrainedModel):
    # 初始化方法，接受一个配置参数并调用父类的初始化方法
    def __init__(self, config):
        super().__init__(config)

        # 使用给定的配置参数初始化 MCTCTModel 类的实例
        self.mctct = MCTCTModel(config)

        # 如果配置中未定义词汇表大小，则抛出值错误异常
        if config.vocab_size is None:
            raise ValueError(
                f"You are trying to instantiate {self.__class__} with a configuration that "
                "does not define the vocabulary size of the language model head. Please "
                "instantiate the model as follows: `MCTCTForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
                "or define `vocab_size` of your model's configuration."
            )

        # 从配置中获取隐藏层大小作为输出隐藏层大小
        output_hidden_size = config.hidden_size

        # 创建一个线性层，用于CTC（Connectionist Temporal Classification）任务的输出
        self.ctc_head = nn.Linear(output_hidden_size, config.vocab_size)

        # 调用后续初始化方法，用于权重初始化和最终处理
        self.post_init()

    # 前向传播方法，接受多个输入参数并返回预测结果或损失
    @add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_CTC_EXPECTED_OUTPUT,
        expected_loss=_CTC_EXPECTED_LOSS,
    )
    def forward(
        self,
        input_features: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.LongTensor] = None,
        ) -> Union[Tuple, CausalLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """
        # Determine if the return_dict should be set based on the model's configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # Perform masked connectionist temporal classification on the input features
        outputs = self.mctct(
            input_features,
            attention_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract hidden states from the model's outputs
        hidden_states = outputs[0]

        # Compute logits using the CTC head
        logits = self.ctc_head(hidden_states)

        loss = None
        if labels is not None:
            if labels.max() >= self.config.vocab_size:
                # Raise an error if any label value exceeds the vocabulary size
                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")

            # Compute input_lengths from attention_mask
            attention_mask = (
                attention_mask
                if attention_mask is not None
                else torch.ones(input_features.shape[:-1], dtype=torch.long)
            )
            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
            
            # Create a mask for valid labels and calculate target_lengths
            labels_mask = labels >= 0
            target_lengths = labels_mask.sum(-1)
            flattened_targets = labels.masked_select(labels_mask)

            # Log-probabilities for the CTC loss calculation
            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

            # Disable CuDNN for CTC loss computation
            with torch.backends.cudnn.flags(enabled=False):
                loss = nn.functional.ctc_loss(
                    log_probs,
                    flattened_targets,
                    input_lengths,
                    target_lengths,
                    blank=self.config.pad_token_id,
                    reduction=self.config.ctc_loss_reduction,
                    zero_infinity=self.config.ctc_zero_infinity,
                )

        if not return_dict:
            # Prepare output tuple if return_dict is False
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # Return structured output using CausalLMOutput if return_dict is True
        return CausalLMOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )

`.\models\deprecated\mctct\processing_mctct.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Speech processor class for M-CTC-T
"""
import warnings
from contextlib import contextmanager

from ....processing_utils import ProcessorMixin


class MCTCTProcessor(ProcessorMixin):
    r"""
    Constructs a MCTCT processor which wraps a MCTCT feature extractor and a MCTCT tokenizer into a single processor.

    [`MCTCTProcessor`] offers all the functionalities of [`MCTCTFeatureExtractor`] and [`AutoTokenizer`]. See the
    [`~MCTCTProcessor.__call__`] and [`~MCTCTProcessor.decode`] for more information.

    Args:
        feature_extractor (`MCTCTFeatureExtractor`):
            An instance of [`MCTCTFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of [`AutoTokenizer`]. The tokenizer is a required input.
    """

    # 类属性，指定特征提取器的类名
    feature_extractor_class = "MCTCTFeatureExtractor"
    # 类属性，指定分词器的类名
    tokenizer_class = "AutoTokenizer"

    def __init__(self, feature_extractor, tokenizer):
        # 调用父类的构造函数，传入特征提取器和分词器实例
        super().__init__(feature_extractor, tokenizer)
        # 设置当前处理器为特征提取器实例
        self.current_processor = self.feature_extractor
        # 内部标志，指示是否处于目标上下文管理器中的状态，默认为 False
        self._in_target_context_manager = False
    # 当对象被调用时执行的方法，根据当前上下文执行不同的操作
    """
    当对象在正常模式下使用时，此方法将所有参数转发给 MCTCTFeatureExtractor 的 [`~MCTCTFeatureExtractor.__call__`] 并返回其输出。
    如果在 [`~MCTCTProcessor.as_target_processor`] 上下文中使用，则将所有参数转发给 AutoTokenizer 的 [`~AutoTokenizer.__call__`]。
    更多信息请参考上述两个方法的文档字符串。
    """
    # 对于向后兼容性
    if self._in_target_context_manager:
        # 如果处于目标处理器上下文中，直接调用当前处理器的方法，并返回其输出
        return self.current_processor(*args, **kwargs)

    # 如果关键字参数中包含 "raw_speech"，发出警告并使用 "audio" 替代
    if "raw_speech" in kwargs:
        warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
        audio = kwargs.pop("raw_speech")
    else:
        # 否则，尝试从关键字参数中获取 "audio"，默认为 None
        audio = kwargs.pop("audio", None)
    # 尝试从关键字参数中获取 "sampling_rate"，默认为 None
    sampling_rate = kwargs.pop("sampling_rate", None)
    # 尝试从关键字参数中获取 "text"，默认为 None
    text = kwargs.pop("text", None)
    # 如果位置参数 args 不为空，则将第一个参数作为 audio，并从 args 中移除
    if len(args) > 0:
        audio = args[0]
        args = args[1:]

    # 如果既没有 audio 也没有 text 输入，则抛出 ValueError
    if audio is None and text is None:
        raise ValueError("You need to specify either an `audio` or `text` input to process.")

    # 如果有音频输入，则使用 feature_extractor 处理音频数据
    if audio is not None:
        inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
    # 如果有文本输入，则使用 tokenizer 处理文本数据
    if text is not None:
        encodings = self.tokenizer(text, **kwargs)

    # 根据输入情况返回相应的结果
    if text is None:
        return inputs  # 如果只有音频输入，则返回音频处理的结果
    elif audio is None:
        return encodings  # 如果只有文本输入，则返回文本处理的结果
    else:
        inputs["labels"] = encodings["input_ids"]  # 如果既有音频又有文本输入，则将文本处理结果作为标签添加到音频处理结果中
        return inputs  # 返回整合后的结果字典

# 批量解码方法，将所有参数转发给 AutoTokenizer 的 [`~PreTrainedTokenizer.batch_decode`] 方法
def batch_decode(self, *args, **kwargs):
    """
    此方法将所有参数转发给 AutoTokenizer 的 [`~PreTrainedTokenizer.batch_decode`]。请参考该方法的文档字符串以获取更多信息。
    """
    return self.tokenizer.batch_decode(*args, **kwargs)
    def pad(self, *args, **kwargs):
        """
        When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's
        [`~MCTCTFeatureExtractor.pad`] and returns its output. If used in the context
        [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
        [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
        """
        # 对于向后兼容性
        if self._in_target_context_manager:
            # 如果处于目标处理器上下文管理器中，则调用当前处理器的 pad 方法
            return self.current_processor.pad(*args, **kwargs)

        # 获取特定参数
        input_features = kwargs.pop("input_features", None)
        labels = kwargs.pop("labels", None)
        if len(args) > 0:
            # 如果有位置参数，将第一个位置参数作为 input_features，其余作为 args
            input_features = args[0]
            args = args[1:]

        # 如果 input_features 不为 None，则调用特征提取器的 pad 方法
        if input_features is not None:
            input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
        # 如果 labels 不为 None，则调用标记器的 pad 方法
        if labels is not None:
            labels = self.tokenizer.pad(labels, **kwargs)

        # 根据输入是否为 None，返回相应结果
        if labels is None:
            return input_features
        elif input_features is None:
            return labels
        else:
            input_features["labels"] = labels["input_ids"]
            return input_features

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to AutoTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
        docstring of this method for more information.
        """
        # 将所有参数转发给 tokenizer 的 decode 方法
        return self.tokenizer.decode(*args, **kwargs)

    @contextmanager
    def as_target_processor(self):
        """
        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning MCTCT.
        """
        # 发出警告信息，因为该方法即将在 v5 版本中移除
        warnings.warn(
            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
            "your audio inputs, or in a separate call."
        )
        # 设置标志位，指示当前处于目标处理器上下文管理器中
        self._in_target_context_manager = True
        # 将当前处理器设置为 tokenizer
        self.current_processor = self.tokenizer
        yield
        # 在退出上下文管理器后，将当前处理器设置回特征提取器
        self.current_processor = self.feature_extractor
        self._in_target_context_manager = False

`.\models\deprecated\mctct\init.py`

# 导入类型检查模块
from typing import TYPE_CHECKING

# 导入自定义的异常类和模块惰性加载工具函数
from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块导入结构字典
_import_structure = {
    "configuration_mctct": ["MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MCTCTConfig"],
    "feature_extraction_mctct": ["MCTCTFeatureExtractor"],
    "processing_mctct": ["MCTCTProcessor"],
}

# 检查是否存在 torch 库，若不存在则抛出自定义异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若存在 torch 库，则添加模型相关的导入内容到结构字典
    _import_structure["modeling_mctct"] = [
        "MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MCTCTForCTC",
        "MCTCTModel",
        "MCTCTPreTrainedModel",
    ]

# 如果是类型检查模式
if TYPE_CHECKING:
    # 导入配置相关的类和变量
    from .configuration_mctct import MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, MCTCTConfig
    # 导入特征提取相关的类
    from .feature_extraction_mctct import MCTCTFeatureExtractor
    # 导入处理相关的类
    from .processing_mctct import MCTCTProcessor

    # 再次检查 torch 库是否可用，若不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入模型相关的类和变量
        from .modeling_mctct import MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST, MCTCTForCTC, MCTCTModel, MCTCTPreTrainedModel

# 如果不是类型检查模式
else:
    import sys

    # 将当前模块替换为懒加载模块对象，支持按需导入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\deprecated\mmbt\configuration_mmbt.py`

# coding=utf-8
# Copyright (c) Facebook, Inc. and its affiliates.
# Copyright (c) HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" MMBT configuration"""

from ....utils import logging  # 导入 logging 模块，这里是从上级目录开始导入的

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象


class MMBTConfig(object):
    """
    This is the configuration class to store the configuration of a [`MMBTModel`]. It is used to instantiate a MMBT
    model according to the specified arguments, defining the model architecture.

    Args:
        config ([`PreTrainedConfig`]):
            Config of the underlying Transformer models. Its values are copied over to use a single config.
        num_labels (`int`, *optional*):
            Size of final Linear layer for classification.
        modal_hidden_size (`int`, *optional*, defaults to 2048):
            Embedding dimension of the non-text modality encoder.
    """

    def __init__(self, config, num_labels=None, modal_hidden_size=2048):
        self.__dict__ = config.__dict__  # 将传入的 config 对象的所有属性复制到当前对象的属性中
        self.modal_hidden_size = modal_hidden_size  # 设置非文本模态编码器的嵌入维度大小
        if num_labels:
            self.num_labels = num_labels  # 如果指定了 num_labels 参数，则设置分类器最终线性层的大小

`.\models\deprecated\mmbt\modeling_mmbt.py`

# coding=utf-8
# 文件编码声明为UTF-8，确保可以处理各种字符
# Copyright (c) Facebook, Inc. and its affiliates.
# 版权声明，版权归Facebook及其关联公司所有
# Copyright (c) HuggingFace Inc. team.
# 版权声明，版权归HuggingFace Inc.团队所有
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据Apache License 2.0许可证授权，详情参见许可证文档
# you may not use this file except in compliance with the License.
# 除非符合许可证的规定，否则不得使用本文件
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
# 可以在上述链接获取许可证副本
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则按"原样"分发软件，无论是明示的还是暗示的
# See the License for the specific language governing permissions and
# limitations under the License.
# 请参阅许可证，了解特定语言下的权限和限制
"""PyTorch MMBT model."""
# PyTorch MMBT模型的定义

import torch
# 引入PyTorch库
from torch import nn
# 从PyTorch中引入神经网络模块
from torch.nn import CrossEntropyLoss, MSELoss
# 从PyTorch的神经网络模块中引入交叉熵损失和均方误差损失

from ....modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
# 从上层目录中的modeling_outputs模块引入BaseModelOutputWithPooling和SequenceClassifierOutput
from ....modeling_utils import ModuleUtilsMixin
# 从上层目录中的modeling_utils模块引入ModuleUtilsMixin
from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
# 从上层目录中的utils模块引入add_start_docstrings、add_start_docstrings_to_model_forward、logging和replace_return_docstrings函数

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器对象

_CONFIG_FOR_DOC = "MMBTConfig"
# 用于文档的配置名称为MMBTConfig

class ModalEmbeddings(nn.Module):
    """Generic Modal Embeddings which takes in an encoder, and a transformer embedding."""
    # 通用的模态嵌入，接受一个编码器和一个Transformer嵌入

    def __init__(self, config, encoder, embeddings):
        super().__init__()
        # 调用父类的初始化方法
        self.config = config
        # 设置配置属性
        self.encoder = encoder
        # 设置编码器属性
        self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size)
        # 使用线性层映射模态隐藏大小到隐藏大小
        self.position_embeddings = embeddings.position_embeddings
        # 设置位置嵌入属性
        self.token_type_embeddings = embeddings.token_type_embeddings
        # 设置token类型嵌入属性
        self.word_embeddings = embeddings.word_embeddings
        # 设置单词嵌入属性
        self.LayerNorm = embeddings.LayerNorm
        # 设置LayerNorm属性
        self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
        # 使用指定的丢弃概率创建Dropout层
    # 定义前向传播函数，接受输入模态数据，可选的起始和结束标记，位置编码和标记类型编码
    def forward(self, input_modal, start_token=None, end_token=None, position_ids=None, token_type_ids=None):
        # 获取输入模态数据的token embeddings
        token_embeddings = self.proj_embeddings(self.encoder(input_modal))
        # 获取当前token embeddings的序列长度
        seq_length = token_embeddings.size(1)

        # 如果存在起始标记，则添加起始标记的embedding到token embeddings序列中
        if start_token is not None:
            start_token_embeds = self.word_embeddings(start_token)
            seq_length += 1
            token_embeddings = torch.cat([start_token_embeds.unsqueeze(1), token_embeddings], dim=1)

        # 如果存在结束标记，则添加结束标记的embedding到token embeddings序列末尾
        if end_token is not None:
            end_token_embeds = self.word_embeddings(end_token)
            seq_length += 1
            token_embeddings = torch.cat([token_embeddings, end_token_embeds.unsqueeze(1)], dim=1)

        # 如果未提供位置编码，则根据序列长度创建默认位置编码
        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_modal.device)
            position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length)

        # 如果未提供标记类型编码，则创建全零的标记类型编码
        if token_type_ids is None:
            token_type_ids = torch.zeros(
                (input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device
            )

        # 根据位置编码获取位置embedding
        position_embeddings = self.position_embeddings(position_ids)
        # 根据标记类型编码获取标记类型embedding
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
        # 将token embeddings、位置embedding和标记类型embedding相加得到最终的嵌入表示
        embeddings = token_embeddings + position_embeddings + token_type_embeddings
        # 对嵌入表示进行Layer Normalization
        embeddings = self.LayerNorm(embeddings)
        # 对嵌入表示进行dropout处理
        embeddings = self.dropout(embeddings)
        # 返回处理后的嵌入表示作为前向传播的输出
        return embeddings
"""
MMBT model was proposed in [Supervised Multimodal Bitransformers for Classifying Images and
Text](https://github.com/facebookresearch/mmbt) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
It's a supervised multimodal bitransformer model that fuses information from text and other image encoders, and
obtain state-of-the-art performance on various multimodal classification benchmark tasks.

This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)

This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.

Parameters:
    config ([`MMBTConfig`]): Model configuration class with all the parameters of the model.
        Initializing with a config file does not load the weights associated with the model, only the
        configuration.
    transformer (`nn.Module`): A text transformer that is used by MMBT.
        It should have embeddings, encoder, and pooler attributes.
    encoder (`nn.Module`): Encoder for the second modality.
        It should take in a batch of modal inputs and return k, n dimension embeddings.
"""



"""
The bare MMBT Model outputting raw hidden-states without any specific head on top.
Inherits documentation from MMBT_START_DOCSTRING.

This class represents the core MMBT model without any additional task-specific head, providing raw hidden states.

Attributes:
    config: Model configuration instance containing all model parameters.
    transformer: Text transformer module used by MMBT for processing textual inputs.
    modal_encoder: Module for handling the encoding of the second modality (e.g., images).

Methods:
    forward: Defines the forward pass of the model, detailing how inputs propagate through the network.
    get_input_embeddings: Retrieves the word embedding layer of the model.
    set_input_embeddings: Sets the word embedding layer of the model.

"""
    MMBT_INPUTS_DOCSTRING,


注释：


    # 使用 MMBT_INPUTS_DOCSTRING 常量
# 定义一个名为MMBTForClassification的神经网络模型类，继承自nn.Module类，用于分类任务
class MMBTForClassification(nn.Module):
    r"""
    **labels**: (*optional*) `torch.LongTensor` of shape `(batch_size,)`:
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns: *Tuple* comprising various elements depending on the configuration (config) and inputs: **loss**:
    (*optional*, returned when `labels` is provided) `torch.FloatTensor` of shape `(1,)`: Classification (or
    regression if config.num_labels==1) loss. **logits**:
        `torch.FloatTensor` of shape `(batch_size, config.num_labels)` Classification (or regression if
        config.num_labels==1) scores (before SoftMax).
    **hidden_states**: (*optional*, returned when `output_hidden_states=True`) list of `torch.FloatTensor` (one for
    the output of each layer + the output of the embeddings) of shape `(batch_size, sequence_length, hidden_size)`:
    Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**:
    (*optional*, returned when `output_attentions=True`) list of `torch.FloatTensor` (one for each layer) of shape
    `(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used
    to compute the weighted average in the self-attention heads.

    Examples:

    ```
    # For example purposes. Not runnable.
    transformer = BertModel.from_pretrained("google-bert/bert-base-uncased")
    encoder = ImageEncoder(args)
    model = MMBTForClassification(config, transformer, encoder)
    outputs = model(input_modal, input_ids, labels=labels)
    loss, logits = outputs[:2]
    ```"""

    # 初始化方法，定义了模型的结构
    def __init__(self, config, transformer, encoder):
        super().__init__()
        # 设置类别数量
        self.num_labels = config.num_labels

        # 初始化MMBT模型，传入配置、transformer和encoder
        self.mmbt = MMBTModel(config, transformer, encoder)
        # 添加一个dropout层，用于防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 分类器层，线性变换，将隐藏状态的大小映射到类别数量
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    # 前向传播方法，定义了数据从输入到输出的流程
    def forward(
        self,
        input_modal,
        input_ids=None,
        modal_start_tokens=None,
        modal_end_tokens=None,
        attention_mask=None,
        token_type_ids=None,
        modal_token_type_ids=None,
        position_ids=None,
        modal_position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        return_dict=None,
        ):
            # 如果 return_dict 不是 None，则使用传入的 return_dict，否则使用 self.config.use_return_dict
            return_dict = return_dict if return_dict is not None else self.config.use_return_dict

            # 使用 MMBT 模型进行推理，传入各种输入参数
            outputs = self.mmbt(
                input_modal=input_modal,
                input_ids=input_ids,
                modal_start_tokens=modal_start_tokens,
                modal_end_tokens=modal_end_tokens,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                modal_token_type_ids=modal_token_type_ids,
                position_ids=position_ids,
                modal_position_ids=modal_position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                return_dict=return_dict,
            )

            # 获取 MMBT 模型输出的池化后的表示
            pooled_output = outputs[1]

            # 对池化后的表示进行 dropout 操作
            pooled_output = self.dropout(pooled_output)
            # 将 dropout 后的表示输入分类器获取 logits
            logits = self.classifier(pooled_output)

            loss = None
            # 如果有标签，则计算损失
            if labels is not None:
                if self.num_labels == 1:
                    # 如果只有一个标签，说明是回归任务
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), labels.view(-1))
                else:
                    # 多标签分类任务，使用交叉熵损失函数
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

            # 如果 return_dict 为 False，则返回一个元组，包括 logits 和额外的输出
            if not return_dict:
                output = (logits,) + outputs[2:]
                return ((loss,) + output) if loss is not None else output

            # 如果 return_dict 为 True，则返回一个 SequenceClassifierOutput 对象，包含损失、logits、隐藏状态和注意力权重
            return SequenceClassifierOutput(
                loss=loss,
                logits=logits,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            )

Transformers-源码解析-三十五-

Transformers 源码解析（三十五）

.\models\deformable_detr\__init__.py

.\models\deit\configuration_deit.py

.\models\deit\convert_deit_timm_to_pytorch.py

.\models\deit\feature_extraction_deit.py

.\models\deit\image_processing_deit.py

.\models\deit\modeling_deit.py

.\models\deit\modeling_tf_deit.py

.\models\deit\__init__.py

.\models\deprecated\bort\convert_bort_original_gluonnlp_checkpoint_to_pytorch.py

.\models\deprecated\bort\__init__.py

.\models\deprecated\mctct\configuration_mctct.py

.\models\deprecated\mctct\feature_extraction_mctct.py

.\models\deprecated\mctct\modeling_mctct.py

.\models\deprecated\mctct\processing_mctct.py

.\models\deprecated\mctct\__init__.py

.\models\deprecated\mmbt\configuration_mmbt.py

.\models\deprecated\mmbt\modeling_mmbt.py

`.\models\deformable_detr\init.py`

`.\models\deit\configuration_deit.py`

`.\models\deit\convert_deit_timm_to_pytorch.py`

`.\models\deit\feature_extraction_deit.py`

`.\models\deit\image_processing_deit.py`

`.\models\deit\modeling_deit.py`

`.\models\deit\modeling_tf_deit.py`

`.\models\deit\init.py`

`.\models\deprecated\bort\convert_bort_original_gluonnlp_checkpoint_to_pytorch.py`

`.\models\deprecated\bort\init.py`

`.\models\deprecated\mctct\configuration_mctct.py`

`.\models\deprecated\mctct\feature_extraction_mctct.py`

`.\models\deprecated\mctct\modeling_mctct.py`

`.\models\deprecated\mctct\processing_mctct.py`

`.\models\deprecated\mctct\init.py`

`.\models\deprecated\mmbt\configuration_mmbt.py`

`.\models\deprecated\mmbt\modeling_mmbt.py`