Transformers 源码解析（四十）

`.\models\dinat\init.py`

# 版权声明和许可证声明，指明版权所有者及许可协议
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入类型检查模块中的 TYPE_CHECKING
from typing import TYPE_CHECKING

# 导入必要的依赖项
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构
_import_structure = {"configuration_dinat": ["DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DinatConfig"]}

# 检查是否有必要的 Torch 依赖，若无则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果依赖可用，扩展导入结构以包含 DINAT 模型相关的内容
    _import_structure["modeling_dinat"] = [
        "DINAT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DinatForImageClassification",
        "DinatModel",
        "DinatPreTrainedModel",
        "DinatBackbone",
    ]

# 如果是类型检查阶段，则导入 DINAT 配置和模型相关的内容
if TYPE_CHECKING:
    from .configuration_dinat import DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP, DinatConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_dinat import (
            DINAT_PRETRAINED_MODEL_ARCHIVE_LIST,
            DinatBackbone,
            DinatForImageClassification,
            DinatModel,
            DinatPreTrainedModel,
        )

# 如果不是类型检查阶段，则将当前模块设为延迟加载模块
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\dinov2\configuration_dinov2.py`

# 设置文件编码为 UTF-8
# 版权声明，声明此文件版权归 HuggingFace Inc. 团队所有
# 根据 Apache License, Version 2.0 许可证使用本文件，除非符合许可证的条件，否则不得使用本文件
# 可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
# 除非适用法律要求或书面同意，否则按"原样"分发本软件
# 本软件不提供任何明示或暗示的保证或条件，包括但不限于特定目的适销性和适用性的隐含保证或条件
# 有关详细信息，请参阅许可证

""" DINOv2 模型配置 """

# 从 collections 模块导入 OrderedDict 类
# 从 typing 模块导入 Mapping 类型
from collections import OrderedDict
from typing import Mapping

# 从 packaging 模块导入 version 函数
from packaging import version

# 从配置文件工具中导入 PretrainedConfig 类
from ...configuration_utils import PretrainedConfig
# 从 ONNX 配置中导入 OnnxConfig 类
from ...onnx import OnnxConfig
# 从工具集中导入日志记录器
from ...utils import logging
# 从 Backbone 工具集中导入 BackboneConfigMixin 类和 get_aligned_output_features_output_indices 函数
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# DINOV2 预训练配置文件的映射字典，指定预训练模型名称和配置文件地址
DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/dinov2-base": "https://huggingface.co/facebook/dinov2-base/resolve/main/config.json",
}

# Dinov2Config 类，继承自 BackboneConfigMixin 和 PretrainedConfig 类
class Dinov2Config(BackboneConfigMixin, PretrainedConfig):
    r"""
    这是一个配置类，用于存储 [`Dinov2Model`] 的配置。根据指定的参数实例化一个 Dinov2 模型，定义模型架构。
    使用默认值实例化配置将产生与 Dinov2 [google/dinov2-base-patch16-224] 架构类似的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。有关更多信息，请阅读 [`PretrainedConfig`] 的文档。

    示例：

    ```
    >>> from transformers import Dinov2Config, Dinov2Model

    >>> # 初始化一个 Dinov2 dinov2-base-patch16-224 风格的配置
    >>> configuration = Dinov2Config()

    >>> # 使用 Dinov2 dinov2-base-patch16-224 风格的配置初始化一个模型（带有随机权重）
    >>> model = Dinov2Model(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    # 模型类型标识为 "dinov2"
    model_type = "dinov2"

    # 构造函数，定义 Dinov2Config 的各种参数和默认值
    def __init__(
        self,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        mlp_ratio=4,
        hidden_act="gelu",
        hidden_dropout_prob=0.0,
        attention_probs_dropout_prob=0.0,
        initializer_range=0.02,
        layer_norm_eps=1e-6,
        image_size=224,
        patch_size=16,
        num_channels=3,
        qkv_bias=True,
        layerscale_value=1.0,
        drop_path_rate=0.0,
        use_swiglu_ffn=False,
        out_features=None,
        out_indices=None,
        apply_layernorm=True,
        reshape_hidden_states=True,
        **kwargs,
        ):
            super().__init__(**kwargs)
        # 调用父类的初始化方法，并传入所有关键字参数

        self.hidden_size = hidden_size
        # 设置模型的隐藏层大小

        self.num_hidden_layers = num_hidden_layers
        # 设置模型的隐藏层数量

        self.num_attention_heads = num_attention_heads
        # 设置注意力头的数量

        self.mlp_ratio = mlp_ratio
        # 设置MLP（多层感知机）部分的大小与隐藏层大小之比

        self.hidden_act = hidden_act
        # 设置隐藏层的激活函数类型

        self.hidden_dropout_prob = hidden_dropout_prob
        # 设置隐藏层的dropout概率

        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 设置注意力概率矩阵的dropout概率

        self.initializer_range = initializer_range
        # 设置初始化权重的范围

        self.layer_norm_eps = layer_norm_eps
        # 设置层归一化操作中的epsilon值

        self.image_size = image_size
        # 设置输入图像的尺寸

        self.patch_size = patch_size
        # 设置图像切片的尺寸

        self.num_channels = num_channels
        # 设置输入图像的通道数

        self.qkv_bias = qkv_bias
        # 设置是否使用查询、键、值的偏置项

        self.layerscale_value = layerscale_value
        # 设置层标度值

        self.drop_path_rate = drop_path_rate
        # 设置DropPath操作的比例

        self.use_swiglu_ffn = use_swiglu_ffn
        # 设置是否使用Swish-Gated Gated Linear Unit (SwiGLU)作为FFN层的激活函数

        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
        # 创建阶段名称列表，包括“stem”和从“stage1”到“stageN”的阶段

        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
        )
        # 调用函数获取对齐的输出特征和输出索引，存储在self._out_features和self._out_indices中

        self.apply_layernorm = apply_layernorm
        # 设置是否应用层归一化操作

        self.reshape_hidden_states = reshape_hidden_states
        # 设置是否需要对隐藏状态进行重塑
# 定义一个新的类 Dinov2OnnxConfig，继承自 OnnxConfig 类
class Dinov2OnnxConfig(OnnxConfig):
    # 设置 torch_onnx_minimum_version 属性为版本号 1.11
    torch_onnx_minimum_version = version.parse("1.11")

    # 定义 inputs 属性，返回一个有序字典，描述模型输入的维度顺序和名称
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                # 定义输入的像素值对应的维度顺序和名称
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
            ]
        )

    # 定义 atol_for_validation 属性，返回一个浮点数，表示验证时的容差
    @property
    def atol_for_validation(self) -> float:
        return 1e-4

`.\models\dinov2\convert_dinov2_to_hf.py`

# 设置编码格式为 UTF-8
# 版权声明及许可信息，指定此代码的使用条件
# 此脚本用于从原始存储库转换 DINOv2 检查点

# 导入必要的库和模块
import argparse  # 用于解析命令行参数
import json  # 用于 JSON 数据处理
from pathlib import Path  # 用于处理文件路径

import requests  # 用于发起 HTTP 请求
import torch  # PyTorch 深度学习库
import torch.nn as nn  # PyTorch 神经网络模块
from huggingface_hub import hf_hub_download  # 用于从 Hugging Face Hub 下载文件
from PIL import Image  # Python 图像库，用于图像处理
from torchvision import transforms  # PyTorch 的视觉处理工具集

# 导入 Transformers 库中相关的类和函数
from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model
from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
from transformers.utils import logging  # Transformers 库的日志工具

# 设置日志输出级别为 info
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


def get_dinov2_config(model_name, image_classifier=False):
    # 创建一个 Dinov2Config 对象，指定图像大小和补丁大小
    config = Dinov2Config(image_size=518, patch_size=14)

    # 根据模型名调整配置参数
    if "vits" in model_name:
        config.hidden_size = 384
        config.num_attention_heads = 6
    elif "vitb" in model_name:
        pass  # 如果模型名包含 'vitb'，则保持默认设置
    elif "vitl" in model_name:
        config.hidden_size = 1024
        config.num_hidden_layers = 24
        config.num_attention_heads = 16
    elif "vitg" in model_name:
        config.use_swiglu_ffn = True
        config.hidden_size = 1536
        config.num_hidden_layers = 40
        config.num_attention_heads = 24
    else:
        raise ValueError("Model not supported")  # 抛出异常，指示不支持的模型

    # 如果需要为图像分类器设置配置参数
    if image_classifier:
        repo_id = "huggingface/label-files"
        filename = "imagenet-1k-id2label.json"
        # 从 Hugging Face Hub 下载并加载标签文件，将标签映射添加到配置中
        config.num_labels = 1000
        config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
        config.id2label = {int(k): v for k, v in config.id2label.items()}

    return config


def create_rename_keys(config):
    rename_keys = []

    # 下面的注释是为了指定列表的格式
    # fmt: off

    # 将原始键名和目标键名添加到重命名键列表中，用于模型权重加载时的映射
    rename_keys.append(("cls_token", "embeddings.cls_token"))
    rename_keys.append(("mask_token", "embeddings.mask_token"))
    rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
    rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
    rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
    # 对每个隐藏层进行循环迭代，生成需要重命名的键值对列表
    for i in range(config.num_hidden_layers):
        # layernorms
        # 添加权重和偏置的重命名键值对，映射到编码器层的第i层的第1个归一化层
        rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
        rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
        rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
        rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
        
        # MLP
        # 根据配置决定使用不同的MLP结构进行重命名
        if config.use_swiglu_ffn:
            # 使用 SwiGLU 结构的前馈神经网络，添加相应的重命名键值对
            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
        else:
            # 使用普通的全连接层结构，添加相应的重命名键值对
            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
        
        # layerscale
        # 添加层尺度的重命名键值对，映射到编码器层的第i层的层尺度参数
        rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
        rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
        
        # attention projection layer
        # 添加注意力投影层的重命名键值对，映射到编码器层的第i层的注意力输出层
        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))

    # final layernorm
    # 添加最终的归一化层权重和偏置的重命名键值对，映射到最终的归一化层
    rename_keys.append(("norm.weight", "layernorm.weight"))
    rename_keys.append(("norm.bias", "layernorm.bias"))

    # fmt: on
    # 返回所有的重命名键值对列表
    return rename_keys
# 从字典 dct 中移除键 old，并将其对应的值赋给变量 val，然后将键 new 添加到字典 dct 中，其值为 val
def rename_key(dct, old, new):
    val = dct.pop(old)
    dct[new] = val

# 对每个编码器层的状态字典 state_dict 执行操作，将每一层的查询（query）、键（key）和值（value）分别读取并添加到 state_dict 中
def read_in_q_k_v(state_dict, config):
    for i in range(config.num_hidden_layers):
        # 读取输入投影层（在 timm 中，这是一个单独的矩阵加偏置项）的权重和偏置
        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
        
        # 将查询（query）、键（key）、值（value）依次添加到状态字典 state_dict 中
        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
            config.hidden_size : config.hidden_size * 2, :
        ]
        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
            config.hidden_size : config.hidden_size * 2
        ]
        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]

# 准备图像，从指定 URL 获取图像并返回
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    image = Image.open(requests.get(url, stream=True).raw)
    return image

# 无需梯度的上下文管理器装饰器，用于 DINOv2 模型的权重转换操作
@torch.no_grad()
def convert_dinov2_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
    """
    Copy/paste/tweak model's weights to our DINOv2 structure.
    """

    # 根据模型名称和是否为 1layer 模型获取 DINOv2 的配置信息
    image_classifier = "1layer" in model_name
    config = get_dinov2_config(model_name, image_classifier=image_classifier)

    # 从 Torch Hub 加载原始模型
    original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", ""))
    original_model.eval()

    # 加载原始模型的状态字典，移除和重命名一些键
    state_dict = original_model.state_dict()
    rename_keys = create_rename_keys(config)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict, config)

    # 复制状态字典的键值对，并根据需要修改键名
    for key, val in state_dict.copy().items():
        val = state_dict.pop(key)
        if "w12" in key:
            key = key.replace("w12", "weights_in")
        if "w3" in key:
            key = key.replace("w3", "weights_out")
        state_dict[key] = val

    # 加载 HuggingFace 模型
    # 如果存在图像分类器，则使用Dinov2ForImageClassification模型，加载状态字典，并设为评估模式
    if image_classifier:
        model = Dinov2ForImageClassification(config).eval()
        model.dinov2.load_state_dict(state_dict)
        
        # 根据模型名称选择对应的分类器状态字典的 URL
        model_name_to_classifier_dict_url = {
            "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth",
            "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth",
            "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth",
            "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth",
        }
        url = model_name_to_classifier_dict_url[model_name]
        
        # 使用 torch.hub 从 URL 加载分类器状态字典到本地，并在 CPU 上加载
        classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
        
        # 将加载的分类器权重和偏置设为模型的参数
        model.classifier.weight = nn.Parameter(classifier_state_dict["weight"])
        model.classifier.bias = nn.Parameter(classifier_state_dict["bias"])
    else:
        # 否则使用Dinov2Model，并加载状态字典
        model = Dinov2Model(config).eval()
        model.load_state_dict(state_dict)

    # 加载图像数据
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

    # 图像预处理步骤
    transformations = transforms.Compose(
        [
            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),  # 调整图像大小
            transforms.CenterCrop(224),  # 中心裁剪图像
            transforms.ToTensor(),  # 转换为张量
            transforms.Normalize(  # 标准化图像数据
                mean=IMAGENET_DEFAULT_MEAN,  # 图像数据的均值
                std=IMAGENET_DEFAULT_STD,  # 图像数据的标准差
            ),
        ]
    )

    # 对原始像素值应用预处理，并增加批处理维度
    original_pixel_values = transformations(image).unsqueeze(0)

    # 使用 BitImageProcessor 处理图像，返回处理后的像素值
    processor = BitImageProcessor(
        size={"shortest_edge": 256},  # 最短边设置为256像素
        resample=PILImageResampling.BICUBIC,  # 使用双三次插值重采样
        image_mean=IMAGENET_DEFAULT_MEAN,  # 图像数据的均值
        image_std=IMAGENET_DEFAULT_STD,  # 图像数据的标准差
    )
    pixel_values = processor(image, return_tensors="pt").pixel_values

    # 断言处理后的像素值与原始像素值在数值上的近似性
    assert torch.allclose(original_pixel_values, pixel_values)

    # 关闭梯度计算，在推理阶段不更新模型参数
    with torch.no_grad():
        # 获取模型输出及隐藏状态
        outputs = model(pixel_values, output_hidden_states=True)
        original_outputs = original_model(pixel_values)

    # 断言检查
    if image_classifier:
        # 如果是图像分类任务，输出预测类别
        print("Predicted class:")
        class_idx = outputs.logits.argmax(-1).item()
        print(model.config.id2label[class_idx])
    else:
        # 否则，断言原始输出和当前输出的最后隐藏状态的一致性
        assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
        assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
    print("Looks ok!")
    # 如果指定了 pytorch_dump_folder_path，则执行以下操作
    if pytorch_dump_folder_path is not None:
        # 创建目录，如果目录已存在则不报错
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        # 打印信息，显示正在保存模型到指定路径
        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 打印信息，显示正在保存图像处理器到指定路径
        print(f"Saving image processor to {pytorch_dump_folder_path}")
        # 将图像处理器保存到指定路径
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到 Hub
    if push_to_hub:
        # 根据模型名称映射到对应的 Hub 名称
        model_name_to_hf_name = {
            "dinov2_vits14": "dinov2-small",
            "dinov2_vitb14": "dinov2-base",
            "dinov2_vitl14": "dinov2-large",
            "dinov2_vitg14": "dinov2-giant",
            "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer",
            "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer",
            "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer",
            "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer",
        }
        
        # 根据模型名称获取对应的 Hub 名称
        name = model_name_to_hf_name[model_name]
        # 将模型推送到 Hub，使用格式化的 Hub 路径
        model.push_to_hub(f"facebook/{name}")
        # 将图像处理器推送到 Hub，使用格式化的 Hub 路径
        processor.push_to_hub(f"facebook/{name}")
if __name__ == "__main__":
    # 如果当前脚本作为主程序执行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # Required parameters
    parser.add_argument(
        "--model_name",
        default="dinov2_vitb14",
        type=str,
        choices=[
            "dinov2_vits14",
            "dinov2_vitb14",
            "dinov2_vitl14",
            "dinov2_vitg14",
            "dinov2_vits14_1layer",
            "dinov2_vitb14_1layer",
            "dinov2_vitl14_1layer",
            "dinov2_vitg14_1layer",
        ],
        help="Name of the model you'd like to convert.",
    )
    # 添加一个参数选项，指定模型的名称，必须从预定义的选项中选择

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加一个参数选项，指定输出的PyTorch模型目录的路径

    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    # 添加一个参数选项，指定是否将转换后的模型推送到Hugging Face hub

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数 convert_dinov2_checkpoint，传入解析后的参数
    convert_dinov2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

`.\models\dinov2\modeling_dinov2.py`

# 设置文件编码为 UTF-8
# 版权声明及保留所有权利声明
#
# 根据 Apache 许可证 2.0 版本进行许可
# 除非符合许可证中的要求，否则不得使用此文件
# 您可以在以下网址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则软件按"原样"分发，
# 不提供任何明示或暗示的保证或条件
# 请参阅许可证以了解具体的法律条款和条件
""" PyTorch DINOv2 模型."""

# 导入必要的库和模块
import collections.abc
import math
from typing import Dict, List, Optional, Set, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入模型输出类
from ...activations import ACT2FN
from ...modeling_outputs import (
    BackboneOutput,
    BaseModelOutput,
    BaseModelOutputWithPooling,
    ImageClassifierOutput,
)
# 导入模型基类和相关工具函数
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 导入背景模块相关的工具函数
from ...utils.backbone_utils import BackboneMixin
# 导入 DINOv2 配置类
from .configuration_dinov2 import Dinov2Config

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 用于文档字符串的常规配置名称
_CONFIG_FOR_DOC = "Dinov2Config"

# 用于文档字符串的基础检查点名称
_CHECKPOINT_FOR_DOC = "facebook/dinov2-base"
# 预期的输出形状
_EXPECTED_OUTPUT_SHAPE = [1, 257, 768]

# 图像分类模型文档字符串中的检查点名称
_IMAGE_CLASS_CHECKPOINT = "facebook/dinov2-small-imagenet1k-1-layer"
# 预期的图像分类输出
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

# DINOv2 预训练模型的存档列表
DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/dinov2-base",
    # 查看所有 DINOv2 模型：https://huggingface.co/models?filter=dinov2
]


class Dinov2Embeddings(nn.Module):
    """
    构建 CLS 令牌、掩码令牌、位置和补丁嵌入的模块。
    """

    def __init__(self, config: Dinov2Config) -> None:
        super().__init__()

        # 定义 CLS 令牌作为可学习的参数
        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
        # 定义掩码令牌作为可学习的参数，初始为全零向量
        self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
        # 使用 Dinov2PatchEmbeddings 类构建补丁嵌入
        self.patch_embeddings = Dinov2PatchEmbeddings(config)
        # 获取补丁数量并为每个补丁和位置添加嵌入向量
        num_patches = self.patch_embeddings.num_patches
        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
        # 使用配置中的隐藏层丢弃率定义丢弃层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 保存配置对象
        self.config = config
    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
        """
        This method interpolates the pre-trained position encodings for higher resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        """

        # Determine the number of patches and positions minus one
        num_patches = embeddings.shape[1] - 1
        num_positions = self.position_embeddings.shape[1] - 1

        # If the number of patches matches the number of positions and height equals width, return the original position embeddings
        if num_patches == num_positions and height == width:
            return self.position_embeddings

        # Select the [CLS] token positional embedding and the patch positional embeddings
        class_pos_embed = self.position_embeddings[:, 0]
        patch_pos_embed = self.position_embeddings[:, 1:]

        # Get the dimension of the embeddings
        dim = embeddings.shape[-1]

        # Calculate the effective height and width based on the configuration's patch size
        height = height // self.config.patch_size
        width = width // self.config.patch_size

        # Add a small number to avoid floating-point errors during interpolation
        height, width = height + 0.1, width + 0.1

        # Reshape patch_pos_embed to match the spatial dimensions of the patches
        patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)

        # Determine the target data type for patch_pos_embed
        target_dtype = patch_pos_embed.dtype

        # Perform bicubic interpolation on patch_pos_embed to match the new height and width
        patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed.to(dtype=torch.float32),
            scale_factor=(float(height / math.sqrt(num_positions)), float(width / math.sqrt(num_positions))),
            mode="bicubic",
            align_corners=False,
        ).to(dtype=target_dtype)

        # Verify that the interpolated dimensions match the expected height and width
        if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
            raise ValueError("Width or height does not match with the interpolated position embeddings")

        # Reshape patch_pos_embed back to the original token dimensions and concatenate with class_pos_embed
        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)

    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
        batch_size, _, height, width = pixel_values.shape
        target_dtype = self.patch_embeddings.projection.weight.dtype

        # Convert pixel_values to the target_dtype and pass through patch_embeddings
        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))

        # Conditionally replace embeddings with mask_token where bool_masked_pos is True
        if bool_masked_pos is not None:
            embeddings = torch.where(
                bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
            )

        # Expand the [CLS] token across the batch and concatenate with embeddings
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        embeddings = torch.cat((cls_tokens, embeddings), dim=1)

        # Add interpolated positional encoding to each token embedding
        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)

        # Apply dropout to the embeddings
        embeddings = self.dropout(embeddings)

        return embeddings
# 定义一个用于将像素值转换为初始隐藏状态（补丁嵌入）的模块，以便Transformer处理。
class Dinov2PatchEmbeddings(nn.Module):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config):
        super().__init__()
        # 从配置中提取图像大小和补丁大小
        image_size, patch_size = config.image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.hidden_size

        # 将图像大小和补丁大小转为可迭代对象（tuple），如果不是的话
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        
        # 计算图像中的补丁数量
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        
        # 设置模块的属性
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches

        # 使用卷积层进行投影，将通道数转换为隐藏大小
        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        # 检查输入像素值的通道数是否与配置中指定的通道数相匹配
        num_channels = pixel_values.shape[1]
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
                f" Expected {self.num_channels} but got {num_channels}."
            )
        
        # 使用投影层进行补丁嵌入，然后展平和转置以生成最终的嵌入表示
        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
        return embeddings


# 从transformers.models.vit.modeling_vit.ViTSelfAttention中复制并修改为Dinov2
class Dinov2SelfAttention(nn.Module):
    def __init__(self, config: Dinov2Config) -> None:
        super().__init__()
        # 检查隐藏大小是否是注意力头数的倍数，如果没有提供embedding_size属性的话
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
                f"heads {config.num_attention_heads}."
            )

        # 设置注意力头数和每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 定义查询、键、值的线性变换层，并支持是否使用偏置
        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)

        # 定义dropout层，用于注意力概率的dropout
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        # 重新排列张量形状，以便进行多头注意力的计算
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)
    # 定义一个方法用于前向传播，接受隐藏状态、头部掩码（可选的张量）、是否输出注意力矩阵作为参数，并返回一个元组
    def forward(
        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 生成混合查询层，使用 self.query 对隐藏状态进行处理
        mixed_query_layer = self.query(hidden_states)

        # 生成键层，先使用 self.key 对隐藏状态进行处理，再根据注意力头大小重新排列维度
        key_layer = self.transpose_for_scores(self.key(hidden_states))

        # 生成值层，先使用 self.value 对隐藏状态进行处理，同样根据注意力头大小重新排列维度
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        # 对混合查询层也进行维度重排
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 计算注意力分数，使用 torch.matmul 计算 "查询" 和 "键" 的点积
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        # 将注意力分数除以 sqrt(注意力头大小)，进行归一化处理
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # 对注意力分数进行 softmax 操作，将其转换为注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 对注意力概率进行 dropout 操作，以减少过拟合风险
        attention_probs = self.dropout(attention_probs)

        # 如果存在头部掩码，则将注意力概率与掩码相乘
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算上下文层，使用注意力概率和值层的乘积
        context_layer = torch.matmul(attention_probs, value_layer)

        # 对上下文层进行维度重排，将注意力头的结果合并到一起
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        # 根据是否需要输出注意力矩阵，构造输出元组
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        # 返回最终的输出结果
        return outputs
# 从transformers.models.vit.modeling_vit.ViTSelfOutput复制到Dinov2SelfOutput，并将ViT改为Dinov2
class Dinov2SelfOutput(nn.Module):
    """
    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    """

    def __init__(self, config: Dinov2Config) -> None:
        super().__init__()
        # 使用配置中的隐藏层大小定义线性层
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 使用配置中的隐藏层dropout概率定义dropout层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入的隐藏状态传递给线性层
        hidden_states = self.dense(hidden_states)
        # 对线性层的输出应用dropout
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# 从transformers.models.vit.modeling_vit.ViTAttention复制到Dinov2Attention，并将ViT改为Dinov2
class Dinov2Attention(nn.Module):
    def __init__(self, config: Dinov2Config) -> None:
        super().__init__()
        # 初始化自注意力层和输出层
        self.attention = Dinov2SelfAttention(config)
        self.output = Dinov2SelfOutput(config)
        # 初始化一个空集合，用于存储需要剪枝的注意力头
        self.pruned_heads = set()

    def prune_heads(self, heads: Set[int]) -> None:
        if len(heads) == 0:
            return
        # 调用辅助函数找到可剪枝的注意力头索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.attention.query = prune_linear_layer(self.attention.query, index)
        self.attention.key = prune_linear_layer(self.attention.key, index)
        self.attention.value = prune_linear_layer(self.attention.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储剪枝后的注意力头
        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 将隐藏状态传递给自注意力层进行处理
        self_outputs = self.attention(hidden_states, head_mask, output_attentions)

        # 将自注意力层的输出传递给输出层，与原始输入结合形成注意力输出
        attention_output = self.output(self_outputs[0], hidden_states)

        # 如果需要输出注意力权重，则将它们添加到输出中
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs


# 从transformers.models.beit.modeling_beit.drop_path复制
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    实现DropPath操作，用于随机关闭网络中的路径，以增强模型的泛化能力。
    """
    # 如果 drop_prob 为 0 或者不处于训练模式，则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    
    # 计算保留节点的概率
    keep_prob = 1 - drop_prob
    
    # 确定随机张量的形状，适用于不同维度的张量，而不仅仅是二维卷积网络
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)
    
    # 生成与输入张量相同形状的随机张量，其值在 [keep_prob, 1.0) 范围内
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    
    # 将随机张量向下取整，实现二值化操作
    random_tensor.floor_()
    
    # 对输入张量进行调整，以实现随机丢弃路径的效果
    output = input.div(keep_prob) * random_tensor
    
    # 返回调整后的输出张量
    return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath
# 定义 Dinov2DropPath 类，用于实现每个样本的随机深度路径丢弃（在残差块的主路径上应用）。
class Dinov2DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob  # 初始化丢弃概率

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用 drop_path 函数来执行随机深度路径丢弃操作
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        # 返回描述实例额外信息的字符串，这里是丢弃概率
        return "p={}".format(self.drop_prob)


class Dinov2MLP(nn.Module):
    # 定义 Dinov2MLP 类，用于实现多层感知机（MLP）部分的前向传播。
    def __init__(self, config) -> None:
        super().__init__()
        in_features = out_features = config.hidden_size
        hidden_features = int(config.hidden_size * config.mlp_ratio)
        # 第一个全连接层，输入特征数为隐藏大小，输出特征数为隐藏大小乘以 MLP 比率
        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
        # 激活函数根据配置选择或者直接使用配置中的激活函数
        if isinstance(config.hidden_act, str):
            self.activation = ACT2FN[config.hidden_act]
        else:
            self.activation = config.hidden_act
        # 第二个全连接层，输入特征数为隐藏大小乘以 MLP 比率，输出特征数为隐藏大小
        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)

    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        # MLP 的前向传播过程，依次经过第一层全连接、激活函数、第二层全连接
        hidden_state = self.fc1(hidden_state)
        hidden_state = self.activation(hidden_state)
        hidden_state = self.fc2(hidden_state)
        return hidden_state


class Dinov2SwiGLUFFN(nn.Module):
    # 定义 Dinov2SwiGLUFFN 类，用于实现基于 SwiGLU 的前馈神经网络部分的前向传播。
    def __init__(self, config) -> None:
        super().__init__()
        in_features = out_features = config.hidden_size
        hidden_features = int(config.hidden_size * config.mlp_ratio)
        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8

        # 输入权重层，输入特征数为隐藏大小，输出特征数为隐藏大小乘以 2
        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
        # 输出权重层，输入特征数为隐藏大小乘以 2/3，输出特征数为隐藏大小
        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)

    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        # SwiGLU 前馈神经网络的前向传播过程，经过输入权重层、切片操作、激活函数、输出权重层
        hidden_state = self.weights_in(hidden_state)
        x1, x2 = hidden_state.chunk(2, dim=-1)
        hidden = nn.functional.silu(x1) * x2
        return self.weights_out(hidden)


class Dinov2Layer(nn.Module):
    """This corresponds to the Block class in the original implementation."""
    # 定义 Dinov2Layer 类，对应原始实现中的块（Block）类。

    def __init__(self, config: Dinov2Config) -> None:
        super().__init__()

        # 层归一化层，用于规范隐藏状态，epsilon 为层归一化的小数
        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dinov2Attention 注意力机制
        self.attention = Dinov2Attention(config)
        # 层缩放，用于缩放层的输出
        self.layer_scale1 = Dinov2LayerScale(config)
        # Dinov2DropPath 类，用于随机深度路径丢弃
        self.drop_path = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()

        # 层归一化层，用于规范隐藏状态，epsilon 为层归一化的小数
        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 根据配置选择使用 SwiGLU 前馈神经网络或者 MLP
        if config.use_swiglu_ffn:
            self.mlp = Dinov2SwiGLUFFN(config)
        else:
            self.mlp = Dinov2MLP(config)
        # 层缩放，用于缩放层的输出
        self.layer_scale2 = Dinov2LayerScale(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    # 定义函数返回类型，可以返回两种可能的元组类型，每种元组包含两个 torch.Tensor 对象
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 对隐藏状态进行第一次 LayerNorm 处理，并传递给自注意力模块
        self_attention_outputs = self.attention(
            self.norm1(hidden_states),  # Dinov2 中在自注意力之前应用 LayerNorm
            head_mask,
            output_attentions=output_attentions,
        )
        # 获取自注意力模块的输出
        attention_output = self_attention_outputs[0]

        # 对自注意力输出应用第一个缩放层
        attention_output = self.layer_scale1(attention_output)
        outputs = self_attention_outputs[1:]  # 如果需要输出注意力权重，则将其添加到 outputs 中

        # 第一个残差连接
        hidden_states = self.drop_path(attention_output) + hidden_states

        # Dinov2 中在自注意力之后再次应用 LayerNorm
        layer_output = self.norm2(hidden_states)
        # 经过 MLP (多层感知机) 的前馈网络处理
        layer_output = self.mlp(layer_output)
        # 对 MLP 输出应用第二个缩放层
        layer_output = self.layer_scale2(layer_output)

        # 第二个残差连接
        layer_output = self.drop_path(layer_output) + hidden_states

        # 将最终的层输出与可能的注意力权重输出打包成元组
        outputs = (layer_output,) + outputs

        # 返回处理后的输出
        return outputs
# 从 transformers.models.vit.modeling_vit.ViTEncoder 复制而来，将 ViT 修改为 Dinov2
class Dinov2Encoder(nn.Module):
    # Dinov2Encoder 类的构造函数，接受一个 Dinov2Config 类型的参数 config
    def __init__(self, config: Dinov2Config) -> None:
        super().__init__()
        # 保存传入的配置参数
        self.config = config
        # 使用 Dinov2Layer 类构建的 nn.ModuleList，构建包含 config.num_hidden_layers 个层的层列表
        self.layer = nn.ModuleList([Dinov2Layer(config) for _ in range(config.num_hidden_layers)])
        # 是否启用梯度检查点，默认为 False
        self.gradient_checkpointing = False

    # Dinov2Encoder 的前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,  # 输入的隐藏状态张量
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码张量，可选
        output_attentions: bool = False,  # 是否输出注意力权重，默认为 False
        output_hidden_states: bool = False,  # 是否输出所有隐藏状态，默认为 False
        return_dict: bool = True,  # 是否返回字典格式的输出，默认为 True
    ) -> Union[tuple, BaseModelOutput]:  # 返回值可以是元组或 BaseModelOutput 类型

        # 如果要输出隐藏状态，则初始化一个空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果要输出注意力权重，则初始化一个空元组
        all_self_attentions = () if output_attentions else None

        # 遍历每一层的 module
        for i, layer_module in enumerate(self.layer):
            # 如果要输出隐藏状态，则将当前的隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 如果启用梯度检查点且在训练阶段，使用梯度检查点函数进行计算
            if self.gradient_checkpointing and self.training:
                # 调用梯度检查点函数进行前向传播计算
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    layer_head_mask,
                    output_attentions,
                )
            else:
                # 否则，直接调用当前层的 forward 方法进行前向传播计算
                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)

            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果要输出注意力权重，则将当前层的注意力权重输出添加到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果要输出隐藏状态，则将最终的隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要以字典格式返回结果，则返回不为 None 的值的元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则，返回一个 BaseModelOutput 类型的对象，包含最终的隐藏状态、所有隐藏状态和所有注意力权重
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )


# Dinov2PreTrainedModel 类，继承自 PreTrainedModel 类
class Dinov2PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # Dinov2PreTrainedModel 类的配置类为 Dinov2Config
    config_class = Dinov2Config
    # 基础模型前缀名称为 "dinov2"
    base_model_prefix = "dinov2"
    # 主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 初始化模型的权重
    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
        """Initialize the weights"""
        # 如果是线性层或者卷积层
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 将权重数据先转换为 float32 类型，避免在 half 类型下使用 `trunc_normal_cpu` 时出现问题，
            # 然后再转换回原始的 dtype
            module.weight.data = nn.init.trunc_normal_(
                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
            ).to(module.weight.dtype)
            # 如果存在偏置项，将其数据初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是 LayerNorm 层
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项数据初始化为零
            module.bias.data.zero_()
            # 将权重数据初始化为全 1
            module.weight.data.fill_(1.0)
        # 如果是 Dinov2Embeddings 类型的模块
        elif isinstance(module, Dinov2Embeddings):
            # 初始化位置嵌入的数据
            module.position_embeddings.data = nn.init.trunc_normal_(
                module.position_embeddings.data.to(torch.float32),
                mean=0.0,
                std=self.config.initializer_range,
            ).to(module.position_embeddings.dtype)
            # 初始化类别令牌的数据
            module.cls_token.data = nn.init.trunc_normal_(
                module.cls_token.data.to(torch.float32),
                mean=0.0,
                std=self.config.initializer_range,
            ).to(module.cls_token.dtype)
# DINOV2_INPUTS_DOCSTRING 是一个字符串变量，用于存储模型输入参数的文档字符串模板。
DINOV2_INPUTS_DOCSTRING = r"""
# 以下是模型的输入参数说明：

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        像素值。像素值可以使用 [`AutoImageProcessor`] 获取。详见 [`BitImageProcessor.preprocess`]。

    bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
        布尔掩码位置。指示哪些补丁是掩码的（1），哪些不是（0）。仅在预训练中相关。

    head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
        用于掩盖自注意力模块中选定头部的掩码。掩码值在 `[0, 1]` 中选择：

        - 1 表示头部**未被掩盖**，
        - 0 表示头部**被掩盖**。

    output_attentions (`bool`, *optional*):
        是否返回所有注意力层的注意力张量。返回的张量中的 `attentions` 有更多细节。

    output_hidden_states (`bool`, *optional*):
        是否返回所有层的隐藏状态。返回的张量中的 `hidden_states` 有更多细节。

    return_dict (`bool`, *optional*):
        是否返回一个 [`~utils.ModelOutput`] 而不是一个普通元组。
"""
    # 定义函数签名和参数说明
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            像素值。像素值可以通过 [`AutoImageProcessor`] 获取。详见 [`BitImageProcessor.preprocess`]。

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            自注意力模块中选定头部的掩码。掩码值在 `[0, 1]` 范围内：

            - 1 表示头部 **未被掩码**，
            - 0 表示头部 **已被掩码**。

        output_attentions (`bool`, *optional*):
            是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions`。

        output_hidden_states (`bool`, *optional*):
            是否返回所有层的隐藏状态。详见返回的张量中的 `hidden_states`。

        return_dict (`bool`, *optional*):
            是否返回 [`~utils.ModelOutput`] 而非普通元组。
"""
@add_start_docstrings(
    "The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.",
    DINOV2_START_DOCSTRING,
)
"""
class Dinov2Model(Dinov2PreTrainedModel):
    """
    DINOv2 模型类，继承自预训练模型基类 Dinov2PreTrainedModel。
    """
    def __init__(self, config: Dinov2Config):
        """
        初始化方法，设置模型配置信息。

        Args:
            config (Dinov2Config): 模型的配置对象。
        """
        super().__init__(config)
        self.config = config

        # 初始化模型的嵌入层和编码器
        self.embeddings = Dinov2Embeddings(config)
        self.encoder = Dinov2Encoder(config)

        # 初始化层归一化层
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 执行初始化权重和最终处理
        self.post_init()

    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
        """
        返回输入嵌入层对象。

        Returns:
            Dinov2PatchEmbeddings: 输入嵌入层对象。
        """
        return self.embeddings.patch_embeddings

    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
        """
        对模型的注意力头进行修剪。

        Args:
            heads_to_prune (Dict[int, List[int]]): 要在每层修剪的注意力头的字典。

        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(DINOV2_BASE_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        bool_masked_pos: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, BaseModelOutputWithPooling]:
        # 确定是否输出注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 确定是否输出隐藏层状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 确定是否返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            # 如果像素值为None，则抛出数值错误异常
            raise ValueError("You have to specify pixel_values")

        # 准备头部掩码（如果需要）
        # head_mask中的1.0表示保留该头部
        # attention_probs的形状为 bsz x n_heads x N x N
        # 输入的head_mask的形状为[num_heads]或[num_hidden_layers x num_heads]
        # head_mask被转换为形状[num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        # 将像素值嵌入到模型中，如果有bool_masked_pos，则指定其位置
        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)

        # 编码器处理嵌入的输出
        encoder_outputs = self.encoder(
            embedding_output,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取编码器的序列输出
        sequence_output = encoder_outputs[0]
        # 对序列输出进行层归一化
        sequence_output = self.layernorm(sequence_output)
        # 提取汇总输出，通常是序列输出的第一个位置
        pooled_output = sequence_output[:, 0, :]

        if not return_dict:
            # 如果不需要返回字典形式的输出，则返回头部输出和编码器的其他输出状态
            head_outputs = (sequence_output, pooled_output)
            return head_outputs + encoder_outputs[1:]

        # 如果需要返回字典形式的输出，则返回BaseModelOutputWithPooling对象
        return BaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
# 使用装饰器为类添加文档字符串，描述该类是基于Dinov2模型的转换器，带有顶部的图像分类头部
# 顶部的图像分类头部指的是在[CLS]标记的最终隐藏状态之上的线性层，例如用于ImageNet分类
@add_start_docstrings(
    """
    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    """,
    DINOV2_START_DOCSTRING,
)
class Dinov2ForImageClassification(Dinov2PreTrainedModel):
    def __init__(self, config: Dinov2Config) -> None:
        super().__init__(config)

        # 初始化函数，调用父类构造函数初始化配置
        self.num_labels = config.num_labels
        # 创建Dinov2模型实例
        self.dinov2 = Dinov2Model(config)

        # 分类器头部
        # 如果配置中的标签数大于0，则创建一个线性层作为分类器，输入大小为两倍的隐藏状态大小，输出大小为标签数
        # 否则创建一个身份映射层
        self.classifier = (
            nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # 初始化权重并应用最终处理
        self.post_init()

    # 使用装饰器为前向方法添加文档字符串，描述输入和输出的格式，参考检查点和期望输出类型
    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[tuple, ImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用给定的 return_dict；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的 forward 方法，传入像素值 pixel_values 和其他参数，获取模型输出
        outputs = self.dinov2(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 提取模型输出中的序列输出（通常是经过特征提取后的表示）
        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size

        # 提取序列输出中的 CLS token，通常用于整体序列的表示
        cls_token = sequence_output[:, 0]

        # 提取除了 CLS token 以外的所有 patch token 的表示
        patch_tokens = sequence_output[:, 1:]

        # 构造线性层的输入，将 CLS token 和 patch token 的均值拼接在一起
        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)

        # 将拼接后的表示输入分类器，得到预测的 logits
        logits = self.classifier(linear_input)

        # 初始化损失为 None
        loss = None
        if labels is not None:
            # 将 labels 移动到正确的设备上以支持模型并行计算
            labels = labels.to(logits.device)

            # 如果问题类型为 None，则根据标签和标签数量确定问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算相应的损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 如果只有一个标签，则计算回归损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 否则计算多标签的回归损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 如果是单标签分类问题，则使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 如果是多标签分类问题，则使用带 logits 的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不需要返回字典形式的输出，则按照元组形式返回输出和损失（如果存在损失）
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 否则以 ImageClassifierOutput 的形式返回输出
        return ImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 使用自定义文档字符串描述 Dinov2 的骨干模型，适用于 DETR 和 MaskFormer 等框架
@add_start_docstrings(
    """
    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
    """,
    DINOV2_START_DOCSTRING,
)
class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
    def __init__(self, config):
        # 调用父类的初始化方法，传入配置
        super().__init__(config)
        # 初始化骨干模型
        super()._init_backbone(config)

        # 计算特征数量列表，每层的特征数量都是隐藏大小 hidden_size
        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
        
        # 初始化嵌入层
        self.embeddings = Dinov2Embeddings(config)
        # 初始化编码器
        self.encoder = Dinov2Encoder(config)

        # 初始化 LayerNorm 层，用于归一化隐藏状态
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
        # 返回嵌入层中的 patch_embeddings
        return self.embeddings.patch_embeddings

    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.Tensor,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> BackboneOutput:
        """
        定义方法签名和返回类型注解，方法返回类型为 BackboneOutput。

        返回方法的输出结果，通常用于说明方法的功能。

        Examples: 示例用法，展示如何使用此方法的代码片段。

        ```
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 16, 16]
        ```"""
        # 初始化 return_dict 变量，如果外部未提供则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 初始化 output_hidden_states 变量，如果外部未提供则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 初始化 output_attentions 变量，如果外部未提供则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

        # 将输入像素值传递给嵌入层处理，生成嵌入输出
        embedding_output = self.embeddings(pixel_values)

        # 将嵌入输出传递给编码器，获取编码器的输出
        outputs = self.encoder(
            embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict
        )

        # 根据是否使用 return_dict，选择合适的隐藏状态
        hidden_states = outputs.hidden_states if return_dict else outputs[1]

        # 初始化特征图为空元组
        feature_maps = ()
        # 遍历阶段名称和隐藏状态，为每个阶段生成特征图
        for stage, hidden_state in zip(self.stage_names, hidden_states):
            if stage in self.out_features:
                # 如果配置要求，对隐藏状态应用层归一化
                if self.config.apply_layernorm:
                    hidden_state = self.layernorm(hidden_state)
                # 如果配置要求，重塑隐藏状态的形状
                if self.config.reshape_hidden_states:
                    hidden_state = hidden_state[:, 1:]
                    # 原始实现中存在的 bug，这里修复了它，通常是高度、宽度的顺序
                    batch_size, _, height, width = pixel_values.shape
                    patch_size = self.config.patch_size
                    hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
                # 将生成的特征图添加到特征图元组中
                feature_maps += (hidden_state,)

        # 如果不使用 return_dict，则根据输出隐藏状态构建输出
        if not return_dict:
            if output_hidden_states:
                output = (feature_maps,) + outputs[1:]
            else:
                output = (feature_maps,) + outputs[2:]
            return output

        # 如果使用 return_dict，则构建 BackboneOutput 对象并返回
        return BackboneOutput(
            feature_maps=feature_maps,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=outputs.attentions if output_attentions else None,
        )

`.\models\dinov2\init.py`

# 版权声明及许可信息
#
# 版权所有 2023 年 HuggingFace 团队。保留所有权利。
# 
# 根据 Apache 许可证版本 2.0 进行许可；
# 除非符合许可证要求，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# 除非适用法律要求或书面同意，否则按“原样”分发的软件
# 没有任何形式的明示或暗示担保或条件。
# 有关详细信息，请参阅许可证。
from typing import TYPE_CHECKING

# 从 utils 中导入必要的模块和异常类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义导入结构，指定 Dinov2 模块的组织结构
_import_structure = {
    "configuration_dinov2": ["DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Dinov2Config", "Dinov2OnnxConfig"]
}

# 检查是否存在 Torch，如果不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则导入 modeling_dinov2 模块相关内容
    _import_structure["modeling_dinov2"] = [
        "DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Dinov2ForImageClassification",
        "Dinov2Model",
        "Dinov2PreTrainedModel",
        "Dinov2Backbone",
    ]

# 如果是类型检查阶段，导入具体的配置和模型类
if TYPE_CHECKING:
    from .configuration_dinov2 import DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Dinov2Config, Dinov2OnnxConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_dinov2 import (
            DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST,
            Dinov2Backbone,
            Dinov2ForImageClassification,
            Dinov2Model,
            Dinov2PreTrainedModel,
        )

# 如果不是类型检查阶段，则将当前模块替换为一个懒加载模块，根据 _import_structure 中的定义懒加载相关模块
else:
    import sys

    # 使用 _LazyModule 创建一个懒加载模块，替换当前模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\distilbert\configuration_distilbert.py`

# 导入必要的模块和函数
from collections import OrderedDict  # 导入OrderedDict，用于创建有序字典
from typing import Mapping  # 导入Mapping，用于类型提示

# 从相应的库中导入配置类和工具
from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...onnx import OnnxConfig  # 导入ONNX配置类
from ...utils import logging  # 导入日志工具

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义DistilBERT预训练配置文件的下载映射字典
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/config.json",
    "distilbert-base-uncased-distilled-squad": (
        "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/config.json"
    ),
    "distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/config.json",
    "distilbert-base-cased-distilled-squad": (
        "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/config.json"
    ),
    "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/config.json",
    "distilbert-base-multilingual-cased": (
        "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/config.json"
    ),
    "distilbert-base-uncased-finetuned-sst-2-english": (
        "https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/config.json"
    ),
}

# 定义DistilBERT配置类，继承自PretrainedConfig
class DistilBertConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DistilBertModel`] or a [`TFDistilBertModel`]. It
    is used to instantiate a DistilBERT model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the DistilBERT
    [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    pass  # 此类目前为空，仅作为DistilBERT模型配置的基础类定义
    # 定义 DistilBERT 模型的配置类，用于初始化模型参数
    Args:
        vocab_size (`int`, *optional*, defaults to 30522):
            DistilBERT 模型的词汇表大小，定义了在调用 [`DistilBertModel`] 或 [`TFDistilBertModel`] 时可以表示的不同令牌数量。
        max_position_embeddings (`int`, *optional*, defaults to 512):
            模型可能使用的最大序列长度。通常设置为一个较大的值（例如 512、1024 或 2048）以防万一。
        sinusoidal_pos_embds (`boolean`, *optional*, defaults to `False`):
            是否使用正弦位置嵌入。
        n_layers (`int`, *optional*, defaults to 6):
            Transformer 编码器中隐藏层的数量。
        n_heads (`int`, *optional*, defaults to 12):
            Transformer 编码器中每个注意力层的注意头数。
        dim (`int`, *optional*, defaults to 768):
            编码器层和池化层的维度。
        hidden_dim (`int`, *optional*, defaults to 3072):
            Transformer 编码器中“中间”（通常称为前馈）层的大小。
        dropout (`float`, *optional*, defaults to 0.1):
            嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
        attention_dropout (`float`, *optional*, defaults to 0.1):
            注意力概率的 dropout 比率。
        activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            编码器和池化器中的非线性激活函数（函数或字符串）。支持 "gelu"、"relu"、"silu" 和 "gelu_new"。
        initializer_range (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态初始化器的标准差。
        qa_dropout (`float`, *optional*, defaults to 0.1):
            用于问答模型 [`DistilBertForQuestionAnswering`] 中的 dropout 概率。
        seq_classif_dropout (`float`, *optional*, defaults to 0.2):
            用于序列分类和多选模型 [`DistilBertForSequenceClassification`] 中的 dropout 概率。

    Examples:

    ```
    >>> from transformers import DistilBertConfig, DistilBertModel

    >>> # 初始化一个 DistilBERT 配置
    >>> configuration = DistilBertConfig()

    >>> # 从配置初始化一个带有随机权重的模型
    >>> model = DistilBertModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```

    # 设置模型类型为 "distilbert"，并定义属性映射字典
    model_type = "distilbert"
    attribute_map = {
        "hidden_size": "dim",
        "num_attention_heads": "n_heads",
        "num_hidden_layers": "n_layers",
    }
    # 初始化函数，用于创建一个新的对象实例，并初始化其各项属性
    def __init__(
        self,
        vocab_size=30522,  # 设置词汇表大小，默认为30522
        max_position_embeddings=512,  # 设置最大位置编码长度，默认为512
        sinusoidal_pos_embds=False,  # 是否使用正弦位置编码，默认为False
        n_layers=6,  # Transformer 模型的层数，默认为6层
        n_heads=12,  # 每个多头注意力中的头数，默认为12个头
        dim=768,  # 模型中隐藏层的维度，默认为768
        hidden_dim=4 * 768,  # 隐藏层的维度，默认为4倍的768
        dropout=0.1,  # 模型的全连接层的dropout率，默认为0.1
        attention_dropout=0.1,  # 注意力机制中的dropout率，默认为0.1
        activation="gelu",  # 激活函数的类型，默认为GELU
        initializer_range=0.02,  # 参数初始化的范围，默认为0.02
        qa_dropout=0.1,  # 用于问答任务的dropout率，默认为0.1
        seq_classif_dropout=0.2,  # 序列分类任务中的dropout率，默认为0.2
        pad_token_id=0,  # 填充标记的ID，默认为0
        **kwargs,  # 其他未命名的参数，作为关键字参数传递
    ):
        # 初始化基类的构造函数，将额外参数传递给基类，并设置pad_token_id参数
        super().__init__(**kwargs, pad_token_id=pad_token_id)

        # 将传入的参数逐个赋值给对象的属性
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.sinusoidal_pos_embds = sinusoidal_pos_embds
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.dim = dim
        self.hidden_dim = hidden_dim
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation = activation
        self.initializer_range = initializer_range
        self.qa_dropout = qa_dropout
        self.seq_classif_dropout = seq_classif_dropout
# 定义 DistilBertOnnxConfig 类，它继承自 OnnxConfig 类
class DistilBertOnnxConfig(OnnxConfig):

    # 定义 inputs 属性，返回一个映射结构，其中键为字符串，值为映射，映射的键是整数，值是字符串
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        
        # 如果任务为多选题（multiple-choice），则动态轴设置为三个维度的映射
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则动态轴设置为两个维度的映射
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 返回一个有序字典，包含两个键值对，分别对应输入的 "input_ids" 和 "attention_mask"
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),       # input_ids 对应的动态轴
                ("attention_mask", dynamic_axis),  # attention_mask 对应的动态轴
            ]
        )

`.\models\distilbert\modeling_distilbert.py`

# Copied from transformers.models.llama.modeling_llama._get_unpad_data
# 从transformers.models.llama.modeling_llama._get_unpad_data复制而来，用于获取不需要填充的数据
def _get_unpad_data(attention_mask):
    # 计算每个样本序列的长度，attention_mask是一个表示注意力掩码的张量
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    # 找到所有需要注意力的位置的索引，返回的indices是一个一维张量
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    # 在批次中计算序列长度的最大值
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    
    # 对序列长度进行累积和，并在左侧填充一个元素为0的张量
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    
    # 返回计算得到的结果：索引、累积序列长度、批次中的最大序列长度
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )
# 创建正弦位置嵌入向量的函数，根据是否启用了 DeepSpeed 的 zero3 功能进行处理
def create_sinusoidal_embeddings(n_pos: int, dim: int, out: torch.Tensor):
    # 检查是否启用了 DeepSpeed 的 zero3 功能
    if is_deepspeed_zero3_enabled():
        # 导入 DeepSpeed 库
        import deepspeed
        # 使用 DeepSpeed 的 GatheredParameters 函数，确保仅在 modifier_rank=0 的进程上执行
        with deepspeed.zero.GatheredParameters(out, modifier_rank=0):
            # 如果当前进程的 rank 为 0，则执行创建正弦位置嵌入向量的函数
            if torch.distributed.get_rank() == 0:
                _create_sinusoidal_embeddings(n_pos=n_pos, dim=dim, out=out)
    else:
        # 如果未启用 DeepSpeed 的 zero3 功能，则直接调用创建正弦位置嵌入向量的函数
        _create_sinusoidal_embeddings(n_pos=n_pos, dim=dim, out=out)


# 实际创建正弦位置嵌入向量的函数，计算正弦和余弦值并将其赋给输出张量
def _create_sinusoidal_embeddings(n_pos: int, dim: int, out: torch.Tensor):
    # 根据位置和维度创建正弦位置编码矩阵
    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
    # 设置输出张量不需要梯度
    out.requires_grad = False
    # 将计算得到的正弦值赋给输出张量的偶数索引位置
    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
    # 将计算得到的余弦值赋给输出张量的奇数索引位置
    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
    # 分离输出张量，使其不再跟踪梯度
    out.detach_()


# 表示嵌入层的类，用于组合词嵌入和位置嵌入
class Embeddings(nn.Module):
    # 初始化函数，配置词嵌入和位置嵌入
    def __init__(self, config: PretrainedConfig):
        super().__init__()
        # 初始化词嵌入层，包括词汇大小、嵌入维度和填充标记索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id)
        # 初始化位置嵌入层，包括最大位置嵌入大小和嵌入维度
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
        # 如果配置要求使用正弦位置嵌入，调用创建正弦位置嵌入的函数
        if config.sinusoidal_pos_embds:
            create_sinusoidal_embeddings(
                n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
            )
        
        # 初始化 LayerNorm 层，用于标准化输入张量
        self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
        # 初始化 Dropout 层，用于随机失活输入张量的元素
        self.dropout = nn.Dropout(config.dropout)
        # 注册位置标识符张量，用于标识输入张量中每个位置的索引
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
    def forward(self, input_ids: torch.Tensor, input_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Parameters:
            input_ids (torch.Tensor):
                torch.tensor(bs, max_seq_length) The token ids to embed.
            input_embeds (*optional*, torch.Tensor):
                The pre-computed word embeddings. Can only be passed if the input ids are `None`.

        Returns:
            torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings)
        """
        # 如果传入了 input_ids，则使用 self.word_embeddings 对其进行 embedding
        if input_ids is not None:
            input_embeds = self.word_embeddings(input_ids)  # (bs, max_seq_length, dim)

        seq_length = input_embeds.size(1)  # 获取序列长度

        # 如果模型实例包含名为 "position_ids" 的属性，则使用其注册的位置信息
        if hasattr(self, "position_ids"):
            position_ids = self.position_ids[:, :seq_length]  # (1, max_seq_length)
        else:
            # 否则根据序列长度生成位置信息（从0到seq_length-1）
            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  # (max_seq_length)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # (bs, max_seq_length)

        position_embeddings = self.position_embeddings(position_ids)  # 根据位置信息获取位置嵌入 (bs, max_seq_length, dim)

        embeddings = input_embeds + position_embeddings  # 将词嵌入和位置嵌入相加得到最终嵌入 (bs, max_seq_length, dim)
        embeddings = self.LayerNorm(embeddings)  # Layer normalization (bs, max_seq_length, dim)
        embeddings = self.dropout(embeddings)  # Dropout 正则化 (bs, max_seq_length, dim)

        return embeddings  # 返回嵌入后的结果
    # 定义一个多头自注意力机制的神经网络模块
    class MultiHeadSelfAttention(nn.Module):
        # 初始化函数，接受一个预训练配置对象作为参数
        def __init__(self, config: PretrainedConfig):
            super().__init__()
            self.config = config

            # 从配置对象中获取多头注意力的数量和维度
            self.n_heads = config.n_heads
            self.dim = config.dim
            # 使用指定的注意力丢弃率创建一个dropout层
            self.dropout = nn.Dropout(p=config.attention_dropout)
            # 默认非因果关系
            self.is_causal = False

            # 确保多头数能够均匀地分割维度
            if self.dim % self.n_heads != 0:
                # 如果无法均匀分割则引发值错误
                raise ValueError(f"self.n_heads: {self.n_heads} must divide self.dim: {self.dim} evenly")

            # 分别定义线性变换层：query、key、value和输出层
            self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
            self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
            self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
            self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)

            # 初始化一个空集合，用于记录被修剪的注意力头
            self.pruned_heads: Set[int] = set()
            # 计算每个注意力头的大小
            self.attention_head_size = self.dim // self.n_heads

        # 定义修剪注意力头的方法
        def prune_heads(self, heads: List[int]):
            if len(heads) == 0:
                return
            # 调用外部函数找到可修剪的注意力头和索引
            heads, index = find_pruneable_heads_and_indices(
                heads, self.n_heads, self.attention_head_size, self.pruned_heads
            )
            # 修剪线性层
            self.q_lin = prune_linear_layer(self.q_lin, index)
            self.k_lin = prune_linear_layer(self.k_lin, index)
            self.v_lin = prune_linear_layer(self.v_lin, index)
            self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
            # 更新超参数：减少的注意力头数和调整后的维度
            self.n_heads = self.n_heads - len(heads)
            self.dim = self.attention_head_size * self.n_heads
            # 将修剪的头添加到已修剪的头的集合中
            self.pruned_heads = self.pruned_heads.union(heads)

        # 前向传播方法，实现自注意力机制
        def forward(
            self,
            query: torch.Tensor,
            key: torch.Tensor,
            value: torch.Tensor,
            mask: torch.Tensor,
            head_mask: Optional[torch.Tensor] = None,
            output_attentions: bool = False,
        """
        Parameters:
            query: torch.tensor(bs, seq_length, dim)
            key: torch.tensor(bs, seq_length, dim)
            value: torch.tensor(bs, seq_length, dim)
            mask: torch.tensor(bs, seq_length)

        Returns:
            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights
            context: torch.tensor(bs, seq_length, dim) Contextualized layer.
                    Optional: only if `output_attentions=True`
        """
        # 获取输入张量的尺寸信息
        bs, q_length, dim = query.size()
        k_length = key.size(1)

        # 计算每个头部的维度
        dim_per_head = self.dim // self.n_heads

        # 创建用于遮罩的形状
        mask_reshp = (bs, 1, 1, k_length)

        def shape(x: torch.Tensor) -> torch.Tensor:
            """将输入张量重塑以便多头注意力"""
            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)

        def unshape(x: torch.Tensor) -> torch.Tensor:
            """将多头注意力结果合并"""
            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)

        # 对查询、键和值进行线性变换并重塑以多头注意力的形式
        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
        k = shape(self.k_lin(key))    # (bs, n_heads, k_length, dim_per_head)
        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)

        # 缩放查询向量以增强稳定性
        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)

        # 计算注意力分数
        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)

        # 创建注意力遮罩
        mask = (mask == 0).view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)

        # 对于遮罩中的位置，用极小值填充注意力分数
        scores = scores.masked_fill(mask, torch.tensor(torch.finfo(scores.dtype).min))  # (bs, n_heads, q_length, k_length)

        # 计算注意力权重
        weights = nn.functional.softmax(scores, dim=-1)  # (bs, n_heads, q_length, k_length)
        weights = self.dropout(weights)  # (bs, n_heads, q_length, k_length)

        # 如果有头部遮罩，将其应用到权重上
        if head_mask is not None:
            weights = weights * head_mask

        # 计算上下文向量
        context = torch.matmul(weights, v)  # (bs, n_heads, q_length, dim_per_head)

        # 将多头注意力结果重新合并为单一张量
        context = unshape(context)  # (bs, q_length, dim)

        # 将上下文向量通过输出线性层得到最终的上下文表示
        context = self.out_lin(context)  # (bs, q_length, dim)

        # 如果需要输出注意力权重，则返回上下文和注意力权重；否则，只返回上下文
        if output_attentions:
            return (context, weights)
        else:
            return (context,)
# DistilBertFlashAttention2 类继承自 MultiHeadSelfAttention，用于实现 DistilBERT 的闪存注意力模块。
# 这个模块保留了 MultiHeadSelfAttention 的权重。唯一需要改变的是前向传播的实现，需要正确调用闪存注意力的公共 API，并处理输入中可能存在的填充标记。

# 从 transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ 复制而来
def __init__(self, *args, **kwargs):
    # 调用父类 MultiHeadSelfAttention 的构造函数
    super().__init__(*args, **kwargs)

    # TODO: 一旦 Flash Attention for RoCm 升级到 2.1 版本后应该移除这段注释。
    # flash_attn<2.1 生成左上角对齐的因果掩码，而这里需要的是右下角对齐，flash_attn>=2.1 已经将右下角对齐作为默认行为。
    # 这个属性用于处理这种差异。参考：https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0。
    # 需要注意的是，对于 flash_attn<2.1，除了 q_seqlen == 1 的情况外，使用 q_seqlen != k_seqlen 会产生错误的掩码（左上角）。
    self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

def forward(
    self,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    mask: torch.Tensor,
    head_mask: Optional[torch.Tensor] = None,
    output_attentions: bool = False,
        """
        Parameters:
            query: torch.tensor(bs, seq_length, dim)  # 输入查询张量，形状为(batch_size, seq_length, dim)
            key: torch.tensor(bs, seq_length, dim)    # 键张量，形状为(batch_size, seq_length, dim)
            value: torch.tensor(bs, seq_length, dim)  # 值张量，形状为(batch_size, seq_length, dim)
            mask: torch.tensor(bs, seq_length)         # 掩码张量，形状为(batch_size, seq_length)

        Returns:
            weights: torch.tensor(bs, n_heads, seq_length, seq_length)  # 注意力权重张量，形状为(batch_size, n_heads, seq_length, seq_length)
            context: torch.tensor(bs, seq_length, dim)  # 上下文化层，可选：仅在`output_attentions=True`时返回
        """
        # 获取批大小、查询长度和维度
        batch_size, q_length, dim = query.size()

        # 计算每个头部的维度
        dim_per_head = self.dim // self.n_heads

        def reshape(x: torch.Tensor) -> torch.Tensor:
            """将张量重新形状为(batch_size, seq_length, n_heads, dim_per_head)"""
            return x.view(batch_size, -1, self.n_heads, dim_per_head)

        # Flash Attention 要求输入的形状为 batch_size x seq_length x head_dim x hidden_dim
        # 对查询、键和值进行线性变换并按头部进行重塑
        query_states = reshape(self.q_lin(query))  # 查询状态
        key_states = reshape(self.k_lin(key))      # 键状态
        value_states = reshape(self.v_lin(value))  # 值状态

        # 注意力丢弃率，在训练时使用配置中的值，在评估时为0
        attn_dropout = self.config.attention_dropout if self.training else 0.0

        # 在 PEFT 中，通常会将层归一化转换为 float32，以提高训练稳定性
        # 因此输入的隐藏状态可能被静默转换为 float32。为确保一切按预期工作，需要将其转换回正确的数据类型
        if query_states.dtype == torch.float32:
            if torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            elif hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            else:
                target_dtype = self.q_lin.weight.dtype

            # 记录警告日志，指出可能的 float32 转换，并将查询、键、值状态转换回目标数据类型
            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        # 执行 Flash Attention 的前向传播
        attn_weights = self._flash_attention_forward(
            query_states, key_states, value_states, mask, q_length, dropout=attn_dropout
        )

        # 将注意力权重重塑为(batch_size, seq_length, n_heads * dim_per_head)
        attn_weights_reshaped = attn_weights.reshape(batch_size, q_length, self.n_heads * dim_per_head)

        # 对重塑后的注意力权重进行线性变换
        attn_output = self.out_lin(attn_weights_reshaped)

        # 如果需要输出注意力权重，则返回注意力输出和注意力权重；否则，仅返回注意力输出
        if output_attentions:
            return (attn_output, attn_weights)
        else:
            return (attn_output,)
    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward with causal=True->causal=False
    def _flash_attention_forward(
        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        # Determine the causal mode for Flash Attention
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal  # Use the model's default causal setting
        else:
            # Special case for ROCm where `query_length != 1` affects causal mode
            causal = self.is_causal and query_length != 1

        # Check if there are any padding tokens in the input sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            # Unpad the input based on attention_mask
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            # Extract sequence lengths
            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            # Perform Flash Attention with variable-length sequences
            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            # Pad the attention output based on the unpadded indices
            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
        else:
            # Perform regular Flash Attention without considering padding
            attn_output = flash_attn_func(
                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
            )

        return attn_output
    # 从 transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input 复制代码，将 num_heads 重命名为 n_heads
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        # 使用 _get_unpad_data 函数获取解包数据的索引、当前序列长度和批次中的最大序列长度
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        
        # 获取 key_layer 的形状信息：批次大小、键值序列长度、键值头数、头维度
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
        
        # 根据 indices_k 对 key_layer 进行重新索引，重塑成 (batch_size * kv_seq_len, num_key_value_heads, head_dim) 的形状
        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        
        # 根据 indices_k 对 value_layer 进行重新索引，重塑成 (batch_size * kv_seq_len, num_key_value_heads, head_dim) 的形状
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        
        # 根据 query_length 的不同情况进行处理
        if query_length == kv_seq_len:
            # 如果 query_length 等于 kv_seq_len，则根据 indices_k 对 query_layer 进行重新索引
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.n_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k  # 设置当前查询序列长度为 cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k  # 设置批次中的最大查询序列长度为 max_seqlen_in_batch_k
            indices_q = indices_k  # 设置查询的索引为 indices_k
        elif query_length == 1:
            # 如果 query_length 等于 1，则进行单个查询的处理
            max_seqlen_in_batch_q = 1  # 设置批次中的最大查询序列长度为 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # 在 query_layer 设备上创建一个整数张量，表示当前查询序列长度
            indices_q = cu_seqlens_q[:-1]  # 设置查询的索引为 cu_seqlens_q 的前 n-1 项
            query_layer = query_layer.squeeze(1)  # 压缩 query_layer 的第一个维度
        else:
            # 否则，根据左填充的假设，截取 attention_mask 的后 query_length 列，然后调用 unpad_input 处理 query_layer 和 attention_mask
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
        
        # 返回处理后的查询层、键层、值层、查询索引、当前序列长度元组、最大序列长度元组
        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )
class FFN(nn.Module):
    def __init__(self, config: PretrainedConfig):
        super().__init__()
        self.dropout = nn.Dropout(p=config.dropout)  # 使用给定的 dropout 概率创建 Dropout 层
        self.chunk_size_feed_forward = config.chunk_size_feed_forward  # 设置前向传播的块大小
        self.seq_len_dim = 1  # 序列长度的维度设为1
        self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)  # 创建线性层 lin1
        self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)  # 创建线性层 lin2
        self.activation = get_activation(config.activation)  # 根据配置获取激活函数

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)

    def ff_chunk(self, input: torch.Tensor) -> torch.Tensor:
        x = self.lin1(input)  # 输入经过 lin1 线性层
        x = self.activation(x)  # 应用激活函数
        x = self.lin2(x)  # 经过 lin2 线性层
        x = self.dropout(x)  # 应用 dropout
        return x


DISTILBERT_ATTENTION_CLASSES = {
    "eager": MultiHeadSelfAttention,
    "flash_attention_2": DistilBertFlashAttention2,
}


class TransformerBlock(nn.Module):
    def __init__(self, config: PretrainedConfig):
        super().__init__()

        # Ensure number of heads evenly divides dimension
        if config.dim % config.n_heads != 0:
            raise ValueError(f"config.n_heads {config.n_heads} must divide config.dim {config.dim} evenly")

        self.attention = DISTILBERT_ATTENTION_CLASSES[config._attn_implementation](config)  # 根据配置选择注意力机制类别
        self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)  # 创建自注意力层的 LayerNorm

        self.ffn = FFN(config)  # 创建 FeedForward 网络
        self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)  # 创建输出层的 LayerNorm

    def forward(
        self,
        x: torch.Tensor,
        attn_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        """
        Parameters:
            x: torch.tensor(bs, seq_length, dim)
            attn_mask: torch.tensor(bs, seq_length)

        Returns:
            sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) 注意力权重
            ffn_output: torch.tensor(bs, seq_length, dim) Transformer 块的输出
        """
        # Self-Attention 自注意力机制
        sa_output = self.attention(
            query=x,
            key=x,
            value=x,
            mask=attn_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
        )
        if output_attentions:
            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
        else:  # 处理输出注意力权重或隐藏状态的情况，返回元组的情况
            if type(sa_output) != tuple:
                raise TypeError(f"sa_output must be a tuple but it is {type(sa_output)} type")

            sa_output = sa_output[0]  # 仅获取自注意力输出

        sa_output = self.sa_layer_norm(sa_output + x)  # Self-Attention 后的 Layer Normalization

        # Feed Forward Network 前馈神经网络
        ffn_output = self.ffn(sa_output)  # (bs, seq_length, dim)
        ffn_output: torch.Tensor = self.output_layer_norm(ffn_output + sa_output)  # 前馈网络后的 Layer Normalization

        output = (ffn_output,)  # 输出结果元组

        if output_attentions:
            output = (sa_weights,) + output  # 如果需要输出注意力权重，将注意力权重加入输出元组

        return output
class Transformer(nn.Module):
    # Transformer 类，继承自 nn.Module
    def __init__(self, config: PretrainedConfig):
        # 初始化函数，接受一个预训练配置对象 config
        super().__init__()
        # 调用父类的初始化函数
        self.n_layers = config.n_layers
        # 设置 Transformer 的层数为 config 中指定的层数
        self.layer = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
        # 创建 nn.ModuleList，其中包含 config.n_layers 个 TransformerBlock 模块
        self.gradient_checkpointing = False
        # 设置梯度检查点为 False，默认不启用梯度检查点功能

    def forward(
        self,
        x: torch.Tensor,
        attn_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: Optional[bool] = None,
    ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]:  # docstyle-ignore
        """
        Parameters:
            x: torch.tensor(bs, seq_length, dim) Input sequence embedded.
                输入的嵌入序列张量，形状为 (bs, seq_length, dim)
            attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.
                序列的注意力掩码张量，形状为 (bs, seq_length)

        Returns:
            hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top)
                layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
                最后（顶部）层的隐藏状态序列，形状为 (bs, seq_length, dim)
                一个包含每层隐藏状态的元组，形状为 (n_layers, bs, seq_length, dim)
                可选项：仅在 output_hidden_states=True 时返回
            all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
                每层的注意力权重张量的元组，形状为 (n_layers, bs, n_heads, seq_length, seq_length)
                可选项：仅在 output_attentions=True 时返回
        """
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        hidden_state = x
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_state,)

            if self.gradient_checkpointing and self.training:
                # 如果启用了梯度检查点且处于训练模式，则使用梯度检查点函数处理
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_state,
                    attn_mask,
                    head_mask[i],
                    output_attentions,
                )
            else:
                # 否则，直接调用层模块处理输入
                layer_outputs = layer_module(
                    hidden_state,
                    attn_mask,
                    head_mask[i],
                    output_attentions,
                )

            hidden_state = layer_outputs[-1]

            if output_attentions:
                if len(layer_outputs) != 2:
                    raise ValueError(f"The length of the layer_outputs should be 2, but it is {len(layer_outputs)}")

                attentions = layer_outputs[0]
                all_attentions = all_attentions + (attentions,)
            else:
                if len(layer_outputs) != 1:
                    raise ValueError(f"The length of the layer_outputs should be 1, but it is {len(layer_outputs)}")

        # 添加最后一层的隐藏状态
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_state,)

        # 如果 return_dict 为 False，则返回所有非 None 的值的元组
        if not return_dict:
            return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None)
        # 否则，返回 BaseModelOutput 对象，包含最后的隐藏状态、所有隐藏状态和所有注意力权重
        return BaseModelOutput(
            last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
        )
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class DistilBertPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = DistilBertConfig  # 使用DistilBertConfig配置类来配置模型
    load_tf_weights = None  # 用于加载TensorFlow权重的标志，暂时未定义
    base_model_prefix = "distilbert"  # 基础模型的名称前缀
    supports_gradient_checkpointing = True  # 支持梯度检查点的标志
    _supports_flash_attn_2 = True  # 特殊支持的特性标志，用于Flash Attention机制

    def _init_weights(self, module: nn.Module):
        """Initialize the weights."""
        if isinstance(module, nn.Linear):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            # 使用正态分布初始化线性层的权重，标准差为config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()  # 如果有偏置，初始化为零
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化嵌入层的权重，标准差为config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果指定了padding_idx，将对应的嵌入向量初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 初始化LayerNorm层的偏置为零，权重为1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


DISTILBERT_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

DISTILBERT_INPUTS_DOCSTRING = r"""
    Describes the inputs to the DistilBERT model and how to prepare them.
"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
            
            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
            
            [What are input IDs?](../glossary#input-ids)
        
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            
            [What are attention masks?](../glossary#attention-mask)
        
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
            
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
Define a DistilBERT model for encoding text using transformer architecture.

@add_start_docstrings(
    "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
    DISTILBERT_START_DOCSTRING,
)
class DistilBertModel(DistilBertPreTrainedModel):
    def __init__(self, config: PretrainedConfig):
        super().__init__(config)

        self.embeddings = Embeddings(config)  # Embeddings
        self.transformer = Transformer(config)  # Encoder
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"

        # Initialize weights and apply final processing
        self.post_init()

    def get_position_embeddings(self) -> nn.Embedding:
        """
        Returns the position embeddings
        """
        return self.embeddings.position_embeddings
"""
    def resize_position_embeddings(self, new_num_position_embeddings: int):
        """
        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.

        Arguments:
            new_num_position_embeddings (`int`):
                The number of new position embedding matrix. If position embeddings are learned, increasing the size
                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
                the size will remove vectors from the end.
        """
        # 计算新旧位置嵌入矩阵长度之差
        num_position_embeds_diff = new_num_position_embeddings - self.config.max_position_embeddings

        # 如果长度没有变化，则无需调整
        if num_position_embeds_diff == 0:
            return

        # 记录信息：设置 `config.max_position_embeddings` 的新值
        logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...")
        self.config.max_position_embeddings = new_num_position_embeddings

        # 备份旧的位置嵌入权重
        old_position_embeddings_weight = self.embeddings.position_embeddings.weight.clone()

        # 根据新的 `max_position_embeddings` 大小重新创建位置嵌入层
        self.embeddings.position_embeddings = nn.Embedding(self.config.max_position_embeddings, self.config.dim)

        # 如果使用正弦位置嵌入，根据新的大小重新创建正弦位置嵌入
        if self.config.sinusoidal_pos_embds:
            create_sinusoidal_embeddings(
                n_pos=self.config.max_position_embeddings, dim=self.config.dim, out=self.position_embeddings.weight
            )
        else:
            with torch.no_grad():
                # 根据位置嵌入大小的变化，重新设置位置嵌入权重
                if num_position_embeds_diff > 0:
                    self.embeddings.position_embeddings.weight[:-num_position_embeds_diff] = nn.Parameter(
                        old_position_embeddings_weight
                    )
                else:
                    self.embeddings.position_embeddings.weight = nn.Parameter(
                        old_position_embeddings_weight[:num_position_embeds_diff]
                    )

        # 将更新后的位置嵌入层移动到正确的设备上
        self.embeddings.position_embeddings.to(self.device)

    def get_input_embeddings(self) -> nn.Embedding:
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, new_embeddings: nn.Embedding):
        # 设置输入词嵌入层的新权重
        self.embeddings.word_embeddings = new_embeddings

    def _prune_heads(self, heads_to_prune: Dict[int, List[List[int]]]):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历需要修剪的层和头部，进行修剪操作
        for layer, heads in heads_to_prune.items():
            self.transformer.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,  # 添加代码示例文档字符串，指定检查点为给定的文档检查点
        output_type=BaseModelOutput,  # 指定输出类型为BaseModelOutput类
        config_class=_CONFIG_FOR_DOC,  # 指定配置类为给定的配置类
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的token IDs张量，可选
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码张量，可选
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码张量，可选
        inputs_embeds: Optional[torch.Tensor] = None,  # 嵌入输入张量，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，可选
    ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]:  # 返回值可以是BaseModelOutput或者张量元组

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定output_attentions，则使用self.config中的默认值

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定output_hidden_states，则使用self.config中的默认值

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果未指定return_dict，则使用self.config中的默认值

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")
        # 检查输入参数的有效性，确保只能同时指定input_ids或inputs_embeds，并获取输入的形状

        device = input_ids.device if input_ids is not None else inputs_embeds.device
        # 获取输入所在的设备

        # Prepare head mask if needed
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
        # 如果需要，准备头部掩码

        embeddings = self.embeddings(input_ids, inputs_embeds)  # (bs, seq_length, dim)
        # 生成输入的嵌入表示

        if self._use_flash_attention_2:
            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
        else:
            if attention_mask is None:
                attention_mask = torch.ones(input_shape, device=device)  # (bs, seq_length)
        # 根据self._use_flash_attention_2的条件设置注意力掩码，如果未提供则使用全1的默认掩码

        return self.transformer(
            x=embeddings,
            attn_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 调用transformer模块进行前向传播，传入嵌入表示、注意力掩码、头部掩码等参数，并返回结果
# 使用装饰器添加文档字符串，描述此类是在DistilBert模型基础上增加了遮盖语言建模头部的模型
@add_start_docstrings(
    """DistilBert Model with a `masked language modeling` head on top.""",
    DISTILBERT_START_DOCSTRING,
)
# 定义DistilBertForMaskedLM类，继承自DistilBertPreTrainedModel
class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    # 定义_tied_weights_keys属性，指定需要绑定权重的键名
    _tied_weights_keys = ["vocab_projector.weight"]

    # 初始化函数，接收一个PretrainedConfig类型的config对象作为参数
    def __init__(self, config: PretrainedConfig):
        # 调用父类的初始化函数
        super().__init__(config)

        # 根据配置中指定的激活函数名称，获取对应的激活函数
        self.activation = get_activation(config.activation)

        # 创建DistilBertModel模型对象，并赋值给self.distilbert
        self.distilbert = DistilBertModel(config)
        
        # 创建一个线性层，用于词汇转换，输入和输出维度均为config.dim
        self.vocab_transform = nn.Linear(config.dim, config.dim)
        
        # 创建一个LayerNorm层，用于词汇层的归一化，输入维度为config.dim
        self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
        
        # 创建一个线性层，用于将模型的输出映射到词汇表大小的向量
        self.vocab_projector = nn.Linear(config.dim, config.vocab_size)

        # 执行初始化权重操作和最终处理
        self.post_init()

        # 定义模型的损失函数为交叉熵损失函数
        self.mlm_loss_fct = nn.CrossEntropyLoss()

    # 获取位置嵌入的方法，返回DistilBert模型中的位置嵌入
    def get_position_embeddings(self) -> nn.Embedding:
        """
        Returns the position embeddings
        """
        return self.distilbert.get_position_embeddings()

    # 调整位置嵌入的方法，根据新的位置嵌入数量调整模型的位置嵌入矩阵
    def resize_position_embeddings(self, new_num_position_embeddings: int):
        """
        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.

        Arguments:
            new_num_position_embeddings (`int`):
                The number of new position embedding matrix. If position embeddings are learned, increasing the size
                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
                the size will remove vectors from the end.
        """
        self.distilbert.resize_position_embeddings(new_num_position_embeddings)

    # 获取输出嵌入的方法，返回词汇投影层对象
    def get_output_embeddings(self) -> nn.Module:
        return self.vocab_projector

    # 设置输出嵌入的方法，用新的嵌入层对象替换词汇投影层
    def set_output_embeddings(self, new_embeddings: nn.Module):
        self.vocab_projector = new_embeddings

    # 使用装饰器添加文档字符串到模型前向传播方法，描述输入参数和输出类型
    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    # 使用代码示例装饰器添加文档字符串，提供模型前向传播的示例和其他相关信息
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 模型的前向传播方法，接收多个输入参数，返回一个输出对象或字典
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[MaskedLMOutput, Tuple[torch.Tensor, ...]]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        """
        # 根据函数声明，定义了输入参数和返回类型，包括可选的标签用于计算MLM损失
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用DistilBERT模型，获取输出结果
        dlbrt_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 提取DistilBERT模型的隐藏状态
        hidden_states = dlbrt_output[0]  # (bs, seq_length, dim)
        # 将隐藏状态转换为预测的对数概率
        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
        # 应用激活函数到预测的对数概率
        prediction_logits = self.activation(prediction_logits)  # (bs, seq_length, dim)
        # 对预测的对数概率进行层归一化
        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
        # 使用投影层将预测的对数概率映射到词汇表大小的空间
        prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)

        mlm_loss = None
        # 如果提供了标签，计算MLM损失
        if labels is not None:
            mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1))

        # 如果不要求返回字典格式的输出，构建输出元组
        if not return_dict:
            output = (prediction_logits,) + dlbrt_output[1:]
            return ((mlm_loss,) + output) if mlm_loss is not None else output

        # 返回MaskedLMOutput对象，包括损失、预测的对数概率、隐藏状态和注意力权重
        return MaskedLMOutput(
            loss=mlm_loss,
            logits=prediction_logits,
            hidden_states=dlbrt_output.hidden_states,
            attentions=dlbrt_output.attentions,
        )
"""
DistilBert模型转换器，顶部带有序列分类/回归头（即顶部的线性层，用于池化输出），例如用于GLUE任务。
"""
@add_start_docstrings(
    """
    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
    def __init__(self, config: PretrainedConfig):
        """
        初始化方法，配置DistilBert序列分类/回归模型。

        Arguments:
            config (:class:`~transformers.PretrainedConfig`):
                包含模型配置信息的预训练配置对象。
        """
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        # DistilBert模型实例化
        self.distilbert = DistilBertModel(config)
        # 预分类器，线性层
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        # 分类器，线性层
        self.classifier = nn.Linear(config.dim, config.num_labels)
        # Dropout层
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_position_embeddings(self) -> nn.Embedding:
        """
        返回位置嵌入
        """
        return self.distilbert.get_position_embeddings()

    def resize_position_embeddings(self, new_num_position_embeddings: int):
        """
        如果`new_num_position_embeddings != config.max_position_embeddings`，调整模型的位置嵌入。

        Arguments:
            new_num_position_embeddings (`int`):
                新的位置嵌入矩阵数量。如果位置嵌入是学习的，则增加大小将在末尾添加新初始化的向量，
                而减小大小将从末尾删除向量。如果位置嵌入不是学习的（例如正弦位置嵌入），
                增加大小将按照位置编码算法在末尾添加正确的向量，而减小大小将从末尾删除向量。
        """
        self.distilbert.resize_position_embeddings(new_num_position_embeddings)

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ):
        """
        前向传播方法，执行DistilBert序列分类/回归模型的计算。

        Arguments:
            input_ids (:obj:`torch.Tensor`, optional):
                输入序列的token IDs张量。如果不提供`inputs_embeds`，则必须提供此项。
            attention_mask (:obj:`torch.Tensor`, optional):
                注意力遮罩张量，指示哪些tokens应被忽略。默认为`None`。
            head_mask (:obj:`torch.Tensor`, optional):
                多头注意力层的遮罩张量，用于控制每个注意力头的输出。默认为`None`。
            inputs_embeds (:obj:`torch.Tensor`, optional):
                直接传入模型的嵌入张量，而不是输入IDs。如果提供了此项，则`input_ids`应为`None`。
            labels (:obj:`torch.LongTensor`, optional):
                标签张量，用于模型训练的目标值。默认为`None`。
            output_attentions (:obj:`bool`, optional):
                是否返回所有注意力权重。默认为`None`。
            output_hidden_states (:obj:`bool`, optional):
                是否返回所有隐藏状态。默认为`None`。
            return_dict (:obj:`bool`, optional):
                是否返回字典类型的输出。默认为`None`。

        Returns:
            :class:`~transformers.modeling_outputs.SequenceClassifierOutput`:
                包含模型输出的命名元组。
        """
        # 实现前向传播逻辑，计算输出结果
        # （具体实现详见模型具体代码，此处略）
    ) -> Union[SequenceClassifierOutput, Tuple[torch.Tensor, ...]]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 根据是否需要返回字典来确定返回值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给DistilBERT模型，获取输出
        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 提取DistilBERT输出的隐藏状态
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        # 提取池化后的输出，取每个序列的第一个标记
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        # 应用预分类器（一个线性层）到池化输出
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        # 应用ReLU激活函数到预分类器输出
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        # 应用dropout操作到ReLU输出
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        # 将池化后的输出传递给分类器，得到logits
        logits = self.classifier(pooled_output)  # (bs, num_labels)

        # 初始化损失值为None
        loss = None
        # 如果有标签输入
        if labels is not None:
            # 如果问题类型尚未确定
            if self.config.problem_type is None:
                # 根据标签数量确定问题类型为回归或分类
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算损失值
            if self.config.problem_type == "regression":
                # 使用均方误差损失函数
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 使用带logits的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不需要返回字典，则只返回logits和可能的其他输出
        if not return_dict:
            output = (logits,) + distilbert_output[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典，则返回包含损失、logits和其他输出的SequenceClassifierOutput对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )
@add_start_docstrings(
    """
    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    DISTILBERT_START_DOCSTRING,
)



class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
    def __init__(self, config: PretrainedConfig):
        super().__init__(config)

        # 初始化 DistilBERT 模型
        self.distilbert = DistilBertModel(config)
        # 线性层用于输出 span 开始和结束的逻辑回归
        self.qa_outputs = nn.Linear(config.dim, config.num_labels)
        # 检查标签数是否为2，否则引发错误
        if config.num_labels != 2:
            raise ValueError(f"config.num_labels should be 2, but it is {config.num_labels}")

        # Dropout 层
        self.dropout = nn.Dropout(config.qa_dropout)

        # 初始化权重并进行最终处理
        self.post_init()

    def get_position_embeddings(self) -> nn.Embedding:
        """
        Returns the position embeddings
        """
        return self.distilbert.get_position_embeddings()

    def resize_position_embeddings(self, new_num_position_embeddings: int):
        """
        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.

        Arguments:
            new_num_position_embeddings (`int`):
                The number of new position embedding matrix. If position embeddings are learned, increasing the size
                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
                the size will remove vectors from the end.
        """
        self.distilbert.resize_position_embeddings(new_num_position_embeddings)

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
):



@add_start_docstrings(
    """
    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForTokenClassification(DistilBertPreTrainedModel):
    def __init__(self, config: PretrainedConfig):
        super().__init__(config)
        self.num_labels = config.num_labels  # 从配置中获取标签数量

        self.distilbert = DistilBertModel(config)  # 初始化 DistilBERT 模型
        self.dropout = nn.Dropout(config.dropout)  # 根据配置添加 dropout 层
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)  # 添加线性分类器

        # 初始化权重并应用最终处理
        self.post_init()

    def get_position_embeddings(self) -> nn.Embedding:
        """
        Returns the position embeddings
        """
        return self.distilbert.get_position_embeddings()  # 返回 DistilBERT 模型的位置嵌入

    def resize_position_embeddings(self, new_num_position_embeddings: int):
        """
        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.

        Arguments:
            new_num_position_embeddings (`int`):
                The number of new position embedding matrix. If position embeddings are learned, increasing the size
                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
                the size will remove vectors from the end.
        """
        self.distilbert.resize_position_embeddings(new_num_position_embeddings)  # 调整 DistilBERT 模型的位置嵌入

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Defines how inputs are processed through the model layers.

        Arguments:
            input_ids (`torch.Tensor`, optional):
                Indices of input sequence tokens in the vocabulary.
            attention_mask (`torch.Tensor`, optional):
                Mask to avoid performing attention on padding token indices.
            head_mask (`torch.Tensor`, optional):
                Mask to nullify selected heads of the self-attention modules.
            inputs_embeds (`torch.Tensor`, optional):
                Embedded representation of input tokens.
            labels (`torch.LongTensor`, optional):
                Labels for computing the token classification loss.
            output_attentions (`bool`, optional):
                Whether to return attentions tensors.
            output_hidden_states (`bool`, optional):
                Whether to return hidden states.
            return_dict (`bool`, optional):
                Whether to return a dictionary instead of a tuple of outputs.

        Returns:
            Output of the model, usually a tuple with various elements depending on the configuration.
        """
    ) -> Union[TokenClassifierOutput, Tuple[torch.Tensor, ...]]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 根据函数定义，返回值可以是 TokenClassifierOutput 对象或者元组形式的 Tensor
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 DistilBERT 模型进行前向传播，获取输出结果
        outputs = self.distilbert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从输出结果中提取序列输出
        sequence_output = outputs[0]

        # 应用 dropout 操作
        sequence_output = self.dropout(sequence_output)
        
        # 使用分类器对序列输出进行分类，得到分类 logits
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None

        # 如果有标签输入，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不需要返回字典，则按元组形式返回输出结果
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典形式的输出，构造 TokenClassifierOutput 对象返回
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
a softmax) e.g. for RocStories/SWAG tasks.
"""
# 导入必要的库和模块
import torch
import torch.nn as nn
from .configuration_distilbert import DistilBertConfig
from .modeling_distilbert import DistilBertModel, DistilBertPreTrainedModel
from .file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
from typing import Optional
from transformers.file_utils import ModelOutput, PretrainedConfig

# 定义 DistilBertForMultipleChoice 类，继承自 DistilBertPreTrainedModel
@add_start_docstrings(
    """
    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
    
    def __init__(self, config: PretrainedConfig):
        super().__init__(config)
        
        # 初始化 DistilBert 模型
        self.distilbert = DistilBertModel(config)
        
        # 多选分类任务的预分类器
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        
        # 用于二分类的线性层
        self.classifier = nn.Linear(config.dim, 1)
        
        # Dropout 层，用于防止过拟合
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_position_embeddings(self) -> nn.Embedding:
        """
        Returns the position embeddings
        """
        # 调用 DistilBertModel 的方法获取位置嵌入
        return self.distilbert.get_position_embeddings()

    def resize_position_embeddings(self, new_num_position_embeddings: int):
        """
        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.

        Arguments:
            new_num_position_embeddings (`int`)
                The number of new position embeddings. If position embeddings are learned, increasing the size will add
                newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
                position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
                add correct vectors at the end following the position encoding algorithm, whereas reducing the size
                will remove vectors from the end.
        """
        # 调整 DistilBertModel 的位置嵌入
        self.distilbert.resize_position_embeddings(new_num_position_embeddings)

    @add_start_docstrings_to_model_forward(
        DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @replace_return_docstrings(output_type=MultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> ModelOutput:
        """
        Forward pass for DistilBertForMultipleChoice.
        
        Args:
            input_ids (Optional[torch.Tensor], optional):
                Indices of input sequence tokens in the vocabulary.
            attention_mask (Optional[torch.Tensor], optional):
                Mask to avoid performing attention on padding token indices.
            head_mask (Optional[torch.Tensor], optional):
                Mask to nullify selected heads of the self-attention modules.
            inputs_embeds (Optional[torch.Tensor], optional):
                Optionally, instead of passing `input_ids`, you can directly pass an embedded representation.
            labels (Optional[torch.LongTensor], optional):
                Labels for computing the multiple choice classification loss.
            output_attentions (Optional[bool], optional):
                Whether to return attentions weights.
            output_hidden_states (Optional[bool], optional):
                Whether to return hidden states.
            return_dict (Optional[bool], optional):
                Whether to return a dictionary instead of a tuple.
            **kwargs:
                Additional keyword arguments for the DistilBertModel forward method.
        
        Returns:
            ModelOutput: A namedtuple with the model outputs: last_hidden_state, (optional) hidden_states, (optional) attentions.
        """
        # 调用 DistilBertModel 的 forward 方法进行前向传播
        return self.distilbert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            **kwargs,
        )

`.\models\distilbert\modeling_flax_distilbert.py`

# 导入所需的库和模块
import math
from typing import Callable, Optional, Tuple

import flax.linen as nn  # 导入flax中的linen模块作为nn别名
import jax  # 导入jax库
import jax.numpy as jnp  # 导入jax中的numpy模块作为jnp别名
import numpy as np  # 导入numpy库作为np别名
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze  # 从flax.core.frozen_dict模块导入FrozenDict、freeze、unfreeze函数
from flax.traverse_util import flatten_dict, unflatten_dict  # 从flax.traverse_util模块导入flatten_dict、unflatten_dict函数
from jax import lax  # 从jax模块导入lax模块

# 导入模型输出相关的类
from ...modeling_flax_outputs import (
    FlaxBaseModelOutput,
    FlaxMaskedLMOutput,
    FlaxMultipleChoiceModelOutput,
    FlaxQuestionAnsweringModelOutput,
    FlaxSequenceClassifierOutput,
    FlaxTokenClassifierOutput,
)
# 导入模型工具函数和常量
from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
# 导入工具函数和常量
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
# 导入DistilBertConfig配置类
from .configuration_distilbert import DistilBertConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的模型检查点名称
_CHECKPOINT_FOR_DOC = "distilbert-base-uncased"
# 用于文档的模型配置名称
_CONFIG_FOR_DOC = "DistilBertConfig"

# DistilBERT模型的起始文档字符串
FLAX_DISTILBERT_START_DOCSTRING = r"""

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# DistilBERT模型输入文档字符串
DISTILBERT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            # 输入序列中的token索引数组，索引对应词汇表中的token。

            # 可以使用`AutoTokenizer`获取这些索引。参见`PreTrainedTokenizer.encode`和`PreTrainedTokenizer.__call__`获取详细信息。

            # [什么是input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            # 避免对填充token索引执行注意力计算的掩码。掩码的取值范围为`[0, 1]`：

            # - 1表示**不被掩盖**的token，
            # - 0表示**被掩盖**的token。

            # [什么是attention masks?](../glossary#attention-mask)
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。查看返回的张量中`attentions`获取更多细节。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。查看返回的张量中`hidden_states`获取更多细节。
        return_dict (`bool`, *optional*):
            # 是否返回`~utils.ModelOutput`而不是普通的元组。
"""
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates
"""
# 根据位置、索引和模型维度计算角度率，用于位置编码中的角度计算

"""
def positional_encoding(position, d_model):
    # create the sinusoidal pattern for the positional encoding
    angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return jnp.array(pos_encoding)
"""
# 根据位置和模型维度生成位置编码的正弦和余弦模式

class FlaxEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    def setup(self):
        self.word_embeddings = nn.Embed(
            self.config.vocab_size,
            self.config.dim,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )
        if not self.config.sinusoidal_pos_embds:
            self.position_embeddings = nn.Embed(
                self.config.max_position_embeddings,
                self.config.dim,
                embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
            )
        else:
            self.pos_encoding = positional_encoding(self.config.max_position_embeddings, self.config.dim)
        self.LayerNorm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)
        self.dropout = nn.Dropout(rate=self.config.dropout)

    def __call__(self, input_ids, deterministic: bool = True):
        # Embed
        batch_size, seq_length = input_ids.shape
        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
        if not self.config.sinusoidal_pos_embds:
            position_ids = jnp.arange(seq_length).astype("i4")
            position_ids = jnp.broadcast_to(position_ids, shape=(batch_size, seq_length))
            position_embeds = self.position_embeddings(position_ids.astype("i4"))
        else:
            position_embeds = self.pos_encoding[:, :seq_length, :]
            # explicitly cast the positions here, since self.embed_positions are not registered as parameters
            position_embeds = position_embeds.astype(inputs_embeds.dtype)

        # Sum all embeddings
        hidden_states = inputs_embeds + position_embeds

        # Layer Norm
        hidden_states = self.LayerNorm(hidden_states)
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        return hidden_states


class FlaxMultiHeadSelfAttention(nn.Module):
    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
    def setup(self):
        self.n_heads = self.config.n_heads  # 从配置中获取注意力头的数量
        self.dim = self.config.dim  # 从配置中获取模型维度
        self.dropout = nn.Dropout(rate=self.config.attention_dropout)  # 根据配置设置注意力机制中的dropout

        if not (self.dim % self.n_heads == 0):
            raise ValueError(f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}")  # 检查隐藏层大小是否可以被注意力头的数量整除

        self.q_lin = nn.Dense(
            self.dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )  # 初始化用于query的线性层，输入维度为dim，输出维度为dim

        self.k_lin = nn.Dense(
            self.dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )  # 初始化用于key的线性层，输入维度为dim，输出维度为dim

        self.v_lin = nn.Dense(
            self.dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )  # 初始化用于value的线性层，输入维度为dim，输出维度为dim

        self.out_lin = nn.Dense(
            self.dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )  # 初始化输出层线性层，输入维度为dim，输出维度为dim

    def __call__(
        self,
        query,
        key,
        value,
        mask,
        deterministic: bool = True,
        output_attentions: bool = False,
    ):
        bs, q_len, dim = query.shape  # 获取query的形状信息，bs为batch size，q_len为query的长度，dim为维度
        k_len = key.shape[1]  # 获取key的长度

        dim_per_head = self.dim // self.n_heads  # 计算每个注意力头的维度

        mask_reshp = (bs, 1, 1, k_len)  # 重塑mask的形状用于后续操作

        def shape(x):
            """分离头部"""
            return x.reshape(bs, -1, self.n_heads, dim_per_head).transpose(0, 2, 1, 3)  # 重塑张量x以分离注意力头

        def unshape(x):
            """合并头部"""
            return x.transpose(0, 2, 1, 3).reshape(bs, -1, self.n_heads * dim_per_head)  # 重塑张量x以合并注意力头

        q = shape(self.q_lin(query))  # 通过query的线性层进行形状分离，得到 (bs, n_heads, q_len, dim_per_head)
        k = shape(self.k_lin(key))  # 通过key的线性层进行形状分离，得到 (bs, n_heads, k_len, dim_per_head)
        v = shape(self.v_lin(value))  # 通过value的线性层进行形状分离，得到 (bs, n_heads, k_len, dim_per_head)

        q = q / math.sqrt(dim_per_head)  # 对query进行缩放，以便更好地计算注意力权重 (bs, n_heads, q_len, dim_per_head)
        scores = jnp.matmul(q, k.transpose(0, 1, 3, 2))  # 计算注意力分数，形状为 (bs, n_heads, q_len, k_len)
        mask = jnp.reshape(mask, mask_reshp)  # 调整mask的形状以匹配注意力分数

        mask = mask.astype(scores.dtype)  # 将mask转换为与scores相同的数据类型
        scores = scores - 1e30 * (1.0 - mask)  # 将mask应用于scores，增加无效位置的大负数

        weights = nn.softmax(scores, axis=-1)  # 计算注意力权重，形状为 (bs, n_heads, q_len, k_len)
        weights = self.dropout(weights, deterministic=deterministic)  # 应用dropout到注意力权重

        context = jnp.matmul(weights, v)  # 计算上下文向量，形状为 (bs, n_heads, q_len, dim_per_head)
        context = unshape(context)  # 合并注意力头，形状为 (bs, q_len, dim)
        context = self.out_lin(context)  # 应用输出层线性层，形状为 (bs, q_len, dim)

        if output_attentions:
            return (context, weights)  # 如果需要输出注意力权重，返回上下文向量和权重
        else:
            return (context,)  # 否则只返回上下文向量
class FlaxFFN(nn.Module):
    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32  # 计算的数据类型

    def setup(self):
        self.dropout = nn.Dropout(rate=self.config.dropout)  # 设置dropout层
        self.chunk_size_feed_forward = self.config.chunk_size_feed_forward  # 前馈层的块大小
        self.seq_len_dim = 1  # 序列长度维度为1
        self.lin1 = nn.Dense(
            self.config.hidden_dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )  # 第一个全连接层，使用正态分布初始化权重

        self.lin2 = nn.Dense(
            self.config.dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )  # 第二个全连接层，使用正态分布初始化权重

        self.activation = ACT2FN[self.config.activation]  # 激活函数

    def __call__(self, hidden_states, deterministic: bool = True):
        hidden_states = self.lin1(hidden_states)  # 第一个全连接层的计算
        hidden_states = self.activation(hidden_states)  # 激活函数的应用
        hidden_states = self.lin2(hidden_states)  # 第二个全连接层的计算
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)  # dropout操作
        return hidden_states


class FlaxTransformerBlock(nn.Module):
    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32  # 计算的数据类型

    def setup(self):
        assert (
            self.config.dim % self.config.n_heads == 0
        ), f"Hidden size {self.config.dim} not dividable by number of heads {self.config.n_heads}"  # 断言，确保隐藏大小可以被头数整除

        self.attention = FlaxMultiHeadSelfAttention(self.config, dtype=self.dtype)  # 多头自注意力机制
        self.sa_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)  # 自注意力层的LayerNorm

        self.ffn = FlaxFFN(self.config, dtype=self.dtype)  # 前馈网络
        self.output_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)  # 输出层的LayerNorm

    def __call__(
        self,
        hidden_states,
        attn_mask,
        output_attentions: bool = False,
        deterministic: bool = True,
    ):
        # 自注意力
        sa_output = self.attention(
            query=hidden_states,
            key=hidden_states,
            value=hidden_states,
            mask=attn_mask,
            output_attentions=output_attentions,
            deterministic=deterministic,
        )
        if output_attentions:
            sa_output, sa_weights = sa_output  # 如果需要输出注意力权重，则获取权重

        else:
            assert type(sa_output) == tuple
            sa_output = sa_output[0]  # 否则，获取自注意力的输出

        sa_output = self.sa_layer_norm(sa_output + hidden_states)  # 应用LayerNorm

        # 前馈网络
        ffn_output = self.ffn(sa_output, deterministic=deterministic)  # 前馈网络的计算
        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # 应用LayerNorm
        output = (ffn_output,)  # 输出结果为元组

        if output_attentions:
            output = (sa_weights,) + output  # 如果需要输出注意力权重，则将权重添加到输出中

        return output


class FlaxTransformer(nn.Module):
    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32  # 计算的数据类型

    def setup(self):
        self.layers = [
            FlaxTransformerBlock(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.n_layers)
        ]  # 创建多个TransformerBlock层的列表
    # 定义一个可调用的方法，用于执行模型的前向传播
    def __call__(
        self,
        hidden_states,
        attention_mask,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        deterministic: bool = True,
        return_dict: bool = False,
    ):
        # 如果输出隐藏状态，初始化存储所有隐藏状态的元组，否则为None
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重，初始化存储所有注意力权重的元组，否则为None
        all_attentions = () if output_attentions else None

        # 遍历所有的层模块
        for layer_module in self.layers:
            # 如果需要输出隐藏状态，将当前的隐藏状态添加到所有隐藏状态的元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 调用当前层模块的前向传播方法，获取该层的输出
            layer_outputs = layer_module(
                hidden_states=hidden_states,
                attn_mask=attention_mask,
                output_attentions=output_attentions,
                deterministic=deterministic,
            )
            # 更新隐藏状态为当前层的输出的最后一个值
            hidden_states = layer_outputs[-1]

            # 如果需要输出注意力权重
            if output_attentions:
                # 确保当前层的输出包含两个元素（注意力权重和其他）
                assert len(layer_outputs) == 2
                # 获取注意力权重，并添加到所有注意力权重的元组中
                attentions = layer_outputs[0]
                all_attentions = all_attentions + (attentions,)
            else:
                # 确保当前层的输出只包含一个元素（隐藏状态）
                assert len(layer_outputs) == 1

        # 添加最后一层的隐藏状态到所有隐藏状态的元组中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典，则返回包含非None值的元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_attentions, all_hidden_states] if v is not None)
        # 如果需要返回字典，则创建并返回FlaxBaseModelOutput对象
        return FlaxBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )
class FlaxTransformerEncoder(nn.Module):
    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    def setup(self):
        self.layer = FlaxTransformer(self.config, dtype=self.dtype)
        # 初始化 FlaxTransformer 层，使用给定的配置和数据类型

    def __call__(
        self,
        hidden_states,
        attention_mask,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        deterministic: bool = True,
        return_dict: bool = False,
    ):
        return self.layer(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            deterministic=deterministic,
            return_dict=return_dict,
        )
        # 调用 FlaxTransformer 层，传递输入参数并返回结果
        

class FlaxDistilBertLMDecoder(nn.Module):
    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros

    def setup(self):
        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
        # 初始化偏置参数 self.bias，大小为词汇表大小，使用 bias_init 初始化器

    def __call__(self, inputs, kernel):
        inputs = jnp.asarray(inputs, self.dtype)
        kernel = jnp.asarray(kernel, self.dtype)
        y = lax.dot_general(inputs, kernel, (((inputs.ndim - 1,), (0,)), ((), ())))
        # 执行高效的矩阵乘法操作，inputs 和 kernel 是输入张量
        bias = jnp.asarray(self.bias, self.dtype)
        y = y + bias
        # 将偏置加到输出 y 上
        return y


class FlaxDistilBertPreTrainedModel(FlaxPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = DistilBertConfig
    base_model_prefix = "distilbert"
    module_class: nn.Module = None

    def __init__(
        self,
        config: DistilBertConfig,
        input_shape: Tuple = (1, 1),
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        **kwargs,
    ):
        module = self.module_class(config=config, dtype=dtype, **kwargs)
        # 使用给定的配置和数据类型初始化模块
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        # 初始化权重函数
        input_ids = jnp.zeros(input_shape, dtype="i4")
        attention_mask = jnp.ones_like(input_ids)
        # 创建输入张量和注意力掩码，使用默认值

        params_rng, dropout_rng = jax.random.split(rng)
        rngs = {"params": params_rng, "dropout": dropout_rng}
        # 分割随机数生成器以用于参数初始化和 dropout

        random_params = self.module.init(rngs, input_ids, attention_mask, return_dict=False)["params"]
        # 使用随机数初始化模块的参数

        if params is not None:
            random_params = flatten_dict(unfreeze(random_params))
            params = flatten_dict(unfreeze(params))
            for missing_key in self._missing_keys:
                params[missing_key] = random_params[missing_key]
            self._missing_keys = set()
            return freeze(unflatten_dict(params))
            # 如果有指定的参数，则将缺失的键补充为随机初始化的参数，并返回完整的参数字典
        else:
            return random_params
            # 否则，直接返回随机初始化的参数
    # 添加模型调用的前向传播文档字符串，描述输入参数为批大小和序列长度
    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 定义调用方法，接受多个输入参数用于模型推理
    def __call__(
        self,
        input_ids,  # 输入的token IDs序列
        attention_mask=None,  # 注意力掩码，指示哪些位置是有效的
        head_mask=None,  # 头掩码，控制不同的注意力头的掩码
        params: dict = None,  # 参数字典，用于加载模型参数
        dropout_rng: jax.random.PRNGKey = None,  # 随机数生成器密钥，用于Dropout操作
        train: bool = False,  # 指示是否为训练模式
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出
    ):
        # 如果未提供attention_mask，则默认为全1，即所有位置都是有效的
        if attention_mask is None:
            attention_mask = jnp.ones_like(input_ids)

        # 处理可能需要的任何随机数生成器
        rngs = {}
        if dropout_rng is not None:
            rngs["dropout"] = dropout_rng

        # 调用模型的apply方法进行前向传播
        return self.module.apply(
            {"params": params or self.params},  # 使用给定的参数或默认的模型参数
            jnp.array(input_ids, dtype="i4"),  # 转换输入token IDs为JAX数组
            jnp.array(attention_mask, dtype="i4"),  # 转换注意力掩码为JAX数组
            not train,  # 转换训练标志为相反值，用于控制模型是否在推理模式下运行
            output_attentions,  # 是否输出注意力权重
            output_hidden_states,  # 是否输出隐藏状态
            return_dict,  # 是否返回字典格式的输出
            rngs=rngs,  # 传递随机数生成器密钥到模型的apply方法中
        )
class FlaxDistilBertModule(nn.Module):
    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32  # 计算时使用的数据类型

    def setup(self):
        # 初始化嵌入层对象，使用给定的配置和数据类型
        self.embeddings = FlaxEmbeddings(self.config, dtype=self.dtype)
        # 初始化变换器编码器对象，使用给定的配置和数据类型
        self.transformer = FlaxTransformerEncoder(self.config, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 如果输出注意力权重未指定，则使用配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果输出隐藏状态未指定，则使用配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果返回字典未指定，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 获取输入的嵌入表示
        input_embeds = self.embeddings(input_ids, deterministic=deterministic)
        # 调用变换器编码器进行处理
        return self.transformer(
            hidden_states=input_embeds,
            attention_mask=attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


@add_start_docstrings(
    "输出原始隐藏状态的DistilBert模型变换器，没有特定的输出头部。",
    FLAX_DISTILBERT_START_DOCSTRING,
)
class FlaxDistilBertModel(FlaxDistilBertPreTrainedModel):
    module_class = FlaxDistilBertModule


append_call_sample_docstring(FlaxDistilBertModel, _CHECKPOINT_FOR_DOC, None, _CONFIG_FOR_DOC)


class FlaxDistilBertForMaskedLMModule(nn.Module):
    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32  # 计算时使用的数据类型

    def setup(self):
        # 初始化DistilBert模型对象，使用给定的配置和数据类型
        self.distilbert = FlaxDistilBertModule(self.config, dtype=self.dtype)
        # 初始化词汇变换层，使用给定的维度和正态分布的初始化方式
        self.vocab_transform = nn.Dense(
            self.config.dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )
        # 初始化词汇层归一化，设定epsilon为1e-12，使用给定的数据类型
        self.vocab_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)
        # 如果需要绑定词嵌入，则初始化DistilBert语言模型解码器
        if self.config.tie_word_embeddings:
            self.vocab_projector = FlaxDistilBertLMDecoder(
                self.config,
                dtype=self.dtype,
            )
        else:
            # 否则初始化普通的Dense层作为词汇投影器
            self.vocab_projector = nn.Dense(
                self.config.vocab_size,
                dtype=self.dtype,
                kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
            )

    def __call__(
        self,
        input_ids,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        )
        # 如果 return_dict 为 None，则根据配置决定是否使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 DistilBERT 模型处理输入，获取输出
        dlbrt_output = self.distilbert(
            input_ids=input_ids,                   # 输入的 token IDs
            attention_mask=attention_mask,         # 注意力掩码
            output_attentions=output_attentions,   # 是否输出注意力权重
            output_hidden_states=output_hidden_states,  # 是否输出隐藏状态
            deterministic=deterministic,           # 是否确定性运行
            return_dict=return_dict,               # 是否返回字典形式的输出
        )
        # 获取隐藏状态作为预测的 logits
        hidden_states = dlbrt_output[0]
        # 使用 vocab_transform 对隐藏状态进行转换得到预测 logits
        prediction_logits = self.vocab_transform(hidden_states)
        # 根据配置中的激活函数对 logits 进行激活
        prediction_logits = ACT2FN[self.config.activation](prediction_logits)
        # 对激活后的 logits 进行 layer normalization
        prediction_logits = self.vocab_layer_norm(prediction_logits)

        # 如果配置指定共享词嵌入，则使用 distilbert 中的词嵌入与 logits 进行投影
        if self.config.tie_word_embeddings:
            shared_embedding = self.distilbert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
            prediction_logits = self.vocab_projector(prediction_logits, shared_embedding.T)
        else:
            prediction_logits = self.vocab_projector(prediction_logits)

        # 如果不需要以字典形式返回结果，则返回 logits 与其它输出
        if not return_dict:
            output = (prediction_logits,) + dlbrt_output[1:]  # 构建输出元组
            return output

        # 以 FlaxMaskedLMOutput 类型返回输出结果，包含 logits、隐藏状态和注意力权重
        return FlaxMaskedLMOutput(
            logits=prediction_logits,                   # 预测 logits
            hidden_states=dlbrt_output.hidden_states,   # 隐藏状态
            attentions=dlbrt_output.attentions,         # 注意力权重
        )
@add_start_docstrings("""DistilBert Model with a `language modeling` head on top.""", FLAX_DISTILBERT_START_DOCSTRING)
class FlaxDistilBertForMaskedLM(FlaxDistilBertPreTrainedModel):
    module_class = FlaxDistilBertForMaskedLMModule

- 定义了一个基于FlaxDistilBertPreTrainedModel的FlaxDistilBertForMaskedLM类，它具有一个`language modeling`头部。


append_call_sample_docstring(FlaxDistilBertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC)

- 调用append_call_sample_docstring函数，为FlaxDistilBertForMaskedLM类添加文档字符串示例。


class FlaxDistilBertForSequenceClassificationModule(nn.Module):
    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype)
        self.pre_classifier = nn.Dense(
            self.config.dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )
        self.dropout = nn.Dropout(rate=self.config.seq_classif_dropout)
        self.classifier = nn.Dense(
            self.config.num_labels,
            dtype=self.dtype,
        )

- 定义了一个FlaxDistilBertForSequenceClassificationModule类，继承自nn.Module，用于序列分类任务。在setup方法中初始化了DistilBERT模块、预分类器、Dropout和分类器。


    def __call__(
        self,
        input_ids,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

- 定义了__call__方法，实现了对输入数据进行处理和前向传播，支持不同的返回格式选项。


        distilbert_output = self.distilbert(
            input_ids,
            attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

- 调用self.distilbert对输入进行处理，得到DistilBERT模型的输出。


        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = ACT2FN["relu"](pooled_output)
        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
        logits = self.classifier(pooled_output)  # (bs, dim)

- 对DistilBERT模型的输出进行处理，包括提取池化输出、通过预分类器和激活函数处理、应用Dropout、最终分类器得到logits。


        if not return_dict:
            return (logits,) + distilbert_output[1:]

        return FlaxSequenceClassifierOutput(
            logits=logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )

- 根据return_dict的设置决定返回的结果格式，可以选择返回元组或者包含logits、隐藏状态和注意力的FlaxSequenceClassifierOutput对象。


@add_start_docstrings(
    """
    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    FLAX_DISTILBERT_START_DOCSTRING,
)
class FlaxDistilBertForSequenceClassification(FlaxDistilBertPreTrainedModel):
    module_class = FlaxDistilBertForSequenceClassificationModule

- 定义了一个FlaxDistilBertForSequenceClassification类，继承自FlaxDistilBertPreTrainedModel，具有序列分类/回归头部的DistilBERT模型。


append_call_sample_docstring(
    FlaxDistilBertForSequenceClassification,
    _CHECKPOINT_FOR_DOC,
    FlaxSequenceClassifierOutput,
    _CONFIG_FOR_DOC,
)

- 调用append_call_sample_docstring函数，为FlaxDistilBertForSequenceClassification类添加文档字符串示例。


class FlaxDistilBertForMultipleChoiceModule(nn.Module):
    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32

- 定义了一个FlaxDistilBertForMultipleChoiceModule类，继承自nn.Module，用于多选题任务。
    # 初始化模型的各个组件，包括DistilBERT模块、预分类器、Dropout层和分类器
    def setup(self):
        self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype)
        self.pre_classifier = nn.Dense(
            self.config.dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )
        self.dropout = nn.Dropout(rate=self.config.seq_classif_dropout)
        self.classifier = nn.Dense(
            1,
            dtype=self.dtype,
        )

    # 模型的调用方法，接收输入的token IDs和attention mask，并返回多项选择任务的结果
    def __call__(
        self,
        input_ids,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 根据参数设定是否使用配置中指定的返回字典方式
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 计算多项选择的数量
        num_choices = input_ids.shape[1]
        # 将输入的token IDs重新调整形状以便传递给模型
        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
        # 将输入的attention mask重新调整形状以便传递给模型
        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None

        # 使用DistilBERT模型处理输入，返回模型的输出
        outputs = self.distilbert(
            input_ids,
            attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型的隐藏状态
        hidden_state = outputs[0]
        # 从隐藏状态中提取池化输出，一般是第一个位置的隐藏状态
        pooled_output = hidden_state[:, 0]
        # 通过预分类器处理池化输出
        pooled_output = self.pre_classifier(pooled_output)
        # 应用ReLU激活函数到处理后的池化输出
        pooled_output = ACT2FN["relu"](pooled_output)
        # 使用Dropout层对处理后的池化输出进行随机失活
        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
        # 使用分类器计算最终的logits
        logits = self.classifier(pooled_output)

        # 将logits重新调整形状以适应多项选择的格式
        reshaped_logits = logits.reshape(-1, num_choices)

        # 如果不使用返回字典的方式，则返回调整形状后的logits和额外的隐藏状态
        if not return_dict:
            return (reshaped_logits,) + outputs[2:]

        # 如果使用返回字典的方式，则返回FlaxMultipleChoiceModelOutput对象
        return FlaxMultipleChoiceModelOutput(
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    """,
    FLAX_DISTILBERT_START_DOCSTRING,
)
class FlaxDistilBertForMultipleChoice(FlaxDistilBertPreTrainedModel):
    module_class = FlaxDistilBertForMultipleChoiceModule



overwrite_call_docstring(
    FlaxDistilBertForMultipleChoice, DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)



append_call_sample_docstring(
    FlaxDistilBertForMultipleChoice,
    _CHECKPOINT_FOR_DOC,
    FlaxMultipleChoiceModelOutput,
    _CONFIG_FOR_DOC,
)



class FlaxDistilBertForTokenClassificationModule(nn.Module):
    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype)
        self.dropout = nn.Dropout(rate=self.config.dropout)
        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # Model
        outputs = self.distilbert(
            input_ids,
            attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        logits = self.classifier(hidden_states)

        if not return_dict:
            return (logits,) + outputs[1:]

        return FlaxTokenClassifierOutput(
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )



@add_start_docstrings(
    """
    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    """,
    FLAX_DISTILBERT_START_DOCSTRING,
)
class FlaxDistilBertForTokenClassification(FlaxDistilBertPreTrainedModel):
    module_class = FlaxDistilBertForTokenClassificationModule



append_call_sample_docstring(
    FlaxDistilBertForTokenClassification,
    _CHECKPOINT_FOR_DOC,
    FlaxTokenClassifierOutput,
    _CONFIG_FOR_DOC,
)



class FlaxDistilBertForQuestionAnsweringModule(nn.Module):
    config: DistilBertConfig
    dtype: jnp.dtype = jnp.float32
    # 初始化模型的方法，设置各个组件
    def setup(self):
        # 创建一个 DistilBERT 模型实例，使用给定的配置和数据类型
        self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype)
        # 创建一个全连接层，用于输出问题回答的分类数目
        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
        # 断言模型需要输出的类别数为2
        assert self.config.num_labels == 2
        # 创建一个 Dropout 层，用于在训练过程中随机丢弃部分输入以防止过拟合
        self.dropout = nn.Dropout(rate=self.config.qa_dropout)

    # 模型调用方法，接受输入并返回模型预测结果
    def __call__(
        self,
        input_ids,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 根据参数设置是否返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 DistilBERT 模型进行前向传播
        distilbert_output = self.distilbert(
            input_ids,
            attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出中的隐藏状态
        hidden_states = distilbert_output[0]

        # 使用 Dropout 层对隐藏状态进行随机丢弃
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        # 将处理后的隐藏状态输入到全连接层中，得到最终的分类 logits
        logits = self.qa_outputs(hidden_states)
        # 将 logits 按照类别数目分割成起始和结束 logits
        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
        # 去除不必要的维度
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        # 根据是否需要返回字典形式的输出进行处理并返回
        if not return_dict:
            # 如果不返回字典，则返回元组形式的输出
            return (start_logits, end_logits) + distilbert_output[1:]

        # 返回 FlaxQuestionAnsweringModelOutput 类的实例，包含起始 logits、结束 logits、隐藏状态和注意力权重
        return FlaxQuestionAnsweringModelOutput(
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )
@add_start_docstrings(
    """
    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    FLAX_DISTILBERT_START_DOCSTRING,
)


这部分代码是一个装饰器函数调用，用于给 `FlaxDistilBertForQuestionAnswering` 类添加文档字符串。文档字符串描述了该类的作用，说明它是基于 DistilBert 模型的，具有用于提取式问答任务（如 SQuAD）的分类头部（在隐藏状态输出的基础上进行线性层计算，生成 `span start logits` 和 `span end logits`）。


class FlaxDistilBertForQuestionAnswering(FlaxDistilBertPreTrainedModel):
    module_class = FlaxDistilBertForQuestionAnsweringModule


定义了一个新的类 `FlaxDistilBertForQuestionAnswering`，继承自 `FlaxDistilBertPreTrainedModel`。`module_class` 被设置为 `FlaxDistilBertForQuestionAnsweringModule`，用于模型内部的模块处理。


append_call_sample_docstring(
    FlaxDistilBertForQuestionAnswering,
    _CHECKPOINT_FOR_DOC,
    FlaxQuestionAnsweringModelOutput,
    _CONFIG_FOR_DOC,
)


这是一个函数调用，用于向 `FlaxDistilBertForQuestionAnswering` 类添加调用示例的文档字符串。它会附加一个关于模型如何调用的示例文档字符串，包括 `_CHECKPOINT_FOR_DOC`（用于模型检查点）、`FlaxQuestionAnsweringModelOutput`（模型输出）和 `_CONFIG_FOR_DOC`（模型配置）。

`.\models\distilbert\modeling_tf_distilbert.py`

# 导入警告模块，用于处理警告信息
import warnings
# 导入类型提示相关模块
from typing import Optional, Tuple, Union
# 导入 NumPy 库
import numpy as np
# 导入 TensorFlow 库
import tensorflow as tf

# 从 Hugging Face 库中导入相关模块和函数
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutput,
    TFMaskedLMOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_distilbert import DistilBertConfig

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 文档中使用的预训练模型名称
_CHECKPOINT_FOR_DOC = "distilbert-base-uncased"
# 文档中使用的配置文件名称
_CONFIG_FOR_DOC = "DistilBertConfig"

# 可用的预训练模型列表
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "distilbert-base-uncased",
    "distilbert-base-uncased-distilled-squad",
    "distilbert-base-cased",
    "distilbert-base-cased-distilled-squad",
    "distilbert-base-multilingual-cased",
    "distilbert-base-uncased-finetuned-sst-2-english",
    # 更多 DistilBERT 模型列表可见 https://huggingface.co/models?filter=distilbert
]

class TFEmbeddings(keras.layers.Layer):
    """构建由单词、位置和标记类型嵌入组成的嵌入层。"""

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 嵌入层配置
        self.config = config
        # 嵌入维度
        self.dim = config.dim
        # 初始化器范围
        self.initializer_range = config.initializer_range
        # 最大位置嵌入数量
        self.max_position_embeddings = config.max_position_embeddings
        # 层归一化层对象
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
        # Dropout 层对象
        self.dropout = keras.layers.Dropout(rate=config.dropout)
    # 在神经网络层的建立过程中，用于构建层的方法，设置输入形状（如果有）
    def build(self, input_shape=None):
        # 在 "word_embeddings" 命名空间下创建一个权重张量，用于词嵌入
        with tf.name_scope("word_embeddings"):
            self.weight = self.add_weight(
                name="weight",
                shape=[self.config.vocab_size, self.dim],
                initializer=get_initializer(initializer_range=self.initializer_range),
            )

        # 在 "position_embeddings" 命名空间下创建一个权重张量，用于位置嵌入
        with tf.name_scope("position_embeddings"):
            self.position_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.max_position_embeddings, self.dim],
                initializer=get_initializer(initializer_range=self.initializer_range),
            )

        # 如果网络层已经建立，直接返回
        if self.built:
            return
        self.built = True
        
        # 如果存在 LayerNorm 层，则构建 LayerNorm 层，设置输入形状
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.dim])

    # 在调用时应用嵌入操作，基于输入张量进行嵌入处理
    def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False):
        """
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        """
        # 断言输入张量不为空
        assert not (input_ids is None and inputs_embeds is None)

        # 如果提供了 input_ids，使用权重张量对应位置的嵌入向量
        if input_ids is not None:
            # 检查 input_ids 是否在合法范围内
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            # 根据 input_ids 从权重张量中获取对应的嵌入向量
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        # 获取输入嵌入张量的形状，排除最后一个维度（批次维度）
        input_shape = shape_list(inputs_embeds)[:-1]

        # 如果未提供 position_ids，则创建默认的位置 id 张量
        if position_ids is None:
            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        # 根据 position_ids 从位置嵌入张量中获取位置嵌入向量
        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)

        # 将输入嵌入向量和位置嵌入向量相加，得到最终的嵌入向量
        final_embeddings = inputs_embeds + position_embeds

        # 对最终嵌入向量应用 LayerNorm 层
        final_embeddings = self.LayerNorm(inputs=final_embeddings)
        
        # 在训练时，对最终嵌入向量应用 Dropout 层
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)

        # 返回最终的嵌入向量
        return final_embeddings
# 定义一个名为 TFMultiHeadSelfAttention 的自定义层，继承自 keras 的 Layer 类
class TFMultiHeadSelfAttention(keras.layers.Layer):
    # 初始化方法，接受一个 config 参数和其他关键字参数
    def __init__(self, config, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 设置多头注意力的头数和每个头的维度
        self.n_heads = config.n_heads
        self.dim = config.dim
        # 创建一个 Dropout 层，用于注意力机制的dropout
        self.dropout = keras.layers.Dropout(config.attention_dropout)
        # 是否输出注意力权重的标志
        self.output_attentions = config.output_attentions

        # 断言确保隐藏层大小能被头数整除
        assert self.dim % self.n_heads == 0, f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}"

        # 创建 Dense 层来处理查询、键和值的线性变换
        self.q_lin = keras.layers.Dense(
            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin"
        )
        self.k_lin = keras.layers.Dense(
            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin"
        )
        self.v_lin = keras.layers.Dense(
            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin"
        )
        # 创建 Dense 层用于最终输出的线性变换
        self.out_lin = keras.layers.Dense(
            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin"
        )

        # 初始化一个空集合，用于记录需要剪枝的注意力头
        self.pruned_heads = set()
        # 保存传入的配置对象
        self.config = config

    # 剪枝注意力头的方法，抛出未实现的错误
    def prune_heads(self, heads):
        raise NotImplementedError
    def call(self, query, key, value, mask, head_mask, output_attentions, training=False):
        """
        Parameters:
            query: tf.Tensor(bs, seq_length, dim)
            key: tf.Tensor(bs, seq_length, dim)
            value: tf.Tensor(bs, seq_length, dim)
            mask: tf.Tensor(bs, seq_length)

        Returns:
            weights: tf.Tensor(bs, n_heads, seq_length, seq_length) Attention weights context: tf.Tensor(bs,
            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
        """
        bs, q_length, dim = shape_list(query)  # 获取query张量的形状信息
        k_length = shape_list(key)[1]  # 获取key张量的长度信息
        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
        # assert key.size() == value.size()
        dim_per_head = int(self.dim / self.n_heads)  # 计算每个注意力头的维度
        dim_per_head = tf.cast(dim_per_head, dtype=tf.int32)  # 转换为整数类型
        mask_reshape = [bs, 1, 1, k_length]  # 定义用于重塑mask张量的形状

        def shape(x):
            """将张量按照注意力头分离"""
            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))

        def unshape(x):
            """将张量的注意力头重新组合"""
            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))

        q = shape(self.q_lin(query))  # 使用线性层处理query张量并按照注意力头分离 (bs, n_heads, q_length, dim_per_head)
        k = shape(self.k_lin(key))  # 使用线性层处理key张量并按照注意力头分离 (bs, n_heads, k_length, dim_per_head)
        v = shape(self.v_lin(value))  # 使用线性层处理value张量并按照注意力头分离 (bs, n_heads, k_length, dim_per_head)
        q = tf.cast(q, dtype=tf.float32)  # 将q张量转换为float32类型
        q = tf.multiply(q, tf.math.rsqrt(tf.cast(dim_per_head, dtype=tf.float32)))  # 对q张量进行归一化处理
        k = tf.cast(k, dtype=q.dtype)  # 将k张量转换为和q相同的数据类型
        scores = tf.matmul(q, k, transpose_b=True)  # 计算注意力分数 (bs, n_heads, q_length, k_length)
        mask = tf.reshape(mask, mask_reshape)  # 重塑mask张量的形状为 (bs, n_heads, qlen, klen)
        # scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)

        mask = tf.cast(mask, dtype=scores.dtype)  # 将mask张量转换为和scores相同的数据类型
        scores = scores - 1e30 * (1.0 - mask)  # 使用mask进行注意力分数的屏蔽处理
        weights = stable_softmax(scores, axis=-1)  # 对注意力分数进行稳定的softmax操作，得到注意力权重 (bs, n_heads, qlen, klen)
        weights = self.dropout(weights, training=training)  # 对注意力权重进行dropout操作，用于训练阶段 (bs, n_heads, qlen, klen)

        # 如果需要，对注意力头进行屏蔽操作
        if head_mask is not None:
            weights = weights * head_mask

        context = tf.matmul(weights, v)  # 计算上下文张量 (bs, n_heads, qlen, dim_per_head)
        context = unshape(context)  # 将上下文张量按照注意力头重新组合成原始形状 (bs, q_length, dim)
        context = self.out_lin(context)  # 使用线性层处理上下文张量，得到最终输出 (bs, q_length, dim)

        if output_attentions:
            return (context, weights)  # 如果需要输出注意力权重，返回上下文和注意力权重
        else:
            return (context,)  # 否则，仅返回上下文张量
    # 构建函数，用于构建模型的组件
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 设置标志表示已经构建过
        self.built = True
        
        # 如果存在查询线性层，则构建查询线性层
        if getattr(self, "q_lin", None) is not None:
            with tf.name_scope(self.q_lin.name):
                # 使用输入维度构建查询线性层
                self.q_lin.build([None, None, self.config.dim])
        
        # 如果存在键线性层，则构建键线性层
        if getattr(self, "k_lin", None) is not None:
            with tf.name_scope(self.k_lin.name):
                # 使用输入维度构建键线性层
                self.k_lin.build([None, None, self.config.dim])
        
        # 如果存在值线性层，则构建值线性层
        if getattr(self, "v_lin", None) is not None:
            with tf.name_scope(self.v_lin.name):
                # 使用输入维度构建值线性层
                self.v_lin.build([None, None, self.config.dim])
        
        # 如果存在输出线性层，则构建输出线性层
        if getattr(self, "out_lin", None) is not None:
            with tf.name_scope(self.out_lin.name):
                # 使用输入维度构建输出线性层
                self.out_lin.build([None, None, self.config.dim])
# 定义一个名为 TFFFN 的自定义 Keras 层
class TFFFN(keras.layers.Layer):
    # 初始化方法，接受一个配置对象和其他关键字参数
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 设置 dropout 层，使用配置对象中的 dropout 参数
        self.dropout = keras.layers.Dropout(config.dropout)
        # 设置第一个全连接层，使用配置对象中的 hidden_dim 参数和特定的初始化器
        self.lin1 = keras.layers.Dense(
            config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1"
        )
        # 设置第二个全连接层，使用配置对象中的 dim 参数和特定的初始化器
        self.lin2 = keras.layers.Dense(
            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2"
        )
        # 设置激活函数，根据配置对象中的 activation 参数选择对应的 TensorFlow 激活函数
        self.activation = get_tf_activation(config.activation)
        # 保存配置对象
        self.config = config

    # 定义 call 方法，处理输入数据，并进行层间传递
    def call(self, input, training=False):
        # 第一层全连接操作
        x = self.lin1(input)
        # 应用激活函数
        x = self.activation(x)
        # 第二层全连接操作
        x = self.lin2(x)
        # 应用 dropout 操作，根据 training 参数判断是否进行训练模式
        x = self.dropout(x, training=training)
        # 返回处理后的数据
        return x

    # build 方法，用于构建层的结构
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记该层已经构建
        self.built = True
        # 检查并构建第一层全连接层
        if getattr(self, "lin1", None) is not None:
            with tf.name_scope(self.lin1.name):
                self.lin1.build([None, None, self.config.dim])
        # 检查并构建第二层全连接层
        if getattr(self, "lin2", None) is not None:
            with tf.name_scope(self.lin2.name):
                self.lin2.build([None, None, self.config.hidden_dim])


# 定义一个名为 TFTransformerBlock 的自定义 Keras 层
class TFTransformerBlock(keras.layers.Layer):
    # 初始化方法，接受一个配置对象和其他关键字参数
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 设置注意力头数
        self.n_heads = config.n_heads
        # 设置向量维度
        self.dim = config.dim
        # 设置隐藏层维度
        self.hidden_dim = config.hidden_dim
        # 设置 dropout 层，使用配置对象中的 dropout 参数
        self.dropout = keras.layers.Dropout(config.dropout)
        # 设置激活函数类型
        self.activation = config.activation
        # 是否输出注意力权重
        self.output_attentions = config.output_attentions

        # 确保向量维度可以被注意力头数整除
        assert (
            config.dim % config.n_heads == 0
        ), f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}"

        # 设置自注意力层
        self.attention = TFMultiHeadSelfAttention(config, name="attention")
        # 设置自注意力层后的 LayerNormalization 层
        self.sa_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")

        # 设置前馈神经网络层
        self.ffn = TFFFN(config, name="ffn")
        # 设置前馈神经网络层后的 LayerNormalization 层
        self.output_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm")
        # 保存配置对象
        self.config = config
    def call(self, x, attn_mask, head_mask, output_attentions, training=False):  # removed: src_enc=None, src_len=None
        """
        Parameters:
            x: tf.Tensor(bs, seq_length, dim)
                输入张量，形状为(batch_size, 序列长度, 维度)
            attn_mask: tf.Tensor(bs, seq_length)
                注意力掩码张量，形状为(batch_size, 序列长度)，用于屏蔽无效位置的注意力
            head_mask: Not used in this function
                该参数在本函数中未使用
            output_attentions: bool
                是否输出注意力权重张量
            training: bool, optional
                是否处于训练模式，默认为False

        Outputs:
            sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
                注意力权重张量，形状为(batch_size, 注意力头数, 序列长度, 序列长度)
            ffn_output: tf.Tensor(bs, seq_length, dim)
                变换器块的输出张量，形状为(batch_size, 序列长度, 维度)
        """
        # Self-Attention
        sa_output = self.attention(x, x, x, attn_mask, head_mask, output_attentions, training=training)
        if output_attentions:
            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
        else:  # To handle these `output_attentions` or `output_hidden_states` cases returning tuples
            # assert type(sa_output) == tuple
            sa_output = sa_output[0]
        sa_output = self.sa_layer_norm(sa_output + x)  # (bs, seq_length, dim)

        # Feed Forward Network
        ffn_output = self.ffn(sa_output, training=training)  # (bs, seq_length, dim)
        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)

        output = (ffn_output,)
        if output_attentions:
            output = (sa_weights,) + output
        return output

    def build(self, input_shape=None):
        """
        构建模型的方法，用于初始化相关层的参数和变量。

        Parameters:
            input_shape: Not used in this function
                该参数在本函数中未使用
        """
        if self.built:
            return
        self.built = True
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        if getattr(self, "sa_layer_norm", None) is not None:
            with tf.name_scope(self.sa_layer_norm.name):
                self.sa_layer_norm.build([None, None, self.config.dim])
        if getattr(self, "ffn", None) is not None:
            with tf.name_scope(self.ffn.name):
                self.ffn.build(None)
        if getattr(self, "output_layer_norm", None) is not None:
            with tf.name_scope(self.output_layer_norm.name):
                self.output_layer_norm.build([None, None, self.config.dim])
class TFTransformer(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        self.n_layers = config.n_layers  # 初始化层数
        self.output_hidden_states = config.output_hidden_states  # 是否输出所有隐藏层状态
        self.output_attentions = config.output_attentions  # 是否输出所有注意力权重

        # 初始化每一层的TransformerBlock
        self.layer = [TFTransformerBlock(config, name=f"layer_._{i}") for i in range(config.n_layers)]

    def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict, training=False):
        # docstyle-ignore
        """
        Parameters:
            x: tf.Tensor(bs, seq_length, dim) 输入序列的嵌入表示
            attn_mask: tf.Tensor(bs, seq_length) 序列的注意力掩码

        Returns:
            hidden_state: tf.Tensor(bs, seq_length, dim)
                最后（顶层）层的隐藏状态序列
            all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
                长度为n_layers的元组，包含每一层的隐藏状态序列
                可选：仅在output_hidden_states=True时返回
            all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)]
                长度为n_layers的元组，包含每一层的注意力权重
                可选：仅在output_attentions=True时返回
        """
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        hidden_state = x
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_state,)

            layer_outputs = layer_module(hidden_state, attn_mask, head_mask[i], output_attentions, training=training)
            hidden_state = layer_outputs[-1]

            if output_attentions:
                assert len(layer_outputs) == 2
                attentions = layer_outputs[0]
                all_attentions = all_attentions + (attentions,)
            else:
                assert len(layer_outputs) == 1, f"Incorrect number of outputs {len(layer_outputs)} instead of 1"

        # 添加最后一层的隐藏状态
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_state,)

        # 如果return_dict为False，则返回非None的元组
        if not return_dict:
            return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None)
        # 如果return_dict为True，则返回TFBaseModelOutput对象
        return TFBaseModelOutput(
            last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                with tf.name_scope(layer.name):
                    layer.build(None)


@keras_serializable
class TFDistilBertMainLayer(keras.layers.Layer):
    config_class = DistilBertConfig
    # 初始化函数，接受一个配置对象和额外的关键字参数
    def __init__(self, config, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 将配置对象保存在实例中
        self.config = config
        # 从配置对象中获取隐藏层的数量
        self.num_hidden_layers = config.num_hidden_layers
        # 是否输出注意力权重
        self.output_attentions = config.output_attentions
        # 是否输出隐藏层状态
        self.output_hidden_states = config.output_hidden_states
        # 是否使用返回字典
        self.return_dict = config.use_return_dict

        # 创建嵌入层对象并保存到实例中，命名为“embeddings”
        self.embeddings = TFEmbeddings(config, name="embeddings")  # Embeddings
        # 创建Transformer编码器对象并保存到实例中，命名为“transformer”
        self.transformer = TFTransformer(config, name="transformer")  # Encoder

    # 返回嵌入层对象的方法
    def get_input_embeddings(self):
        return self.embeddings

    # 设置嵌入层对象的权重和词汇大小的方法
    def set_input_embeddings(self, value):
        # 设置嵌入层对象的权重
        self.embeddings.weight = value
        # 设置嵌入层对象的词汇大小
        self.embeddings.vocab_size = value.shape[0]

    # 抽象方法，用于剪枝头部（未实现）
    def _prune_heads(self, heads_to_prune):
        raise NotImplementedError

    # 装饰器函数，用于解压输入参数
    @unpack_inputs
    def call(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
    ):
        # 如果同时指定了input_ids和inputs_embeds，则抛出错误
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        # 如果指定了input_ids，则获取其形状
        elif input_ids is not None:
            input_shape = shape_list(input_ids)
        # 如果指定了inputs_embeds，则获取其形状（去除最后一个维度）
        elif inputs_embeds is not None:
            input_shape = shape_list(inputs_embeds)[:-1]
        # 如果没有指定input_ids或inputs_embeds，则抛出错误
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 如果未提供attention_mask，则创建全1的注意力掩码
        if attention_mask is None:
            attention_mask = tf.ones(input_shape)  # (bs, seq_length)

        # 将attention_mask转换为float32类型
        attention_mask = tf.cast(attention_mask, dtype=tf.float32)

        # 准备头部掩码（目前未实现）
        # head_mask的形状为[num_hidden_layers]或[num_hidden_layers x batch x num_heads x seq_length x seq_length]
        if head_mask is not None:
            raise NotImplementedError
        else:
            # 创建与隐藏层数相同数量的None列表作为头部掩码
            head_mask = [None] * self.num_hidden_layers

        # 获取嵌入层的输出，输入到Transformer编码器中
        embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds)  # (bs, seq_length, dim)
        # 将嵌入层的输出传递给Transformer编码器
        tfmr_output = self.transformer(
            embedding_output,
            attention_mask,
            head_mask,
            output_attentions,
            output_hidden_states,
            return_dict,
            training=training,
        )

        # 返回Transformer编码器的输出，包括最后一层的隐藏状态、所有隐藏状态和所有注意力权重
        return tfmr_output  # last-layer hidden-state, (all hidden_states), (all attentions)
    # 构建方法用于构造模型的层次结构，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果模型具有嵌入层，则构建嵌入层
        if getattr(self, "embeddings", None) is not None:
            # 在命名空间下构建嵌入层，使用嵌入层的名称
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        # 如果模型具有变换器层，则构建变换器层
        if getattr(self, "transformer", None) is not None:
            # 在命名空间下构建变换器层，使用变换器层的名称
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
# 接口用于编码器和特定任务模型
class TFDistilBertPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 DistilBertConfig 作为配置类
    config_class = DistilBertConfig
    # base_model_prefix 指定为 "distilbert"
    base_model_prefix = "distilbert"


# DISTILBERT_START_DOCSTRING 是一个包含多行字符串的文档字符串，描述了模型的继承关系和基本用法
DISTILBERT_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# DISTILBERT_INPUTS_DOCSTRING 是一个包含多行字符串的文档字符串，用于描述输入的格式和使用方法
DISTILBERT_INPUTS_DOCSTRING = r"""
    # Args: 输入参数说明开始
    input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
        # 输入序列标记在词汇表中的索引
        Indices of input sequence tokens in the vocabulary.

        # 通过 [`AutoTokenizer`] 可以获取输入的索引。参见 [`PreTrainedTokenizer.__call__`] 和 [`PreTrainedTokenizer.encode`] 获取详细信息。
        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
        [`PreTrainedTokenizer.encode`] for details.

        # [What are input IDs?](../glossary#input-ids) 输入 ID 是什么？
    attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
        # 注意力掩码，用于避免对填充的标记索引执行注意力操作。掩码值在 `[0, 1]` 之间：

        # - 1 表示**未掩码**的标记，
        # - 0 表示**已掩码**的标记。
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        # [What are attention masks?](../glossary#attention-mask) 注意力掩码是什么？
    head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
        # 自注意力模块中要屏蔽的头部的掩码。掩码值在 `[0, 1]` 之间：

        # - 1 表示**未掩码**的头部，
        # - 0 表示**已掩码**的头部。
        Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

        # inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
        # 可选项，可以直接传递嵌入表示而不是传递 `input_ids`。如果希望更好地控制如何将 `input_ids` 索引转换为关联向量，这很有用。
        # This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
        # model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        # 是否返回所有注意力层的注意力张量。查看返回张量中的 `attentions` 获取更多细节。此参数仅在 eager 模式下使用，在图模式下将使用配置中的值。
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
        tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
        config will be used instead.
    output_hidden_states (`bool`, *optional*):
        # 是否返回所有层的隐藏状态。查看返回张量中的 `hidden_states` 获取更多细节。此参数仅在 eager 模式下使用，在图模式下将使用配置中的值。
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
        more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
        used instead.
    return_dict (`bool`, *optional*):
        # 是否返回 [`~utils.ModelOutput`] 而不是普通元组。此参数可以在 eager 模式下使用，在图模式下该值将始终为 True。
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
        eager mode, in graph mode the value will always be set to True.
    training (`bool`, *optional*, defaults to `False`):
        # 是否使用模型处于训练模式（某些模块如 dropout 在训练和评估之间有不同的行为）。
        Whether or not to use the model in training mode (some modules like dropout modules have different
        behaviors between training and evaluation).
"""
TFDistilBertModel 类定义了一个基于 DistilBERT 模型的编码器/转换器，不添加特定的输出头部。

@parameters：config - DistilBERT 模型的配置
             *inputs - 输入参数
             **kwargs - 额外的关键字参数

@returns：DistilBERT 模型的输出结果

"""

@add_start_docstrings(
    "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
    DISTILBERT_START_DOCSTRING,
)
class TFDistilBertModel(TFDistilBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")  # Embeddings

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        outputs = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "distilbert", None) is not None:
            with tf.name_scope(self.distilbert.name):
                self.distilbert.build(None)


class TFDistilBertLMHead(keras.layers.Layer):
    """
    TFDistilBertLMHead 类定义了 DistilBERT 的语言模型头部。

    @paramters：config - DistilBERT 的配置
                input_embeddings - 输入的嵌入层

    """

    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)

        self.config = config
        self.dim = config.dim

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.input_embeddings = input_embeddings

    def build(self, input_shape):
        """
        建立语言模型头部的权重。
        
        @paramters：input_shape - 输入形状

        """
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

        super().build(input_shape)

    def get_output_embeddings(self):
        """
        获取输出嵌入层。

        @returns：输入嵌入层

        """
        return self.input_embeddings

    def set_output_embeddings(self, value):
        """
        设置输出嵌入层。

        @paramters：value - 新的嵌入层权重

        """
        self.input_embeddings.weight = value
        self.input_embeddings.vocab_size = shape_list(value)[0]

    def get_bias(self):
        """
        获取偏置项。

        @returns：偏置项字典

        """
        return {"bias": self.bias}

    def set_bias(self, value):
        """
        设置偏置项。

        @paramters：value - 新的偏置项值

        """
        self.bias = value["bias"]
        self.config.vocab_size = shape_list(value["bias"])[0]
    # 定义一个方法 `call`，接受 `hidden_states` 参数
    def call(self, hidden_states):
        # 获取 `hidden_states` 张量的序列长度
        seq_length = shape_list(tensor=hidden_states)[1]
        # 将 `hidden_states` 张量重塑为二维张量，形状为 [-1, self.dim]
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.dim])
        # 对重塑后的张量 `hidden_states` 与模型的输入嵌入权重矩阵进行矩阵乘法，转置模型的输入嵌入权重矩阵
        hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
        # 将矩阵乘法的结果重新塑形为三维张量，形状为 [-1, seq_length, self.config.vocab_size]
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
        # 在张量 `hidden_states` 上添加偏置项，偏置项为模型的偏置
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)

        # 返回处理后的张量 `hidden_states`
        return hidden_states
# 添加模型文档字符串，描述该类为带有 `masked language modeling` 头部的 DistilBERT 模型
@add_start_docstrings(
    """DistilBert Model with a `masked language modeling` head on top.""",
    DISTILBERT_START_DOCSTRING,
)
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.config = config

        # 初始化 DistilBERT 主层
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
        
        # 创建词汇转换层，用于预测词汇的分布
        self.vocab_transform = keras.layers.Dense(
            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform"
        )
        
        # 获取激活函数并应用于模型
        self.act = get_tf_activation(config.activation)
        
        # 添加词汇层归一化层
        self.vocab_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
        
        # 初始化 DistilBERT 语言模型头部
        self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")

    def get_lm_head(self):
        # 返回语言模型头部
        return self.vocab_projector

    def get_prefix_bias_name(self):
        # 警告：方法 get_prefix_bias_name 已废弃，请使用 `get_bias` 替代
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        # 返回带有语言模型头部名字的前缀
        return self.name + "/" + self.vocab_projector.name

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        # 神经网络模型的前向传播函数，用于执行推断或训练步骤
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 提取DistilBERT模型的输出隐藏状态
        hidden_states = distilbert_output[0]  # (bs, seq_length, dim)
        # 将隐藏状态映射为预测的logits（对应于词汇表大小）
        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
        # 应用激活函数到预测的logits
        prediction_logits = self.act(prediction_logits)  # (bs, seq_length, dim)
        # 对预测的logits进行层归一化
        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
        # 投影到词汇表维度的空间
        prediction_logits = self.vocab_projector(prediction_logits)

        # 如果没有提供标签，则损失为None；否则使用预测的logits计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, prediction_logits)

        # 如果不要求返回字典，则返回一组输出
        if not return_dict:
            output = (prediction_logits,) + distilbert_output[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回带有命名属性的TFMaskedLMOutput对象，包括损失、logits、隐藏状态和注意力权重
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果存在DistilBERT模型，则构建它
        if getattr(self, "distilbert", None) is not None:
            with tf.name_scope(self.distilbert.name):
                self.distilbert.build(None)
        # 如果存在词汇转换层，则构建它
        if getattr(self, "vocab_transform", None) is not None:
            with tf.name_scope(self.vocab_transform.name):
                self.vocab_transform.build([None, None, self.config.dim])
        # 如果存在词汇层归一化，则构建它
        if getattr(self, "vocab_layer_norm", None) is not None:
            with tf.name_scope(self.vocab_layer_norm.name):
                self.vocab_layer_norm.build([None, None, self.config.dim])
        # 如果存在词汇投影层，则构建它
        if getattr(self, "vocab_projector", None) is not None:
            with tf.name_scope(self.vocab_projector.name):
                self.vocab_projector.build(None)
@add_start_docstrings(
    """
    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels  # 初始化分类标签数量

        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")  # 初始化 DistilBERT 主层

        # 预分类器，用于准备输入特征
        self.pre_classifier = keras.layers.Dense(
            config.dim,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="relu",
            name="pre_classifier",
        )

        # 分类器，用于分类任务，输出为 num_labels 个类别
        self.classifier = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )

        # Dropout 层，用于防止过拟合
        self.dropout = keras.layers.Dropout(config.seq_classif_dropout)

        self.config = config  # 保存配置信息

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        ):
        """
        根据输入调用模型进行前向传播计算。

        Args:
            input_ids (TFModelInputType | None): 输入序列的 token IDs
            attention_mask (np.ndarray | tf.Tensor | None): 注意力遮罩，掩盖无意义的位置
            head_mask (np.ndarray | tf.Tensor | None): 多头注意力掩码
            inputs_embeds (np.ndarray | tf.Tensor | None): 替代输入的嵌入向量
            output_attentions (Optional[bool]): 是否输出注意力权重
            output_hidden_states (Optional[bool]): 是否输出隐藏状态
            return_dict (Optional[bool]): 是否返回字典格式的输出
            labels (np.ndarray | tf.Tensor | None): 标签 IDs
            training (Optional[bool]): 是否处于训练模式

        Returns:
            输出字典或对象，包含预测结果或损失值
        """
        # 省略部分代码
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 调用 DistilBERT 模型处理输入数据，获取模型输出
        distilbert_output = self.distilbert(
            input_ids=input_ids,                # 输入的 token IDs
            attention_mask=attention_mask,      # 注意力掩码，指示哪些 token 是填充的
            head_mask=head_mask,                # 头部掩码，用于控制哪些注意力头部是有效的
            inputs_embeds=inputs_embeds,        # 嵌入的输入张量
            output_attentions=output_attentions,    # 是否输出注意力权重
            output_hidden_states=output_hidden_states,    # 是否输出隐藏状态
            return_dict=return_dict,            # 是否以字典形式返回结果
            training=training,                  # 是否处于训练模式
        )
        hidden_state = distilbert_output[0]    # 获取 DistilBERT 输出的隐藏状态 (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]     # 获取池化的输出 (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # 对池化输出进行预分类
        pooled_output = self.dropout(pooled_output, training=training)  # 对预分类输出进行 dropout 处理
        logits = self.classifier(pooled_output)    # 使用分类器获取 logits (bs, dim)

        loss = None if labels is None else self.hf_compute_loss(labels, logits)  # 计算损失，若无标签则为 None

        if not return_dict:
            output = (logits,) + distilbert_output[1:]   # 如果不返回字典，则输出 logits 和其他 DistilBERT 输出
            return ((loss,) + output) if loss is not None else output  # 返回损失和输出或者仅输出

        # 返回 TFSequenceClassifierOutput 对象，包括损失、logits、隐藏状态和注意力权重
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return  # 如果模型已经构建过，则直接返回

        self.built = True  # 标记模型已经构建

        if getattr(self, "distilbert", None) is not None:
            with tf.name_scope(self.distilbert.name):
                self.distilbert.build(None)  # 构建 DistilBERT 模型

        if getattr(self, "pre_classifier", None) is not None:
            with tf.name_scope(self.pre_classifier.name):
                self.pre_classifier.build([None, None, self.config.dim])  # 构建预分类器模型

        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.dim])  # 构建分类器模型
@add_start_docstrings(
    """
    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenClassificationLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 设置分类任务的标签数量
        self.num_labels = config.num_labels

        # 初始化 DistilBERT 主层
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
        # Dropout 层，用于防止过拟合
        self.dropout = keras.layers.Dropout(config.dropout)
        # 分类器，输出层，用于将隐藏状态输出映射到标签空间
        self.classifier = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 调用 DistilBERT 主层，获取模型的输出
        outputs = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取序列输出
        sequence_output = outputs[0]
        # 应用 Dropout 层以防止过拟合
        sequence_output = self.dropout(sequence_output, training=training)
        # 将 Dropout 后的序列输出传入分类器，得到预测的 logits
        logits = self.classifier(sequence_output)
        # 如果有标签，计算损失值
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不要求返回字典，则返回元组形式的输出
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典，则返回 TFTokenClassifierOutput 格式的输出
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 如果模型已经构建完成，则直接返回，不进行重复构建
    if self.built:
        return
    
    # 标记模型为已构建状态
    self.built = True
    
    # 如果模型包含名为'distilbert'的属性且不为None，则构建distilbert部分
    if getattr(self, "distilbert", None) is not None:
        # 在TensorFlow中使用名称作用域来管理命名空间，这里创建distilbert的名称作用域
        with tf.name_scope(self.distilbert.name):
            # 调用distilbert的build方法来构建模型
            self.distilbert.build(None)
    
    # 如果模型包含名为'classifier'的属性且不为None，则构建classifier部分
    if getattr(self, "classifier", None) is not None:
        # 在TensorFlow中使用名称作用域来管理命名空间，这里创建classifier的名称作用域
        with tf.name_scope(self.classifier.name):
            # 调用classifier的build方法来构建模型，传入输入形状作为参数
            self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
    """
    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoiceLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化 DistilBERT 主层，作为模型的主体部分
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
        
        # Dropout 层，用于随机断开输入神经元，防止过拟合
        self.dropout = keras.layers.Dropout(config.seq_classif_dropout)
        
        # 预分类器层，包含一个 Dense 层用于降维和激活函数为 ReLU
        self.pre_classifier = keras.layers.Dense(
            config.dim,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="relu",
            name="pre_classifier",
        )
        
        # 分类器层，输出为单个值，用于多选题的分类
        self.classifier = keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        
        # 存储配置信息
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(
        DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        **kwargs,
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        """
        Forward pass for TFDistilBertForMultipleChoice.
        
        Args:
            input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.
            attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length)`, optional):
                Mask to avoid performing attention on padding token indices.
            head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, optional):
                Mask to nullify selected heads of the self-attention modules.
            inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, hidden_size)`, optional):
                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            output_attentions (:obj:`bool`, optional):
                Whether or not to return the attentions tensors of all attention layers.
            output_hidden_states (:obj:`bool`, optional):
                Whether or not to return the hidden states of all layers.
            return_dict (:obj:`bool`, optional):
                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
            labels (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(batch_size,)`, optional):
                Labels for computing the multiple choice classification loss.
            training (:obj:`bool`, optional):
                Whether to set the model to training mode (dropout active).
        
        Returns:
            :obj:`TFMultipleChoiceModelOutput` or :obj:`Tuple` comprising various elements depending on the configuration
            (config_class, output_attentions, output_hidden_states).
        """
        # 实现模型的前向传播
        # 省略部分代码以保持注释紧凑
        pass
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """
        # 如果提供了 input_ids，则获取 num_choices 和 seq_length
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]  # 获取 input_ids 的第二维大小，即选项数
            seq_length = shape_list(input_ids)[2]   # 获取 input_ids 的第三维大小，即序列长度
        else:
            num_choices = shape_list(inputs_embeds)[1]  # 获取 inputs_embeds 的第二维大小，即选项数
            seq_length = shape_list(inputs_embeds)[2]   # 获取 inputs_embeds 的第三维大小，即序列长度

        # 将 input_ids 展开成二维张量，如果 input_ids 不为 None
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
        # 将 attention_mask 展开成二维张量，如果 attention_mask 不为 None
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        # 将 inputs_embeds 展开成三维张量，如果 inputs_embeds 不为 None
        flat_inputs_embeds = (
            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
            if inputs_embeds is not None
            else None
        )
        # 调用 DistilBERT 模型进行前向传播，获取输出
        distilbert_output = self.distilbert(
            flat_input_ids,
            flat_attention_mask,
            head_mask,
            flat_inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        hidden_state = distilbert_output[0]  # 获取 DistilBERT 输出的隐藏状态 (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # 获取隐藏状态的首个位置，作为池化输出 (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # 经过预分类器处理 (bs, dim)
        pooled_output = self.dropout(pooled_output, training=training)  # 应用 dropout (bs, dim)
        logits = self.classifier(pooled_output)  # 经过分类器处理，得到预测 logits

        reshaped_logits = tf.reshape(logits, (-1, num_choices))  # 重新调整 logits 的形状为 (batch_size, num_choices)

        # 计算损失，如果提供了 labels
        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)

        # 如果不需要返回字典形式的输出，则返回元组形式的输出
        if not return_dict:
            output = (reshaped_logits,) + distilbert_output[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典形式的输出，则返回 TFMultipleChoiceModelOutput 对象
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        self.built = True

        # 如果模型中包含 DistilBERT 层，则构建 DistilBERT 层
        if getattr(self, "distilbert", None) is not None:
            with tf.name_scope(self.distilbert.name):
                self.distilbert.build(None)

        # 如果模型中包含预分类器层，则构建预分类器层
        if getattr(self, "pre_classifier", None) is not None:
            with tf.name_scope(self.pre_classifier.name):
                self.pre_classifier.build([None, None, self.config.dim])

        # 如果模型中包含分类器层，则构建分类器层
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.dim])
@add_start_docstrings(
    """
    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    DISTILBERT_START_DOCSTRING,
)
class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAnsweringLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化 DistilBERT 主层，使用给定的配置和名称
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
        
        # 初始化输出层，一个全连接层用于预测起始和结束位置的 logits
        self.qa_outputs = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
        
        # 断言确保标签数目为2，用于检查是否正确配置了模型
        assert config.num_labels == 2, f"Incorrect number of labels {config.num_labels} instead of 2"
        
        # 初始化 dropout 层，用于在训练时进行随机失活
        self.dropout = keras.layers.Dropout(config.qa_dropout)
        
        # 保存配置对象到实例中
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 模型的前向传播方法，接受多个输入参数并返回输出
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        ):
        """
        此方法用于模型的前向传播，接受多个输入参数并返回输出结果。
        """
        # 以下是方法体的代码，包括输入参数和具体的处理逻辑。
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 获取 DistilBERT 的输出，包括隐藏状态和注意力权重等
        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
        # 对隐藏状态应用 dropout，用于防止过拟合
        hidden_states = self.dropout(hidden_states, training=training)  # (bs, max_query_len, dim)
        # 通过线性层计算起始和结束位置的 logits
        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)

        loss = None
        # 如果给定了起始和结束位置，则计算损失
        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            loss = self.hf_compute_loss(labels, (start_logits, end_logits))

        # 如果不要求返回字典，则根据是否存在损失返回相应的输出
        if not return_dict:
            output = (start_logits, end_logits) + distilbert_output[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFQuestionAnsweringModelOutput 类的对象，包含损失、起始和结束 logits、隐藏状态和注意力权重
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 DistilBERT 存在，则构建其层次结构
        if getattr(self, "distilbert", None) is not None:
            with tf.name_scope(self.distilbert.name):
                self.distilbert.build(None)
        # 如果 QA 输出层存在，则构建其层次结构
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.dim])

Transformers-源码解析-四十-

Transformers 源码解析（四十）

.\models\dinat\__init__.py

.\models\dinov2\configuration_dinov2.py

.\models\dinov2\convert_dinov2_to_hf.py

.\models\dinov2\modeling_dinov2.py

.\models\dinov2\__init__.py

.\models\distilbert\configuration_distilbert.py

.\models\distilbert\modeling_distilbert.py

.\models\distilbert\modeling_flax_distilbert.py

.\models\distilbert\modeling_tf_distilbert.py

`.\models\dinat\init.py`

`.\models\dinov2\configuration_dinov2.py`

`.\models\dinov2\convert_dinov2_to_hf.py`

`.\models\dinov2\modeling_dinov2.py`

`.\models\dinov2\init.py`

`.\models\distilbert\configuration_distilbert.py`

`.\models\distilbert\modeling_distilbert.py`

`.\models\distilbert\modeling_flax_distilbert.py`

`.\models\distilbert\modeling_tf_distilbert.py`