Transformers 源码解析（七十八）

`.\models\mobilevit\init.py`

# 引入类型检查工具，用于类型检查
from typing import TYPE_CHECKING

# 从当前包中的工具模块导入相关依赖
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_torch_available,
    is_vision_available,
)

# 定义模块的导入结构，包含配置、模型和处理类
_import_structure = {
    "configuration_mobilevit": ["MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTConfig", "MobileViTOnnxConfig"],
}

# 检查视觉处理是否可用，若不可用则抛出可选依赖不可用的异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加特征提取和图像处理模块到导入结构
    _import_structure["feature_extraction_mobilevit"] = ["MobileViTFeatureExtractor"]
    _import_structure["image_processing_mobilevit"] = ["MobileViTImageProcessor"]

# 检查是否 Torch 可用，若不可用则抛出可选依赖不可用的异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 Torch 版本的模型定义到导入结构
    _import_structure["modeling_mobilevit"] = [
        "MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MobileViTForImageClassification",
        "MobileViTForSemanticSegmentation",
        "MobileViTModel",
        "MobileViTPreTrainedModel",
    ]

# 检查是否 TensorFlow 可用，若不可用则抛出可选依赖不可用的异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 TensorFlow 版本的模型定义到导入结构
    _import_structure["modeling_tf_mobilevit"] = [
        "TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFMobileViTForImageClassification",
        "TFMobileViTForSemanticSegmentation",
        "TFMobileViTModel",
        "TFMobileViTPreTrainedModel",
    ]

# 如果是类型检查环境，从配置和模型模块中导入相关类和变量
if TYPE_CHECKING:
    from .configuration_mobilevit import MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTConfig, MobileViTOnnxConfig

    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果视觉处理可用，从特征提取和图像处理模块中导入相关类
        from .feature_extraction_mobilevit import MobileViTFeatureExtractor
        from .image_processing_mobilevit import MobileViTImageProcessor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 Torch 可用，从模型定义模块中导入相关类和变量
        from .modeling_mobilevit import (
            MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
            MobileViTForImageClassification,
            MobileViTForSemanticSegmentation,
            MobileViTModel,
            MobileViTPreTrainedModel,
        )
    # 尝试检查是否TensorFlow可用，如果不可用则抛出OptionalDependencyNotAvailable异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 捕获OptionalDependencyNotAvailable异常，不做任何处理
    except OptionalDependencyNotAvailable:
        pass
    # 如果没有异常发生，则执行以下代码块
    else:
        # 从模块modeling_tf_mobilevit中导入以下内容：
        # TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST：移动ViT模型的预训练模型归档列表
        # TFMobileViTForImageClassification：用于图像分类的移动ViT模型
        # TFMobileViTForSemanticSegmentation：用于语义分割的移动ViT模型
        # TFMobileViTModel：移动ViT的基础模型
        # TFMobileViTPreTrainedModel：移动ViT的预训练模型基类
        from .modeling_tf_mobilevit import (
            TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFMobileViTForImageClassification,
            TFMobileViTForSemanticSegmentation,
            TFMobileViTModel,
            TFMobileViTPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于动态修改模块属性
    import sys

    # 将当前模块(__name__)的引用指向一个自定义的 LazyModule 对象
    # LazyModule 是一个自定义的延迟加载模块，用于在需要时再加载模块内容
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\mobilevitv2\configuration_mobilevitv2.py`

# coding=utf-8
# 文件编码声明，指定使用UTF-8编码格式

# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
# 版权声明，版权归HuggingFace Inc.团队所有

# Licensed under the Apache License, Version 2.0 (the "License");
# 按照Apache许可证版本2.0授权许可

# you may not use this file except in compliance with the License.
# 除非符合许可证要求，否则不得使用本文件

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证副本

#     http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# 除非适用法律要求或书面同意，否则软件

# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 根据"原样"分发许可证，无论是明示还是暗示，不带任何形式的担保或条件

# See the License for the specific language governing permissions and
# 请查阅许可证，获取详细的权限说明及限制条款

# limitations under the License.
# 许可证下的限制条件

""" MobileViTV2 model configuration"""
# 文档字符串，指明本代码是关于MobileViTV2模型配置的

from collections import OrderedDict
# 导入OrderedDict类，用于创建有序字典

from typing import Mapping
# 导入Mapping类型提示，用于类型注解

from packaging import version
# 导入version模块，用于处理版本信息

from ...configuration_utils import PretrainedConfig
# 从...configuration_utils中导入PretrainedConfig类，用于继承模型配置

from ...onnx import OnnxConfig
# 从...onnx中导入OnnxConfig类，用于处理ONNX配置

from ...utils import logging
# 从...utils中导入logging模块，用于日志记录

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "apple/mobilevitv2-1.0": "https://huggingface.co/apple/mobilevitv2-1.0/resolve/main/config.json",
}
# MobileViTV2预训练模型配置存档映射，指定模型名称及其对应的配置文件URL

class MobileViTV2Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MobileViTV2Model`]. It is used to instantiate a
    MobileViTV2 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the MobileViTV2
    [apple/mobilevitv2-1.0](https://huggingface.co/apple/mobilevitv2-1.0) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # MobileViTV2Config类，用于存储MobileViTV2模型的配置信息

    def __init__(self, **kwargs):
        # 构造方法，初始化MobileViTV2Config对象
        super().__init__(**kwargs)
        # 调用父类构造方法初始化配置对象
    Args:
        num_channels (`int`, *optional*, defaults to 3):
            输入通道数，默认为3。
        image_size (`int`, *optional*, defaults to 256):
            每张图片的分辨率大小，默认为256像素。
        patch_size (`int`, *optional*, defaults to 2):
            每个图块的分辨率大小，默认为2像素。
        expand_ratio (`float`, *optional*, defaults to 2.0):
            MobileNetv2层的扩展因子，默认为2.0。
        hidden_act (`str` or `function`, *optional*, defaults to `"swish"`):
            Transformer编码器和卷积层中的非线性激活函数（函数或字符串），默认为"swish"。
        conv_kernel_size (`int`, *optional*, defaults to 3):
            MobileViTV2层中卷积核的大小，默认为3。
        output_stride (`int`, *optional*, defaults to 32):
            输出空间分辨率与输入图像分辨率之比，默认为32。
        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
            附加分类器的dropout比率，默认为0.1。
        initializer_range (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态初始化器的标准差，默认为0.02。
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            层归一化层使用的epsilon值，默认为1e-05。
        aspp_out_channels (`int`, *optional*, defaults to 512):
            语义分割中ASPP层使用的输出通道数，默认为512。
        atrous_rates (`List[int]`, *optional*, defaults to `[6, 12, 18]`):
            语义分割中ASPP层使用的扩张（空洞）率列表，默认为`[6, 12, 18]`。
        aspp_dropout_prob (`float`, *optional*, defaults to 0.1):
            语义分割中ASPP层的dropout比率，默认为0.1。
        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
            语义分割模型损失函数中被忽略的索引，默认为255。
        n_attn_blocks (`List[int]`, *optional*, defaults to `[2, 4, 3]`):
            每个MobileViTV2Layer中注意力块的数量列表，默认为`[2, 4, 3]`。
        base_attn_unit_dims (`List[int]`, *optional*, defaults to `[128, 192, 256]`):
            每个MobileViTV2Layer中注意力块维度的基础乘数列表，默认为`[128, 192, 256]`。
        width_multiplier (`float`, *optional*, defaults to 1.0):
            MobileViTV2的宽度乘数，默认为1.0。
        ffn_multiplier (`int`, *optional*, defaults to 2):
            MobileViTV2的FFN乘数，默认为2。
        attn_dropout (`float`, *optional*, defaults to 0.0):
            注意力层中的dropout比率，默认为0.0。
        ffn_dropout (`float`, *optional*, defaults to 0.0):
            FFN层之间的dropout比率，默认为0.0。

    Example:

    ```
    >>> from transformers import MobileViTV2Config, MobileViTV2Model

    >>> # Initializing a mobilevitv2-small style configuration
    >>> configuration = MobileViTV2Config()
    # 初始化一个 MobileViTV2Model 模型，使用给定的配置信息进行配置
    model = MobileViTV2Model(configuration)

    # 访问模型的配置信息
    configuration = model.config
# 定义一个 MobileViTV2OnnxConfig 类，继承自 OnnxConfig 类
class MobileViTV2OnnxConfig(OnnxConfig):
    
    # 定义类变量 torch_onnx_minimum_version，指定最小的 Torch ONNX 版本为 1.11
    torch_onnx_minimum_version = version.parse("1.11")

    # 定义 inputs 属性，返回一个有序字典，描述模型输入的维度顺序和名称
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict([("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"})])

    # 定义 outputs 属性，返回一个有序字典，根据任务类型返回不同的输出结构描述
    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        if self.task == "image-classification":
            return OrderedDict([("logits", {0: "batch"})])
        else:
            return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})])

    # 定义 atol_for_validation 属性，返回一个浮点数，指定验证过程中的容差阈值
    @property
    def atol_for_validation(self) -> float:
        return 1e-4

`.\models\mobilevitv2\convert_mlcvnets_to_pytorch.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert MobileViTV2 checkpoints from the ml-cvnets library."""


import argparse
import collections
import json
from pathlib import Path

import requests
import torch
import yaml
from huggingface_hub import hf_hub_download
from PIL import Image

from transformers import (
    MobileViTImageProcessor,
    MobileViTV2Config,
    MobileViTV2ForImageClassification,
    MobileViTV2ForSemanticSegmentation,
)
from transformers.utils import logging


logging.set_verbosity_info()
logger = logging.get_logger(__name__)


def load_orig_config_file(orig_cfg_file):
    print("Loading config file...")

    def flatten_yaml_as_dict(d, parent_key="", sep="."):
        items = []
        for k, v in d.items():
            new_key = parent_key + sep + k if parent_key else k
            if isinstance(v, collections.abc.MutableMapping):
                items.extend(flatten_yaml_as_dict(v, new_key, sep=sep).items())
            else:
                items.append((new_key, v))
        return dict(items)

    # 打开原始配置文件并将其展平为字典形式
    config = argparse.Namespace()
    with open(orig_cfg_file, "r") as yaml_file:
        try:
            cfg = yaml.load(yaml_file, Loader=yaml.FullLoader)

            flat_cfg = flatten_yaml_as_dict(cfg)
            for k, v in flat_cfg.items():
                setattr(config, k, v)
        except yaml.YAMLError as exc:
            # 如果加载配置文件出错，记录错误信息
            logger.error("Error while loading config file: {}. Error message: {}".format(orig_cfg_file, str(exc)))
    return config


def get_mobilevitv2_config(task_name, orig_cfg_file):
    # 创建一个 MobileViTV2Config 对象
    config = MobileViTV2Config()

    is_segmentation_model = False

    # 根据任务名设置不同的配置选项
    # imagenet1k 相关任务
    if task_name.startswith("imagenet1k_"):
        config.num_labels = 1000
        if int(task_name.strip().split("_")[-1]) == 384:
            config.image_size = 384
        else:
            config.image_size = 256
        filename = "imagenet-1k-id2label.json"
    # imagenet21k_to_1k 相关任务
    elif task_name.startswith("imagenet21k_to_1k_"):
        config.num_labels = 21000
        if int(task_name.strip().split("_")[-1]) == 384:
            config.image_size = 384
        else:
            config.image_size = 256
        filename = "imagenet-22k-id2label.json"
    # ade20k 相关分割任务
    elif task_name.startswith("ade20k_"):
        config.num_labels = 151
        config.image_size = 512
        filename = "ade20k-id2label.json"
        is_segmentation_model = True
    # 如果任务名称以 "voc_" 开头，则执行以下设置
    elif task_name.startswith("voc_"):
        # 设置配置文件的类别数量为 21
        config.num_labels = 21
        # 设置图像大小为 512x512
        config.image_size = 512
        # 指定文件名为 "pascal-voc-id2label.json"
        filename = "pascal-voc-id2label.json"
        # 标记这是一个分割模型
        is_segmentation_model = True

    # 加载原始配置文件
    orig_config = load_orig_config_file(orig_cfg_file)
    # 断言原始配置文件中的模型名称为 "mobilevit_v2"，否则抛出异常
    assert getattr(orig_config, "model.classification.name", -1) == "mobilevit_v2", "Invalid model"
    # 设置配置文件中的宽度乘数器为 mitv2 的宽度乘数器值，如果不存在则默认为 1.0
    config.width_multiplier = getattr(orig_config, "model.classification.mitv2.width_multiplier", 1.0)
    # 断言配置文件中的注意力归一化层为 "layer_norm_2d"，否则抛出异常
    assert (
        getattr(orig_config, "model.classification.mitv2.attn_norm_layer", -1) == "layer_norm_2d"
    ), "Norm layers other than layer_norm_2d is not supported"
    # 设置隐藏层激活函数为配置文件中的激活函数名称，如果不存在则默认为 "swish"
    config.hidden_act = getattr(orig_config, "model.classification.activation.name", "swish")
    # 设置图像大小为配置文件中采样器的裁剪宽度，但注释掉了，未生效
    # config.image_size == getattr(orig_config,  'sampler.bs.crop_size_width', 256)

    # 如果是分割模型，则进行以下设置
    if is_segmentation_model:
        # 设置输出步长为配置文件中分割模型的输出步长，默认为 16
        config.output_stride = getattr(orig_config, "model.segmentation.output_stride", 16)
        # 如果任务名称包含 "_deeplabv3"，则设置以下参数
        if "_deeplabv3" in task_name:
            # 设置 DeepLabv3 的空洞卷积率列表
            config.atrous_rates = getattr(orig_config, "model.segmentation.deeplabv3.aspp_rates", [12, 24, 36])
            # 设置 DeepLabv3 的 ASPP 输出通道数，默认为 512
            config.aspp_out_channels = getattr(orig_config, "model.segmentation.deeplabv3.aspp_out_channels", 512)
            # 设置 DeepLabv3 的 ASPP dropout 概率，默认为 0.1
            config.aspp_dropout_prob = getattr(orig_config, "model.segmentation.deeplabv3.aspp_dropout", 0.1)

    # 从 Hugging Face Hub 下载指定仓库中的文件，并加载为 JSON 格式
    repo_id = "huggingface/label-files"
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    # 将加载的 id2label 映射转换为整数键值对
    id2label = {int(k): v for k, v in id2label.items()}
    # 设置配置文件中的 id 到 label 的映射
    config.id2label = id2label
    # 设置配置文件中的 label 到 id 的映射
    config.label2id = {v: k for k, v in id2label.items()}

    # 返回配置对象
    return config
# 重命名字典中的键值对，将键为 old 的项替换为 new，保留其对应的值
def rename_key(dct, old, new):
    val = dct.pop(old)  # 弹出字典 dct 中键为 old 的项，并将其值赋给 val
    dct[new] = val  # 将键 new 和对应的值 val 添加到字典 dct 中


def create_rename_keys(state_dict, base_model=False):
    if base_model:
        model_prefix = ""  # 如果 base_model 为 True，则模型前缀为空字符串
    else:
        model_prefix = "mobilevitv2."  # 否则，模型前缀为 "mobilevitv2."

    rename_keys = []  # 初始化空的重命名键列表
    return rename_keys  # 返回空的重命名键列表


def remove_unused_keys(state_dict):
    """remove unused keys (e.g.: seg_head.aux_head)"""
    keys_to_ignore = []  # 初始化空的忽略键列表
    for k in state_dict.keys():  # 遍历 state_dict 中的所有键
        if k.startswith("seg_head.aux_head."):  # 如果键 k 以 "seg_head.aux_head." 开头
            keys_to_ignore.append(k)  # 将该键 k 添加到忽略键列表 keys_to_ignore 中
    for k in keys_to_ignore:  # 遍历忽略键列表中的所有键
        state_dict.pop(k, None)  # 从 state_dict 中移除键 k 对应的项


# We will verify our results on an image of cute cats
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # url = "https://cdn.britannica.com/86/141086-050-9D7C75EE/Gulfstream-G450-business-jet-passengers.jpg"
    im = Image.open(requests.get(url, stream=True).raw)  # 使用给定的 URL 打开并加载图像
    return im  # 返回加载的图像对象


@torch.no_grad()
def convert_mobilevitv2_checkpoint(task_name, checkpoint_path, orig_config_path, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our MobileViTV2 structure.
    """
    config = get_mobilevitv2_config(task_name, orig_config_path)  # 获取移动视觉模型的配置信息

    # load original state_dict
    checkpoint = torch.load(checkpoint_path, map_location="cpu")  # 加载原始的模型检查点

    # load huggingface model
    if task_name.startswith("ade20k_") or task_name.startswith("voc_"):
        model = MobileViTV2ForSemanticSegmentation(config).eval()  # 创建用于语义分割任务的移动视觉模型
        base_model = False  # 设置基础模型标志为 False
    else:
        model = MobileViTV2ForImageClassification(config).eval()  # 创建用于图像分类任务的移动视觉模型
        base_model = False  # 设置基础模型标志为 False

    # remove and rename some keys of load the original model
    state_dict = checkpoint  # 将加载的检查点赋给 state_dict
    remove_unused_keys(state_dict)  # 移除 state_dict 中的一些未使用的键
    rename_keys = create_rename_keys(state_dict, base_model=base_model)  # 创建重命名键列表
    for rename_key_src, rename_key_dest in rename_keys:  # 遍历重命名键列表中的每一对键值对
        rename_key(state_dict, rename_key_src, rename_key_dest)  # 使用 rename_key 函数重命名 state_dict 中的键

    # load modified state_dict
    model.load_state_dict(state_dict)  # 加载修改后的状态字典到模型中

    # Check outputs on an image, prepared by MobileViTImageProcessor
    image_processor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)  # 创建移动视觉图像处理器对象
    encoding = image_processor(images=prepare_img(), return_tensors="pt")  # 对准备好的图像进行编码处理
    outputs = model(**encoding)  # 使用模型进行推断

    # verify classification model
    if task_name.startswith("imagenet"):
        logits = outputs.logits  # 获取模型输出的逻辑回归值
        predicted_class_idx = logits.argmax(-1).item()  # 获取预测类别索引
        print("Predicted class:", model.config.id2label[predicted_class_idx])  # 打印预测类别标签
        if task_name.startswith("imagenet1k_256") and config.width_multiplier == 1.0:
            # expected_logits for base variant
            expected_logits = torch.tensor([-1.6336e00, -7.3204e-02, -5.1883e-01])
            assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)  # 断言预测的逻辑回归与预期值的接近度

    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)  # 创建模型保存目录
    print(f"Saving model {task_name} to {pytorch_dump_folder_path}")  # 打印保存模型的信息
    model.save_pretrained(pytorch_dump_folder_path)  # 保存模型到指定路径
    print(f"Saving image processor to {pytorch_dump_folder_path}")  # 打印保存图像处理器的信息
    # 将图像处理器的当前状态保存到指定的 PyTorch 模型导出文件夹路径
    image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()
    
    # 必需的参数
    parser.add_argument(
        "--task",
        default="imagenet1k_256",
        type=str,
        help=(
            "Name of the task for which the MobileViTV2 model you'd like to convert is trained on . "
            """
                Classification (ImageNet-1k)
                    - MobileViTV2 (256x256) : imagenet1k_256
                    - MobileViTV2 (Trained on 256x256 and Finetuned on 384x384) : imagenet1k_384
                    - MobileViTV2 (Trained on ImageNet-21k and Finetuned on ImageNet-1k 256x256) :
                      imagenet21k_to_1k_256
                    - MobileViTV2 (Trained on ImageNet-21k, Finetuned on ImageNet-1k 256x256, and Finetuned on
                      ImageNet-1k 384x384) : imagenet21k_to_1k_384
                Segmentation
                    - ADE20K Dataset : ade20k_deeplabv3
                    - Pascal VOC 2012 Dataset: voc_deeplabv3
            """
        ),
        choices=[
            "imagenet1k_256",
            "imagenet1k_384",
            "imagenet21k_to_1k_256",
            "imagenet21k_to_1k_384",
            "ade20k_deeplabv3",
            "voc_deeplabv3",
        ],
    )
    
    # 添加参数：原始检查点文件路径
    parser.add_argument(
        "--orig_checkpoint_path", required=True, type=str, help="Path to the original state dict (.pt file)."
    )
    
    # 添加参数：原始配置文件路径
    parser.add_argument("--orig_config_path", required=True, type=str, help="Path to the original config file.")
    
    # 添加参数：输出 PyTorch 模型目录路径
    parser.add_argument(
        "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
    )
    
    # 解析命令行参数
    args = parser.parse_args()
    
    # 调用函数，将 MobileViTV2 模型检查点转换为 PyTorch 模型
    convert_mobilevitv2_checkpoint(
        args.task, args.orig_checkpoint_path, args.orig_config_path, args.pytorch_dump_folder_path
    )

`.\models\mobilevitv2\modeling_mobilevitv2.py`

# coding=utf-8
# Copyright 2023 Apple Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE
""" PyTorch MobileViTV2 model."""


from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithNoAttention,
    BaseModelOutputWithPoolingAndNoAttention,
    ImageClassifierOutputWithNoAttention,
    SemanticSegmenterOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_mobilevitv2 import MobileViTV2Config


logger = logging.get_logger(__name__)


# General docstring
_CONFIG_FOR_DOC = "MobileViTV2Config"

# Base docstring
_CHECKPOINT_FOR_DOC = "apple/mobilevitv2-1.0-imagenet1k-256"
_EXPECTED_OUTPUT_SHAPE = [1, 512, 8, 8]

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "apple/mobilevitv2-1.0-imagenet1k-256"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"


MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "apple/mobilevitv2-1.0-imagenet1k-256"
    # See all MobileViTV2 models at https://huggingface.co/models?filter=mobilevitv2
]


# Copied from transformers.models.mobilevit.modeling_mobilevit.make_divisible
def make_divisible(value: int, divisor: int = 8, min_value: Optional[int] = None) -> int:
    """
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    """
    if min_value is None:
        min_value = divisor
    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_value < 0.9 * value:
        new_value += divisor
    return int(new_value)


def clip(value: float, min_val: float = float("-inf"), max_val: float = float("inf")) -> float:
    """
    Clip the input `value` to ensure it falls within the specified range [`min_val`, `max_val`].
    """
    return max(min_val, min(max_val, value))


# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTConvLayer with MobileViT->MobileViTV2
class MobileViTV2ConvLayer(nn.Module):
    """
    MobileViTV2 convolutional layer module that extends nn.Module.
    """
    # 初始化函数，用于初始化一个卷积层模块
    def __init__(
        self,
        config: MobileViTV2Config,  # 接收配置对象，指定模型的参数和行为
        in_channels: int,  # 输入通道数
        out_channels: int,  # 输出通道数
        kernel_size: int,  # 卷积核大小
        stride: int = 1,  # 卷积步长，默认为1
        groups: int = 1,  # 分组卷积中的组数，默认为1
        bias: bool = False,  # 是否使用偏置，默认不使用
        dilation: int = 1,  # 空洞卷积的扩张率，默认为1
        use_normalization: bool = True,  # 是否使用归一化，默认使用
        use_activation: Union[bool, str] = True,  # 是否使用激活函数，或指定激活函数类型，默认使用
    ) -> None:
        super().__init__()  # 调用父类的初始化函数

        padding = int((kernel_size - 1) / 2) * dilation  # 计算卷积的填充大小

        # 检查输入通道数是否能被组数整除，否则抛出数值错误
        if in_channels % groups != 0:
            raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.")
        # 检查输出通道数是否能被组数整除，否则抛出数值错误
        if out_channels % groups != 0:
            raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.")

        # 创建卷积层对象
        self.convolution = nn.Conv2d(
            in_channels=in_channels,  # 输入通道数
            out_channels=out_channels,  # 输出通道数
            kernel_size=kernel_size,  # 卷积核大小
            stride=stride,  # 卷积步长
            padding=padding,  # 填充大小
            dilation=dilation,  # 空洞卷积的扩张率
            groups=groups,  # 分组卷积的组数
            bias=bias,  # 是否使用偏置
            padding_mode="zeros",  # 填充模式为零填充
        )

        # 根据是否使用归一化，创建归一化层对象或设置为None
        if use_normalization:
            self.normalization = nn.BatchNorm2d(
                num_features=out_channels,  # 归一化的特征数，即输出通道数
                eps=1e-5,  # 用于数值稳定性的小值
                momentum=0.1,  # 动量参数，用于计算移动平均
                affine=True,  # 是否学习仿射参数
                track_running_stats=True,  # 是否跟踪运行时统计信息
            )
        else:
            self.normalization = None  # 不使用归一化

        # 根据是否使用激活函数，选择合适的激活函数对象
        if use_activation:
            if isinstance(use_activation, str):
                self.activation = ACT2FN[use_activation]  # 根据字符串映射到对应的激活函数
            elif isinstance(config.hidden_act, str):
                self.activation = ACT2FN[config.hidden_act]  # 根据配置中的隐藏层激活函数映射到对应的激活函数
            else:
                self.activation = config.hidden_act  # 使用配置中指定的激活函数
        else:
            self.activation = None  # 不使用激活函数

    # 前向传播函数，接收输入特征并输出处理后的特征
    def forward(self, features: torch.Tensor) -> torch.Tensor:
        features = self.convolution(features)  # 进行卷积操作
        if self.normalization is not None:
            features = self.normalization(features)  # 如果有归一化层，进行归一化操作
        if self.activation is not None:
            features = self.activation(features)  # 如果有激活函数，应用激活函数
        return features  # 返回处理后的特征
# 从MobileViT模型中复制的MobileViTV2InvertedResidual类，用于MobileViTV2模型
class MobileViTV2InvertedResidual(nn.Module):
    """
    反向残差块（MobileNetv2）：https://arxiv.org/abs/1801.04381
    """

    def __init__(
        self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int, dilation: int = 1
    ) -> None:
        super().__init__()
        # 根据配置参数计算扩展后的通道数，确保是8的倍数
        expanded_channels = make_divisible(int(round(in_channels * config.expand_ratio)), 8)

        # 检查步幅是否合法
        if stride not in [1, 2]:
            raise ValueError(f"Invalid stride {stride}.")

        # 判断是否使用残差连接
        self.use_residual = (stride == 1) and (in_channels == out_channels)

        # 第一个1x1扩展卷积层
        self.expand_1x1 = MobileViTV2ConvLayer(
            config, in_channels=in_channels, out_channels=expanded_channels, kernel_size=1
        )

        # 3x3深度可分离卷积层
        self.conv_3x3 = MobileViTV2ConvLayer(
            config,
            in_channels=expanded_channels,
            out_channels=expanded_channels,
            kernel_size=3,
            stride=stride,
            groups=expanded_channels,
            dilation=dilation,
        )

        # 第二个1x1卷积层，用于减少通道数
        self.reduce_1x1 = MobileViTV2ConvLayer(
            config,
            in_channels=expanded_channels,
            out_channels=out_channels,
            kernel_size=1,
            use_activation=False,
        )

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        residual = features

        # 执行前向传播：扩展卷积、深度可分离卷积、通道减少卷积
        features = self.expand_1x1(features)
        features = self.conv_3x3(features)
        features = self.reduce_1x1(features)

        # 如果使用残差连接，则加上原始特征
        return residual + features if self.use_residual else features


# 从MobileViT模型中复制的MobileViTV2MobileNetLayer类，用于MobileViTV2模型
class MobileViTV2MobileNetLayer(nn.Module):
    def __init__(
        self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int = 1, num_stages: int = 1
    ) -> None:
        super().__init__()

        # 创建模型层列表
        self.layer = nn.ModuleList()
        for i in range(num_stages):
            # 创建MobileViTV2InvertedResidual块并添加到层列表中
            layer = MobileViTV2InvertedResidual(
                config,
                in_channels=in_channels,
                out_channels=out_channels,
                stride=stride if i == 0 else 1,
            )
            self.layer.append(layer)
            in_channels = out_channels

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        # 逐层执行前向传播
        for layer_module in self.layer:
            features = layer_module(features)
        return features


class MobileViTV2LinearSelfAttention(nn.Module):
    """
    这一层应用了MobileViTV2论文中描述的线性复杂度的自注意力机制：
    https://arxiv.org/abs/2206.02680

    Args:
        config (`MobileVitv2Config`):
             模型配置对象
        embed_dim (`int`):
            预期输入的通道数，尺寸为(batch_size, input_channels, height, width)
    """
    def __init__(self, config: MobileViTV2Config, embed_dim: int) -> None:
        super().__init__()

        # 初始化查询/键/值投影层
        self.qkv_proj = MobileViTV2ConvLayer(
            config=config,
            in_channels=embed_dim,
            out_channels=1 + (2 * embed_dim),
            bias=True,
            kernel_size=1,
            use_normalization=False,
            use_activation=False,
        )

        # Dropout 用于注意力权重
        self.attn_dropout = nn.Dropout(p=config.attn_dropout)

        # 初始化输出投影层
        self.out_proj = MobileViTV2ConvLayer(
            config=config,
            in_channels=embed_dim,
            out_channels=embed_dim,
            bias=True,
            kernel_size=1,
            use_normalization=False,
            use_activation=False,
        )

        # 保存嵌入维度
        self.embed_dim = embed_dim

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用查询/键/值投影层处理隐藏状态
        # qkv 的形状从 (batch_size, embed_dim, num_pixels_in_patch, num_patches) 变为 (batch_size, 1+2*embed_dim, num_pixels_in_patch, num_patches)
        qkv = self.qkv_proj(hidden_states)

        # 将 qkv 张量分解为查询、键和值
        # query 的形状为 [batch_size, 1, num_pixels_in_patch, num_patches]
        # key 和 value 的形状为 [batch_size, embed_dim, num_pixels_in_patch, num_patches]
        query, key, value = torch.split(qkv, split_size_or_sections=[1, self.embed_dim, self.embed_dim], dim=1)

        # 在 num_patches 维度上应用 softmax
        context_scores = torch.nn.functional.softmax(query, dim=-1)
        context_scores = self.attn_dropout(context_scores)

        # 计算上下文向量
        # context_vector 的形状为 [batch_size, embed_dim, num_pixels_in_patch, 1]
        context_vector = key * context_scores
        context_vector = torch.sum(context_vector, dim=-1, keepdim=True)

        # 将上下文向量与值结合起来
        # out 的形状为 [batch_size, embed_dim, num_pixels_in_patch, num_patches]
        out = torch.nn.functional.relu(value) * context_vector.expand_as(value)
        out = self.out_proj(out)
        return out
# 定义一个 MobileViTV2FFN 类，继承自 nn.Module
class MobileViTV2FFN(nn.Module):
    # 初始化方法，接受 MobileViTV2Config 对象、嵌入维度、FFN 潜在维度、以及可选的 FFN dropout 率
    def __init__(
        self,
        config: MobileViTV2Config,
        embed_dim: int,
        ffn_latent_dim: int,
        ffn_dropout: float = 0.0,
    ) -> None:
        super().__init__()
        # 第一个卷积层，使用 MobileViTV2ConvLayer 初始化
        self.conv1 = MobileViTV2ConvLayer(
            config=config,
            in_channels=embed_dim,
            out_channels=ffn_latent_dim,
            kernel_size=1,
            stride=1,
            bias=True,
            use_normalization=False,
            use_activation=True,
        )
        # 第一个 dropout 层
        self.dropout1 = nn.Dropout(ffn_dropout)

        # 第二个卷积层，使用 MobileViTV2ConvLayer 初始化
        self.conv2 = MobileViTV2ConvLayer(
            config=config,
            in_channels=ffn_latent_dim,
            out_channels=embed_dim,
            kernel_size=1,
            stride=1,
            bias=True,
            use_normalization=False,
            use_activation=False,
        )
        # 第二个 dropout 层
        self.dropout2 = nn.Dropout(ffn_dropout)

    # 前向传播方法，接受输入张量 hidden_states，返回输出张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 第一层卷积操作和激活函数
        hidden_states = self.conv1(hidden_states)
        # 第一个 dropout 操作
        hidden_states = self.dropout1(hidden_states)
        # 第二层卷积操作（无激活函数）
        hidden_states = self.conv2(hidden_states)
        # 第二个 dropout 操作
        hidden_states = self.dropout2(hidden_states)
        # 返回最终输出张量
        return hidden_states


# 定义一个 MobileViTV2TransformerLayer 类，继承自 nn.Module
class MobileViTV2TransformerLayer(nn.Module):
    # 初始化方法，接受 MobileViTV2Config 对象、嵌入维度、FFN 潜在维度、以及可选的 dropout 率
    def __init__(
        self,
        config: MobileViTV2Config,
        embed_dim: int,
        ffn_latent_dim: int,
        dropout: float = 0.0,
    ) -> None:
        super().__init__()
        # LayerNorm 操作，用于输入前
        self.layernorm_before = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps)
        # 线性自注意力层，使用 MobileViTV2LinearSelfAttention 初始化
        self.attention = MobileViTV2LinearSelfAttention(config, embed_dim)
        # 第一个 dropout 层
        self.dropout1 = nn.Dropout(p=dropout)
        # LayerNorm 操作，用于输入后
        self.layernorm_after = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps)
        # Feed Forward Network (FFN)，使用 MobileViTV2FFN 初始化
        self.ffn = MobileViTV2FFN(config, embed_dim, ffn_latent_dim, config.ffn_dropout)

    # 前向传播方法，接受输入张量 hidden_states，返回输出张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # LayerNorm 操作，输入前
        layernorm_1_out = self.layernorm_before(hidden_states)
        # 自注意力层操作
        attention_output = self.attention(layernorm_1_out)
        # 残差连接和 LayerNorm 操作，输入后
        hidden_states = attention_output + hidden_states

        # LayerNorm 操作，输入后
        layer_output = self.layernorm_after(hidden_states)
        # Feed Forward Network 操作
        layer_output = self.ffn(layer_output)

        # 残差连接
        layer_output = layer_output + hidden_states
        # 返回最终输出张量
        return layer_output


# 定义一个 MobileViTV2Transformer 类，继承自 nn.Module
class MobileViTV2Transformer(nn.Module):
    # 初始化方法，接受 MobileViTV2Config 对象、层数、模型维度
    def __init__(self, config: MobileViTV2Config, n_layers: int, d_model: int) -> None:
        super().__init__()

        # FFN 维度的倍增器
        ffn_multiplier = config.ffn_multiplier

        # 构建 FFN 各层的维度列表，确保维度是 16 的倍数
        ffn_dims = [ffn_multiplier * d_model] * n_layers
        ffn_dims = [int((d // 16) * 16) for d in ffn_dims]

        # 用于存储 Transformer 层的列表
        self.layer = nn.ModuleList()
        # 循环创建并添加 Transformer 层到列表中
        for block_idx in range(n_layers):
            transformer_layer = MobileViTV2TransformerLayer(
                config, embed_dim=d_model, ffn_latent_dim=ffn_dims[block_idx]
            )
            self.layer.append(transformer_layer)
    # 定义一个前向传播函数，接受隐藏状态作为输入并返回处理后的隐藏状态张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 遍历神经网络模型中的每个层模块
        for layer_module in self.layer:
            # 将隐藏状态张量输入当前层模块，得到处理后的输出隐藏状态张量
            hidden_states = layer_module(hidden_states)
        # 返回最终处理后的隐藏状态张量
        return hidden_states
class MobileViTV2Layer(nn.Module):
    """
    MobileViTV2 layer: https://arxiv.org/abs/2206.02680
    """

    def __init__(
        self,
        config: MobileViTV2Config,
        in_channels: int,
        out_channels: int,
        attn_unit_dim: int,
        n_attn_blocks: int = 2,
        dilation: int = 1,
        stride: int = 2,
    ) -> None:
        super().__init__()
        self.patch_width = config.patch_size  # 设置实例变量 patch_width 为配置中的 patch_size
        self.patch_height = config.patch_size  # 设置实例变量 patch_height 为配置中的 patch_size

        cnn_out_dim = attn_unit_dim  # 将注意力单元维度赋给 cnn_out_dim 变量

        if stride == 2:
            # 如果步长为 2，则创建下采样层对象
            self.downsampling_layer = MobileViTV2InvertedResidual(
                config,
                in_channels=in_channels,
                out_channels=out_channels,
                stride=stride if dilation == 1 else 1,
                dilation=dilation // 2 if dilation > 1 else 1,
            )
            in_channels = out_channels  # 更新输入通道数为输出通道数
        else:
            self.downsampling_layer = None  # 如果步长不为 2，则下采样层设为 None

        # 创建局部表示的卷积层
        self.conv_kxk = MobileViTV2ConvLayer(
            config,
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=config.conv_kernel_size,
            groups=in_channels,
        )

        # 创建局部表示的 1x1 卷积层
        self.conv_1x1 = MobileViTV2ConvLayer(
            config,
            in_channels=in_channels,
            out_channels=cnn_out_dim,
            kernel_size=1,
            use_normalization=False,
            use_activation=False,
        )

        # 创建全局表示的变换器层
        self.transformer = MobileViTV2Transformer(config, d_model=attn_unit_dim, n_layers=n_attn_blocks)

        # 创建层归一化对象，使用 GroupNorm 形式
        self.layernorm = nn.GroupNorm(num_groups=1, num_channels=attn_unit_dim, eps=config.layer_norm_eps)

        # 创建融合用的投影卷积层
        self.conv_projection = MobileViTV2ConvLayer(
            config,
            in_channels=cnn_out_dim,
            out_channels=in_channels,
            kernel_size=1,
            use_normalization=True,
            use_activation=False,
        )

    def unfolding(self, feature_map: torch.Tensor) -> Tuple[torch.Tensor, Tuple[int, int]]:
        batch_size, in_channels, img_height, img_width = feature_map.shape
        # 对特征图进行展开，生成图像块，步长为指定的 patch 尺寸
        patches = nn.functional.unfold(
            feature_map,
            kernel_size=(self.patch_height, self.patch_width),
            stride=(self.patch_height, self.patch_width),
        )
        patches = patches.reshape(batch_size, in_channels, self.patch_height * self.patch_width, -1)

        return patches, (img_height, img_width)
    # 定义一个方法用于将 patches 转换回特征图
    def folding(self, patches: torch.Tensor, output_size: Tuple[int, int]) -> torch.Tensor:
        # 获取 patches 的维度信息
        batch_size, in_dim, patch_size, n_patches = patches.shape
        # 将 patches 重新整形为 [batch_size, in_dim * patch_size, n_patches]
        patches = patches.reshape(batch_size, in_dim * patch_size, n_patches)

        # 使用 PyTorch 的 fold 函数将 patches 折叠回特征图
        feature_map = nn.functional.fold(
            patches,
            output_size=output_size,
            kernel_size=(self.patch_height, self.patch_width),
            stride=(self.patch_height, self.patch_width),
        )

        return feature_map

    # 定义模型的前向传播方法
    def forward(self, features: torch.Tensor) -> torch.Tensor:
        # 如果定义了下采样层，则对输入特征进行空间维度缩减
        if self.downsampling_layer:
            features = self.downsampling_layer(features)

        # 进行局部表示学习
        features = self.conv_kxk(features)  # 应用 kxk 卷积
        features = self.conv_1x1(features)  # 应用 1x1 卷积

        # 将特征图转换为 patches 和输出大小信息
        patches, output_size = self.unfolding(features)

        # 学习全局表示
        patches = self.transformer(patches)  # 使用 transformer 对 patches 进行处理
        patches = self.layernorm(patches)    # 对处理后的 patches 进行 layer normalization

        # 将 patches 转换回特征图
        # [batch_size, patch_height, patch_width, input_dim] --> [batch_size, input_dim, patch_height, patch_width]
        features = self.folding(patches, output_size)  # 调用 folding 方法将 patches 折叠为特征图

        features = self.conv_projection(features)  # 应用卷积投影层将特征图投影到最终输出维度
        return features  # 返回最终的特征图作为模型的输出
class MobileViTV2Encoder(nn.Module):
    # MobileViTV2 编码器的定义，继承自 nn.Module
    def __init__(self, config: MobileViTV2Config) -> None:
        super().__init__()
        self.config = config

        # 初始化一个空的模块列表，用于存储各层模块
        self.layer = nn.ModuleList()
        # 梯度检查点默认为 False
        self.gradient_checkpointing = False

        # 根据配置调整输出步幅，适用于 DeepLab 和 PSPNet 这类分割架构修改分类骨干网的步幅
        dilate_layer_4 = dilate_layer_5 = False
        if config.output_stride == 8:
            dilate_layer_4 = True
            dilate_layer_5 = True
        elif config.output_stride == 16:
            dilate_layer_5 = True

        dilation = 1

        # 计算各层的维度，使其可分割
        layer_0_dim = make_divisible(
            clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16
        )

        layer_1_dim = make_divisible(64 * config.width_multiplier, divisor=16)
        layer_2_dim = make_divisible(128 * config.width_multiplier, divisor=8)
        layer_3_dim = make_divisible(256 * config.width_multiplier, divisor=8)
        layer_4_dim = make_divisible(384 * config.width_multiplier, divisor=8)
        layer_5_dim = make_divisible(512 * config.width_multiplier, divisor=8)

        # 创建 MobileViTV2MobileNetLayer 层，并添加到模块列表
        layer_1 = MobileViTV2MobileNetLayer(
            config,
            in_channels=layer_0_dim,
            out_channels=layer_1_dim,
            stride=1,
            num_stages=1,
        )
        self.layer.append(layer_1)

        # 创建 MobileViTV2MobileNetLayer 层，并添加到模块列表
        layer_2 = MobileViTV2MobileNetLayer(
            config,
            in_channels=layer_1_dim,
            out_channels=layer_2_dim,
            stride=2,
            num_stages=2,
        )
        self.layer.append(layer_2)

        # 创建 MobileViTV2Layer 层，并添加到模块列表
        layer_3 = MobileViTV2Layer(
            config,
            in_channels=layer_2_dim,
            out_channels=layer_3_dim,
            attn_unit_dim=make_divisible(config.base_attn_unit_dims[0] * config.width_multiplier, divisor=8),
            n_attn_blocks=config.n_attn_blocks[0],
        )
        self.layer.append(layer_3)

        # 如果需要扩展 layer_4 的空洞卷积
        if dilate_layer_4:
            dilation *= 2

        # 创建 MobileViTV2Layer 层，并添加到模块列表
        layer_4 = MobileViTV2Layer(
            config,
            in_channels=layer_3_dim,
            out_channels=layer_4_dim,
            attn_unit_dim=make_divisible(config.base_attn_unit_dims[1] * config.width_multiplier, divisor=8),
            n_attn_blocks=config.n_attn_blocks[1],
            dilation=dilation,
        )
        self.layer.append(layer_4)

        # 如果需要扩展 layer_5 的空洞卷积
        if dilate_layer_5:
            dilation *= 2

        # 创建 MobileViTV2Layer 层，并添加到模块列表
        layer_5 = MobileViTV2Layer(
            config,
            in_channels=layer_4_dim,
            out_channels=layer_5_dim,
            attn_unit_dim=make_divisible(config.base_attn_unit_dims[2] * config.width_multiplier, divisor=8),
            n_attn_blocks=config.n_attn_blocks[2],
            dilation=dilation,
        )
        self.layer.append(layer_5)

    # 前向传播函数，接收隐藏状态张量和是否输出隐藏状态的标志，并返回一个字典
    def forward(
        self,
        hidden_states: torch.Tensor,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[tuple, BaseModelOutputWithNoAttention]:
        # 如果不需要输出隐藏状态，初始化空元组；否则置为 None
        all_hidden_states = () if output_hidden_states else None

        # 遍历每个 Transformer 层
        for i, layer_module in enumerate(self.layer):
            # 如果开启了梯度检查点且处于训练模式
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数来计算当前层的隐藏状态
                hidden_states = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                )
            else:
                # 否则直接调用当前层获取隐藏状态
                hidden_states = layer_module(hidden_states)

            # 如果需要输出隐藏状态，则将当前层的隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的结果，将隐藏状态和所有隐藏层状态以元组形式返回
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)

        # 返回字典形式的结果，包括最终的隐藏状态和所有隐藏层状态
        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
# 定义 MobileViTV2PreTrainedModel 类，继承自 PreTrainedModel
class MobileViTV2PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 MobileViTV2Config
    config_class = MobileViTV2Config
    # 模型的基础名称前缀为 "mobilevitv2"
    base_model_prefix = "mobilevitv2"
    # 主输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    # 初始化模型权重的方法
    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
        """Initialize the weights"""
        # 如果是 Linear 或 Conv2d 模块
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 使用正态分布初始化权重，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有偏置，初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是 LayerNorm 模块
        elif isinstance(module, nn.LayerNorm):
            # 初始化偏置为零，权重为全 1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


# MOBILEVITV2_START_DOCSTRING 文档字符串
MOBILEVITV2_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MobileViTV2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# MOBILEVITV2_INPUTS_DOCSTRING 输入参数文档字符串
MOBILEVITV2_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`MobileViTImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 添加文档字符串注释到 MobileViTV2Model 类
@add_start_docstrings(
    "The bare MobileViTV2 model outputting raw hidden-states without any specific head on top.",
    MOBILEVITV2_START_DOCSTRING,
)
class MobileViTV2Model(MobileViTV2PreTrainedModel):
    pass  # 类主体为空，只继承 MobileViTV2PreTrainedModel
    def __init__(self, config: MobileViTV2Config, expand_output: bool = True):
        super().__init__(config)
        self.config = config  # 存储传入的配置对象
        self.expand_output = expand_output  # 是否扩展输出的标志

        # 计算第一个卷积层的输出维度
        layer_0_dim = make_divisible(
            clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16
        )

        # 创建第一个卷积层对象
        self.conv_stem = MobileViTV2ConvLayer(
            config,
            in_channels=config.num_channels,
            out_channels=layer_0_dim,
            kernel_size=3,
            stride=2,
            use_normalization=True,
            use_activation=True,
        )
        # 创建 MobileViTV2Encoder 实例
        self.encoder = MobileViTV2Encoder(config)

        # 初始化权重并进行最终处理
        self.post_init()

    def _prune_heads(self, heads_to_prune):
        """Prunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        """
        # 遍历需要修剪的层和头部信息
        for layer_index, heads in heads_to_prune.items():
            mobilevitv2_layer = self.encoder.layer[layer_index]
            # 确保层类型为 MobileViTV2Layer
            if isinstance(mobilevitv2_layer, MobileViTV2Layer):
                # 遍历每个 transformer 层的注意力头部，修剪指定的头部
                for transformer_layer in mobilevitv2_layer.transformer.layer:
                    transformer_layer.attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndNoAttention,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Performs forward pass of the MobileViTV2 model.
        pixel_values: Optional[torch.Tensor], input pixel values of shape (batch_size, num_channels, height, width)
        output_hidden_states: Optional[bool], whether to output hidden states
        return_dict: Optional[bool], whether to return a dictionary as output
        """
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果没有显式指定 output_hidden_states，则使用模型配置中的默认设置

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果没有显式指定 return_dict，则使用模型配置中的默认设置

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")
        # 如果未提供 pixel_values 参数，则抛出数值错误异常

        embedding_output = self.conv_stem(pixel_values)
        # 将像素值输入卷积层，得到嵌入输出

        encoder_outputs = self.encoder(
            embedding_output,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 将嵌入输出传入编码器进行编码，返回编码器的输出

        if self.expand_output:
            last_hidden_state = encoder_outputs[0]
            # 如果指定了扩展输出，取编码器的最后隐藏状态

            # 全局平均池化: (batch_size, channels, height, width) -> (batch_size, channels)
            pooled_output = torch.mean(last_hidden_state, dim=[-2, -1], keepdim=False)
            # 对最后隐藏状态进行全局平均池化，得到池化输出
        else:
            last_hidden_state = encoder_outputs[0]
            pooled_output = None
            # 否则，只取编码器的最后隐藏状态，并且池化输出为空

        if not return_dict:
            output = (last_hidden_state, pooled_output) if pooled_output is not None else (last_hidden_state,)
            # 如果不要求返回字典格式，则返回最后隐藏状态和池化输出（如果有），否则只返回最后隐藏状态
            return output + encoder_outputs[1:]
            # 返回编码器的其他输出

        return BaseModelOutputWithPoolingAndNoAttention(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
        )
        # 如果要求返回字典格式，则创建并返回包含最后隐藏状态、池化输出和所有隐藏状态的 BaseModelOutputWithPoolingAndNoAttention 对象
# 定义 MobileViTV2 图像分类模型，其在 MobileViTV2PreTrainedModel 基础上增加了一个图像分类头部（即在池化特征之上的线性层），例如适用于 ImageNet。
@add_start_docstrings(
    """
    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    """,
    MOBILEVITV2_START_DOCSTRING,
)
class MobileViTV2ForImageClassification(MobileViTV2PreTrainedModel):
    
    def __init__(self, config: MobileViTV2Config) -> None:
        super().__init__(config)

        # 从配置中获取类别数目
        self.num_labels = config.num_labels
        # 初始化 MobileViTV2 模型
        self.mobilevitv2 = MobileViTV2Model(config)

        # 计算第五层的输出维度，并确保是8的倍数
        out_channels = make_divisible(512 * config.width_multiplier, divisor=8)  # layer 5 output dimension
        
        # 分类器头部
        self.classifier = (
            nn.Linear(in_features=out_channels, out_features=config.num_labels)
            if config.num_labels > 0
            else nn.Identity()
        )

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutputWithNoAttention,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
        return_dict: Optional[bool] = None,
        # 以下省略部分 forward 方法参数注释
    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 参数不为 None，则使用参数值；否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 MobileViTV2 模型进行前向传播
        outputs = self.mobilevitv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)

        # 如果 return_dict 为 True，则使用 pooler_output 作为 pooled_output；否则使用 outputs 的第二个元素作为 pooled_output
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 将 pooled_output 输入分类器（全连接层），得到 logits
        logits = self.classifier(pooled_output)

        # 初始化 loss 为 None
        loss = None
        # 如果 labels 不为 None，则计算损失函数
        if labels is not None:
            # 如果问题类型未定义，则根据 num_labels 和 labels 的数据类型确定问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单标签回归问题，计算均方误差损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签回归问题，计算均方误差损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类问题，使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类问题，使用带 logits 的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回 logits 和额外的输出 hidden_states
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 ImageClassifierOutputWithNoAttention 类型的对象
        return ImageClassifierOutputWithNoAttention(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
        )
# 从 transformers.models.mobilevit.modeling_mobilevit.MobileViTASPPPooling 复制而来，名称更改为 MobileViTV2ASPPPooling
class MobileViTV2ASPPPooling(nn.Module):
    def __init__(self, config: MobileViTV2Config, in_channels: int, out_channels: int) -> None:
        super().__init__()

        # 创建一个全局平均池化层，将输入特征图池化到输出大小为 1x1
        self.global_pool = nn.AdaptiveAvgPool2d(output_size=1)

        # 创建一个 1x1 卷积层，用于通道变换和特征提取
        self.conv_1x1 = MobileViTV2ConvLayer(
            config,
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=1,
            stride=1,
            use_normalization=True,
            use_activation="relu",
        )

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        # 获取输入特征图的空间大小（高度和宽度）
        spatial_size = features.shape[-2:]
        # 对输入特征图进行全局平均池化，将特征图池化到大小为 1x1
        features = self.global_pool(features)
        # 通过 1x1 卷积层处理池化后的特征图，进行通道变换和特征提取
        features = self.conv_1x1(features)
        # 使用双线性插值将特征图的大小插值回原来的空间大小
        features = nn.functional.interpolate(features, size=spatial_size, mode="bilinear", align_corners=False)
        return features


class MobileViTV2ASPP(nn.Module):
    """
    ASPP 模块，由 DeepLab 论文定义：https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
    """

    def __init__(self, config: MobileViTV2Config) -> None:
        super().__init__()

        # 计算编码器输出通道数，并确保可被 8 整除，作为输入通道数
        encoder_out_channels = make_divisible(512 * config.width_multiplier, divisor=8)  # 第 5 层输出维度
        in_channels = encoder_out_channels
        out_channels = config.aspp_out_channels

        # 如果空洞卷积的扩张率不是 3 个值，抛出异常
        if len(config.atrous_rates) != 3:
            raise ValueError("Expected 3 values for atrous_rates")

        # 创建一个包含多个卷积层的模块列表
        self.convs = nn.ModuleList()

        # 创建输入投影层，使用 1x1 卷积进行通道变换和特征提取
        in_projection = MobileViTV2ConvLayer(
            config,
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=1,
            use_activation="relu",
        )
        self.convs.append(in_projection)

        # 使用不同的空洞率创建多个卷积层，并加入到模块列表中
        self.convs.extend(
            [
                MobileViTV2ConvLayer(
                    config,
                    in_channels=in_channels,
                    out_channels=out_channels,
                    kernel_size=3,
                    dilation=rate,
                    use_activation="relu",
                )
                for rate in config.atrous_rates
            ]
        )

        # 创建 ASPP 池化层，使用全局平均池化和 1x1 卷积
        pool_layer = MobileViTV2ASPPPooling(config, in_channels, out_channels)
        self.convs.append(pool_layer)

        # 创建投影层，将多个卷积层的输出连接起来，并通过 1x1 卷积进行通道变换和特征提取
        self.project = MobileViTV2ConvLayer(
            config, in_channels=5 * out_channels, out_channels=out_channels, kernel_size=1, use_activation="relu"
        )

        # 创建 Dropout 层，用于防止过拟合
        self.dropout = nn.Dropout(p=config.aspp_dropout_prob)

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        # 创建一个空列表，用于存储多个卷积层处理后的特征图
        pyramid = []
        # 遍历模块列表中的每个卷积层，对输入特征图进行处理，并将处理后的结果添加到列表中
        for conv in self.convs:
            pyramid.append(conv(features))
        # 将列表中所有处理后的特征图沿着通道维度拼接起来
        pyramid = torch.cat(pyramid, dim=1)

        # 使用投影层处理拼接后的特征图，进行通道变换和特征提取
        pooled_features = self.project(pyramid)
        # 对投影后的特征图进行 Dropout 操作，以减少过拟合
        pooled_features = self.dropout(pooled_features)
        return pooled_features
# 从 transformers.models.mobilevit.modeling_mobilevit.MobileViTDeepLabV3 复制而来，将 MobileViT 改为 MobileViTV2
class MobileViTV2DeepLabV3(nn.Module):
    """
    DeepLabv3 架构：https://arxiv.org/abs/1706.05587
    """

    def __init__(self, config: MobileViTV2Config) -> None:
        super().__init__()
        # 初始化 ASPP 模块，用于多尺度特征处理
        self.aspp = MobileViTV2ASPP(config)

        # Dropout 层，用于防止过拟合
        self.dropout = nn.Dropout2d(config.classifier_dropout_prob)

        # 分类器模块，将 ASPP 输出转换为最终的语义分割结果
        self.classifier = MobileViTV2ConvLayer(
            config,
            in_channels=config.aspp_out_channels,
            out_channels=config.num_labels,
            kernel_size=1,
            use_normalization=False,
            use_activation=False,
            bias=True,
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 前向传播函数，处理输入的隐藏状态张量并生成语义分割的特征张量
        features = self.aspp(hidden_states[-1])  # 使用 ASPP 模块处理最后一层隐藏状态
        features = self.dropout(features)  # 对特征张量进行 dropout 处理
        features = self.classifier(features)  # 使用分类器模块生成最终的语义分割特征
        return features


@add_start_docstrings(
    """
    MobileViTV2 模型，顶部带有语义分割头，例如用于 Pascal VOC 数据集。
    """,
    MOBILEVITV2_START_DOCSTRING,
)
class MobileViTV2ForSemanticSegmentation(MobileViTV2PreTrainedModel):
    def __init__(self, config: MobileViTV2Config) -> None:
        super().__init__(config)

        self.num_labels = config.num_labels
        self.mobilevitv2 = MobileViTV2Model(config, expand_output=False)  # MobileViTV2 主干模型
        self.segmentation_head = MobileViTV2DeepLabV3(config)  # 深度解析 v3 分割头模型

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> Union[SemanticSegmenterOutput, Tuple[torch.Tensor, ...]]:
        # 前向传播函数，接受像素值、标签等输入，返回语义分割输出
        return self.mobilevitv2(
            pixel_values=pixel_values,
            labels=labels,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            **kwargs,
        )
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.mobilevitv2(
            pixel_values,
            output_hidden_states=True,  # 指定需要中间隐藏状态作为输出
            return_dict=return_dict,  # 指定是否返回字典形式的输出
        )

        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
        # 如果返回字典形式，则使用 `outputs.hidden_states`，否则使用 `outputs[1]`

        logits = self.segmentation_head(encoder_hidden_states)
        # 使用编码器的隐藏状态生成分割头部的 logits

        loss = None
        if labels is not None:
            if self.config.num_labels == 1:
                raise ValueError("The number of labels should be greater than one")
            else:
                # 将 logits 插值到原始图像大小
                upsampled_logits = nn.functional.interpolate(
                    logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
                )
                loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
                loss = loss_fct(upsampled_logits, labels)
                # 计算交叉熵损失，忽略指定索引处的标签

        if not return_dict:
            if output_hidden_states:
                output = (logits,) + outputs[1:]
                # 如果不返回字典，且需要隐藏状态，则输出 logits 和隐藏状态
            else:
                output = (logits,) + outputs[2:]
                # 如果不返回字典，不需要隐藏状态，则输出 logits 和其他输出

            return ((loss,) + output) if loss is not None else output
            # 如果有损失，则输出损失和其它输出；否则只输出其它输出

        return SemanticSegmenterOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=None,
        )
        # 返回语义分割器的输出对象，包括损失、logits、隐藏状态和注意力信息（注意力暂未提供）

`.\models\mobilevitv2\init.py`

# 版权声明和许可信息，标明代码版权和使用许可条件
from typing import TYPE_CHECKING

# 导入必要的异常和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
    is_vision_available,
)

# 定义模块导入结构的字典
_import_structure = {
    "configuration_mobilevitv2": [
        "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "MobileViTV2Config",
        "MobileViTV2OnnxConfig",
    ],
}

# 检查是否存在torch模块，若不存在则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在torch模块，将相关模型导入结构加入_import_structure字典
    _import_structure["modeling_mobilevitv2"] = [
        "MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MobileViTV2ForImageClassification",
        "MobileViTV2ForSemanticSegmentation",
        "MobileViTV2Model",
        "MobileViTV2PreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 导入配置相关的类和常量
    from .configuration_mobilevitv2 import (
        MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
        MobileViTV2Config,
        MobileViTV2OnnxConfig,
    )

    # 再次检查是否存在torch模块，若不存在则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入模型相关的类和常量
        from .modeling_mobilevitv2 import (
            MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST,
            MobileViTV2ForImageClassification,
            MobileViTV2ForSemanticSegmentation,
            MobileViTV2Model,
            MobileViTV2PreTrainedModel,
        )

# 如果不在类型检查模式下，则将当前模块作为LazyModule延迟加载模块导入
else:
    import sys

    # 将当前模块替换为_LazyModule对象，实现延迟导入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\mpnet\configuration_mpnet.py`

# coding=utf-8
# 声明文件编码格式为UTF-8

# Copyright 2018 The HuggingFace Inc. team, Microsoft Corporation.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
# 版权声明，版权归HuggingFace Inc.团队、Microsoft Corporation和NVIDIA CORPORATION所有

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证 2.0 版本授权使用本文件

# you may not use this file except in compliance with the License.
# 除非遵守许可证，否则不得使用此文件

# You may obtain a copy of the License at
# 可以在以下网址获取许可证副本

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 除非适用法律要求或书面同意，本软件按"原样"分发，不附带任何明示或暗示的担保或条件
# 详见许可证中的特定语言及限制条件

""" MPNet model configuration"""

# 引入必要的模块和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取logger对象，用于记录日志信息
logger = logging.get_logger(__name__)

# 预训练配置文件映射，指定预训练模型的名称及其对应的配置文件URL
MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/mpnet-base": "https://huggingface.co/microsoft/mpnet-base/resolve/main/config.json",
}


class MPNetConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MPNetModel`] or a [`TFMPNetModel`]. It is used to
    instantiate a MPNet model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the MPNet
    [microsoft/mpnet-base](https://huggingface.co/microsoft/mpnet-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # MPNetConfig类用于存储MPNet模型或TFMPNet模型的配置信息。
    # 通过指定的参数实例化一个MPNet模型，定义模型架构。使用默认配置将生成与MPNet microsoft/mpnet-base架构类似的配置。

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        # 初始化函数，调用父类的初始化方法
    # 指定模型的配置参数，定义了MPNet模型的各种参数选项，默认值见文档描述
    Args:
        vocab_size (`int`, *optional*, defaults to 30527):
            MPNet模型的词汇表大小，决定了在调用`MPNetModel`或`TFMPNetModel`时，`inputs_ids`可以表示的不同token数量。
        hidden_size (`int`, *optional*, defaults to 768):
            编码器层和池化层的维度。
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Transformer编码器中的隐藏层数量。
        num_attention_heads (`int`, *optional*, defaults to 12):
            Transformer编码器中每个注意力层的注意力头数量。
        intermediate_size (`int`, *optional*, defaults to 3072):
            Transformer编码器中"中间"（通常称为前馈）层的维度。
        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            编码器和池化器中的非线性激活函数（函数或字符串）。支持的字符串有："gelu"、"relu"、"silu"和"gelu_new"。
        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            嵌入层、编码器和池化器中所有全连接层的dropout概率。
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            注意力概率的dropout比例。
        max_position_embeddings (`int`, *optional*, defaults to 512):
            模型可能使用的最大序列长度。通常设置为一个较大的值（例如512、1024或2048），以防万一。
        initializer_range (`float`, *optional*, defaults to 0.02):
            初始化所有权重矩阵的截断正态初始化器的标准差。
        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            层归一化层使用的epsilon值。
        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
            每个注意力层使用的桶的数量。
    
    Examples:
        
        # 导入MPNetModel和MPNetConfig
        >>> from transformers import MPNetModel, MPNetConfig
        
        # 初始化一个MPNet mpnet-base风格的配置
        >>> configuration = MPNetConfig()
        
        # 使用mpnet-base风格的配置初始化一个模型
        >>> model = MPNetModel(configuration)
        
        # 访问模型配置
        >>> configuration = model.config
    # 初始化函数，用于创建一个新的实例对象
    def __init__(
        self,
        vocab_size=30527,                     # 词汇表大小，默认为30527
        hidden_size=768,                      # 隐藏层大小，默认为768
        num_hidden_layers=12,                 # 隐藏层的数量，默认为12
        num_attention_heads=12,               # 注意力头的数量，默认为12
        intermediate_size=3072,               # 中间层大小，默认为3072
        hidden_act="gelu",                    # 隐藏层激活函数，默认为gelu
        hidden_dropout_prob=0.1,              # 隐藏层的dropout概率，默认为0.1
        attention_probs_dropout_prob=0.1,     # 注意力概率的dropout概率，默认为0.1
        max_position_embeddings=512,          # 最大位置嵌入数，默认为512
        initializer_range=0.02,               # 初始化范围，默认为0.02
        layer_norm_eps=1e-12,                 # 层归一化的epsilon，默认为1e-12
        relative_attention_num_buckets=32,    # 相对注意力的桶数量，默认为32
        pad_token_id=1,                       # 填充标记ID，默认为1
        bos_token_id=0,                       # 起始标记ID，默认为0
        eos_token_id=2,                       # 结束标记ID，默认为2
        **kwargs,
    ):
        # 调用父类的初始化函数，传入填充、起始和结束标记ID，以及其他关键字参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 初始化对象的属性
        self.vocab_size = vocab_size                           # 设置词汇表大小属性
        self.hidden_size = hidden_size                         # 设置隐藏层大小属性
        self.num_hidden_layers = num_hidden_layers             # 设置隐藏层数量属性
        self.num_attention_heads = num_attention_heads         # 设置注意力头数量属性
        self.hidden_act = hidden_act                           # 设置隐藏层激活函数属性
        self.intermediate_size = intermediate_size             # 设置中间层大小属性
        self.hidden_dropout_prob = hidden_dropout_prob         # 设置隐藏层dropout概率属性
        self.attention_probs_dropout_prob = attention_probs_dropout_prob   # 设置注意力dropout概率属性
        self.max_position_embeddings = max_position_embeddings # 设置最大位置嵌入数属性
        self.initializer_range = initializer_range             # 设置初始化范围属性
        self.layer_norm_eps = layer_norm_eps                   # 设置层归一化的epsilon属性
        self.relative_attention_num_buckets = relative_attention_num_buckets   # 设置相对注意力的桶数量属性

`.\models\mpnet\modeling_mpnet.py`

# coding=utf-8
# 版权 2018 年 HuggingFace Inc. 团队，Microsoft 公司。
# 版权所有 (c) 2018 年 NVIDIA 公司。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）授权；
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则依据许可证分发的软件
# 是基于“按原样”提供，没有任何明示或暗示的担保或条件。
# 有关许可证的详细信息，请参阅许可证。
"""PyTorch MPNet 模型。"""


import math  # 导入数学函数库
from typing import Optional, Tuple, Union  # 引入类型提示相关的库

import torch  # 导入 PyTorch 库
from torch import nn  # 导入 PyTorch 的神经网络模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 导入损失函数

from ...activations import ACT2FN, gelu  # 导入激活函数
from ...modeling_outputs import (  # 导入模型输出相关的类
    BaseModelOutput,
    BaseModelOutputWithPooling,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer  # 导入模型工具函数
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging  # 导入工具函数和日志模块
from .configuration_mpnet import MPNetConfig  # 导入 MPNet 模型的配置文件


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CHECKPOINT_FOR_DOC = "microsoft/mpnet-base"  # 预训练模型的检查点名称
_CONFIG_FOR_DOC = "MPNetConfig"  # MPNet 模型的配置类名


MPNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/mpnet-base",  # MPNet 预训练模型存档列表
]


class MPNetPreTrainedModel(PreTrainedModel):
    config_class = MPNetConfig  # 使用的配置类
    pretrained_model_archive_map = MPNET_PRETRAINED_MODEL_ARCHIVE_LIST  # 预训练模型的映射表
    base_model_prefix = "mpnet"  # 基础模型前缀名称

    def _init_weights(self, module):
        """初始化模型参数"""
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化线性层的权重，均值为 0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果存在偏置项，将其初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化嵌入层的权重，均值为 0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果存在 padding 索引，将其对应的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 初始化 LayerNorm 层的权重和偏置
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


class MPNetEmbeddings(nn.Module):
    # 初始化函数，接收配置参数并进行初始化
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 设置填充索引为1
        self.padding_idx = 1
        # 创建词嵌入层，使用配置中的词汇表大小和隐藏层大小，设置填充索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
        # 创建位置嵌入层，使用配置中的最大位置数和隐藏层大小，设置填充索引
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

        # 创建 LayerNorm 层，使用隐藏层大小和配置中的 epsilon 值
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建 Dropout 层，使用配置中的隐藏层 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 注册位置索引的缓冲区，生成一个从0到最大位置数的索引张量
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )

    # 前向传播函数，接收输入张量或嵌入向量，并返回处理后的嵌入向量
    def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, **kwargs):
        # 如果位置索引为 None
        if position_ids is None:
            # 如果输入的是 input_ids
            if input_ids is not None:
                # 根据 input_ids 创建位置索引，使用填充索引
                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx)
            else:
                # 否则，根据 inputs_embeds 创建位置索引
                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        # 如果输入的是 input_ids
        if input_ids is not None:
            # 获取输入的形状
            input_shape = input_ids.size()
        else:
            # 否则，获取 inputs_embeds 的形状，去掉最后一个维度
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度
        seq_length = input_shape[1]

        # 如果位置索引仍然为 None，则使用预先创建的位置索引的子集
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果输入的嵌入向量为 None，则使用输入的 input_ids 获得词嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        # 使用位置嵌入层获取位置嵌入
        position_embeddings = self.position_embeddings(position_ids)

        # 将词嵌入和位置嵌入相加作为最终的嵌入向量
        embeddings = inputs_embeds + position_embeddings
        # 对嵌入向量进行 LayerNorm 处理
        embeddings = self.LayerNorm(embeddings)
        # 对嵌入向量进行 Dropout 处理
        embeddings = self.dropout(embeddings)
        # 返回处理后的嵌入向量
        return embeddings

    # 根据输入的嵌入向量创建位置索引
    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        # 获取输入的形状，去掉最后一个维度
        input_shape = inputs_embeds.size()[:-1]
        # 获取序列长度
        sequence_length = input_shape[1]

        # 生成从填充索引+1到序列长度+填充索引+1的位置索引张量，使用输入的设备
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        # 将位置索引张量扩展为与输入形状相匹配
        return position_ids.unsqueeze(0).expand(input_shape)
class MPNetSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 检查隐藏层大小是否是注意力头数的倍数，若不是且没有嵌入尺寸配置则抛出异常
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 初始化查询、键、值和输出线性层
        self.q = nn.Linear(config.hidden_size, self.all_head_size)
        self.k = nn.Linear(config.hidden_size, self.all_head_size)
        self.v = nn.Linear(config.hidden_size, self.all_head_size)
        self.o = nn.Linear(config.hidden_size, config.hidden_size)

        # 初始化注意力概率的丢弃层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        # 调整张量形状以便计算注意力分数
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        position_bias=None,
        output_attentions=False,
        **kwargs,
    ):
        # 计算查询、键、值的线性变换
        q = self.q(hidden_states)
        k = self.k(hidden_states)
        v = self.v(hidden_states)

        # 将查询、键、值调整为注意力头形式
        q = self.transpose_for_scores(q)
        k = self.transpose_for_scores(k)
        v = self.transpose_for_scores(v)

        # 计算注意力分数，通过查询和键的点积得到原始注意力分数
        attention_scores = torch.matmul(q, k.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # 如果提供了位置偏置，则添加相对位置嵌入
        if position_bias is not None:
            attention_scores += position_bias

        # 如果提供了注意力遮罩，则将其应用于注意力分数
        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask

        # 对注意力分数进行归一化，转换为注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 应用注意力概率的丢弃层
        attention_probs = self.dropout(attention_probs)

        # 如果提供了头部遮罩，则将其应用于注意力概率
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算加权后的值向量
        c = torch.matmul(attention_probs, v)

        # 调整输出形状，将注意力头合并回原来的形状
        c = c.permute(0, 2, 1, 3).contiguous()
        new_c_shape = c.size()[:-2] + (self.all_head_size,)
        c = c.view(*new_c_shape)

        # 经过输出线性层得到最终的注意力输出
        o = self.o(c)

        # 返回注意力输出及可能的注意力概率，根据需要决定是否输出注意力分布
        outputs = (o, attention_probs) if output_attentions else (o,)
        return outputs


class MPNetAttention(nn.Module):
    # 初始化函数，用于初始化一个新的实例
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 初始化自注意力层
        self.attn = MPNetSelfAttention(config)
        # 初始化层归一化，设置隐藏层大小和层归一化的 epsilon 值
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 dropout 层，设置隐藏层的 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 存储被修剪的注意力头的索引集合
        self.pruned_heads = set()

    # 方法用于修剪自注意力层中的头
    def prune_heads(self, heads):
        # 如果传入的头列表为空，则直接返回
        if len(heads) == 0:
            return
        
        # 调用函数找到可修剪的头的索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.attn.num_attention_heads, self.attn.attention_head_size, self.pruned_heads
        )

        # 对自注意力层中的 q、k、v、o 进行线性层修剪
        self.attn.q = prune_linear_layer(self.attn.q, index)
        self.attn.k = prune_linear_layer(self.attn.k, index)
        self.attn.v = prune_linear_layer(self.attn.v, index)
        self.attn.o = prune_linear_layer(self.attn.o, index, dim=1)

        # 更新注意力头的数量和总的头大小
        self.attn.num_attention_heads = self.attn.num_attention_heads - len(heads)
        self.attn.all_head_size = self.attn.attention_head_size * self.attn.num_attention_heads
        # 将修剪的头添加到已修剪头的集合中
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播函数，用于执行模型的前向计算
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        position_bias=None,
        output_attentions=False,
        **kwargs,
    ):
        # 调用自注意力层进行前向传播计算
        self_outputs = self.attn(
            hidden_states,
            attention_mask,
            head_mask,
            position_bias,
            output_attentions=output_attentions,
        )
        # 计算注意力输出并进行层归一化、dropout
        attention_output = self.LayerNorm(self.dropout(self_outputs[0]) + hidden_states)
        # 如果需要输出注意力权重，则将其添加到输出中
        outputs = (attention_output,) + self_outputs[1:]  # 如果有输出注意力权重，则添加它们
        return outputs
# Copied from transformers.models.bert.modeling_bert.BertIntermediate
class MPNetIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，输入维度为 config.hidden_size，输出维度为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act, str):
            # 如果 config.hidden_act 是字符串，则从预定义的 ACT2FN 字典中选择对应的激活函数
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            # 否则直接使用 config.hidden_act 作为激活函数
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 通过全连接层进行前向传播，得到中间层的输出
        hidden_states = self.dense(hidden_states)
        # 应用中间层的激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOutput
class MPNetOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，输入维度为 config.intermediate_size，输出维度为 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 初始化 LayerNorm 层，输入维度为 config.hidden_size
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 Dropout 层，dropout 概率为 config.hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 通过全连接层进行前向传播，得到输出的隐藏状态
        hidden_states = self.dense(hidden_states)
        # 应用 Dropout 操作
        hidden_states = self.dropout(hidden_states)
        # 对隐藏状态应用 LayerNorm
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class MPNetLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化自注意力层、中间层和输出层
        self.attention = MPNetAttention(config)
        self.intermediate = MPNetIntermediate(config)
        self.output = MPNetOutput(config)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        position_bias=None,
        output_attentions=False,
        **kwargs,
    ):
        # 使用自注意力层处理隐藏状态，可能包括注意力掩码、头部掩码和位置偏置等参数
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            position_bias=position_bias,
            output_attentions=output_attentions,
        )
        # 获取自注意力层的输出
        attention_output = self_attention_outputs[0]
        # 如果输出注意力权重，将其添加到输出中
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        # 通过中间层处理自注意力层的输出
        intermediate_output = self.intermediate(attention_output)
        # 通过输出层处理中间层的输出和自注意力层的输出
        layer_output = self.output(intermediate_output, attention_output)
        # 将层输出与可能存在的注意力权重输出合并到 outputs 中
        outputs = (layer_output,) + outputs
        return outputs


class MPNetEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.n_heads = config.num_attention_heads
        # 使用 MPNetLayer 构建编码器的多层堆叠，层数由 config.num_hidden_layers 决定
        self.layer = nn.ModuleList([MPNetLayer(config) for _ in range(config.num_hidden_layers)])
        # 初始化相对注意力偏置 Embedding，用于编码器中的每个注意头
        self.relative_attention_bias = nn.Embedding(config.relative_attention_num_buckets, self.n_heads)
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = False,
        **kwargs,
    ):
        # 计算位置偏置，用于注意力机制中的相对位置编码
        position_bias = self.compute_position_bias(hidden_states)
        
        # 如果需要输出隐藏状态，则初始化一个空元组用于存储所有隐藏状态
        all_hidden_states = () if output_hidden_states else None
        
        # 如果需要输出注意力权重，则初始化一个空元组用于存储所有注意力权重
        all_attentions = () if output_attentions else None
        
        # 遍历每个层次的 Transformer 层进行前向传播
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                # 如果需要输出隐藏状态，则将当前隐藏状态加入到 all_hidden_states 中
                all_hidden_states = all_hidden_states + (hidden_states,)
            
            # 调用当前层的前向传播函数，获取当前层的输出
            layer_outputs = layer_module(
                hidden_states,
                attention_mask,
                head_mask[i],
                position_bias,
                output_attentions=output_attentions,
                **kwargs,
            )
            
            # 更新当前隐藏状态为当前层的输出
            hidden_states = layer_outputs[0]

            if output_attentions:
                # 如果需要输出注意力权重，则将当前层的注意力权重加入到 all_attentions 中
                all_attentions = all_attentions + (layer_outputs[1],)

        # 添加最后一层的隐藏状态到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 根据 return_dict 的设置决定返回结果的形式
        if not return_dict:
            # 如果不需要返回一个字典，则返回包含非空元素的元组
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
        else:
            # 如果需要返回一个字典形式的结果，则使用 BaseModelOutput 构造结果并返回
            return BaseModelOutput(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states,
                attentions=all_attentions,
            )

    def compute_position_bias(self, x, position_ids=None, num_buckets=32):
        # 获取输入张量 x 的尺寸信息
        bsz, qlen, klen = x.size(0), x.size(1), x.size(1)
        
        # 如果提供了位置 ids，则使用这些 ids；否则，创建默认的位置 ids
        if position_ids is not None:
            context_position = position_ids[:, :, None]
            memory_position = position_ids[:, None, :]
        else:
            context_position = torch.arange(qlen, dtype=torch.long)[:, None]
            memory_position = torch.arange(klen, dtype=torch.long)[None, :]

        # 计算相对位置矩阵
        relative_position = memory_position - context_position

        # 将相对位置映射到固定数量的桶中
        rp_bucket = self.relative_position_bucket(relative_position, num_buckets=num_buckets)
        rp_bucket = rp_bucket.to(x.device)
        
        # 获取相对注意力偏置值
        values = self.relative_attention_bias(rp_bucket)
        
        # 调整值的维度顺序以匹配注意力矩阵的形状要求
        values = values.permute([2, 0, 1]).unsqueeze(0)
        
        # 扩展值的维度以匹配输入张量 x 的尺寸
        values = values.expand((bsz, -1, qlen, klen)).contiguous()
        
        # 返回计算得到的位置偏置值
        return values

    @staticmethod
    # 定义函数 `relative_position_bucket`，计算相对位置的桶索引
    def relative_position_bucket(relative_position, num_buckets=32, max_distance=128):
        # 初始化返回值为0
        ret = 0
        # 将相对位置取反
        n = -relative_position

        # 将桶的数量除以2，这是为了后面计算的方便
        num_buckets //= 2
        # 根据条件将ret加上一个整数值（0或num_buckets），使用torch的long类型
        ret += (n < 0).to(torch.long) * num_buckets
        # 取n的绝对值
        n = torch.abs(n)

        # 定义最大精确值为桶数的一半
        max_exact = num_buckets // 2
        # 判断n是否小于最大精确值
        is_small = n < max_exact

        # 如果n较大，计算大值的情况
        val_if_large = max_exact + (
            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
        ).to(torch.long)

        # 将大值的情况限制在桶数减1以内
        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
        # 根据is_small的条件选择填入的值，返回最终的ret
        ret += torch.where(is_small, n, val_if_large)
        # 返回计算结果
        return ret
        """
        Args:
            hidden_states (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
                Tensor containing the hidden states of the input sequences. Typically comes from the output of the
                last layer of the model.
        
        Returns:
            :obj:`torch.Tensor`: The pooled output tensor of shape :obj:`(batch_size, hidden_size)`.
                This tensor represents the "pooled" output, i.e., the output corresponding to the first token of each
                input sequence in the batch.
        """
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列标记在词汇表中的索引。使用 AutoTokenizer 可获取这些索引。
            # 参见 PreTrainedTokenizer.encode 和 PreTrainedTokenizer.__call__ 获取更多细节。
            # 什么是输入 ID？请参阅 ../glossary#input-ids

        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 遮罩，用于避免在填充标记索引上执行注意力计算。
            # 遮罩值在 [0, 1] 范围内选择：
            # - 1 表示不遮罩的标记，
            # - 0 表示遮罩的标记。
            # 什么是注意力遮罩？请参阅 ../glossary#attention-mask

        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 输入序列标记在位置嵌入中的位置索引。选择范围为 [0, config.max_position_embeddings - 1]。
            # 什么是位置 ID？请参阅 ../glossary#position-ids

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 用于将自注意力模块的特定头部置零的遮罩。
            # 遮罩值在 [0, 1] 范围内选择：
            # - 1 表示不遮罩的头部，
            # - 0 表示遮罩的头部。

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选，可以直接传递嵌入表示而不是传递 input_ids。如果需要对如何将 input_ids 索引转换为关联向量
            # 拥有更多控制权，这将非常有用，胜过于模型的内部嵌入查找矩阵。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。查看返回张量中的 `attentions` 以获取更多细节。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。查看返回张量中的 `hidden_states` 以获取更多细节。

        return_dict (`bool`, *optional*):
            # 是否返回 `utils.ModelOutput` 而不是普通元组。
# 使用自定义的装饰器为模型类添加文档字符串，描述模型输出原始隐藏状态的特性
@add_start_docstrings(
    "The bare MPNet Model transformer outputting raw hidden-states without any specific head on top.",
    MPNET_START_DOCSTRING,
)
# 定义 MPNetModel 类，继承自 MPNetPreTrainedModel 类
class MPNetModel(MPNetPreTrainedModel):
    # 初始化方法，接受配置参数和是否添加池化层的标志
    def __init__(self, config, add_pooling_layer=True):
        # 调用父类的初始化方法
        super().__init__(config)
        # 存储配置信息
        self.config = config

        # 初始化嵌入层
        self.embeddings = MPNetEmbeddings(config)
        # 初始化编码器
        self.encoder = MPNetEncoder(config)
        # 如果设置了添加池化层的标志，则初始化池化层；否则设为 None
        self.pooler = MPNetPooler(config) if add_pooling_layer else None

        # 执行初始化后的处理
        self.post_init()

    # 返回输入嵌入的方法
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    # 设置输入嵌入的方法
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    # 剪枝模型中的注意力头方法
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            # 对指定层的注意力头进行剪枝操作
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 使用装饰器添加前向传播方法的文档字符串，描述其输入参数
    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 使用装饰器添加代码示例的文档字符串，描述前向传播方法的返回类型、检查点、输出配置
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
    )
    # 前向传播方法定义，接受多个输入参数和关键字参数
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
        ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]:
        # 确定是否输出注意力权重，默认为配置文件中的设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 确定是否输出隐藏状态，默认为配置文件中的设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 确定是否返回字典形式的输出，默认为配置文件中的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果同时指定了 input_ids 和 inputs_embeds，则抛出 ValueError 异常
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            # 检查是否存在 padding 但未提供 attention_mask 的情况，如果有则发出警告
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            # 获取 input_ids 的形状
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            # 获取 inputs_embeds 的形状，排除最后一个维度（通常是 embedding 维度）
            input_shape = inputs_embeds.size()[:-1]
        else:
            # 如果既未指定 input_ids 也未指定 inputs_embeds，则抛出 ValueError 异常
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 确定使用的设备，如果 input_ids 存在则使用其设备，否则使用 inputs_embeds 的设备
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # 如果未提供 attention_mask，则创建一个全为 1 的张量作为默认 attention_mask
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        # 获取扩展后的 attention_mask，确保其形状与输入数据匹配
        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)

        # 获取头部遮罩，用于控制每层的注意力头部是否生效
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
        # 使用 embeddings 层生成嵌入向量
        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, inputs_embeds=inputs_embeds)
        # 将嵌入向量输入到编码器（encoder）中，得到编码器的输出
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 从编码器的输出中获取序列输出（sequence_output）
        sequence_output = encoder_outputs[0]
        # 如果存在池化器（pooler），则对序列输出进行池化操作，得到池化输出（pooled_output）
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        # 如果不要求返回字典形式的输出，则返回元组形式的输出
        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        # 如果要求返回字典形式的输出，则构建 BaseModelOutputWithPooling 对象并返回
        return BaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
class MPNetForMaskedLM(MPNetPreTrainedModel):
    _tied_weights_keys = ["lm_head.decoder"]

    def __init__(self, config):
        super().__init__(config)

        # 初始化 MPNet 模型，不添加池化层
        self.mpnet = MPNetModel(config, add_pooling_layer=False)
        # 初始化 MPNetLMHead，即语言模型头部
        self.lm_head = MPNetLMHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_output_embeddings(self):
        # 返回 lm_head 的 decoder，用于输出嵌入
        return self.lm_head.decoder

    def set_output_embeddings(self, new_embeddings):
        # 设置 lm_head 的 decoder，用于更新输出嵌入
        self.lm_head.decoder = new_embeddings

    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 确定是否使用返回字典，如果未指定则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 MPNet 模型
        outputs = self.mpnet(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取序列输出并通过 lm_head 进行预测
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            # 如果提供了标签，则计算掩码语言建模损失
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            # 如果不使用返回字典，则输出包含预测分数和其他输出的元组
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # 如果使用返回字典，则返回 MaskedLMOutput 对象
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    """MPNet Head for masked and permuted language modeling."""
    
    # 定义一个名为 MPNet 的类，用于处理掩码和置换语言建模的头部任务
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，将输入特征的大小映射到隐藏大小
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 创建一个层归一化层，对隐藏大小的特征进行归一化处理
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
    
        # 创建一个线性层，将隐藏大小映射到词汇表大小，没有偏置
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        # 创建一个偏置参数，大小为词汇表大小
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
    
        # 需要一个链接来保证偏置能够在 `resize_token_embeddings` 中正确调整大小
        self.decoder.bias = self.bias
    
    # 定义前向传播方法，处理输入特征和其他关键字参数
    def forward(self, features, **kwargs):
        # 将特征输入到全连接层
        x = self.dense(features)
        # 应用 GELU 激活函数
        x = gelu(x)
        # 应用层归一化
        x = self.layer_norm(x)
    
        # 使用线性层将特征映射回词汇表大小，并加上偏置
        x = self.decoder(x)
    
        # 返回结果张量 x
        return x
# 使用装饰器添加模型的文档字符串，描述该模型是基于 MPNet 的序列分类/回归模型，例如用于 GLUE 任务
@add_start_docstrings(
    """
    MPNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    MPNET_START_DOCSTRING,  # 添加 MPNet 的起始文档字符串
)
# 定义 MPNetForSequenceClassification 类，继承自 MPNetPreTrainedModel
class MPNetForSequenceClassification(MPNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 从配置中获取类别数目并赋值给对象属性
        self.num_labels = config.num_labels
        # 使用 MPNetModel 初始化 MPNet 模型，不添加池化层
        self.mpnet = MPNetModel(config, add_pooling_layer=False)
        # 使用 MPNetClassificationHead 初始化分类器
        self.classifier = MPNetClassificationHead(config)

        # 执行初始化权重和最终处理
        self.post_init()

    # 使用装饰器添加 forward 方法的文档字符串，描述输入参数及其作用
    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 使用装饰器添加代码示例的文档字符串，指定相关的检查点、输出类型和配置类
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义 forward 方法，接收多个输入参数并返回模型输出
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 函数参数描述完毕，没有注释的代码
        ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        
        # 初始化返回字典，若未提供则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 MPNet 模型处理输入数据
        outputs = self.mpnet(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        # 从模型输出中获取序列输出
        sequence_output = outputs[0]
        
        # 使用分类器层生成 logits（分类预测）
        logits = self.classifier(sequence_output)

        # 初始化损失值为 None
        loss = None
        
        # 如果提供了标签
        if labels is not None:
            # 根据模型配置决定问题类型（回归、单标签分类、多标签分类）
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单标签回归，计算均方误差损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签回归，计算均方误差损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类，计算交叉熵损失
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类，计算带 logits 的二元交叉熵损失
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        
        # 如果不需要返回字典形式的结果，则只返回 logits 和其它附加输出
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
        
        # 返回 SequenceClassifierOutput 对象，包含损失、logits、隐藏状态和注意力权重
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    MPNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    MPNET_START_DOCSTRING,
)
class MPNetForMultipleChoice(MPNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 MPNet 模型
        self.mpnet = MPNetModel(config)
        # 使用指定的隐藏单元丢弃概率初始化 Dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 初始化分类器线性层，输入大小为隐藏状态的大小，输出大小为1（用于二分类）
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = False,
    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        
        # Determine whether to use the provided return_dict or the default from configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        # Determine the number of choices (second dimension size) based on input_ids or inputs_embeds
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
    
        # Flatten input tensors if not None to prepare for model input
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        flat_inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )
    
        # Pass the flattened inputs to the model, along with other optional arguments
        outputs = self.mpnet(
            flat_input_ids,
            position_ids=flat_position_ids,
            attention_mask=flat_attention_mask,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        # Extract the pooled output (typically pooled for classification tasks)
        pooled_output = outputs[1]
    
        # Apply dropout to the pooled output
        pooled_output = self.dropout(pooled_output)
        
        # Pass the pooled output through the classifier to get logits
        logits = self.classifier(pooled_output)
        
        # Reshape logits to match the shape required for multiple choice tasks
        reshaped_logits = logits.view(-1, num_choices)
    
        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)
    
        # Prepare output based on return_dict flag
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]  # Include additional model outputs if not using return_dict
            return ((loss,) + output) if loss is not None else output
        else:
            # Return as a MultipleChoiceModelOutput object if return_dict is True
            return MultipleChoiceModelOutput(
                loss=loss,
                logits=reshaped_logits,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            )
@add_start_docstrings(
    """
    MPNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    MPNET_START_DOCSTRING,
)
class MPNetForTokenClassification(MPNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # Initialize MPNet model with no pooling layer
        self.mpnet = MPNetModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """

        # Determine whether to return as dictionary based on configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass inputs through MPNet model
        outputs = self.mpnet(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        # Apply dropout to the output of the MPNet model
        sequence_output = self.dropout(sequence_output)
        # Classify the sequence output using a linear layer
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            # Calculate cross-entropy loss if labels are provided
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            # Return output as a tuple if return_dict is False
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # Return TokenClassifierOutput named tuple if return_dict is True
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    def __init__(self, config):
        super().__init__()
        # 使用 config 中的 hidden_size 参数定义一个全连接层，输入和输出维度都是 hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 使用 config 中的 hidden_dropout_prob 参数定义一个 dropout 层，用于在训练时随机置零输入张量的元素
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 使用 config 中的 hidden_size 和 num_labels 参数定义一个全连接层，将 hidden_size 维度的输入映射到 num_labels 维度的输出
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        # 从 features 张量中取出所有样本的第一个位置的向量，通常用于获取整体序列的表示（例如 BERT 的 [CLS] token 表示整个序列）
        x = features[:, 0, :]
        # 对取出的向量应用 dropout 操作，以防止过拟合
        x = self.dropout(x)
        # 将经过 dropout 后的向量输入到全连接层 dense 中进行线性变换
        x = self.dense(x)
        # 对 dense 层的输出应用 tanh 激活函数，增加模型的非线性特性
        x = torch.tanh(x)
        # 再次对经过 tanh 激活后的向量应用 dropout 操作
        x = self.dropout(x)
        # 最后将经过两次 dropout 和一次全连接后得到的向量输入到全连接层 out_proj 中，得到最终的输出
        x = self.out_proj(x)
        # 返回神经网络的前向传播结果
        return x
# 定义一个 MPNet 问题回答模型，用于像 SQuAD 这样的抽取式问答任务，在隐藏状态输出的基础上增加了一个用于计算“起始位置 logits”和“结束位置 logits”的线性层。
@add_start_docstrings(
    """
    MPNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    MPNET_START_DOCSTRING,
)
class MPNetForQuestionAnswering(MPNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 设置标签数等于配置文件中的标签数
        self.num_labels = config.num_labels
        # 初始化 MPNet 模型，不添加池化层
        self.mpnet = MPNetModel(config, add_pooling_layer=False)
        # QA 输出层，使用线性变换将隐藏状态大小转换为标签数
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """

        # 根据返回值的需求确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的前向传播方法，获取输出
        outputs = self.mpnet(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取序列输出
        sequence_output = outputs[0]

        # 将序列输出传入问答输出层得到 logits
        logits = self.qa_outputs(sequence_output)
        
        # 将 logits 按最后一个维度分割成起始位置和结束位置 logits
        start_logits, end_logits = logits.split(1, dim=-1)
        
        # 去除多余的维度并保持连续性
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        # 如果提供了起始位置和结束位置，则计算损失
        if start_positions is not None and end_positions is not None:
            # 如果是多 GPU 训练，需要添加一个维度
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            
            # 忽略超出模型输入范围的位置
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 使用交叉熵损失函数计算起始位置和结束位置的损失
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        # 如果不需要返回字典形式的输出，则返回原始的输出元组
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果需要返回字典形式的输出，则构造 QuestionAnsweringModelOutput 对象并返回
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 从输入的输入 ID 中创建位置 ID。非填充符号被替换为它们的位置号码。位置号码从 padding_idx+1 开始计算，填充符号被忽略。
# 这是从 fairseq 的 `utils.make_positions` 修改而来。
def create_position_ids_from_input_ids(input_ids, padding_idx):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`. :param torch.Tensor x: :return torch.Tensor:
    """
    # 创建一个掩码，标记出非填充符号的位置为1，填充符号位置为0
    mask = input_ids.ne(padding_idx).int()
    # 计算每个位置上非填充符号的累积数量，并将结果转换为与 mask 张量相同的数据类型
    incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
    # 将累积的位置索引转换为 long 类型，并加上 padding_idx，以得到最终的位置 ID
    return incremental_indices.long() + padding_idx

`.\models\mpnet\modeling_tf_mpnet.py`

# 指定编码格式为 UTF-8
# 版权声明和许可信息，遵循 Apache License 2.0
# 导入必要的库和模块
# 定义 MPNet 模型的 Tensorflow 实现

from __future__ import annotations

import math  # 导入数学函数库
import warnings  # 导入警告模块，用于警告处理
from typing import Optional, Tuple, Union  # 导入类型提示相关的类和函数

import numpy as np  # 导入 NumPy 库
import tensorflow as tf  # 导入 TensorFlow 库

# 导入自定义的 Tensorflow 激活函数
from ...activations_tf import get_tf_activation
# 导入 Tensorflow 版本的模型输出相关类
from ...modeling_tf_outputs import (
    TFBaseModelOutput,
    TFBaseModelOutputWithPooling,
    TFMaskedLMOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
# 导入 Tensorflow 工具函数
from ...modeling_tf_utils import (
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
# 导入 Tensorflow 工具函数，用于检查嵌入是否在界限内
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
# 导入通用的工具函数和模块
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
# 导入 MPNet 的配置类
from .configuration_mpnet import MPNetConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置信息
_CHECKPOINT_FOR_DOC = "microsoft/mpnet-base"
_CONFIG_FOR_DOC = "MPNetConfig"

# 预训练模型存档列表
TF_MPNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/mpnet-base",
]

class TFMPNetPreTrainedModel(TFPreTrainedModel):
    """
    一个处理权重初始化、下载和加载预训练模型的抽象类。
    """

    # 配置类为 MPNetConfig
    config_class = MPNetConfig
    # 基础模型前缀为 "mpnet"
    base_model_prefix = "mpnet"


class TFMPNetEmbeddings(keras.layers.Layer):
    """从单词和位置嵌入构建嵌入层。"""

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 填充索引为 1
        self.padding_idx = 1
        # 配置对象
        self.config = config
        # 隐藏层大小
        self.hidden_size = config.hidden_size
        # 最大位置嵌入数
        self.max_position_embeddings = config.max_position_embeddings
        # 初始化范围
        self.initializer_range = config.initializer_range
        # 层归一化模块
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # Dropout 层
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
    # 在建立模型时调用的方法，用于构建模型的输入形状
    def build(self, input_shape=None):
        # 在 "word_embeddings" 命名空间下创建权重张量
        self.weight = self.add_weight(
            name="weight",
            shape=[self.config.vocab_size, self.hidden_size],
            initializer=get_initializer(initializer_range=self.initializer_range),
        )

        # 在 "position_embeddings" 命名空间下创建位置嵌入张量
        self.position_embeddings = self.add_weight(
            name="embeddings",
            shape=[self.max_position_embeddings, self.hidden_size],
            initializer=get_initializer(initializer_range=self.initializer_range),
        )

        # 如果模型已经建立，则直接返回，避免重复建立
        if self.built:
            return
        self.built = True

        # 如果存在 LayerNorm 属性，则构建 LayerNorm 层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])

    def create_position_ids_from_input_ids(self, input_ids):
        """
        根据输入的 token ids 创建位置 ids。非填充符号用它们的位置数字替换。
        位置编号从 padding_idx+1 开始，填充符号被忽略。这是从 fairseq 的 `utils.make_positions` 修改而来。

        Args:
            input_ids: tf.Tensor，输入的 token ids
        Returns: tf.Tensor，位置 ids
        """
        # 创建一个掩码，标记非填充符号的位置
        mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
        # 计算累积的索引，乘以掩码确保只处理非填充符号
        incremental_indices = tf.math.cumsum(mask, axis=1) * mask

        return incremental_indices + self.padding_idx

    def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False):
        """
        根据输入张量应用嵌入。

        Returns:
            final_embeddings (`tf.Tensor`): 输出的嵌入张量。
        """
        # 断言输入的 input_ids 或 inputs_embeds 至少有一个不为空
        assert not (input_ids is None and inputs_embeds is None)

        if input_ids is not None:
            # 检查 input_ids 是否在词汇表大小范围内
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            # 使用 weight 张量从参数中获取对应 input_ids 的嵌入向量
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        input_shape = shape_list(inputs_embeds)[:-1]

        if position_ids is None:
            if input_ids is not None:
                # 从输入的 token ids 创建位置 ids，保持填充的仍然填充
                position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids)
            else:
                # 如果没有输入 token ids，则创建一个从 padding_idx+1 开始的位置 ids
                position_ids = tf.expand_dims(
                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
                )

        # 从 position_embeddings 中获取对应 position_ids 的位置嵌入
        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        # 最终的嵌入向量是输入嵌入与位置嵌入的和
        final_embeddings = inputs_embeds + position_embeds
        # 对最终的嵌入向量进行 LayerNorm 处理
        final_embeddings = self.LayerNorm(inputs=final_embeddings)
        # 在训练时使用 dropout 处理最终的嵌入向量
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)

        return final_embeddings
# 从 transformers.models.bert.modeling_tf_bert.TFBertPooler 复制并修改为 MPNet
class TFMPNetPooler(keras.layers.Layer):
    def __init__(self, config: MPNetConfig, **kwargs):
        super().__init__(**kwargs)

        # 使用 Dense 层定义一个全连接层，units 参数为配置文件中的 hidden_size
        # 使用指定的初始化器初始化权重
        # 激活函数为 tanh
        self.dense = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 对模型进行池化操作，简单地选择对应于第一个标记的隐藏状态
        first_token_tensor = hidden_states[:, 0]
        # 将第一个标记的隐藏状态输入到 Dense 层中
        pooled_output = self.dense(inputs=first_token_tensor)

        return pooled_output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建过，直接返回
        # 否则，在 Dense 层上下文中创建并构建 Dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


class TFMPNetSelfAttention(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 确保隐藏大小是注意力头数的整数倍
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads}"
            )

        # 计算每个注意力头的大小和总头大小
        self.num_attention_heads = config.num_attention_heads
        assert config.hidden_size % config.num_attention_heads == 0
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 定义 q、k、v、o 四个 Dense 层，用于注意力计算和输出
        self.q = keras.layers.Dense(
            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="q"
        )
        self.k = keras.layers.Dense(
            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="k"
        )
        self.v = keras.layers.Dense(
            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="v"
        )
        self.o = keras.layers.Dense(
            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="o"
        )
        # 定义 Dropout 层，用于注意力概率的 dropout
        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
        self.config = config

    def transpose_for_scores(self, x, batch_size):
        # 重塑张量形状，从 [batch_size, seq_length, all_head_size] 到 [batch_size, seq_length, num_attention_heads, attention_head_size]
        x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))

        return tf.transpose(x, perm=[0, 2, 1, 3])
    # 定义一个方法，用于执行自注意力机制
    def call(self, hidden_states, attention_mask, head_mask, output_attentions, position_bias=None, training=False):
        # 获取批量大小
        batch_size = shape_list(hidden_states)[0]

        # 计算查询（q）、键（k）、值（v）的线性变换结果
        q = self.q(hidden_states)
        k = self.k(hidden_states)
        v = self.v(hidden_states)

        # 将查询（q）、键（k）、值（v）转换为适合计算注意力分数的形状
        q = self.transpose_for_scores(q, batch_size)
        k = self.transpose_for_scores(k, batch_size)
        v = self.transpose_for_scores(v, batch_size)

        # 计算注意力分数
        attention_scores = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(shape_list(k)[-1], attention_scores.dtype)
        attention_scores = attention_scores / tf.math.sqrt(dk)

        # 如果提供了位置偏置（在 MPNetEncoder 中预先计算），则将其应用于注意力分数
        if position_bias is not None:
            attention_scores += position_bias

        # 如果存在注意力遮罩，则将其应用于注意力分数
        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask

        # 计算注意力权重
        attention_probs = stable_softmax(attention_scores, axis=-1)

        # 在训练时应用 dropout
        attention_probs = self.dropout(attention_probs, training=training)

        # 如果存在头部遮罩，则将其应用于注意力权重
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算加权后的值向量
        c = tf.matmul(attention_probs, v)
        c = tf.transpose(c, perm=[0, 2, 1, 3])
        c = tf.reshape(c, (batch_size, -1, self.all_head_size))

        # 将加权后的值向量传递给输出层
        o = self.o(c)

        # 返回输出结果，包括加权后的值向量和可能的注意力权重（如果需要输出注意力权重）
        outputs = (o, attention_probs) if output_attentions else (o,)
        return outputs

    # 构建层结构，初始化并构建 q、k、v、o 等变量
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return

        # 标记为已构建
        self.built = True

        # 如果存在 q 变量，初始化并构建其结构
        if getattr(self, "q", None) is not None:
            with tf.name_scope(self.q.name):
                self.q.build([None, None, self.config.hidden_size])

        # 如果存在 k 变量，初始化并构建其结构
        if getattr(self, "k", None) is not None:
            with tf.name_scope(self.k.name):
                self.k.build([None, None, self.config.hidden_size])

        # 如果存在 v 变量，初始化并构建其结构
        if getattr(self, "v", None) is not None:
            with tf.name_scope(self.v.name):
                self.v.build([None, None, self.config.hidden_size])

        # 如果存在 o 变量，初始化并构建其结构
        if getattr(self, "o", None) is not None:
            with tf.name_scope(self.o.name):
                self.o.build([None, None, self.config.hidden_size])
# 定义自定义的注意力层类 TFMPNetAttention，继承自 keras.layers.Layer
class TFMPNetAttention(keras.layers.Layer):
    # 初始化函数，接受一个 config 对象和其他关键字参数
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 创建 TFMPNetSelfAttention 层实例，命名为 "attn"
        self.attn = TFMPNetSelfAttention(config, name="attn")
        # 创建 LayerNormalization 层实例，使用给定的 epsilon 值
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建 Dropout 层实例，使用给定的 dropout 概率
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        # 存储传入的 config 对象
        self.config = config

    # 未实现的方法，用于裁剪注意力头部
    def prune_heads(self, heads):
        raise NotImplementedError

    # 调用函数，处理输入张量并执行自注意力操作
    def call(self, input_tensor, attention_mask, head_mask, output_attentions, position_bias=None, training=False):
        # 使用 self.attn 处理输入张量，得到自注意力层的输出
        self_outputs = self.attn(
            input_tensor, attention_mask, head_mask, output_attentions, position_bias=position_bias, training=training
        )
        # 对自注意力层的输出进行 LayerNormalization 和 Dropout 处理，并与输入张量相加
        attention_output = self.LayerNorm(self.dropout(self_outputs[0]) + input_tensor)
        # 构建输出元组，包含处理后的注意力输出及可能的额外输出
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs

    # 构建函数，用于构建层结构
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 self.attn 存在，使用其名称空间构建 self.attn
        if getattr(self, "attn", None) is not None:
            with tf.name_scope(self.attn.name):
                self.attn.build(None)
        # 如果 self.LayerNorm 存在，使用其名称空间构建 self.LayerNorm
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# 从 transformers.models.bert.modeling_tf_bert.TFBertIntermediate 复制并修改为 MPNet
# 定义 MPNet 的中间层类 TFMPNetIntermediate，继承自 keras.layers.Layer
class TFMPNetIntermediate(keras.layers.Layer):
    # 初始化函数，接受一个 MPNetConfig 对象和其他关键字参数
    def __init__(self, config: MPNetConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建 Dense 层实例，使用给定的单元数和初始化器
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 根据 hidden_act 类型确定激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        # 存储传入的 config 对象
        self.config = config

    # 调用函数，处理隐藏状态张量并执行 Dense 层和激活函数操作
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 经过 Dense 层处理隐藏状态张量
        hidden_states = self.dense(inputs=hidden_states)
        # 使用 intermediate_act_fn 执行激活函数操作
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    # 构建函数，用于构建层结构
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 self.dense 存在，使用其名称空间构建 self.dense
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# 从 transformers.models.bert.modeling_tf_bert.TFBertOutput 复制并修改为 MPNet
# 定义 MPNet 的输出层类 TFMPNetOutput，继承自 keras.layers.Layer
class TFMPNetOutput(keras.layers.Layer):
    # 初始化方法，接受一个配置对象和其他关键字参数
    def __init__(self, config: MPNetConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 创建一个全连接层，使用给定的隐藏单元数和初始化器范围
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        
        # 创建一个层归一化层，使用给定的 epsilon 参数
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        
        # 创建一个 dropout 层，使用给定的 dropout 比例
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        
        # 存储配置对象，以便后续使用
        self.config = config

    # 模型调用方法，接受隐藏状态张量、输入张量和训练标志，并返回变换后的隐藏状态张量
    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 使用全连接层变换隐藏状态张量
        hidden_states = self.dense(inputs=hidden_states)
        
        # 使用 dropout 层对变换后的隐藏状态张量进行 dropout 处理
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        
        # 使用层归一化层对 dropout 处理后的张量和输入张量进行残差连接和归一化处理
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        # 返回变换后的隐藏状态张量
        return hidden_states

    # 构建方法，用于构建层对象，根据输入形状构建层的内部结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 设置标记，表示已经构建过
        self.built = True
        
        # 如果存在全连接层，使用给定的名称作为命名空间，构建全连接层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])
        
        # 如果存在层归一化层，使用给定的名称作为命名空间，构建层归一化层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])
# 定义 TFMPNetLayer 类，继承自 keras 的 Layer 类
class TFMPNetLayer(keras.layers.Layer):
    # 初始化方法，接受 config 和其他关键字参数
    def __init__(self, config, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 创建 TFMPNetAttention 实例，命名为 "attention"
        self.attention = TFMPNetAttention(config, name="attention")
        # 创建 TFMPNetIntermediate 实例，命名为 "intermediate"
        self.intermediate = TFMPNetIntermediate(config, name="intermediate")
        # 创建 TFMPNetOutput 实例，命名为 "output"
        self.out = TFMPNetOutput(config, name="output")

    # call 方法定义了层的前向传播逻辑
    def call(self, hidden_states, attention_mask, head_mask, output_attentions, position_bias=None, training=False):
        # 调用 self.attention 的 call 方法，进行自注意力计算
        self_attention_outputs = self.attention(
            hidden_states, attention_mask, head_mask, output_attentions, position_bias=position_bias, training=training
        )
        # 获取自注意力计算的输出结果
        attention_output = self_attention_outputs[0]
        # 如果需要输出注意力权重，将其添加到 outputs 中
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        # 将注意力输出传递给 self.intermediate 层进行处理
        intermediate_output = self.intermediate(attention_output)
        # 将 intermediate_output 和 attention_output 传递给 self.out 层进行处理
        layer_output = self.out(intermediate_output, attention_output, training=training)
        # 将 layer_output 添加到 outputs 中
        outputs = (layer_output,) + outputs  # add attentions if we output them

        # 返回最终的输出结果
        return outputs

    # build 方法用于构建层，处理层的内部状态
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True

        # 如果 self.attention 存在，则构建 self.attention 层
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)

        # 如果 self.intermediate 存在，则构建 self.intermediate 层
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)

        # 如果 self.out 存在，则构建 self.out 层
        if getattr(self, "out", None) is not None:
            with tf.name_scope(self.out.name):
                self.out.build(None)


# 定义 TFMPNetEncoder 类，继承自 keras 的 Layer 类
class TFMPNetEncoder(keras.layers.Layer):
    # 初始化方法，接受 config 和其他关键字参数
    def __init__(self, config, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 保存 config 中的参数
        self.config = config
        self.n_heads = config.num_attention_heads
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.relative_attention_num_buckets = config.relative_attention_num_buckets
        self.initializer_range = config.initializer_range

        # 创建 TFMPNetLayer 的列表，命名为 "layer_._{i}"，共 config.num_hidden_layers 个
        self.layer = [TFMPNetLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
        # 保存相对注意力的桶数
        self.relative_attention_num_buckets = config.relative_attention_num_buckets

    # build 方法用于构建层，处理层的内部状态
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True

        # 使用 tf.name_scope 构建 "relative_attention_bias" 命名空间
        with tf.name_scope("relative_attention_bias"):
            # 创建相对注意力的偏置权重 self.relative_attention_bias
            self.relative_attention_bias = self.add_weight(
                name="embeddings",
                shape=[self.relative_attention_num_buckets, self.n_heads],
                initializer=get_initializer(self.initializer_range),
            )

        # 如果 self.layer 列表存在，则依次构建其中的每个 TFMPNetLayer 层
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                with tf.name_scope(layer.name):
                    layer.build(None)
    # 定义一个方法，用于执行模型的前向推断过程，生成输出结果
    def call(
        self,
        hidden_states,                # 输入的隐藏状态张量
        attention_mask,               # 注意力掩码张量，用于指定哪些位置需要进行注意力计算
        head_mask,                    # 多头注意力的掩码，控制每个注意力头的屏蔽情况
        output_attentions,            # 是否输出注意力权重
        output_hidden_states,         # 是否输出所有层的隐藏状态
        return_dict,                  # 是否以字典形式返回结果
        training=False,               # 是否在训练模式下运行，默认为 False
    ):
        # 计算位置偏置，用于处理位置编码的影响
        position_bias = self.compute_position_bias(hidden_states)
        # 如果需要输出所有层的隐藏状态，则初始化空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出所有层的注意力权重，则初始化空元组
        all_attentions = () if output_attentions else None

        # 遍历每一层Transformer层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出所有层的隐藏状态，则将当前层的隐藏状态加入到元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 调用Transformer层的前向传播方法
            layer_outputs = layer_module(
                hidden_states,         # 输入的隐藏状态张量
                attention_mask,        # 注意力掩码张量
                head_mask[i],          # 当前层的多头注意力掩码
                output_attentions,     # 是否输出注意力权重
                position_bias=position_bias,  # 位置偏置张量
                training=training,     # 是否在训练模式下运行
            )
            # 更新隐藏状态为当前层的输出隐藏状态
            hidden_states = layer_outputs[0]

            # 如果需要输出所有层的注意力权重，则将当前层的注意力权重加入到元组中
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # 添加最后一层的隐藏状态到元组中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要以字典形式返回结果，则返回非空元组中的元素
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)

        # 如果需要以字典形式返回结果，则创建TFBaseModelOutput对象返回
        return TFBaseModelOutput(
            last_hidden_state=hidden_states,  # 最后一层的隐藏状态
            hidden_states=all_hidden_states,  # 所有层的隐藏状态元组
            attentions=all_attentions         # 所有层的注意力权重元组
        )

    @staticmethod
    # 定义一个静态方法，用于计算相对位置的桶索引
    def _relative_position_bucket(relative_position, num_buckets=32, max_distance=128):
        ret = 0
        n = -relative_position

        num_buckets //= 2
        ret += tf.cast(tf.math.less(n, 0), dtype=relative_position.dtype) * num_buckets
        n = tf.math.abs(n)

        # 现在 n 的范围是 [0, inf)
        max_exact = num_buckets // 2
        is_small = tf.math.less(n, max_exact)

        # 如果 n 较小，则直接使用 n 作为桶索引
        val_if_large = max_exact + tf.cast(
            tf.math.log(n / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact),
            dtype=relative_position.dtype,
        )

        # 限制桶索引的最大值为 num_buckets - 1
        val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
        # 根据 n 的大小选择最终的桶索引
        ret += tf.where(is_small, n, val_if_large)
        return ret
    # 定义一个方法用于计算位置偏置，基于输入的相对位置进行分桶操作
    def compute_position_bias(self, x, position_ids=None):
        """Compute binned relative position bias"""
        # 获取输入张量的形状信息
        input_shape = shape_list(x)
        # 获取输入张量的长度信息，假设是相等的，qlen 表示查询长度，klen 表示键长度
        qlen, klen = input_shape[1], input_shape[1]

        # 如果给定了位置 ID，则使用这些 ID；否则创建一个从 0 到 qlen-1 的序列作为位置 ID
        if position_ids is not None:
            # 获取上下文位置，形状为 (batch_size, qlen, 1)
            context_position = position_ids[:, :, None]
            # 获取记忆位置，形状为 (batch_size, 1, klen)
            memory_position = position_ids[:, None, :]
        else:
            # 创建一个从 0 到 qlen-1 的序列，表示上下文位置
            context_position = tf.range(qlen)[:, None]
            # 创建一个从 0 到 klen-1 的序列，表示记忆位置
            memory_position = tf.range(klen)[None, :]

        # 计算相对位置，形状为 (qlen, klen)，表示每个查询位置和每个键位置的相对偏移量
        relative_position = memory_position - context_position

        # 对相对位置进行分桶，将相对位置映射到预定义数量的桶中
        rp_bucket = self._relative_position_bucket(
            relative_position,
            num_buckets=self.relative_attention_num_buckets,
        )
        
        # 从预先计算的相对注意力偏置中获取对应的值，形状为 (qlen, klen, num_heads)
        values = tf.gather(self.relative_attention_bias, rp_bucket)

        # 调整维度顺序，将注意力偏置按照 (num_heads, qlen, klen) 的顺序排列，并添加一个维度作为批处理维度
        values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0)

        # 返回计算后的位置偏置张量
        return values
# 将类 TFMPNetMainLayer 标记为 Keras 序列化类
@keras_serializable
class TFMPNetMainLayer(keras.layers.Layer):
    # 指定配置类为 MPNetConfig
    config_class = MPNetConfig

    # 初始化方法，接受配置参数 config
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 将传入的配置参数赋给实例变量
        self.config = config
        self.num_hidden_layers = config.num_hidden_layers  # 设置隐藏层的数量
        self.initializer_range = config.initializer_range  # 设置初始化范围
        self.output_attentions = config.output_attentions  # 是否输出注意力权重
        self.output_hidden_states = config.output_hidden_states  # 是否输出隐藏状态
        self.return_dict = config.use_return_dict  # 是否返回字典格式结果
        # 创建 TFMPNetEncoder 实例，用于编码器
        self.encoder = TFMPNetEncoder(config, name="encoder")
        # 创建 TFMPNetPooler 实例，用于池化器
        self.pooler = TFMPNetPooler(config, name="pooler")
        # 创建 TFMPNetEmbeddings 实例，用于嵌入层，必须在最后声明以保持权重顺序
        self.embeddings = TFMPNetEmbeddings(config, name="embeddings")

    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer 中复制的方法，获取输入嵌入层
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.embeddings

    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer 中复制的方法，设置输入嵌入层
    def set_input_embeddings(self, value: tf.Variable):
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer 中复制的方法，修剪模型中的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError

    # 从装饰器 @unpack_inputs 中复制的方法，定义层的调用方式及其参数
    def call(
        self,
        input_ids=None,
        attention_mask=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
    ):
        ...

    # 在构建层时调用的方法，用于构建内部组件的 TensorFlow 图
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在编码器，则构建编码器
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        # 如果存在池化器，则构建池化器
        if getattr(self, "pooler", None) is not None:
            with tf.name_scope(self.pooler.name):
                self.pooler.build(None)
        # 如果存在嵌入层，则构建嵌入层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)


# MPNetStartDocstring 是一个原始字符串常量，用于描述 MPNet 模型的详细信息和用法
MPNET_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>
"""
    # TensorFlow 模型和层在 `transformers` 中支持两种输入格式：
    
    # 1. 将所有输入作为关键字参数（类似于 PyTorch 模型）；
    # 2. 将所有输入作为列表、元组或字典的第一个位置参数。
    
    # 第二种格式的支持是因为 Keras 方法在将输入传递给模型和层时偏向于使用这种格式。
    # 因此，当使用诸如 `model.fit()` 这样的方法时，您只需传递模型所支持的任何格式的输入和标签即可！
    
    # 然而，如果您希望在 Keras 方法之外使用第二种格式，例如在使用 Keras 的 `Functional` API 创建自己的层或模型时，
    # 可以使用三种方式将所有输入张量聚合到第一个位置参数中：
    
    # - 只包含 `input_ids` 的单个张量：`model(input_ids)`
    # - 长度可变的张量列表，按照文档字符串中给出的顺序：`model([input_ids, attention_mask])`
    #   或 `model([input_ids, attention_mask, token_type_ids])`
    # - 包含一个或多个输入张量，并与文档字符串中给出的输入名称关联的字典：
    #   `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
    
    # 请注意，如果使用[子类化](https://keras.io/guides/making_new_layers_and_models_via_subclassing/)创建模型和层，
    # 那么您无需担心这些问题，可以像传递输入给任何其他 Python 函数一样简单！
    
    # Args:
    # config ([`MPNetConfig`]): 模型配置类，包含模型的所有参数。
    # 初始化配置文件并不会加载与模型相关的权重，仅加载配置。
    # 可以查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型权重。
MPNET_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False`):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
"""


@add_start_docstrings(
    "The bare MPNet Model transformer outputting raw hidden-states without any specific head on top.",
    MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")
)
    MPNET_START_DOCSTRING,


注释：


# 使用 MPNET_START_DOCSTRING 指示符，可能用于开始一个多行字符串文档的标记
)

class TFMPNetModel(TFMPNetPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.mpnet = TFMPNetMainLayer(config, name="mpnet")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 定义输入参数 input_ids，可以为 None
        attention_mask: Optional[Union[np.array, tf.Tensor]] = None,  # 定义输入参数 attention_mask，可以为 None
        position_ids: Optional[Union[np.array, tf.Tensor]] = None,  # 定义输入参数 position_ids，可以为 None
        head_mask: Optional[Union[np.array, tf.Tensor]] = None,  # 定义输入参数 head_mask，可以为 None
        inputs_embeds: tf.Tensor | None = None,  # 定义输入参数 inputs_embeds，可以为 None
        output_attentions: Optional[bool] = None,  # 定义输入参数 output_attentions，可以为 None
        output_hidden_states: Optional[bool] = None,  # 定义输入参数 output_hidden_states，可以为 None
        return_dict: Optional[bool] = None,  # 定义输入参数 return_dict，可以为 None
        training: bool = False,  # 定义输入参数 training，默认为 False
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:  # 指定函数返回类型为 TFBaseModelOutput 或 Tuple[tf.Tensor]
        outputs = self.mpnet(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        return outputs

    def build(self, input_shape=None):
        if self.built:
            return  # 如果模型已经建立，则直接返回
        self.built = True  # 标记模型已经建立
        if getattr(self, "mpnet", None) is not None:
            with tf.name_scope(self.mpnet.name):  # 使用 mpnet 层的名字作为命名空间
                self.mpnet.build(None)  # 构建 mpnet 层

class TFMPNetLMHead(keras.layers.Layer):
    """MPNet head for masked and permuted language modeling"""

    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)

        self.config = config  # 存储配置信息
        self.hidden_size = config.hidden_size  # 存储隐藏层大小
        self.dense = keras.layers.Dense(
            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )  # 创建全连接层，使用指定的初始化方法和名字
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")  # 创建 LayerNormalization 层，使用指定的 epsilon 和名字
        self.act = get_tf_activation("gelu")  # 获取 GELU 激活函数

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = input_embeddings  # 存储输入的嵌入层权重
    # 在神经网络模型中构建操作，根据输入形状初始化偏置项
    def build(self, input_shape=None):
        # 添加一个形状为 (vocab_size,) 的可训练偏置项，初始化为零向量
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

        # 如果已经构建过网络，直接返回，避免重复构建
        if self.built:
            return
        # 标记模型已经构建
        self.built = True

        # 如果存在 dense 层，则构建该层并指定输入形状
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])

        # 如果存在 layer_norm 层，则构建该层并指定输入形状
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.hidden_size])

    # 返回解码器对象，用于输出嵌入
    def get_output_embeddings(self):
        return self.decoder

    # 设置输出嵌入的值
    def set_output_embeddings(self, value):
        # 更新解码器的权重
        self.decoder.weight = value
        # 更新解码器的词汇表大小
        self.decoder.vocab_size = shape_list(value)[0]

    # 返回偏置项字典
    def get_bias(self):
        return {"bias": self.bias}

    # 设置偏置项的值
    def set_bias(self, value):
        # 更新偏置项的值
        self.bias = value["bias"]
        # 更新配置中的词汇表大小
        self.config.vocab_size = shape_list(value["bias"])[0]

    # 实现模型的前向传播
    def call(self, hidden_states):
        # 线性变换：全连接层
        hidden_states = self.dense(hidden_states)
        # 激活函数处理
        hidden_states = self.act(hidden_states)
        # 层归一化处理
        hidden_states = self.layer_norm(hidden_states)

        # 将隐藏状态投影回词汇表大小，同时加上偏置项
        seq_length = shape_list(tensor=hidden_states)[1]
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
        hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)

        return hidden_states
@add_start_docstrings("""MPNet Model with a `language modeling` head on top.""", MPNET_START_DOCSTRING)
class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
    _keys_to_ignore_on_load_missing = [r"pooler"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化 MPNet 主层，使用给定的配置和名称
        self.mpnet = TFMPNetMainLayer(config, name="mpnet")
        # 初始化语言模型头部，使用给定的配置和嵌入层
        self.lm_head = TFMPNetLMHead(config, self.mpnet.embeddings, name="lm_head")

    def get_lm_head(self):
        # 返回语言模型头部
        return self.lm_head

    def get_prefix_bias_name(self):
        # 发出警告，告知该方法已经废弃，请使用 get_bias 方法替代
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        # 返回包含名称和语言模型头部名称的字符串
        return self.name + "/" + self.lm_head.name

    @unpack_inputs
    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: tf.Tensor | None = None,
        training: bool = False,
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 调用 MPNet 主层，传递所有输入参数并获取输出
        outputs = self.mpnet(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 MPNet 主层输出中提取序列输出
        sequence_output = outputs[0]
        # 使用语言模型头部生成预测分数
        prediction_scores = self.lm_head(sequence_output)

        # 如果未提供标签，则损失设为 None；否则计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)

        # 如果不要求返回字典形式的输出，则组装输出结果
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFMaskedLMOutput 对象，包含损失、预测日志、隐藏状态和注意力权重
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 定义神经网络模型的 build 方法，用于构建模型结构
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 如果模型中存在名为 mpnet 的属性，并且不为 None，则构建 mpnet
        if getattr(self, "mpnet", None) is not None:
            # 使用 tf.name_scope 将 mpnet 的构建过程放入命名空间 self.mpnet.name 中
            with tf.name_scope(self.mpnet.name):
                # 调用 mpnet 对象的 build 方法进行模型构建
                self.mpnet.build(None)
        # 如果模型中存在名为 lm_head 的属性，并且不为 None，则构建 lm_head
        if getattr(self, "lm_head", None) is not None:
            # 使用 tf.name_scope 将 lm_head 的构建过程放入命名空间 self.lm_head.name 中
            with tf.name_scope(self.lm_head.name):
                # 调用 lm_head 对象的 build 方法进行模型构建
                self.lm_head.build(None)
# 定义一个自定义的 Keras 层，用于处理文本序列级别的分类任务
class TFMPNetClassificationHead(keras.layers.Layer):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 创建一个全连接层，输出维度为 config.hidden_size，激活函数为 tanh
        self.dense = keras.layers.Dense(
            config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        # 添加一个 Dropout 层，以减少过拟合，丢弃率为 config.hidden_dropout_prob
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        # 创建一个全连接层，输出维度为 config.num_labels，用于最终的分类预测
        self.out_proj = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
        )
        # 保存传入的配置对象
        self.config = config

    def call(self, features, training=False):
        # 获取 features 的第一个 token 的表示，通常是 <s>（等同于 [CLS]）
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        # 对输入进行 Dropout 处理，用于训练阶段避免过拟合
        x = self.dropout(x, training=training)
        # 经过全连接层进行特征转换
        x = self.dense(x)
        # 再次进行 Dropout 处理
        x = self.dropout(x, training=training)
        # 经过最终的全连接层，得到分类预测结果
        x = self.out_proj(x)
        return x

    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True
        # 如果 dense 层已定义，则根据输入形状构建它
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果 out_proj 层已定义，则根据输入形状构建它
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.config.hidden_size])


# 使用装饰器为 TFMPNetForSequenceClassification 类添加文档字符串
@add_start_docstrings(
    """
    MPNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    MPNET_START_DOCSTRING,
)
# TFMPNetForSequenceClassification 类继承自 TFMPNetPreTrainedModel 和 TFSequenceClassificationLoss
class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassificationLoss):
    # 加载模型时忽略的键名列表，避免缺失的 "pooler" 键名导致的加载错误
    _keys_to_ignore_on_load_missing = [r"pooler"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 保存分类数目
        self.num_labels = config.num_labels

        # 创建 MPNet 主层，并命名为 "mpnet"
        self.mpnet = TFMPNetMainLayer(config, name="mpnet")
        # 创建分类器头部，用于进行分类任务，命名为 "classifier"
        self.classifier = TFMPNetClassificationHead(config, name="classifier")

    # 使用装饰器解包输入并添加模型前向传播的文档字符串
    @unpack_inputs
    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
        position_ids: Optional[Union[np.array, tf.Tensor]] = None,
        head_mask: Optional[Union[np.array, tf.Tensor]] = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: tf.Tensor | None = None,
        training: bool = False,
        # 声明函数的输入参数，这些参数用于模型的前向传播
        ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 调用 MPNet 模型进行推理，返回的是一个包含各种输出的命名元组或元组
        outputs = self.mpnet(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从 MPNet 的输出中获取序列输出
        sequence_output = outputs[0]
        # 将序列输出传递给分类器模型，并指定是否处于训练状态
        logits = self.classifier(sequence_output, training=training)

        # 如果没有提供标签，则损失为 None；否则，使用标签和预测值计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不要求返回字典形式的输出，则构建一个元组作为结果返回
        if not return_dict:
            output = (logits,) + outputs[2:]  # 排除了第一个元素以外的所有元素作为输出
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典形式的输出，则创建 TFSequenceClassifierOutput 对象并返回
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        self.built = True
        
        # 构建 MPNet 模型（如果存在）
        if getattr(self, "mpnet", None) is not None:
            with tf.name_scope(self.mpnet.name):
                self.mpnet.build(None)
        
        # 构建分类器模型（如果存在）
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build(None)
"""
MPNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""
@add_start_docstrings(
    """
    MPNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    MPNET_START_DOCSTRING,
)
class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化 MPNet 主层，命名为 'mpnet'
        self.mpnet = TFMPNetMainLayer(config, name="mpnet")
        # 添加 dropout 层，使用给定的隐藏层 dropout 概率
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        # 定义分类器，输出维度为 1，使用给定初始化范围的初始化器
        self.classifier = keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        # 保存配置
        self.config = config

    # 定义前向传播函数，接受一组输入参数
    @unpack_inputs
    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: tf.Tensor | None = None,
        training: bool = False,

        # MPNet 的前向传播函数，接受多种输入参数，包括输入的 token IDs、注意力掩码、位置 IDs、头部掩码、
        # 嵌入向量等，还可以控制是否返回字典形式的输出，是否在训练模式下
        # 返回的结果类型为 TFMultipleChoiceModelOutput
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: tf.Tensor | None = None,
        training: bool = False,
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """
        # 如果传入了 `input_ids` 参数，则确定 `num_choices` 和 `seq_length` 的值
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]  # 获取选择项的数量
            seq_length = shape_list(input_ids)[2]   # 获取序列长度
        else:
            num_choices = shape_list(inputs_embeds)[1]  # 否则，使用 `inputs_embeds` 确定 `num_choices`
            seq_length = shape_list(inputs_embeds)[2]   # 并且使用 `inputs_embeds` 确定 `seq_length`

        # 将输入张量展平以便适应模型输入要求
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
        flat_inputs_embeds = (
            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
            if inputs_embeds is not None
            else None
        )
        # 调用模型的主体部分 `mpnet` 进行推断
        outputs = self.mpnet(
            flat_input_ids,
            flat_attention_mask,
            flat_position_ids,
            head_mask,
            flat_inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 提取汇聚的输出特征，并应用 dropout
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        # 应用分类器获取最终的 logits
        logits = self.classifier(pooled_output)
        # 重新整形 logits 以匹配预期的形状
        reshaped_logits = tf.reshape(logits, (-1, num_choices))
        # 如果提供了标签，则计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)

        # 根据 return_dict 参数决定返回值的格式
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果设置了 return_dict，则返回一个包含多选模型输出的对象
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建完毕，则直接返回
        if self.built:
            return
        self.built = True
        # 如果存在 `mpnet` 属性，则构建 `mpnet`
        if getattr(self, "mpnet", None) is not None:
            with tf.name_scope(self.mpnet.name):
                self.mpnet.build(None)
        # 如果存在 `classifier` 属性，则构建 `classifier`
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
# 定义一个 TFMPNetForTokenClassification 类，继承自 TFMPNetPreTrainedModel 和 TFTokenClassificationLoss，用于在 MPNet 模型的隐藏状态输出上添加一个标记分类头（即一个线性层），例如用于命名实体识别（NER）任务。
@add_start_docstrings(
    """
       MPNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
       Named-Entity-Recognition (NER) tasks.
       """,
    MPNET_START_DOCSTRING,  # 添加开始文档字符串，引用了 MPNet 的起始文档字符串
)
class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificationLoss):
    _keys_to_ignore_on_load_missing = [r"pooler"]  # 在加载模型时要忽略的键列表，这里忽略了名为 "pooler" 的项

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)  # 调用父类的初始化方法

        self.num_labels = config.num_labels  # 标签的数量，从配置中获取
        self.mpnet = TFMPNetMainLayer(config, name="mpnet")  # MPNet 的主层，使用给定的配置创建
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)  # Dropout 层，使用给定的隐藏层dropout概率
        self.classifier = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )  # 分类器层，用于输出标签数量的分类结果，使用给定的初始化器范围进行初始化
        self.config = config  # 存储配置对象

    @unpack_inputs  # 解包输入参数装饰器
    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))  # 添加开始文档字符串到模型前向传播
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,  # 参考的检查点信息
        output_type=TFTokenClassifierOutput,  # 输出类型为 TFTokenClassifierOutput
        config_class=_CONFIG_FOR_DOC,  # 参考的配置类信息
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入的 token IDs，类型为 TFModelInputType 或 None
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力掩码，类型为 numpy 数组、Tensor 或 None
        position_ids: np.ndarray | tf.Tensor | None = None,  # 位置 IDs，类型为 numpy 数组、Tensor 或 None
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部掩码，类型为 numpy 数组、Tensor 或 None
        inputs_embeds: tf.Tensor | None = None,  # 嵌入输入，类型为 Tensor 或 None
        output_attentions: Optional[bool] = None,  # 是否输出注意力
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出
        labels: tf.Tensor | None = None,  # 标签数据，类型为 Tensor 或 None
        training: bool = False,  # 是否为训练模式
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 调用 MPNet 主层进行前向传播
        outputs = self.mpnet(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        sequence_output = outputs[0]  # 获取模型输出的序列输出

        sequence_output = self.dropout(sequence_output, training=training)  # 应用 dropout 操作到序列输出上
        logits = self.classifier(sequence_output)  # 使用分类器层得到分类 logits

        loss = None if labels is None else self.hf_compute_loss(labels, logits)  # 计算 token 分类损失，如果没有标签则为 None

        if not return_dict:
            output = (logits,) + outputs[1:]  # 如果不返回字典形式的输出，构建输出元组
            return ((loss,) + output) if loss is not None else output  # 返回损失和输出，如果损失不为 None

        return TFTokenClassifierOutput(  # 返回 TFTokenClassifierOutput 类型的输出
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 构建模型的方法，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        
        # 标记模型已经构建
        self.built = True
        
        # 如果存在名为 "mpnet" 的属性并且不为 None，则构建 mpnet
        if getattr(self, "mpnet", None) is not None:
            # 在命名空间下构建 mpnet
            with tf.name_scope(self.mpnet.name):
                self.mpnet.build(None)
        
        # 如果存在名为 "classifier" 的属性并且不为 None，则构建 classifier
        if getattr(self, "classifier", None) is not None:
            # 在命名空间下构建 classifier
            with tf.name_scope(self.classifier.name):
                # 构建 classifier，期望输入形状为 [None, None, self.config.hidden_size]
                self.classifier.build([None, None, self.config.hidden_size])
# 使用装饰器添加文档字符串，描述了该类是基于 MPNet 模型，用于抽取式问答任务（如 SQuAD），在隐藏状态的基础上增加一个分类头部来计算“span start logits”和“span end logits”。
@add_start_docstrings(
    """
    MPNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    MPNET_START_DOCSTRING,
)
class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLoss):
    # 在加载过程中需要忽略的键列表
    _keys_to_ignore_on_load_missing = [r"pooler"]

    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)
        # 设置类别数目
        self.num_labels = config.num_labels

        # 初始化 MPNet 主层
        self.mpnet = TFMPNetMainLayer(config, name="mpnet")
        # 初始化用于问答输出的 Dense 层
        self.qa_outputs = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
        # 设置配置信息
        self.config = config

    # 使用装饰器定义 call 方法的输入和输出文档字符串
    @unpack_inputs
    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
        position_ids: Optional[Union[np.array, tf.Tensor]] = None,
        head_mask: Optional[Union[np.array, tf.Tensor]] = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: tf.Tensor | None = None,
        end_positions: tf.Tensor | None = None,
        training: bool = False,
        **kwargs,
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 调用 MPNet 模型进行前向传播，获取输出结果
        outputs = self.mpnet(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从模型输出中提取序列输出
        sequence_output = outputs[0]

        # 使用 QA 输出层计算 logits
        logits = self.qa_outputs(sequence_output)
        # 将 logits 按最后一个维度分割为起始和结束 logits
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        # 去除多余的维度，确保与标签维度一致
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)
        loss = None

        # 如果提供了起始和结束位置的标签，则计算损失
        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions, "end_position": end_positions}
            loss = self.hf_compute_loss(labels, (start_logits, end_logits))

        # 如果 return_dict=False，则组织输出结果
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict=True，则构建 TFQuestionAnsweringModelOutput 对象返回
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在 MPNet 模型，则构建 MPNet 模型
        if getattr(self, "mpnet", None) is not None:
            with tf.name_scope(self.mpnet.name):
                self.mpnet.build(None)
        # 如果存在 QA 输出层，则构建 QA 输出层
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])

`.\models\mpnet\tokenization_mpnet.py`

# 设置编码格式为 UTF-8
# 版权声明，包括 HuggingFace Inc. 团队和 Microsoft Corporation 的版权声明
# 版权声明，包括 NVIDIA CORPORATION 的版权声明
#
# 根据 Apache 许可证 2.0 版本授权，除非符合许可证的规定，否则不得使用此文件
# 可以在以下网址获得许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 如果适用法律要求或书面同意，此软件是基于"原样"提供的，不附带任何明示或暗示的担保或条件
# 请参阅许可证了解特定语言的权限和限制
"""Tokenization classes for MPNet."""

# 导入必要的模块和函数
import collections
import os
import unicodedata
from typing import List, Optional, Tuple

# 从 tokenization_utils 模块中导入特定的函数和类
from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
# 从 utils 模块中导入 logging 函数
from ...utils import logging

# 获取 logger 对象用于日志记录
logger = logging.get_logger(__name__)

# 定义词汇文件的名称映射字典
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

# 定义预训练模型的词汇文件映射字典
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/mpnet-base": "https://huggingface.co/microsoft/mpnet-base/resolve/main/vocab.txt",
    }
}

# 定义预训练模型的位置嵌入尺寸字典
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/mpnet-base": 512,
}

# 定义预训练模型的初始化配置字典
PRETRAINED_INIT_CONFIGURATION = {
    "microsoft/mpnet-base": {"do_lower_case": True},
}


def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    # 使用 collections.OrderedDict 创建一个有序字典来存储词汇表
    vocab = collections.OrderedDict()
    # 打开词汇文件并按行读取所有 token
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    # 将每个 token 和其对应的索引存储到 vocab 字典中
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")  # 去除每个 token 末尾的换行符
        vocab[token] = index
    return vocab


def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    # 去除文本两端的空白字符
    text = text.strip()
    if not text:
        return []  # 如果文本为空，则返回空列表
    # 使用空白字符分割文本，并返回分割后的 token 列表
    tokens = text.split()
    return tokens


class MPNetTokenizer(PreTrainedTokenizer):
    """

    This tokenizer inherits from [`BertTokenizer`] which contains most of the methods. Users should refer to the
    superclass for more information regarding methods.

    """

    # 设置类的词汇文件名字典属性
    vocab_files_names = VOCAB_FILES_NAMES
    # 设置类的预训练模型词汇文件映射属性
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 设置类的预训练模型初始化配置属性
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # 设置类的最大模型输入尺寸属性
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 设置类的模型输入名称属性
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="[UNK]",
        pad_token="<pad>",
        mask_token="<mask>",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
        ):
            # 如果 bos_token 是字符串，则将其封装为特殊的 AddedToken 对象；否则保持不变
            bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
            # 如果 eos_token 是字符串，则将其封装为特殊的 AddedToken 对象；否则保持不变
            eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
            # 如果 sep_token 是字符串，则将其封装为特殊的 AddedToken 对象；否则保持不变
            sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
            # 如果 cls_token 是字符串，则将其封装为特殊的 AddedToken 对象；否则保持不变
            cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
            # 如果 unk_token 是字符串，则将其封装为特殊的 AddedToken 对象；否则保持不变
            unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
            # 如果 pad_token 是字符串，则将其封装为特殊的 AddedToken 对象；否则保持不变
            pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token

            # 将 mask_token 封装为特殊的 AddedToken 对象，且指定 lstrip=True 以保留其前面的空格
            mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token

            # 如果指定的 vocab_file 不是文件路径，则抛出 ValueError 异常
            if not os.path.isfile(vocab_file):
                raise ValueError(
                    f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                    " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                )
            # 加载指定路径下的词汇表文件，并赋值给 self.vocab
            self.vocab = load_vocab(vocab_file)
            # 创建一个有序字典，将 self.vocab 中的键值对颠倒，以便通过 id 访问 token
            self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
            # 设置是否执行基本分词的标志
            self.do_basic_tokenize = do_basic_tokenize
            # 如果需要执行基本分词，则初始化 BasicTokenizer 对象并赋值给 self.basic_tokenizer
            if do_basic_tokenize:
                self.basic_tokenizer = BasicTokenizer(
                    do_lower_case=do_lower_case,
                    never_split=never_split,
                    tokenize_chinese_chars=tokenize_chinese_chars,
                    strip_accents=strip_accents,
                )
            # 使用给定的词汇表和未知 token 初始化 WordpieceTokenizer 对象，并赋值给 self.wordpiece_tokenizer
            self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

            # 调用父类的初始化方法，设置各种参数，包括 token 的特殊处理和其他参数
            super().__init__(
                do_lower_case=do_lower_case,
                do_basic_tokenize=do_basic_tokenize,
                never_split=never_split,
                bos_token=bos_token,
                eos_token=eos_token,
                unk_token=unk_token,
                sep_token=sep_token,
                cls_token=cls_token,
                pad_token=pad_token,
                mask_token=mask_token,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
                **kwargs,
            )

        @property
        def do_lower_case(self):
            # 返回 basic_tokenizer 的 do_lower_case 属性
            return self.basic_tokenizer.do_lower_case

        @property
        def vocab_size(self):
            # 返回词汇表的大小，即 self.vocab 的长度
            return len(self.vocab)

        def get_vocab(self):
            # 创建一个新的字典，复制 added_tokens_encoder 中的内容并更新为 self.vocab 的内容
            # 返回更新后的字典
            vocab = self.added_tokens_encoder.copy()
            vocab.update(self.vocab)
            return vocab
    def _tokenize(self, text):
        """
        Tokenizes a single text string into a list of tokens using both basic and wordpiece tokenization.

        Args:
            text (str): The input text to be tokenized.

        Returns:
            List[str]: List of tokens after tokenization.
        """
        split_tokens = []
        if self.do_basic_tokenize:
            # Iterate through tokens returned by basic_tokenizer
            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
                # Check if token is in the set of never_split tokens
                if token in self.basic_tokenizer.never_split:
                    split_tokens.append(token)  # Append as is
                else:
                    split_tokens += self.wordpiece_tokenizer.tokenize(token)  # Wordpiece tokenize
        else:
            split_tokens = self.wordpiece_tokenizer.tokenize(text)  # Use only wordpiece tokenizer
        return split_tokens

    def _convert_token_to_id(self, token):
        """
        Converts a token (str) to its corresponding ID using the vocabulary.

        Args:
            token (str): The token to be converted.

        Returns:
            int: The ID corresponding to the token. Defaults to the ID of unknown token if not found.
        """
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """
        Converts an index (integer) to its corresponding token (str) using the vocabulary.

        Args:
            index (int): The index to be converted.

        Returns:
            str: The token corresponding to the index. Defaults to the unknown token if not found.
        """
        return self.ids_to_tokens.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        """
        Converts a sequence of tokens (list of str) into a single string, removing prefix '##'.

        Args:
            tokens (List[str]): List of tokens to be joined into a string.

        Returns:
            str: The concatenated string of tokens.
        """
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Builds model input from sequences, adding special tokens for sequence classification tasks in MPNet format.

        Args:
            token_ids_0 (`List[int]`):
                List of input IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional list of input IDs for the second sequence (for pair tasks).

        Returns:
            `List[int]`: List of input IDs with added special tokens appropriate for MPNet.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieves a mask indicating the special tokens from input token IDs.

        Args:
            token_ids_0 (`List[int]`):
                List of input IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional list of input IDs for the second sequence (for pair tasks).
            already_has_special_tokens (`bool`):
                Flag indicating whether the input already includes special tokens.

        Returns:
            `List[int]`: Mask indicating special tokens (1 for special tokens, 0 otherwise).
        """
    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Set to True if the token list is already formatted with special tokens for the model

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # 如果已经有特殊标记，则调用父类的方法来获取特殊标记的掩码
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # 如果没有特殊标记，计算token_ids_0和token_ids_1的特殊标记掩码
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. MPNet does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        # 创建用于序列对分类任务的掩码，对于MPNet来说，不使用token类型ID，因此返回一个零值列表
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        index = 0
        # 如果保存目录已存在
        if os.path.isdir(save_directory):
            # 拼接词汇表文件的路径
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        # 打开词汇表文件并写入词汇
        with open(vocab_file, "w", encoding="utf-8") as writer:
            # 遍历词汇表中的token，按照token索引排序逐个写入文件
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                # 如果索引不连续，发出警告
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index
                # 写入token
                writer.write(token + "\n")
                index += 1
        # 返回保存的词汇表文件路径
        return (vocab_file,)
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
class BasicTokenizer(object):
    """
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    """

    def __init__(
        self,
        do_lower_case=True,
        never_split=None,
        tokenize_chinese_chars=True,
        strip_accents=None,
        do_split_on_punc=True,
    ):
        # 如果 `never_split` 参数未提供，则初始化为空列表
        if never_split is None:
            never_split = []
        # 设置是否将输入文本转换为小写
        self.do_lower_case = do_lower_case
        # 将 `never_split` 转换为集合，用于存储不会被分割的标记
        self.never_split = set(never_split)
        # 设置是否分词中文字符
        self.tokenize_chinese_chars = tokenize_chinese_chars
        # 设置是否去除所有的重音符号
        self.strip_accents = strip_accents
        # 设置是否在标点符号处分割单词
        self.do_split_on_punc = do_split_on_punc
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # 使用 union() 方法将 self.never_split 和传入的 never_split 合并成一个新的集合，确保不分割的 token 集合
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        # 清洗文本，去除不必要的字符和空格等
        text = self._clean_text(text)

        # 这段代码是在2018年11月1日为多语言和中文模型添加的。现在也适用于英语模型，尽管英语模型未经过任何中文数据训练，
        # 通常不包含中文数据（尽管英语维基百科中可能含有一些中文单词，所以词汇表中会包含一些中文字符）
        if self.tokenize_chinese_chars:
            # 对中文字符进行分词处理
            text = self._tokenize_chinese_chars(text)
        # 将文本中的Unicode字符规范化为NFC形式，以便更好地进行分词和处理
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        # 使用空白字符分割文本，得到原始的token列表
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        split_tokens = []
        for token in orig_tokens:
            # 如果token不在never_split集合中，进行进一步处理
            if token not in never_split:
                if self.do_lower_case:
                    # 如果需要小写处理，将token转换为小写
                    token = token.lower()
                    # 如果strip_accents不是False，移除token中的重音符号
                    if self.strip_accents is not False:
                        token = self._run_strip_accents(token)
                elif self.strip_accents:
                    # 如果需要strip_accents，移除token中的重音符号
                    token = self._run_strip_accents(token)
            # 根据标点符号进行token的分割，并将分割后的结果加入split_tokens列表
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 使用空白字符重新连接split_tokens中的token，得到最终的输出token列表
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 将文本中的Unicode字符规范化为NFD形式
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            # 获取字符的Unicode类别
            cat = unicodedata.category(char)
            # 如果字符类别为"Mn"（Mark, Nonspacing），表示该字符为重音符号，跳过该字符
            if cat == "Mn":
                continue
            # 否则将字符添加到输出列表中
            output.append(char)
        # 将列表中的字符连接成字符串，并返回
        return "".join(output)
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        # 如果不需要根据标点符号分割文本，或者文本在不分割列表中，则直接返回原文本作为列表
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        # 将文本转换为字符列表
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        # 遍历字符列表
        while i < len(chars):
            char = chars[i]
            # 如果当前字符是标点符号，则作为新的单词分隔符，添加到输出列表中
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                # 如果不是标点符号，且是新单词的开始，则创建一个新的空列表
                if start_new_word:
                    output.append([])
                start_new_word = False
                # 将当前字符添加到最后一个子列表中
                output[-1].append(char)
            i += 1

        # 将列表中的子列表连接成字符串并返回
        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        # 遍历文本中的每个字符
        for char in text:
            cp = ord(char)
            # 如果字符是中文字符，则在其前后添加空格
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        # 将列表中的字符连接成字符串并返回
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # 检查给定的 Unicode 代码点是否是中日韩字符（CJK 字符）
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        # 遍历文本中的每个字符
        for char in text:
            cp = ord(char)
            # 如果字符是无效字符或者控制字符，则跳过
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            # 如果字符是空白字符，则用单个空格替换
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        # 将列表中的字符连接成字符串并返回
        return "".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        # 初始化 WordpieceTokenizer 类的实例
        self.vocab = vocab  # 词汇表，用于词片段的匹配
        self.unk_token = unk_token  # 未知 token 的表示，如果无法匹配词片段
        self.max_input_chars_per_word = max_input_chars_per_word  # 单个词的最大字符数

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.
        """
        output_tokens = []  # 存储最终的词片段 token 结果
        for token in whitespace_tokenize(text):  # 对文本进行分词，使用空白字符分隔
            chars = list(token)  # 将单词拆分为字符列表
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)  # 如果单词字符数超过最大限制，则添加未知 token
                continue

            is_bad = False  # 是否无法匹配词片段的标志
            start = 0  # 当前处理字符的起始位置
            sub_tokens = []  # 存储当前单词的词片段 token
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])  # 构建当前子串
                    if start > 0:
                        substr = "##" + substr  # 如果不是单词的开头，则加上 "##" 前缀
                    if substr in self.vocab:  # 如果词片段在词汇表中存在
                        cur_substr = substr  # 记录当前词片段
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True  # 如果无法找到匹配的词片段，则标记为无效
                    break
                sub_tokens.append(cur_substr)  # 将匹配的词片段加入到子 token 列表中
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)  # 如果整个单词无法匹配有效的词片段，则添加未知 token
            else:
                output_tokens.extend(sub_tokens)  # 将有效的词片段加入最终的输出结果中
        return output_tokens  # 返回最终的词片段 token 列表

Transformers-源码解析-七十八-

Transformers 源码解析（七十八）

.\models\mobilevit\__init__.py

.\models\mobilevitv2\configuration_mobilevitv2.py

.\models\mobilevitv2\convert_mlcvnets_to_pytorch.py

.\models\mobilevitv2\modeling_mobilevitv2.py

.\models\mobilevitv2\__init__.py

.\models\mpnet\configuration_mpnet.py

.\models\mpnet\modeling_mpnet.py

.\models\mpnet\modeling_tf_mpnet.py

.\models\mpnet\tokenization_mpnet.py

`.\models\mobilevit\init.py`

`.\models\mobilevitv2\configuration_mobilevitv2.py`

`.\models\mobilevitv2\convert_mlcvnets_to_pytorch.py`

`.\models\mobilevitv2\modeling_mobilevitv2.py`

`.\models\mobilevitv2\init.py`

`.\models\mpnet\configuration_mpnet.py`

`.\models\mpnet\modeling_mpnet.py`

`.\models\mpnet\modeling_tf_mpnet.py`

`.\models\mpnet\tokenization_mpnet.py`