Transformers 源码解析（三十一）

`.\models\ctrl\init.py`

# 版权声明和许可信息
#
# 根据 Apache 许可证 2.0 版本授权，除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下链接获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 如果适用法律要求或书面同意，软件将根据“原样”基础分发，不附带任何明示或暗示的保证或条件。
# 有关详细信息，请参阅许可证。
#

from typing import TYPE_CHECKING

# 引入自定义工具模块和依赖检查函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available

# 定义模块的导入结构
_import_structure = {
    "configuration_ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig"],
    "tokenization_ctrl": ["CTRLTokenizer"],
}

# 检查是否可用 Torch，如果不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 Torch 模块到导入结构
    _import_structure["modeling_ctrl"] = [
        "CTRL_PRETRAINED_MODEL_ARCHIVE_LIST",
        "CTRLForSequenceClassification",
        "CTRLLMHeadModel",
        "CTRLModel",
        "CTRLPreTrainedModel",
    ]

# 检查是否可用 TensorFlow，如果不可用则抛出异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 TensorFlow 模块到导入结构
    _import_structure["modeling_tf_ctrl"] = [
        "TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFCTRLForSequenceClassification",
        "TFCTRLLMHeadModel",
        "TFCTRLModel",
        "TFCTRLPreTrainedModel",
    ]

# 如果是类型检查阶段，引入具体模块的类型和常量
if TYPE_CHECKING:
    from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
    from .tokenization_ctrl import CTRLTokenizer

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_ctrl import (
            CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
            CTRLForSequenceClassification,
            CTRLLMHeadModel,
            CTRLModel,
            CTRLPreTrainedModel,
        )

    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_tf_ctrl import (
            TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFCTRLForSequenceClassification,
            TFCTRLLMHeadModel,
            TFCTRLModel,
            TFCTRLPreTrainedModel,
        )

# 如果不是类型检查阶段，使用延迟加载模块
else:
    import sys

    # 将当前模块替换为延迟加载模块对象
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\cvt\configuration_cvt.py`

# 设置文件编码为 UTF-8

# 导入所需模块和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)

# CvT 预训练模型配置文件映射表，指定了预训练模型名称及其对应的配置文件 URL
CVT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/cvt-13": "https://huggingface.co/microsoft/cvt-13/resolve/main/config.json",
    # 可以在 https://huggingface.co/models?filter=cvt 查看所有 CvT 模型
}

# CvTConfig 类继承自 PretrainedConfig 类，用于存储 CvT 模型的配置信息
class CvtConfig(PretrainedConfig):
    r"""
    这是配置类，用于存储 [`CvtModel`] 的配置信息。根据指定的参数实例化一个 CvT 模型，定义模型架构。
    使用默认值实例化配置将产生类似于 CvT [microsoft/cvt-13](https://huggingface.co/microsoft/cvt-13) 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。

    示例:

    ```
    >>> from transformers import CvtConfig, CvtModel

    >>> # 初始化一个 CvT msft/cvt 风格的配置
    >>> configuration = CvtConfig()

    >>> # 从配置中初始化一个模型（具有随机权重）
    >>> model = CvtModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    # 指定模型类型为 "cvt"
    model_type = "cvt"

    # 构造函数，初始化 CvTConfig 类的实例，设置模型的各种配置参数
    def __init__(
        self,
        num_channels=3,
        patch_sizes=[7, 3, 3],
        patch_stride=[4, 2, 2],
        patch_padding=[2, 1, 1],
        embed_dim=[64, 192, 384],
        num_heads=[1, 3, 6],
        depth=[1, 2, 10],
        mlp_ratio=[4.0, 4.0, 4.0],
        attention_drop_rate=[0.0, 0.0, 0.0],
        drop_rate=[0.0, 0.0, 0.0],
        drop_path_rate=[0.0, 0.0, 0.1],
        qkv_bias=[True, True, True],
        cls_token=[False, False, True],
        qkv_projection_method=["dw_bn", "dw_bn", "dw_bn"],
        kernel_qkv=[3, 3, 3],
        padding_kv=[1, 1, 1],
        stride_kv=[2, 2, 2],
        padding_q=[1, 1, 1],
        stride_q=[1, 1, 1],
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        **kwargs,
        ):
        # 调用父类的初始化方法，并传递所有关键字参数
        super().__init__(**kwargs)
        # 设置自身的通道数
        self.num_channels = num_channels
        # 设置自身的补丁大小列表
        self.patch_sizes = patch_sizes
        # 设置自身的补丁步长
        self.patch_stride = patch_stride
        # 设置自身的补丁填充
        self.patch_padding = patch_padding
        # 设置自身的嵌入维度
        self.embed_dim = embed_dim
        # 设置自身的注意力头数
        self.num_heads = num_heads
        # 设置自身的层数
        self.depth = depth
        # 设置自身的MLP放大比例
        self.mlp_ratio = mlp_ratio
        # 设置自身的注意力机制中的注意力丢弃率
        self.attention_drop_rate = attention_drop_rate
        # 设置自身的全连接层丢弃率
        self.drop_rate = drop_rate
        # 设置自身的路径丢弃率
        self.drop_path_rate = drop_path_rate
        # 设置自身的qkv偏置
        self.qkv_bias = qkv_bias
        # 设置自身的类令牌
        self.cls_token = cls_token
        # 设置自身的qkv投影方法
        self.qkv_projection_method = qkv_projection_method
        # 设置自身的qkv内核
        self.kernel_qkv = kernel_qkv
        # 设置自身的kv填充
        self.padding_kv = padding_kv
        # 设置自身的kv步幅
        self.stride_kv = stride_kv
        # 设置自身的q填充
        self.padding_q = padding_q
        # 设置自身的q步幅
        self.stride_q = stride_q
        # 设置自身的初始化器范围
        self.initializer_range = initializer_range
        # 设置自身的层归一化epsilon
        self.layer_norm_eps = layer_norm_eps

`.\models\cvt\convert_cvt_original_pytorch_checkpoint_to_pytorch.py`

# 定义一个函数，用于重命名嵌入层权重的函数
def embeddings(idx):
    """
    The function helps in renaming embedding layer weights.

    Args:
        idx: stage number in original model
    """
    # 存储重命名后的权重名称和对应的原始名称
    embed = []
    embed.append(
        (
            f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.weight",
            f"stage{idx}.patch_embed.proj.weight",
        )
    )
    embed.append(
        (
            f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.bias",
            f"stage{idx}.patch_embed.proj.bias",
        )
    )
    embed.append(
        (
            f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.weight",
            f"stage{idx}.patch_embed.norm.weight",
        )
    )
    embed.append(
        (
            f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.bias",
            f"stage{idx}.patch_embed.norm.bias",
        )
    )
    return embed


# 定义一个函数，用于重命名注意力层权重的函数
def attention(idx, cnt):
    """
    The function helps in renaming attention block layers weights.

    Args:
        idx: stage number in original model
        cnt: count of blocks in each stage
    """
    # 存储重命名后的权重名称和对应的原始名称
    attention_weights = []
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.convolution.weight",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.conv.weight",
        )
    )
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.weight",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.weight",
        )
    )
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.bias",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.bias",
        )
    )
    return attention_weights
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_mean",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_mean",
        )
    )
    # 添加注意力权重元组，包含查询卷积投影的运行均值路径和对应的模型中的路径

    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_var",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_var",
        )
    )
    # 添加注意力权重元组，包含查询卷积投影的运行方差路径和对应的模型中的路径

    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.num_batches_tracked",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.num_batches_tracked",
        )
    )
    # 添加注意力权重元组，包含查询卷积投影的批次追踪计数路径和对应的模型中的路径

    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.convolution.weight",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.conv.weight",
        )
    )
    # 添加注意力权重元组，包含键卷积投影的卷积权重路径和对应的模型中的路径

    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.weight",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.weight",
        )
    )
    # 添加注意力权重元组，包含键卷积投影的归一化权重路径和对应的模型中的路径

    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.bias",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.bias",
        )
    )
    # 添加注意力权重元组，包含键卷积投影的归一化偏置路径和对应的模型中的路径

    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_mean",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_mean",
        )
    )
    # 添加注意力权重元组，包含键卷积投影的归一化运行均值路径和对应的模型中的路径

    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_var",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_var",
        )
    )
    # 添加注意力权重元组，包含键卷积投影的归一化运行方差路径和对应的模型中的路径

    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.num_batches_tracked",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.num_batches_tracked",
        )
    )
    # 添加注意力权重元组，包含键卷积投影的归一化批次追踪计数路径和对应的模型中的路径

    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.convolution.weight",
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.conv.weight",
        )
    )
    # 添加注意力权重元组，包含值卷积投影的卷积权重路径和对应的模型中的路径
    # 将注意力权重相关的两个路径添加到 attention_weights 列表中
    attention_weights.append(
        (
            # 第一个路径：注意力权重的卷积投影值的权重参数路径
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.weight",
            # 对应的 PyTorch 路径
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.weight",
        )
    )
    attention_weights.append(
        (
            # 第二个路径：注意力权重的卷积投影值的偏置参数路径
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.bias",
            # 对应的 PyTorch 路径
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.bias",
        )
    )
    attention_weights.append(
        (
            # 第三个路径：注意力权重的卷积投影值的归一化均值路径
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_mean",
            # 对应的 PyTorch 路径
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_mean",
        )
    )
    attention_weights.append(
        (
            # 第四个路径：注意力权重的卷积投影值的归一化方差路径
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_var",
            # 对应的 PyTorch 路径
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_var",
        )
    )
    attention_weights.append(
        (
            # 第五个路径：注意力权重的卷积投影值的归一化追踪批次路径
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.num_batches_tracked",
            # 对应的 PyTorch 路径
            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.num_batches_tracked",
        )
    )
    attention_weights.append(
        (
            # 第六个路径：注意力权重的查询投影矩阵的权重参数路径
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.weight",
            # 对应的 PyTorch 路径
            f"stage{idx}.blocks.{cnt}.attn.proj_q.weight",
        )
    )
    attention_weights.append(
        (
            # 第七个路径：注意力权重的查询投影矩阵的偏置参数路径
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.bias",
            # 对应的 PyTorch 路径
            f"stage{idx}.blocks.{cnt}.attn.proj_q.bias",
        )
    )
    attention_weights.append(
        (
            # 第八个路径：注意力权重的键投影矩阵的权重参数路径
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.weight",
            # 对应的 PyTorch 路径
            f"stage{idx}.blocks.{cnt}.attn.proj_k.weight",
        )
    )
    attention_weights.append(
        (
            # 第九个路径：注意力权重的键投影矩阵的偏置参数路径
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.bias",
            # 对应的 PyTorch 路径
            f"stage{idx}.blocks.{cnt}.attn.proj_k.bias",
        )
    )
    attention_weights.append(
        (
            # 第十个路径：注意力权重的值投影矩阵的权重参数路径
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.weight",
            # 对应的 PyTorch 路径
            f"stage{idx}.blocks.{cnt}.attn.proj_v.weight",
        )
    )
    attention_weights.append(
        (
            # 第十一个路径：注意力权重的值投影矩阵的偏置参数路径
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.bias",
            # 对应的 PyTorch 路径
            f"stage{idx}.blocks.{cnt}.attn.proj_v.bias",
        )
    )
    attention_weights.append(
        (
            # 第十二个路径：注意力权重的输出密集层的权重参数路径
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.weight",
            # 对应的 PyTorch 路径
            f"stage{idx}.blocks.{cnt}.attn.proj.weight",
        )
    )
    # 将权重名称映射为在模型中的具体位置，以便后续在模型中加载预训练的权重
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.bias",  # CVT模型第 idx 阶段第 cnt 层的注意力输出层偏置
            f"stage{idx}.blocks.{cnt}.attn.proj.bias",  # 转换为对应的第 idx 阶段第 cnt 个块的注意力投影层偏置
        )
    )
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.weight",  # CVT模型第 idx 阶段第 cnt 层的中间层权重
            f"stage{idx}.blocks.{cnt}.mlp.fc1.weight"  # 转换为对应的第 idx 阶段第 cnt 个块的多层感知机（MLP）第一层权重
        )
    )
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.bias",  # CVT模型第 idx 阶段第 cnt 层的中间层偏置
            f"stage{idx}.blocks.{cnt}.mlp.fc1.bias"  # 转换为对应的第 idx 阶段第 cnt 个块的多层感知机（MLP）第一层偏置
        )
    )
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.weight",  # CVT模型第 idx 阶段第 cnt 层的输出层权重
            f"stage{idx}.blocks.{cnt}.mlp.fc2.weight"  # 转换为对应的第 idx 阶段第 cnt 个块的多层感知机（MLP）第二层权重
        )
    )
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.bias",  # CVT模型第 idx 阶段第 cnt 层的输出层偏置
            f"stage{idx}.blocks.{cnt}.mlp.fc2.bias"  # 转换为对应的第 idx 阶段第 cnt 个块的多层感知机（MLP）第二层偏置
        )
    )
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.weight",  # CVT模型第 idx 阶段第 cnt 层的归一化前权重
            f"stage{idx}.blocks.{cnt}.norm1.weight"  # 转换为对应的第 idx 阶段第 cnt 个块的归一化层1的权重
        )
    )
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.bias",  # CVT模型第 idx 阶段第 cnt 层的归一化前偏置
            f"stage{idx}.blocks.{cnt}.norm1.bias"  # 转换为对应的第 idx 阶段第 cnt 个块的归一化层1的偏置
        )
    )
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.weight",  # CVT模型第 idx 阶段第 cnt 层的归一化后权重
            f"stage{idx}.blocks.{cnt}.norm2.weight"  # 转换为对应的第 idx 阶段第 cnt 个块的归一化层2的权重
        )
    )
    attention_weights.append(
        (
            f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.bias",  # CVT模型第 idx 阶段第 cnt 层的归一化后偏置
            f"stage{idx}.blocks.{cnt}.norm2.bias"  # 转换为对应的第 idx 阶段第 cnt 个块的归一化层2的偏置
        )
    )
    return attention_weights  # 返回所有权重的列表
# 定义一个函数，用于生成 cls_token 的重命名信息列表
def cls_token(idx):
    token = []
    token.append((f"cvt.encoder.stages.{idx}.cls_token", "stage2.cls_token"))
    return token

# 定义一个函数，用于生成 final 层的重命名信息列表
def final():
    head = []
    head.append(("layernorm.weight", "norm.weight"))
    head.append(("layernorm.bias", "norm.bias"))
    head.append(("classifier.weight", "head.weight"))
    head.append(("classifier.bias", "head.bias"))
    return head

# 定义一个函数，将 Microsoft CVT 模型转换为 Huggingface 模型的检查点
def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder):
    # 定义与 ImageNet 类别对应的标签文件
    img_labels_file = "imagenet-1k-id2label.json"
    num_labels = 1000

    # 下载并加载 ImageNet 类别到标签的映射关系
    repo_id = "huggingface/label-files"
    id2label = json.load(open(cached_download(hf_hub_url(repo_id, img_labels_file, repo_type="dataset")), "r"))
    id2label = {int(k): v for k, v in id2label.items()}

    # 根据 id2label 创建 label2id 的反向映射
    label2id = {v: k for k, v in id2label.items()}

    # 创建 CVT 模型的配置对象
    config = CvtConfig(num_labels=num_labels, id2label=id2label, label2id=label2id)

    # 根据 CVT 模型的命名规则设置不同的深度参数
    if cvt_model.rsplit("/", 1)[-1][4:6] == "13":
        config.depth = [1, 2, 10]
    elif cvt_model.rsplit("/", 1)[-1][4:6] == "21":
        config.depth = [1, 4, 16]
    else:
        config.depth = [2, 2, 20]
        config.num_heads = [3, 12, 16]
        config.embed_dim = [192, 768, 1024]

    # 创建 CVT 图像分类模型
    model = CvtForImageClassification(config)
    # 从预训练模型中加载图像处理器
    image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
    image_processor.size["shortest_edge"] = image_size

    # 加载原始 CVT 模型的权重
    original_weights = torch.load(cvt_file_name, map_location=torch.device("cpu"))

    # 创建一个空的 OrderedDict 存储 Huggingface 格式的权重
    huggingface_weights = OrderedDict()
    list_of_state_dict = []

    # 遍历每个阶段的深度，并根据不同的条件添加对应的模型参数重命名信息
    for idx in range(len(config.depth)):
        if config.cls_token[idx]:
            list_of_state_dict = list_of_state_dict + cls_token(idx)
        list_of_state_dict = list_of_state_dict + embeddings(idx)
        for cnt in range(config.depth[idx]):
            list_of_state_dict = list_of_state_dict + attention(idx, cnt)

    # 添加 final 层的重命名信息
    list_of_state_dict = list_of_state_dict + final()

    # 根据重命名信息，将原始权重映射到 Huggingface 格式
    for i in range(len(list_of_state_dict)):
        huggingface_weights[list_of_state_dict[i][0]] = original_weights[list_of_state_dict[i][1]]

    # 加载映射后的权重到模型
    model.load_state_dict(huggingface_weights)

    # 将模型保存为 Huggingface 格式的预训练模型
    model.save_pretrained(pytorch_dump_folder)
    image_processor.save_pretrained(pytorch_dump_folder)
    parser.add_argument(
        "--cvt_model",  # 定义一个命令行参数 `--cvt_model`，用于指定要转换的 CVT 模型名称
        default="cvt-w24",  # 默认参数为 "cvt-w24"
        type=str,  # 参数类型为字符串
        help="Name of the cvt model you'd like to convert."  # 帮助信息，说明此参数是用于指定要转换的 CVT 模型的名称
    )
    parser.add_argument(
        "--image_size",  # 定义一个命令行参数 `--image_size`，用于指定输入图像的尺寸
        default=384,  # 默认参数为 384
        type=int,  # 参数类型为整数
        help="Input Image Size"  # 帮助信息，说明此参数是用于指定输入图像的尺寸
    )
    parser.add_argument(
        "--cvt_file_name",  # 定义一个命令行参数 `--cvt_file_name`，用于指定 CVT 模型文件的路径和名称
        default=r"cvtmodels\CvT-w24-384x384-IN-22k.pth",  # 默认参数为指定的文件路径
        type=str,  # 参数类型为字符串
        help="Input Image Size"  # 帮助信息，应为 "Input CVT model file path and name"
    )
    parser.add_argument(
        "--pytorch_dump_folder_path",  # 定义一个命令行参数 `--pytorch_dump_folder_path`，用于指定输出 PyTorch 模型的目录路径
        default=None,  # 默认参数为 None，即未指定输出目录
        type=str,  # 参数类型为字符串
        help="Path to the output PyTorch model directory."  # 帮助信息，说明此参数是用于指定输出 PyTorch 模型的目录路径
    )

    args = parser.parse_args()  # 解析命令行参数，并将结果存储在 `args` 变量中
    convert_cvt_checkpoint(args.cvt_model, args.image_size, args.cvt_file_name, args.pytorch_dump_folder_path)  # 调用函数 `convert_cvt_checkpoint`，传入解析后的参数作为参数

`.\models\cvt\modeling_cvt.py`

# coding=utf-8
# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch CvT model."""

import collections.abc
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput
from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import logging
from .configuration_cvt import CvtConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "CvtConfig"

# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/cvt-13"
_EXPECTED_OUTPUT_SHAPE = [1, 384, 14, 14]

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "microsoft/cvt-13"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

# CvT预训练模型存档列表
CVT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/cvt-13",
    "microsoft/cvt-13-384",
    "microsoft/cvt-13-384-22k",
    "microsoft/cvt-21",
    "microsoft/cvt-21-384",
    "microsoft/cvt-21-384-22k",
    # See all Cvt models at https://huggingface.co/models?filter=cvt
]

@dataclass
class BaseModelOutputWithCLSToken(ModelOutput):
    """
    Base class for model's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
            Classification token at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
    """

    last_hidden_state: torch.FloatTensor = None
    cls_token_value: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    if drop_prob == 0.0 or not training:
        return input
    keep_prob = 1 - drop_prob
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # binarize
    output = input.div(keep_prob) * random_tensor
    return output


# Copied from transformers.models.beit.modeling_beit.BeitDropPath
class CvtDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


class CvtEmbeddings(nn.Module):
    """
    Construct the CvT embeddings.
    """

    def __init__(self, patch_size, num_channels, embed_dim, stride, padding, dropout_rate):
        super().__init__()
        self.convolution_embeddings = CvtConvEmbeddings(
            patch_size=patch_size, num_channels=num_channels, embed_dim=embed_dim, stride=stride, padding=padding
        )
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, pixel_values):
        hidden_state = self.convolution_embeddings(pixel_values)
        hidden_state = self.dropout(hidden_state)
        return hidden_state


class CvtConvEmbeddings(nn.Module):
    """
    Image to Conv Embedding.
    """

    def __init__(self, patch_size, num_channels, embed_dim, stride, padding):
        super().__init__()
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        self.patch_size = patch_size
        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=stride, padding=padding)
        self.normalization = nn.LayerNorm(embed_dim)
    # 定义前向传播函数，接受像素值作为输入
    def forward(self, pixel_values):
        # 使用投影函数对像素值进行处理
        pixel_values = self.projection(pixel_values)
        # 获取输入张量的维度信息：批大小，通道数，高度，宽度
        batch_size, num_channels, height, width = pixel_values.shape
        # 计算隐藏层大小
        hidden_size = height * width
        # 将输入张量重新排列为 "批大小 (高度 * 宽度) 通道数"
        pixel_values = pixel_values.view(batch_size, num_channels, hidden_size).permute(0, 2, 1)
        # 如果启用了归一化函数，则对像素值进行归一化处理
        if self.normalization:
            pixel_values = self.normalization(pixel_values)
        # 将张量重新排列为 "批大小 通道数 高度 宽度"
        pixel_values = pixel_values.permute(0, 2, 1).view(batch_size, num_channels, height, width)
        # 返回处理后的像素值张量
        return pixel_values
# 定义自注意力模块的卷积投影类
class CvtSelfAttentionConvProjection(nn.Module):
    def __init__(self, embed_dim, kernel_size, padding, stride):
        super().__init__()
        # 创建一个二维卷积层对象
        self.convolution = nn.Conv2d(
            embed_dim,
            embed_dim,
            kernel_size=kernel_size,
            padding=padding,
            stride=stride,
            bias=False,
            groups=embed_dim,  # 设定卷积的分组数量为 embed_dim，用于深度可分离卷积
        )
        # 创建二维批归一化层对象
        self.normalization = nn.BatchNorm2d(embed_dim)

    def forward(self, hidden_state):
        # 对输入的隐藏状态进行卷积操作
        hidden_state = self.convolution(hidden_state)
        # 对卷积后的结果进行批归一化处理
        hidden_state = self.normalization(hidden_state)
        return hidden_state


# 定义自注意力模块的线性投影类
class CvtSelfAttentionLinearProjection(nn.Module):
    def forward(self, hidden_state):
        # 获取输入隐藏状态的维度信息
        batch_size, num_channels, height, width = hidden_state.shape
        # 计算隐藏状态的大小
        hidden_size = height * width
        # 重新排列张量的维度顺序，转换为 "b (h w) c" 的形式
        hidden_state = hidden_state.view(batch_size, num_channels, hidden_size).permute(0, 2, 1)
        return hidden_state


# 定义自注意力模块的投影类
class CvtSelfAttentionProjection(nn.Module):
    def __init__(self, embed_dim, kernel_size, padding, stride, projection_method="dw_bn"):
        super().__init__()
        # 根据投影方法选择不同的投影方式
        if projection_method == "dw_bn":
            # 使用深度可分离卷积进行投影
            self.convolution_projection = CvtSelfAttentionConvProjection(embed_dim, kernel_size, padding, stride)
        # 创建线性投影对象
        self.linear_projection = CvtSelfAttentionLinearProjection()

    def forward(self, hidden_state):
        # 使用卷积投影对隐藏状态进行处理
        hidden_state = self.convolution_projection(hidden_state)
        # 使用线性投影对卷积后的结果进行进一步处理
        hidden_state = self.linear_projection(hidden_state)
        return hidden_state


# 定义自注意力模块的主类
class CvtSelfAttention(nn.Module):
    def __init__(
        self,
        num_heads,
        embed_dim,
        kernel_size,
        padding_q,
        padding_kv,
        stride_q,
        stride_kv,
        qkv_projection_method,
        qkv_bias,
        attention_drop_rate,
        with_cls_token=True,
        **kwargs,
    ):
        # 在这里进行初始化，省略了部分参数
        pass  # 实际初始化内容可以根据需要添加
        super().__init__()
        # 调用父类的初始化方法

        self.scale = embed_dim**-0.5
        # 初始化缩放因子，用于缩放注意力分数

        self.with_cls_token = with_cls_token
        # 是否包含类别标记的标志

        self.embed_dim = embed_dim
        # 嵌入维度大小

        self.num_heads = num_heads
        # 注意力头的数量

        self.convolution_projection_query = CvtSelfAttentionProjection(
            embed_dim,
            kernel_size,
            padding_q,
            stride_q,
            projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method,
        )
        # 创建用于查询的卷积投影对象，根据指定的投影方法

        self.convolution_projection_key = CvtSelfAttentionProjection(
            embed_dim, kernel_size, padding_kv, stride_kv, projection_method=qkv_projection_method
        )
        # 创建用于键的卷积投影对象，根据指定的投影方法

        self.convolution_projection_value = CvtSelfAttentionProjection(
            embed_dim, kernel_size, padding_kv, stride_kv, projection_method=qkv_projection_method
        )
        # 创建用于值的卷积投影对象，根据指定的投影方法

        self.projection_query = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
        # 创建查询的线性投影层

        self.projection_key = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
        # 创建键的线性投影层

        self.projection_value = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
        # 创建值的线性投影层

        self.dropout = nn.Dropout(attention_drop_rate)
        # 创建用于注意力掩码的dropout层

    def rearrange_for_multi_head_attention(self, hidden_state):
        batch_size, hidden_size, _ = hidden_state.shape
        head_dim = self.embed_dim // self.num_heads
        # 计算每个注意力头的维度

        # 重新排列张量以用于多头注意力计算，形式为 'b t (h d) -> b h t d'
        return hidden_state.view(batch_size, hidden_size, self.num_heads, head_dim).permute(0, 2, 1, 3)
        # 返回重新排列后的张量，以便进行多头注意力计算
    # 定义一个前向传播方法，接受隐藏状态、高度和宽度作为参数
    def forward(self, hidden_state, height, width):
        # 如果设置了包含CLS token，则将隐藏状态分割为CLS token和其余部分
        if self.with_cls_token:
            cls_token, hidden_state = torch.split(hidden_state, [1, height * width], 1)
        
        # 获取批大小、隐藏大小和通道数
        batch_size, hidden_size, num_channels = hidden_state.shape
        
        # 重新排列隐藏状态的维度以适应多头注意力机制的需求："b (h w) c -> b c h w"
        hidden_state = hidden_state.permute(0, 2, 1).view(batch_size, num_channels, height, width)

        # 使用卷积投影函数对隐藏状态进行键、查询和值的投影
        key = self.convolution_projection_key(hidden_state)
        query = self.convolution_projection_query(hidden_state)
        value = self.convolution_projection_value(hidden_state)

        # 如果设置了包含CLS token，则将CLS token拼接到查询、键和值中
        if self.with_cls_token:
            query = torch.cat((cls_token, query), dim=1)
            key = torch.cat((cls_token, key), dim=1)
            value = torch.cat((cls_token, value), dim=1)

        # 计算每个头的维度
        head_dim = self.embed_dim // self.num_heads

        # 使用投影后的结果重新排列以适应多头注意力机制："b t (h d)"
        query = self.rearrange_for_multi_head_attention(self.projection_query(query))
        key = self.rearrange_for_multi_head_attention(self.projection_key(key))
        value = self.rearrange_for_multi_head_attention(self.projection_value(value))

        # 计算注意力分数
        attention_score = torch.einsum("bhlk,bhtk->bhlt", [query, key]) * self.scale
        # 计算注意力概率并应用dropout
        attention_probs = torch.nn.functional.softmax(attention_score, dim=-1)
        attention_probs = self.dropout(attention_probs)

        # 计算上下文向量
        context = torch.einsum("bhlt,bhtv->bhlv", [attention_probs, value])
        
        # 重新排列上下文向量的维度："b h t d -> b t (h d)"
        _, _, hidden_size, _ = context.shape
        context = context.permute(0, 2, 1, 3).contiguous().view(batch_size, hidden_size, self.num_heads * head_dim)
        
        # 返回上下文向量作为前向传播的输出结果
        return context
class CvtSelfOutput(nn.Module):
    """
    The residual connection is defined in CvtLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    """

    def __init__(self, embed_dim, drop_rate):
        super().__init__()
        # 定义全连接层，输入和输出维度为 embed_dim
        self.dense = nn.Linear(embed_dim, embed_dim)
        # 定义 dropout 层，应用于全连接层的输出
        self.dropout = nn.Dropout(drop_rate)

    def forward(self, hidden_state, input_tensor):
        # 输入 hidden_state 经过全连接层
        hidden_state = self.dense(hidden_state)
        # 对全连接层输出应用 dropout
        hidden_state = self.dropout(hidden_state)
        return hidden_state


class CvtAttention(nn.Module):
    def __init__(
        self,
        num_heads,
        embed_dim,
        kernel_size,
        padding_q,
        padding_kv,
        stride_q,
        stride_kv,
        qkv_projection_method,
        qkv_bias,
        attention_drop_rate,
        drop_rate,
        with_cls_token=True,
    ):
        super().__init__()
        # 创建自注意力模块，参数由传入的参数决定
        self.attention = CvtSelfAttention(
            num_heads,
            embed_dim,
            kernel_size,
            padding_q,
            padding_kv,
            stride_q,
            stride_kv,
            qkv_projection_method,
            qkv_bias,
            attention_drop_rate,
            with_cls_token,
        )
        # 创建自定义的输出模块，包括全连接层和 dropout
        self.output = CvtSelfOutput(embed_dim, drop_rate)
        # 存储被修剪的注意力头的集合
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 寻找可修剪的注意力头，并获取相应的索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
        )

        # 修剪线性层
        self.attention.query = prune_linear_layer(self.attention.query, index)
        self.attention.key = prune_linear_layer(self.attention.key, index)
        self.attention.value = prune_linear_layer(self.attention.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪的头
        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(self, hidden_state, height, width):
        # 使用自注意力模块处理 hidden_state，height 和 width 是额外的参数
        self_output = self.attention(hidden_state, height, width)
        # 使用自定义的输出模块处理 self_output 和 hidden_state
        attention_output = self.output(self_output, hidden_state)
        return attention_output


class CvtIntermediate(nn.Module):
    def __init__(self, embed_dim, mlp_ratio):
        super().__init__()
        # 定义全连接层，输入维度为 embed_dim，输出维度为 embed_dim * mlp_ratio
        self.dense = nn.Linear(embed_dim, int(embed_dim * mlp_ratio))
        # 定义 GELU 激活函数
        self.activation = nn.GELU()

    def forward(self, hidden_state):
        # 输入 hidden_state 经过全连接层
        hidden_state = self.dense(hidden_state)
        # 对全连接层输出应用 GELU 激活函数
        hidden_state = self.activation(hidden_state)
        return hidden_state


class CvtOutput(nn.Module):
    # 定义初始化函数，接受嵌入维度、MLP比率和丢弃率作为参数
    def __init__(self, embed_dim, mlp_ratio, drop_rate):
        # 调用父类构造函数进行初始化
        super().__init__()
        # 创建一个全连接层，将输入维度乘以MLP比率得到输出维度，输出维度为embed_dim
        self.dense = nn.Linear(int(embed_dim * mlp_ratio), embed_dim)
        # 创建一个丢弃层，丢弃率为drop_rate，用于在训练过程中随机丢弃部分神经元以减少过拟合
        self.dropout = nn.Dropout(drop_rate)

    # 定义前向传播函数，接受隐藏状态和输入张量作为输入，返回处理后的隐藏状态
    def forward(self, hidden_state, input_tensor):
        # 将隐藏状态通过全连接层dense进行线性变换
        hidden_state = self.dense(hidden_state)
        # 对线性变换后的隐藏状态进行dropout操作，随机丢弃一部分神经元
        hidden_state = self.dropout(hidden_state)
        # 将dropout后的隐藏状态与输入张量相加，实现残差连接
        hidden_state = hidden_state + input_tensor
        # 返回处理后的隐藏状态作为输出
        return hidden_state
class CvtLayer(nn.Module):
    """
    CvtLayer composed by attention layers, normalization and multi-layer perceptrons (mlps).
    """

    def __init__(
        self,
        num_heads,
        embed_dim,
        kernel_size,
        padding_q,
        padding_kv,
        stride_q,
        stride_kv,
        qkv_projection_method,
        qkv_bias,
        attention_drop_rate,
        drop_rate,
        mlp_ratio,
        drop_path_rate,
        with_cls_token=True,
    ):
        super().__init__()
        
        # 初始化自注意力层
        self.attention = CvtAttention(
            num_heads,
            embed_dim,
            kernel_size,
            padding_q,
            padding_kv,
            stride_q,
            stride_kv,
            qkv_projection_method,
            qkv_bias,
            attention_drop_rate,
            drop_rate,
            with_cls_token,
        )

        # 初始化中间层
        self.intermediate = CvtIntermediate(embed_dim, mlp_ratio)
        
        # 初始化输出层
        self.output = CvtOutput(embed_dim, mlp_ratio, drop_rate)
        
        # 如果有drop_path_rate大于0，则初始化drop path层；否则使用恒等映射
        self.drop_path = CvtDropPath(drop_prob=drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
        
        # 初始化前层归一化层
        self.layernorm_before = nn.LayerNorm(embed_dim)
        
        # 初始化后层归一化层
        self.layernorm_after = nn.LayerNorm(embed_dim)

    def forward(self, hidden_state, height, width):
        # 对隐藏状态进行前层归一化后，通过自注意力层计算自注意力输出
        self_attention_output = self.attention(
            self.layernorm_before(hidden_state),  # in Cvt, layernorm is applied before self-attention
            height,
            width,
        )
        attention_output = self_attention_output
        
        # 对自注意力输出应用DropPath层
        attention_output = self.drop_path(attention_output)

        # 第一个残差连接
        hidden_state = attention_output + hidden_state

        # 对隐藏状态进行后层归一化后，通过中间层计算层输出
        layer_output = self.layernorm_after(hidden_state)
        layer_output = self.intermediate(layer_output)

        # 第二个残差连接
        layer_output = self.output(layer_output, hidden_state)
        
        # 对层输出应用DropPath层
        layer_output = self.drop_path(layer_output)
        
        return layer_output
    # 初始化函数，用于初始化一个CvtTransformer对象
    def __init__(self, config, stage):
        # 调用父类的初始化方法
        super().__init__()
        # 将输入的配置参数和阶段保存到对象的属性中
        self.config = config
        self.stage = stage
        # 如果配置中指定了要使用的类别标记（cls_token），则创建一个可学习的类别标记参数
        if self.config.cls_token[self.stage]:
            self.cls_token = nn.Parameter(torch.randn(1, 1, self.config.embed_dim[-1]))

        # 创建嵌入层对象，用于将输入的隐藏状态映射到嵌入空间
        self.embedding = CvtEmbeddings(
            patch_size=config.patch_sizes[self.stage],           # 补丁大小
            stride=config.patch_stride[self.stage],              # 步长
            num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1],  # 输入通道数
            embed_dim=config.embed_dim[self.stage],              # 嵌入维度
            padding=config.patch_padding[self.stage],            # 填充
            dropout_rate=config.drop_rate[self.stage],           # 丢弃率
        )

        # 计算每个层的丢弃路径率
        drop_path_rates = [x.item() for x in torch.linspace(0, config.drop_path_rate[self.stage], config.depth[stage])]

        # 创建包含多个CvtLayer层的顺序容器
        self.layers = nn.Sequential(
            *[
                CvtLayer(
                    num_heads=config.num_heads[self.stage],       # 头数
                    embed_dim=config.embed_dim[self.stage],       # 嵌入维度
                    kernel_size=config.kernel_qkv[self.stage],    # QKV核大小
                    padding_q=config.padding_q[self.stage],      # Q填充
                    padding_kv=config.padding_kv[self.stage],    # KV填充
                    stride_kv=config.stride_kv[self.stage],      # KV步长
                    stride_q=config.stride_q[self.stage],        # Q步长
                    qkv_projection_method=config.qkv_projection_method[self.stage],  # QKV投影方法
                    qkv_bias=config.qkv_bias[self.stage],        # QKV偏置
                    attention_drop_rate=config.attention_drop_rate[self.stage],  # 注意力丢弃率
                    drop_rate=config.drop_rate[self.stage],       # 丢弃率
                    drop_path_rate=drop_path_rates[self.stage],   # 丢弃路径率
                    mlp_ratio=config.mlp_ratio[self.stage],       # MLP比率
                    with_cls_token=config.cls_token[self.stage],  # 是否包含类别标记
                )
                # 根据配置的深度创建多个CvtLayer对象
                for _ in range(config.depth[self.stage])
            ]
        )

    # 前向传播函数，定义了如何计算输入隐藏状态的转换过程
    def forward(self, hidden_state):
        cls_token = None
        # 将输入的隐藏状态通过嵌入层进行转换
        hidden_state = self.embedding(hidden_state)
        batch_size, num_channels, height, width = hidden_state.shape
        # 将转换后的张量重新排列为 batch_size x (height * width) x num_channels
        hidden_state = hidden_state.view(batch_size, num_channels, height * width).permute(0, 2, 1)
        # 如果配置中指定了要使用的类别标记，则将类别标记和转换后的隐藏状态连接起来
        if self.config.cls_token[self.stage]:
            cls_token = self.cls_token.expand(batch_size, -1, -1)
            hidden_state = torch.cat((cls_token, hidden_state), dim=1)

        # 逐层对输入的隐藏状态进行变换，通过CvtLayer层处理
        for layer in self.layers:
            layer_outputs = layer(hidden_state, height, width)
            hidden_state = layer_outputs

        # 如果配置中指定了要使用的类别标记，则从最终的隐藏状态中分离出类别标记
        if self.config.cls_token[self.stage]:
            cls_token, hidden_state = torch.split(hidden_state, [1, height * width], 1)
        # 将最终的隐藏状态重新排列为 batch_size x num_channels x height x width 的形式
        hidden_state = hidden_state.permute(0, 2, 1).view(batch_size, num_channels, height, width)
        # 返回转换后的隐藏状态和类别标记（如果有的话）
        return hidden_state, cls_token
        """
        # CVTEncoder 类，用于实现一个可变形视觉 Transformer 编码器模型

        def __init__(self, config):
            # 初始化函数，继承自 nn.Module
            super().__init__()
            # 保存模型配置信息
            self.config = config
            # 初始化模型的多个编码阶段
            self.stages = nn.ModuleList([])
            # 根据配置中的深度信息，逐个创建并添加 CvtStage 实例到 stages 中
            for stage_idx in range(len(config.depth)):
                self.stages.append(CvtStage(config, stage_idx))

        def forward(self, pixel_values, output_hidden_states=False, return_dict=True):
            # 初始化隐藏状态和额外输出
            all_hidden_states = () if output_hidden_states else None
            hidden_state = pixel_values

            # 初始化分类标记
            cls_token = None
            # 遍历每个阶段的模块
            for _, (stage_module) in enumerate(self.stages):
                # 通过阶段模块处理隐藏状态，同时获取分类标记
                hidden_state, cls_token = stage_module(hidden_state)
                # 如果需要输出所有隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
                if output_hidden_states:
                    all_hidden_states = all_hidden_states + (hidden_state,)

            # 如果不返回字典形式的输出，则返回非空的隐藏状态、分类标记和所有隐藏状态
            if not return_dict:
                return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)

            # 返回包含分类标记、最终隐藏状态和所有隐藏状态的 BaseModelOutputWithCLSToken 对象
            return BaseModelOutputWithCLSToken(
                last_hidden_state=hidden_state,
                cls_token_value=cls_token,
                hidden_states=all_hidden_states,
            )
        """



CVT_PRETRAINED_MODEL_DOCSTRING = r"""
    This model is an abstract class for handling weights initialization and providing a simple interface for downloading
    and loading pretrained models.

    Configuration:
        config_class (`CvtConfig`): The configuration class holding all parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. To load weights, use the `~PreTrainedModel.from_pretrained` method.

    Attributes:
        base_model_prefix (`str`): Prefix applied to base model attributes.
        main_input_name (`str`): Name of the main input attribute for the model, typically 'pixel_values'.
"""

CVT_INIT_WEIGHTS_DOCSTRING = r"""
    Initialize the weights of the model module.

    Args:
        module (`nn.Module`): The PyTorch module for which weights need to be initialized.
"""

CVT_INIT_WEIGHTS_FUNCTION_DOCSTRING = r"""
        Initialize the weights of the provided module.

        Args:
            module (`nn.Module`): The PyTorch module for which weights need to be initialized.
"""

CVT_INIT_WEIGHTS_LINEAR_CONV2D_DOCSTRING = r"""
            Initialize the weights of a Linear or Conv2d module.

            Args:
                module (`nn.Module`): The PyTorch Linear or Conv2d module.
"""

CVT_INIT_WEIGHTS_LAYERNORM_DOCSTRING = r"""
            Initialize the weights of a LayerNorm module.

            Args:
                module (`nn.Module`): The PyTorch LayerNorm module.
"""

CVT_INIT_WEIGHTS_CVTSTAGE_DOCSTRING = r"""
            Initialize the weights of a CvtStage module.

            Args:
                module (`CvtStage`): The CvtStage module.
"""

CVT_START_DOCSTRING,
CVT_INPUTS_DOCSTRING,
CVT_PRETRAINED_MODEL_DOCSTRING,
CVT_INIT_WEIGHTS_DOCSTRING,
CVT_INIT_WEIGHTS_FUNCTION_DOCSTRING,
CVT_INIT_WEIGHTS_LINEAR_CONV2D_DOCSTRING,
CVT_INIT_WEIGHTS_LAYERNORM_DOCSTRING,
CVT_INIT_WEIGHTS_CVTSTAGE_DOCSTRING```
        """
        # CVTEncoder 类，用于实现一个可变形视觉 Transformer 编码器模型

        def __init__(self, config):
            # 初始化函数，继承自 nn.Module
            super().__init__()
            # 保存模型配置信息
            self.config = config
            # 初始化模型的多个编码阶段
            self.stages = nn.ModuleList([])
            # 根据配置中的深度信息，逐个创建并添加 CvtStage 实例到 stages 中
            for stage_idx in range(len(config.depth)):
                self.stages.append(CvtStage(config, stage_idx))

        def forward(self, pixel_values, output_hidden_states=False, return_dict=True):
            # 初始化隐藏状态和额外输出
            all_hidden_states = () if output_hidden_states else None
            hidden_state = pixel_values

            # 初始化分类标记
            cls_token = None
            # 遍历每个阶段的模块
            for _, (stage_module) in enumerate(self.stages):
                # 通过阶段模块处理隐藏状态，同时获取分类标记
                hidden_state, cls_token = stage_module(hidden_state)
                # 如果需要输出所有隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
                if output_hidden_states:
                    all_hidden_states = all_hidden_states + (hidden_state,)

            # 如果不返回字典形式的输出，则返回非空的隐藏状态、分类标记和所有隐藏状态
            if not return_dict:
                return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)

            # 返回包含分类标记、最终隐藏状态和所有隐藏状态的 BaseModelOutputWithCLSToken 对象
            return BaseModelOutputWithCLSToken(
                last_hidden_state=hidden_state,
                cls_token_value=cls_token,
                hidden_states=all_hidden_states,
            )
        """



CVT_PRETRAINED_MODEL_DOCSTRING = r"""
    This model is an abstract class for handling weights initialization and providing a simple interface for downloading
    and loading pretrained models.

    Configuration:
        config_class (`CvtConfig`): The configuration class holding all parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. To load weights, use the `~PreTrainedModel.from_pretrained` method.

    Attributes:
        base_model_prefix (`str`): Prefix applied to base model attributes.
        main_input_name (`str`): Name of the main input attribute for the model, typically 'pixel_values'.
"""

CVT_INIT_WEIGHTS_DOCSTRING = r"""
    Initialize the weights of the model module.

    Args:
        module (`nn.Module`): The PyTorch module for which weights need to be initialized.
"""

CVT_INIT_WEIGHTS_FUNCTION_DOCSTRING = r"""
        Initialize the weights of the provided module.

        Args:
            module (`nn.Module`): The PyTorch module for which weights need to be initialized.
"""

CVT_INIT_WEIGHTS_LINEAR_CONV2D_DOCSTRING = r"""
            Initialize the weights of a Linear or Conv2d module.

            Args:
                module (`nn.Module`): The PyTorch Linear or Conv2d module.
"""

CVT_INIT_WEIGHTS_LAYERNORM_DOCSTRING = r"""
            Initialize the weights of a LayerNorm module.

            Args:
                module (`nn.Module`): The PyTorch LayerNorm module.
"""

CVT_INIT_WEIGHTS_CVTSTAGE_DOCSTRING = r"""
            Initialize the weights of a CvtStage module.

            Args:
                module (`CvtStage`): The CvtStage module.
"""

CVT_START_DOCSTRING,
CVT_INPUTS_DOCSTRING,
CVT_PRETRAINED_MODEL_DOCSTRING,
CVT_INIT_WEIGHTS_DOCSTRING,
CVT_INIT_WEIGHTS_FUNCTION_DOCSTRING,
CVT_INIT_WEIGHTS_LINEAR_CONV2D_DOCSTRING,
CVT_INIT_WEIGHTS_LAYERNORM_DOCSTRING,
CVT_INIT_WEIGHTS_CVTSTAGE_DOCSTRING
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            # 输入的像素数值。可以使用 `AutoImageProcessor` 获取像素值。详见 `CvtImageProcessor.__call__`。
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
            for details.
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。返回的张量中包含 `hidden_states`，详见返回的张量部分。
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            # 是否返回一个 `~file_utils.ModelOutput` 而不是一个普通的元组。
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
Cvt Model transformer outputting raw hidden-states without any specific head on top.
"""
class CvtModel(CvtPreTrainedModel):
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config
        self.encoder = CvtEncoder(config)  # 初始化 CvtEncoder 模块
        self.post_init()

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)  # 对指定层的注意力头进行修剪

    @add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithCLSToken,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithCLSToken]:
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")  # 如果 pixel_values 为 None，则抛出数值错误异常

        encoder_outputs = self.encoder(
            pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = encoder_outputs[0]  # 获取编码器输出的序列输出

        if not return_dict:
            return (sequence_output,) + encoder_outputs[1:]  # 返回序列输出以及额外的编码器输出

        return BaseModelOutputWithCLSToken(
            last_hidden_state=sequence_output,
            cls_token_value=encoder_outputs.cls_token_value,
            hidden_states=encoder_outputs.hidden_states,
        )  # 返回包含 CLS 标记值的基础模型输出类型

"""
Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
the [CLS] token) e.g. for ImageNet.
"""
class CvtForImageClassification(CvtPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.num_labels = config.num_labels  # 存储标签数目
        self.cvt = CvtModel(config, add_pooling_layer=False)  # 初始化 CvtModel 模块，不添加池化层
        self.layernorm = nn.LayerNorm(config.embed_dim[-1])  # 应用 LayerNorm 到最后一个嵌入维度
        # 分类器头部
        self.classifier = (
            nn.Linear(config.embed_dim[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
        )  # 如果存在标签数目，则创建线性层作为分类器头部，否则创建恒等映射

        # Initialize weights and apply final processing
        self.post_init()
    
    @add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
    # 使用装饰器添加代码示例的文档字符串，指定模型使用的检查点、输出类型、配置类和预期输出
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutputWithNoAttention,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    # 定义模型的前向传播方法，接受像素值张量、标签张量、是否返回隐藏状态和是否返回字典作为参数
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 根据需要确定是否返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 调用转换器进行处理像素值，根据需要返回隐藏状态，结果存储在outputs中
        outputs = self.cvt(
            pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取序列输出和CLS token
        sequence_output = outputs[0]
        cls_token = outputs[1]
        # 如果配置中的CLS token标记为最后一个
        if self.config.cls_token[-1]:
            # 应用层归一化到CLS token
            sequence_output = self.layernorm(cls_token)
        else:
            # 获取序列输出的维度信息
            batch_size, num_channels, height, width = sequence_output.shape
            # 重新排列 "b c h w -> b (h w) c"
            sequence_output = sequence_output.view(batch_size, num_channels, height * width).permute(0, 2, 1)
            # 应用层归一化到重新排列的序列输出
            sequence_output = self.layernorm(sequence_output)

        # 计算序列输出的平均值
        sequence_output_mean = sequence_output.mean(dim=1)
        # 将平均值输入分类器得到logits
        logits = self.classifier(sequence_output_mean)

        # 初始化损失为None
        loss = None
        # 如果存在标签
        if labels is not None:
            # 如果问题类型尚未确定
            if self.config.problem_type is None:
                # 根据标签数确定问题类型
                if self.config.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.config.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不需要返回字典，则返回logits和额外的隐藏状态
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回ImageClassifierOutputWithNoAttention对象，包括损失、logits和隐藏状态
        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)

`.\models\cvt\modeling_tf_cvt.py`

# 设置编码格式为 utf-8
# 版权声明

# 引入collections.abc标准库
import collections.abc
# 引入dataclass模块
from dataclasses import dataclass
# 引入Optional, Tuple, Union类型
from typing import Optional, Tuple, Union

# 引入tensorflow库
import tensorflow as tf

# 引入模型输出相关函数和类
from ...modeling_tf_outputs import TFImageClassifierOutputWithNoAttention
# 引入模型工具函数
from ...modeling_tf_utils import (
    TFModelInputType,
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
# 引入tensorflow工具函数
from ...tf_utils import shape_list, stable_softmax
# 引入相关类和函数
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 引入cvt配置类
from .configuration_cvt import CvtConfig

# 获取logger
logger = logging.get_logger(__name__)

# 基本文档字符串
_CONFIG_FOR_DOC = "CvtConfig"

# 预训练模型列表
TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/cvt-13",
    "microsoft/cvt-13-384",
    "microsoft/cvt-13-384-22k",
    "microsoft/cvt-21",
    "microsoft/cvt-21-384",
    "microsoft/cvt-21-384-22k",
    # 查看所有Cvt模型：https://huggingface.co/models?filter=cvt
]

# 数据类
@dataclass
class TFBaseModelOutputWithCLSToken(ModelOutput):
    """
    模型输出的基本类。

    参数:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐态输出序列。
        cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`):
            模型最后一层的分类标记。
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`. 每层模型的隐藏状态加上初始嵌入输出。
    """

    # 隐含态输出序列
    last_hidden_state: tf.Tensor = None
    # 分类标记
    cls_token_value: tf.Tensor = None
    # 隐藏状态
    hidden_states: Tuple[tf.Tensor, ...] | None = None


# TFCvtDropPath类
class TFCvtDropPath(keras.layers.Layer):
    """在残差块的主路径上对每个样本进行辍学（随机深度）。
    参考：(1) github.com:rwightman/pytorch-image-models
    """
    # 初始化函数，用于设置Dropout层的丢弃概率
    def __init__(self, drop_prob: float, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 设置实例变量来存储丢弃概率
        self.drop_prob = drop_prob

    # 覆盖父类的call方法，实现自定义的Dropout逻辑
    def call(self, x: tf.Tensor, training=None):
        # 如果丢弃概率为0或者非训练模式，则直接返回输入张量x
        if self.drop_prob == 0.0 or not training:
            return x
        # 计算保留概率
        keep_prob = 1 - self.drop_prob
        # 获取输入张量x的形状信息，并创建一个与其相同维度的随机张量
        shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
        random_tensor = keep_prob + tf.random.uniform(shape, 0, 1, dtype=self.compute_dtype)
        # 对随机张量进行下取整操作
        random_tensor = tf.floor(random_tensor)
        # 返回经过Dropout处理后的张量
        return (x / keep_prob) * random_tensor
class TFCvtEmbeddings(keras.layers.Layer):
    """Construct the Convolutional Token Embeddings."""

    def __init__(
        self,
        config: CvtConfig,
        patch_size: int,
        num_channels: int,
        embed_dim: int,
        stride: int,
        padding: int,
        dropout_rate: float,
        **kwargs,
    ):
        super().__init__(**kwargs)
        # 初始化卷积嵌入层，用于处理输入像素数据
        self.convolution_embeddings = TFCvtConvEmbeddings(
            config,
            patch_size=patch_size,
            num_channels=num_channels,
            embed_dim=embed_dim,
            stride=stride,
            padding=padding,
            name="convolution_embeddings",
        )
        # 添加一个丢弃层，用于在训练时随机丢弃部分输出，防止过拟合
        self.dropout = keras.layers.Dropout(dropout_rate)

    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 调用卷积嵌入层处理输入像素数据
        hidden_state = self.convolution_embeddings(pixel_values)
        # 在训练时应用丢弃层，随机丢弃部分输出
        hidden_state = self.dropout(hidden_state, training=training)
        return hidden_state

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "convolution_embeddings", None) is not None:
            # 构建卷积嵌入层，如果已构建则跳过
            with tf.name_scope(self.convolution_embeddings.name):
                self.convolution_embeddings.build(None)


class TFCvtConvEmbeddings(keras.layers.Layer):
    """Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts."""

    def __init__(
        self,
        config: CvtConfig,
        patch_size: int,
        num_channels: int,
        embed_dim: int,
        stride: int,
        padding: int,
        **kwargs,
    ):
        super().__init__(**kwargs)
        # 添加零填充层，用于在输入图像边界周围填充零值，以处理卷积操作
        self.padding = keras.layers.ZeroPadding2D(padding=padding)
        # 将 patch_size 转为元组形式，若 patch_size 是迭代对象则保持原样，否则转为 (patch_size, patch_size)
        self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # 添加卷积层，用于从输入中提取局部特征，并投影到指定的嵌入维度
        self.projection = keras.layers.Conv2D(
            filters=embed_dim,
            kernel_size=patch_size,
            strides=stride,
            padding="valid",
            data_format="channels_last",
            kernel_initializer=get_initializer(config.initializer_range),
            name="projection",
        )
        # 添加层归一化层，用于在卷积输出上应用层归一化，增强模型的稳定性和训练效果
        # 使用与 PyTorch 相同的默认 epsilon 值
        self.normalization = keras.layers.LayerNormalization(epsilon=1e-5, name="normalization")
        self.num_channels = num_channels
        self.embed_dim = embed_dim
    # 定义一个方法，接受一个 tf.Tensor 类型的参数 pixel_values，并返回一个 tf.Tensor 类型的结果
    def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
        # 如果输入的 pixel_values 是一个字典，则将其转换为字典中的 "pixel_values" 键对应的值
        if isinstance(pixel_values, dict):
            pixel_values = pixel_values["pixel_values"]

        # 对输入的 pixel_values 进行填充和投影操作
        pixel_values = self.projection(self.padding(pixel_values))

        # 获取处理后的 pixel_values 的形状信息
        # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
        batch_size, height, width, num_channels = shape_list(pixel_values)
        hidden_size = height * width
        # 将 pixel_values 重塑为指定形状
        pixel_values = tf.reshape(pixel_values, shape=(batch_size, hidden_size, num_channels))
        # 对重塑后的 pixel_values 进行归一化处理
        pixel_values = self.normalization(pixel_values)

        # 将 pixel_values 重新重塑为原始输入的形状
        # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
        pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels))
        # 返回处理后的 pixel_values
        return pixel_values

    # 构建方法，用于建立模型的层次结构
    def build(self, input_shape=None):
        # 如果已经建立过模型，则直接返回
        if self.built:
            return
        self.built = True
        # 如果模型具有投影层，则构建投影层
        if getattr(self, "projection", None) is not None:
            with tf.name_scope(self.projection.name):
                self.projection.build([None, None, None, self.num_channels])
        # 如果模型具有归一化层，则构建归一化层
        if getattr(self, "normalization", None) is not None:
            with tf.name_scope(self.normalization.name):
                self.normalization.build([None, None, self.embed_dim])
class TFCvtSelfAttentionConvProjection(keras.layers.Layer):
    """Convolutional projection layer."""

    def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs):
        super().__init__(**kwargs)
        # 设置 ZeroPadding2D 层，用于在输入数据的边缘填充指定数量的零值
        self.padding = keras.layers.ZeroPadding2D(padding=padding)
        # 设置 Conv2D 层，用于执行二维卷积操作
        self.convolution = keras.layers.Conv2D(
            filters=embed_dim,
            kernel_size=kernel_size,
            kernel_initializer=get_initializer(config.initializer_range),
            padding="valid",
            strides=stride,
            use_bias=False,
            name="convolution",
            groups=embed_dim,  # 指定卷积操作的分组数
        )
        # 设置 BatchNormalization 层，用于对卷积层的输出进行批量归一化处理
        # 使用与 PyTorch 相同的默认 epsilon，TF 使用 (1 - PyTorch momentum)
        self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
        self.embed_dim = embed_dim  # 记录嵌入维度

    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 执行卷积操作，并在输入数据的边缘填充零值
        hidden_state = self.convolution(self.padding(hidden_state))
        # 对卷积输出进行批量归一化处理，可选择是否在训练过程中使用
        hidden_state = self.normalization(hidden_state, training=training)
        return hidden_state

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "convolution", None) is not None:
            with tf.name_scope(self.convolution.name):
                # 构建 Conv2D 层，指定输入张量的形状
                self.convolution.build([None, None, None, self.embed_dim])
        if getattr(self, "normalization", None) is not None:
            with tf.name_scope(self.normalization.name):
                # 构建 BatchNormalization 层，指定输入张量的形状
                self.normalization.build([None, None, None, self.embed_dim])


class TFCvtSelfAttentionLinearProjection(keras.layers.Layer):
    """Linear projection layer used to flatten tokens into 1D."""

    def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
        # 将输入张量从四维(batch_size, height, width, num_channels)转换为三维(batch_size, height*width, num_channels)
        batch_size, height, width, num_channels = shape_list(hidden_state)
        hidden_size = height * width
        hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels))
        return hidden_state


class TFCvtSelfAttentionProjection(keras.layers.Layer):
    """Convolutional Projection for Attention."""

    def __init__(
        self,
        config: CvtConfig,
        embed_dim: int,
        kernel_size: int,
        stride: int,
        padding: int,
        projection_method: str = "dw_bn",
        **kwargs,
    ):
        super().__init__(**kwargs)
        if projection_method == "dw_bn":
            # 根据投影方法选择使用卷积投影层进行初始化
            self.convolution_projection = TFCvtSelfAttentionConvProjection(
                config, embed_dim, kernel_size, stride, padding, name="convolution_projection"
            )
        # 初始化线性投影层对象
        self.linear_projection = TFCvtSelfAttentionLinearProjection()
    # 定义一个方法 `call`，用于接收隐藏状态 `hidden_state` 和训练标志 `training`，返回处理后的隐藏状态
    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 调用卷积投影层 `convolution_projection` 处理隐藏状态，根据训练标志 `training` 决定是否使用训练模式
        hidden_state = self.convolution_projection(hidden_state, training=training)
        # 调用线性投影层 `linear_projection` 处理卷积处理后的隐藏状态
        hidden_state = self.linear_projection(hidden_state)
        # 返回处理后的隐藏状态
        return hidden_state

    # 定义一个方法 `build`，用于构建层的网络结构
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 标记这个层已经构建过
        self.built = True
        # 检查是否存在卷积投影层 `convolution_projection`
        if getattr(self, "convolution_projection", None) is not None:
            # 在命名作用域 `self.convolution_projection.name` 下构建卷积投影层
            with tf.name_scope(self.convolution_projection.name):
                self.convolution_projection.build(None)
# 定义一个自注意力层的类 TFCvtSelfAttention，用于处理自注意力机制。这一层包含了为查询（query）、键（key）和值（value）嵌入应用的深度可分离卷积操作（卷积投影）。
class TFCvtSelfAttention(keras.layers.Layer):
    """
    Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
    query, key, and value embeddings.
    """

    # 初始化方法，接受多个参数来配置层的行为和特性
    def __init__(
        self,
        config: CvtConfig,  # 用于配置的对象 CvtConfig
        num_heads: int,  # 注意力头的数量
        embed_dim: int,  # 嵌入维度
        kernel_size: int,  # 卷积核大小
        stride_q: int,  # 查询的步长
        stride_kv: int,  # 键值对的步长
        padding_q: int,  # 查询的填充
        padding_kv: int,  # 键值对的填充
        qkv_projection_method: str,  # 查询、键、值投影的方法
        qkv_bias: bool,  # 是否使用偏置项
        attention_drop_rate: float,  # 注意力机制中的丢弃率
        with_cls_token: bool = True,  # 是否包含类别标记
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.scale = embed_dim**-0.5  # 缩放因子，用于缩放嵌入维度
        self.with_cls_token = with_cls_token  # 是否包含类别标记
        self.embed_dim = embed_dim  # 嵌入维度
        self.num_heads = num_heads  # 注意力头的数量

        # 创建查询的卷积投影层
        self.convolution_projection_query = TFCvtSelfAttentionProjection(
            config,
            embed_dim,
            kernel_size,
            stride_q,
            padding_q,
            projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method,
            name="convolution_projection_query",
        )
        
        # 创建键的卷积投影层
        self.convolution_projection_key = TFCvtSelfAttentionProjection(
            config,
            embed_dim,
            kernel_size,
            stride_kv,
            padding_kv,
            projection_method=qkv_projection_method,
            name="convolution_projection_key",
        )
        
        # 创建值的卷积投影层
        self.convolution_projection_value = TFCvtSelfAttentionProjection(
            config,
            embed_dim,
            kernel_size,
            stride_kv,
            padding_kv,
            projection_method=qkv_projection_method,
            name="convolution_projection_value",
        )

        # 创建查询的全连接投影层
        self.projection_query = keras.layers.Dense(
            units=embed_dim,
            kernel_initializer=get_initializer(config.initializer_range),
            use_bias=qkv_bias,
            bias_initializer="zeros",
            name="projection_query",
        )
        
        # 创建键的全连接投影层
        self.projection_key = keras.layers.Dense(
            units=embed_dim,
            kernel_initializer=get_initializer(config.initializer_range),
            use_bias=qkv_bias,
            bias_initializer="zeros",
            name="projection_key",
        )
        
        # 创建值的全连接投影层
        self.projection_value = keras.layers.Dense(
            units=embed_dim,
            kernel_initializer=get_initializer(config.initializer_range),
            use_bias=qkv_bias,
            bias_initializer="zeros",
            name="projection_value",
        )
        
        # 创建注意力机制中的丢弃层
        self.dropout = keras.layers.Dropout(attention_drop_rate)
    # 重新排列张量以供多头注意力机制使用
    def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor:
        # 获取张量的维度信息：batch_size为批大小，hidden_size为隐藏状态的大小，_表示不关心的维度
        batch_size, hidden_size, _ = shape_list(hidden_state)
        # 计算每个注意力头的维度
        head_dim = self.embed_dim // self.num_heads
        # 将隐藏状态张量重新整形为(batch_size, hidden_size, num_heads, head_dim)
        hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, self.num_heads, head_dim))
        # 转置张量以匹配多头注意力机制的输入要求
        hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3))
        return hidden_state

    # 模型的前向传播函数
    def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
        # 如果模型包含分类标记
        if self.with_cls_token:
            # 分离分类标记和隐藏状态
            cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1)

        # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
        # 获取隐藏状态张量的维度信息：batch_size为批大小，hidden_size为高度乘宽度的大小，num_channels为通道数
        batch_size, hidden_size, num_channels = shape_list(hidden_state)
        # 将隐藏状态张量重新整形为(batch_size, height, width, num_channels)
        hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))

        # 使用三个卷积函数进行投影得到key、query、value
        key = self.convolution_projection_key(hidden_state, training=training)
        query = self.convolution_projection_query(hidden_state, training=training)
        value = self.convolution_projection_value(hidden_state, training=training)

        # 如果模型包含分类标记，则在query、key、value中添加分类标记
        if self.with_cls_token:
            query = tf.concat((cls_token, query), axis=1)
            key = tf.concat((cls_token, key), axis=1)
            value = tf.concat((cls_token, value), axis=1)

        # 计算每个注意力头的维度
        head_dim = self.embed_dim // self.num_heads

        # 将query、key、value张量重新排列以供多头注意力机制使用
        query = self.rearrange_for_multi_head_attention(self.projection_query(query))
        key = self.rearrange_for_multi_head_attention(self.projection_key(key))
        value = self.rearrange_for_multi_head_attention(self.projection_value(value))

        # 计算注意力分数
        attention_score = tf.matmul(query, key, transpose_b=True) * self.scale
        # 对注意力分数进行稳定的softmax操作
        attention_probs = stable_softmax(logits=attention_score, axis=-1)
        # 在训练时对注意力概率进行dropout
        attention_probs = self.dropout(attention_probs, training=training)

        # 计算上下文张量
        context = tf.matmul(attention_probs, value)
        # "batch_size, num_heads, hidden_size, head_dim -> batch_size, hidden_size, (num_heads*head_dim)"
        # 获取上下文张量的维度信息：batch_size为批大小，hidden_size为隐藏状态的大小，_表示不关心的维度
        _, _, hidden_size, _ = shape_list(context)
        # 转置上下文张量以匹配输出格式要求
        context = tf.transpose(context, perm=(0, 2, 1, 3))
        # 将上下文张量重新整形为(batch_size, hidden_size, (num_heads*head_dim))
        context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim))
        return context
    # 如果已经构建过网络结构，则直接返回，避免重复构建
    if self.built:
        return
    # 将标志位设置为已构建
    self.built = True
    
    # 如果存在卷积投影查询层，则构建该层
    if getattr(self, "convolution_projection_query", None) is not None:
        # 在命名空间下构建卷积投影查询层
        with tf.name_scope(self.convolution_projection_query.name):
            self.convolution_projection_query.build(None)
    
    # 如果存在卷积投影键层，则构建该层
    if getattr(self, "convolution_projection_key", None) is not None:
        # 在命名空间下构建卷积投影键层
        with tf.name_scope(self.convolution_projection_key.name):
            self.convolution_projection_key.build(None)
    
    # 如果存在卷积投影值层，则构建该层
    if getattr(self, "convolution_projection_value", None) is not None:
        # 在命名空间下构建卷积投影值层
        with tf.name_scope(self.convolution_projection_value.name):
            self.convolution_projection_value.build(None)
    
    # 如果存在投影查询层，则构建该层
    if getattr(self, "projection_query", None) is not None:
        # 在命名空间下构建投影查询层，预期输入形状为 [None, None, self.embed_dim]
        with tf.name_scope(self.projection_query.name):
            self.projection_query.build([None, None, self.embed_dim])
    
    # 如果存在投影键层，则构建该层
    if getattr(self, "projection_key", None) is not None:
        # 在命名空间下构建投影键层，预期输入形状为 [None, None, self.embed_dim]
        with tf.name_scope(self.projection_key.name):
            self.projection_key.build([None, None, self.embed_dim])
    
    # 如果存在投影值层，则构建该层
    if getattr(self, "projection_value", None) is not None:
        # 在命名空间下构建投影值层，预期输入形状为 [None, None, self.embed_dim]
        with tf.name_scope(self.projection_value.name):
            self.projection_value.build([None, None, self.embed_dim])
# 自定义 Keras 层，用于表示注意力层的输出
class TFCvtSelfOutput(keras.layers.Layer):
    """Output of the Attention layer ."""

    def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs):
        super().__init__(**kwargs)
        # 创建一个全连接层，用于转换隐藏状态到指定维度的输出
        self.dense = keras.layers.Dense(
            units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # Dropout 层，用于在训练过程中随机丢弃部分输出，防止过拟合
        self.dropout = keras.layers.Dropout(drop_rate)
        self.embed_dim = embed_dim

    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将隐藏状态输入全连接层，得到转换后的输出
        hidden_state = self.dense(inputs=hidden_state)
        # 在训练时，对输出进行 Dropout 处理
        hidden_state = self.dropout(inputs=hidden_state, training=training)
        return hidden_state

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                # 构建全连接层，指定输入维度为 [None, None, self.embed_dim]
                self.dense.build([None, None, self.embed_dim])


class TFCvtAttention(keras.layers.Layer):
    """Attention layer. First chunk of the convolutional transformer block."""

    def __init__(
        self,
        config: CvtConfig,
        num_heads: int,
        embed_dim: int,
        kernel_size: int,
        stride_q: int,
        stride_kv: int,
        padding_q: int,
        padding_kv: int,
        qkv_projection_method: str,
        qkv_bias: bool,
        attention_drop_rate: float,
        drop_rate: float,
        with_cls_token: bool = True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        # 创建自注意力层，这是卷积变换器块的第一部分
        self.attention = TFCvtSelfAttention(
            config,
            num_heads,
            embed_dim,
            kernel_size,
            stride_q,
            stride_kv,
            padding_q,
            padding_kv,
            qkv_projection_method,
            qkv_bias,
            attention_drop_rate,
            with_cls_token,
            name="attention",
        )
        # 创建自定义的输出层
        self.dense_output = TFCvtSelfOutput(config, embed_dim, drop_rate, name="output")

    def prune_heads(self, heads):
        # 当前未实现剪枝头部的方法，抛出未实现异常
        raise NotImplementedError

    def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False):
        # 调用注意力层，得到自注意力的输出
        self_output = self.attention(hidden_state, height, width, training=training)
        # 将自注意力层的输出传递给自定义的输出层，得到最终的注意力输出
        attention_output = self.dense_output(self_output, training=training)
        return attention_output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                # 构建注意力层，没有指定输入形状，因为它可以是任意形状
                self.attention.build(None)
        if getattr(self, "dense_output", None) is not None:
            with tf.name_scope(self.dense_output.name):
                # 构建自定义输出层，同样没有指定输入形状，因为它可以是任意形状
                self.dense_output.build(None)


class TFCvtIntermediate(keras.layers.Layer):
    """Intermediate dense layer. Second chunk of the convolutional transformer block."""
    # 初始化函数，用于初始化类实例
    def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs):
        # 调用父类的初始化函数
        super().__init__(**kwargs)
        # 创建一个全连接层(Dense)，设置单元数为 embed_dim * mlp_ratio
        # 使用指定的初始化器初始化权重矩阵
        # 激活函数选择为 GELU
        # 层的名称为 "dense"
        self.dense = keras.layers.Dense(
            units=int(embed_dim * mlp_ratio),
            kernel_initializer=get_initializer(config.initializer_range),
            activation="gelu",
            name="dense",
        )
        # 存储 embed_dim 到实例变量 embed_dim
        self.embed_dim = embed_dim

    # 定义类的调用方法，用于执行实际的前向传播计算
    def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
        # 将输入 hidden_state 通过全连接层 self.dense 进行变换
        hidden_state = self.dense(hidden_state)
        # 返回变换后的张量
        return hidden_state

    # 构建方法，在首次调用 call 方法前调用，用于构建层的结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 设置该层为已构建状态
        self.built = True
        # 如果存在 self.dense 属性，则进入下面的逻辑
        if getattr(self, "dense", None) is not None:
            # 在 TensorFlow 的命名作用域下构建 self.dense 层
            with tf.name_scope(self.dense.name):
                # 构建 self.dense 层，输入形状为 [None, None, self.embed_dim]
                self.dense.build([None, None, self.embed_dim])
        super().__init__(**kwargs)
        # 初始化方法，接收多个参数配置以及超参数
        self.attention = AttentionLayer(
            num_heads=num_heads,
            embed_dim=embed_dim,
            kernel_size=kernel_size,
            stride_q=stride_q,
            stride_kv=stride_kv,
            padding_q=padding_q,
            padding_kv=padding_kv,
            qkv_projection_method=qkv_projection_method,
            qkv_bias=qkv_bias,
            attention_drop_rate=attention_drop_rate,
            **kwargs
        )
        # 创建注意力层对象，用于处理输入数据
        self.norm1 = keras.layers.LayerNormalization(epsilon=1e-5)
        # 创建层归一化对象，对注意力层输出进行归一化
        self.dense1 = keras.layers.Dense(
            units=int(embed_dim * mlp_ratio),
            kernel_initializer=get_initializer(config.initializer_range),
            activation=gelu,
            name="dense1"
        )
        # 创建第一个全连接层对象，处理注意力层输出
        self.dropout = keras.layers.Dropout(drop_rate)
        # 创建丢弃层对象，用于随机丢弃部分数据以防止过拟合
        self.norm2 = keras.layers.LayerNormalization(epsilon=1e-5)
        # 创建第二个层归一化对象，对全连接层输出进行归一化
        self.dense2 = keras.layers.Dense(units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense2")
        # 创建第二个全连接层对象，处理第一个全连接层输出
        self.drop_path = DropPath(drop_path_rate)
        # 创建路径丢弃层对象，用于在训练过程中以概率丢弃路径
        self.with_cls_token = with_cls_token
        # 设置是否使用类别令牌标志位

    def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 定义调用方法，接收输入张量和训练标志位
        attn_output = self.attention(inputs, training=training)
        # 调用注意力层处理输入数据
        attn_output = self.norm1(attn_output + inputs)
        # 对注意力层输出和输入数据进行残差连接并归一化
        mlp_output = self.dense1(attn_output)
        # 将注意力层输出输入全连接层1中进行处理
        mlp_output = self.dropout(mlp_output, training=training)
        # 对全连接层1输出进行丢弃处理
        mlp_output = self.dense2(mlp_output)
        # 将全连接层1输出输入全连接层2中进行处理
        mlp_output = self.drop_path(mlp_output, training=training)
        # 对全连接层2输出进行路径丢弃处理
        mlp_output = self.norm2(mlp_output + attn_output)
        # 对全连接层2输出和注意力层输出进行残差连接并归一化
        if self.with_cls_token:
            return mlp_output[:, 0]
        else:
            return mlp_output
        # 如果设置使用类别令牌，则返回全连接层2输出的第一个元素；否则返回全连接层2输出
    ):
        # 调用父类的初始化方法，传递所有的关键字参数
        super().__init__(**kwargs)
        # 初始化自注意力层，使用 TFCvtAttention 类
        self.attention = TFCvtAttention(
            config,
            num_heads,
            embed_dim,
            kernel_size,
            stride_q,
            stride_kv,
            padding_q,
            padding_kv,
            qkv_projection_method,
            qkv_bias,
            attention_drop_rate,
            drop_rate,
            with_cls_token,
            name="attention",
        )
        # 初始化中间层，使用 TFCvtIntermediate 类
        self.intermediate = TFCvtIntermediate(config, embed_dim, mlp_ratio, name="intermediate")
        # 初始化输出层，使用 TFCvtOutput 类
        self.dense_output = TFCvtOutput(config, embed_dim, mlp_ratio, drop_rate, name="output")
        # 使用 `layers.Activation` 替代 `tf.identity` 来更好地控制 `training` 行为
        self.drop_path = (
            # 如果 drop_path_rate 大于 0.0，则使用 TFCvtDropPath 初始化
            TFCvtDropPath(drop_path_rate, name="drop_path")
            # 否则使用 keras.layers.Activation("linear") 初始化
            if drop_path_rate > 0.0
            else keras.layers.Activation("linear", name="drop_path")
        )
        # 使用与 PyTorch 相同的默认 epsilon 初始化前层归一化层
        self.layernorm_before = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before")
        # 使用与 PyTorch 相同的默认 epsilon 初始化后层归一化层
        self.layernorm_after = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after")
        # 初始化 embed_dim 属性
        self.embed_dim = embed_dim

    def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
        # 在 Cvt 中，self-attention 前应用层归一化
        attention_output = self.attention(self.layernorm_before(hidden_state), height, width, training=training)
        # 应用 drop_path 层
        attention_output = self.drop_path(attention_output, training=training)

        # 第一个残差连接
        hidden_state = attention_output + hidden_state

        # 在 Cvt 中，self-attention 后也应用层归一化
        layer_output = self.layernorm_after(hidden_state)
        # 应用中间层
        layer_output = self.intermediate(layer_output)

        # 第二个残差连接
        layer_output = self.dense_output(layer_output, hidden_state)
        # 应用 drop_path 层
        layer_output = self.drop_path(layer_output, training=training)
        # 返回层输出
        return layer_output
    # 构建模型方法，初始化模型结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True
        
        # 如果存在注意力层，构建其内部结构
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        
        # 如果存在中间层，构建其内部结构
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        
        # 如果存在密集输出层，构建其内部结构
        if getattr(self, "dense_output", None) is not None:
            with tf.name_scope(self.dense_output.name):
                self.dense_output.build(None)
        
        # 如果存在 dropout 路径，构建其内部结构
        if getattr(self, "drop_path", None) is not None:
            with tf.name_scope(self.drop_path.name):
                self.drop_path.build(None)
        
        # 如果存在 layernorm 在前，构建其内部结构
        if getattr(self, "layernorm_before", None) is not None:
            with tf.name_scope(self.layernorm_before.name):
                self.layernorm_before.build([None, None, self.embed_dim])
        
        # 如果存在 layernorm 在后，构建其内部结构
        if getattr(self, "layernorm_after", None) is not None:
            with tf.name_scope(self.layernorm_after.name):
                self.layernorm_after.build([None, None, self.embed_dim])
    """
    Cvt stage (encoder block). Each stage has 2 parts :
    - (1) A Convolutional Token Embedding layer
    - (2) A Convolutional Transformer Block (layer).
    The classification token is added only in the last stage.

    Args:
        config ([`CvtConfig`]): Model configuration class.
        stage (`int`): Stage number.
    """

    def __init__(self, config: CvtConfig, stage: int, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        self.stage = stage
        
        # Check if classification token should be added for this stage
        if self.config.cls_token[self.stage]:
            # Initialize classification token weights
            self.cls_token = self.add_weight(
                shape=(1, 1, self.config.embed_dim[-1]),
                initializer=get_initializer(self.config.initializer_range),
                trainable=True,
                name="cvt.encoder.stages.2.cls_token",
            )

        # Initialize token embeddings layer for the current stage
        self.embedding = TFCvtEmbeddings(
            self.config,
            patch_size=config.patch_sizes[self.stage],
            num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1],
            stride=config.patch_stride[self.stage],
            embed_dim=config.embed_dim[self.stage],
            padding=config.patch_padding[self.stage],
            dropout_rate=config.drop_rate[self.stage],
            name="embedding",
        )

        # Compute drop path rates based on the current stage's depth
        drop_path_rates = tf.linspace(0.0, config.drop_path_rate[self.stage], config.depth[stage])
        drop_path_rates = [x.numpy().item() for x in drop_path_rates]
        
        # Initialize convolutional transformer layers for the current stage
        self.layers = [
            TFCvtLayer(
                config,
                num_heads=config.num_heads[self.stage],
                embed_dim=config.embed_dim[self.stage],
                kernel_size=config.kernel_qkv[self.stage],
                stride_q=config.stride_q[self.stage],
                stride_kv=config.stride_kv[self.stage],
                padding_q=config.padding_q[self.stage],
                padding_kv=config.padding_kv[self.stage],
                qkv_projection_method=config.qkv_projection_method[self.stage],
                qkv_bias=config.qkv_bias[self.stage],
                attention_drop_rate=config.attention_drop_rate[self.stage],
                drop_rate=config.drop_rate[self.stage],
                mlp_ratio=config.mlp_ratio[self.stage],
                drop_path_rate=drop_path_rates[self.stage],
                with_cls_token=config.cls_token[self.stage],
                name=f"layers.{j}",
            )
            for j in range(config.depth[self.stage])
        ]
    # 定义一个方法，用于调用模型，接受隐藏状态和训练标志作为参数
    def call(self, hidden_state: tf.Tensor, training: bool = False):
        # 初始化 cls_token 为 None
        cls_token = None
        # 对隐藏状态进行嵌入操作，根据训练标志调用嵌入层的函数
        hidden_state = self.embedding(hidden_state, training)

        # 获取隐藏状态的形状信息：batch_size, height, width, num_channels
        batch_size, height, width, num_channels = shape_list(hidden_state)
        # 计算展平后的隐藏状态大小
        hidden_size = height * width
        # 对隐藏状态进行重新形状操作，将其转换为 batch_size, hidden_size, num_channels
        hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels))

        # 如果配置要求在当前阶段使用 cls_token
        if self.config.cls_token[self.stage]:
            # 复制 cls_token，并在 batch 维度上重复
            cls_token = tf.repeat(self.cls_token, repeats=batch_size, axis=0)
            # 将 cls_token 连接到隐藏状态的开头，沿着第二个维度拼接
            hidden_state = tf.concat((cls_token, hidden_state), axis=1)

        # 对每一层进行迭代操作
        for layer in self.layers:
            # 调用当前层的函数，处理隐藏状态，并传入高度、宽度和训练标志
            layer_outputs = layer(hidden_state, height, width, training=training)
            # 更新隐藏状态为当前层的输出
            hidden_state = layer_outputs

        # 如果配置要求在当前阶段使用 cls_token
        if self.config.cls_token[self.stage]:
            # 将隐藏状态分割为 cls_token 和其余部分
            cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1)

        # 将隐藏状态重新形状为 batch_size, height, width, num_channels
        hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
        # 返回最终的隐藏状态和 cls_token
        return hidden_state, cls_token

    # 构建方法，用于构建模型
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记模型已构建
        self.built = True
        # 如果存在嵌入层，构建嵌入层
        if getattr(self, "embedding", None) is not None:
            with tf.name_scope(self.embedding.name):
                self.embedding.build(None)
        # 如果存在层列表，逐层构建每一层
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    layer.build(None)
class TFCvtEncoder(keras.layers.Layer):
    """
    Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
    (depth) being 1, 2 and 10.

    Args:
        config ([`CvtConfig`]): Model configuration class.
    """

    config_class = CvtConfig

    def __init__(self, config: CvtConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        # 初始化 CVT 编码器的各个阶段
        self.stages = [
            TFCvtStage(config, stage_idx, name=f"stages.{stage_idx}") for stage_idx in range(len(config.depth))
        ]

    def call(
        self,
        pixel_values: TFModelInputType,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        training: Optional[bool] = False,
    ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
        all_hidden_states = () if output_hidden_states else None
        hidden_state = pixel_values
        # 当在 CPU 上运行时，`keras.layers.Conv2D` 不支持 (batch_size, num_channels, height, width) 作为输入格式。
        # 所以将输入格式更改为 (batch_size, height, width, num_channels)。
        hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1))

        cls_token = None
        for _, (stage_module) in enumerate(self.stages):
            # 逐阶段应用 CVT 编码器，并获取输出的隐藏状态和 CLS token
            hidden_state, cls_token = stage_module(hidden_state, training=training)
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_state,)

        # 将隐藏状态格式变回 (batch_size, num_channels, height, width)，以保持模块的统一性
        hidden_state = tf.transpose(hidden_state, perm=(0, 3, 1, 2))
        if output_hidden_states:
            all_hidden_states = tuple([tf.transpose(hs, perm=(0, 3, 1, 2)) for hs in all_hidden_states])

        if not return_dict:
            return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)

        # 返回 TFBaseModelOutputWithCLSToken 对象，其中包括最终的隐藏状态、CLS token 值和所有隐藏状态的元组
        return TFBaseModelOutputWithCLSToken(
            last_hidden_state=hidden_state,
            cls_token_value=cls_token,
            hidden_states=all_hidden_states,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建过，则直接返回
        if getattr(self, "stages", None) is not None:
            # 为每个阶段的层构建名称作用域
            for layer in self.stages:
                with tf.name_scope(layer.name):
                    layer.build(None)


@keras_serializable
class TFCvtMainLayer(keras.layers.Layer):
    """Construct the Cvt model."""

    config_class = CvtConfig

    def __init__(self, config: CvtConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        # 初始化 CVT 主层的编码器部分
        self.encoder = TFCvtEncoder(config, name="encoder")

    @unpack_inputs
    def call(
        self,
        pixel_values: TFModelInputType | None = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
        # 调用 CVT 编码器的 call 方法来处理输入数据
    # 定义方法，返回值类型为 TFBaseModelOutputWithCLSToken 或者 tf.Tensor 元组
    def __call__(self, pixel_values=None) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
        # 如果 pixel_values 为空，则抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 使用 self.encoder 对象处理 pixel_values
        encoder_outputs = self.encoder(
            pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取编码器的输出序列
        sequence_output = encoder_outputs[0]

        # 如果 return_dict 为 False，则返回一个元组，包含序列输出和编码器的其他输出
        if not return_dict:
            return (sequence_output,) + encoder_outputs[1:]

        # 如果 return_dict 为 True，则返回 TFBaseModelOutputWithCLSToken 对象
        return TFBaseModelOutputWithCLSToken(
            last_hidden_state=sequence_output,
            cls_token_value=encoder_outputs.cls_token_value,
            hidden_states=encoder_outputs.hidden_states,
        )

    # 定义 build 方法，用于构建模型结构
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记该模型已经构建
        self.built = True
        # 如果 self.encoder 存在，则在命名空间下构建 self.encoder 对象
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
# 导入所需模块或类
class TFCvtPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为CvtConfig
    config_class = CvtConfig
    # 模型的基础名称前缀为"cvt"
    base_model_prefix = "cvt"
    # 主要输入的名称为"pixel_values"
    main_input_name = "pixel_values"


# 定义模型开始文档字符串的常量
TFCVT_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TF 2.0 models accepts two formats as inputs:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional arguments.

    This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
    tensors in the first argument of the model call function: `model(inputs)`.

    </Tip>

    Args:
        config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义模型输入文档字符串的常量
TFCVT_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
            for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
"""


@add_start_docstrings(
    "The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.",
    TFCVT_START_DOCSTRING,
)
# 定义TFCvtModel类，继承自TFCvtPreTrainedModel
class TFCvtModel(TFCvtPreTrainedModel):
    def __init__(self, config: CvtConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        
        # 创建一个 TFCvtMainLayer 类的实例，命名为 cvt
        self.cvt = TFCvtMainLayer(config, name="cvt")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFBaseModelOutputWithCLSToken, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
        r"""
        模型的前向传播方法，接收输入像素值和一些可选参数，返回模型输出。

        Args:
            pixel_values (tf.Tensor | None): 输入像素值的张量，可以为 None。
            output_hidden_states (Optional[bool]): 是否输出隐藏状态，默认为 None。
            return_dict (Optional[bool]): 是否返回字典格式的输出，默认为 None。
            training (Optional[bool]): 是否在训练模式下，默认为 False。

        Returns:
            Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]: 根据 return_dict 参数返回不同类型的输出。

        Examples:

        ```
        >>> from transformers import AutoImageProcessor, TFCvtModel
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
        >>> model = TFCvtModel.from_pretrained("microsoft/cvt-13")

        >>> inputs = image_processor(images=image, return_tensors="tf")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```"""

        # 如果 pixel_values 为 None，则抛出 ValueError
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 调用 self.cvt 的前向传播方法，传入相应的参数
        outputs = self.cvt(
            pixel_values=pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 如果 return_dict 为 False，则返回一个元组
        if not return_dict:
            return (outputs[0],) + outputs[1:]

        # 如果 return_dict 为 True，则返回 TFBaseModelOutputWithCLSToken 类的实例
        return TFBaseModelOutputWithCLSToken(
            last_hidden_state=outputs.last_hidden_state,
            cls_token_value=outputs.cls_token_value,
            hidden_states=outputs.hidden_states,
        )

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        self.built = True
        
        # 如果 self.cvt 存在，则在 tf 的命名空间下构建 self.cvt
        if getattr(self, "cvt", None) is not None:
            with tf.name_scope(self.cvt.name):
                self.cvt.build(None)
    """
    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    """
    # 基于Cvt模型转换器，顶部带有图像分类头部（在[CLS]标记的最终隐藏状态之上的线性层），例如用于ImageNet。
    TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassificationLoss):
    
    def __init__(self, config: CvtConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化类别数
        self.num_labels = config.num_labels
        # 创建Cvt主层
        self.cvt = TFCvtMainLayer(config, name="cvt")
        # 使用与原始实现相同的默认epsilon
        self.layernorm = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm")

        # 分类器头部
        self.classifier = keras.layers.Dense(
            units=config.num_labels,
            kernel_initializer=get_initializer(config.initializer_range),
            use_bias=True,
            bias_initializer="zeros",
            name="classifier",
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        labels: tf.Tensor | None = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        # 模型调用函数，接受像素值、标签等参数
        outputs = self.cvt(
            pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )



        # 使用自定义视觉Transformer模型进行前向推断
        # pixel_values: 输入的像素值
        # output_hidden_states: 是否输出隐藏状态
        # return_dict: 是否返回字典格式的输出
        # training: 模型是否处于训练模式



        sequence_output = outputs[0]
        cls_token = outputs[1]



        # 获取模型输出中的序列输出和分类令牌
        # sequence_output: 序列输出
        # cls_token: 分类令牌



        if self.config.cls_token[-1]:
            sequence_output = self.layernorm(cls_token)
        else:



        # 根据配置中的分类令牌，决定如何处理序列输出
        # 若分类令牌存在，则对分类令牌进行 LayerNormalization 处理后作为最终序列输出
        # 否则，进行形状重排以及转置操作，以便进一步处理



            # rearrange "batch_size, num_channels, height, width -> batch_size, (height*width), num_channels"
            batch_size, num_channels, height, width = shape_list(sequence_output)
            sequence_output = tf.reshape(sequence_output, shape=(batch_size, num_channels, height * width))
            sequence_output = tf.transpose(sequence_output, perm=(0, 2, 1))
            sequence_output = self.layernorm(sequence_output)



        # 对序列输出进行形状重排和转置操作，以便后续处理
        # batch_size, num_channels, height, width: 提取序列输出的形状信息
        # sequence_output: 重排和转置后的序列输出，经过 LayerNormalization 处理



        sequence_output_mean = tf.reduce_mean(sequence_output, axis=1)
        logits = self.classifier(sequence_output_mean)



        # 计算序列输出的平均值，并通过分类器生成 logits
        # sequence_output_mean: 序列输出的平均值
        # logits: 经过分类器处理后得到的预测 logits



        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)



        # 如果存在标签，则计算损失值
        # labels: 用于计算分类/回归损失的标签
        # loss: 计算得到的损失值，若无标签则为 None



        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output



        # 根据 return_dict 的设置决定输出格式
        # 如果不返回字典，则将 logits 和其他输出组成元组输出
        # output: 包含 logits 和其他输出的元组



        return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)



        # 返回以 TFImageClassifierOutputWithNoAttention 格式封装的输出
        # loss: 损失值
        # logits: 预测 logits
        # hidden_states: 隐藏状态
    # 如果已经构建过，则直接返回，不重复构建
    if self.built:
        return
    # 设置标志位，表示已经构建
    self.built = True
    
    # 如果存在名为 'cvt' 的属性，并且不为 None，则构建 'cvt' 对象
    if getattr(self, "cvt", None) is not None:
        # 使用 'cvt' 对象的名称作为命名空间，构建它
        with tf.name_scope(self.cvt.name):
            self.cvt.build(None)
    
    # 如果存在名为 'layernorm' 的属性，并且不为 None，则构建 'layernorm' 对象
    if getattr(self, "layernorm", None) is not None:
        # 使用 'layernorm' 对象的名称作为命名空间，构建它
        with tf.name_scope(self.layernorm.name):
            # 构建 'layernorm' 对象，传入输入形状 [None, None, self.config.embed_dim[-1]]
            self.layernorm.build([None, None, self.config.embed_dim[-1]])
    
    # 如果存在名为 'classifier' 的属性，并且不为 None，则构建 'classifier' 对象
    if getattr(self, "classifier", None) is not None:
        # 如果 'classifier' 对象有 'name' 属性，则使用其名称作为命名空间
        if hasattr(self.classifier, "name"):
            with tf.name_scope(self.classifier.name):
                # 构建 'classifier' 对象，传入输入形状 [None, None, self.config.embed_dim[-1]]
                self.classifier.build([None, None, self.config.embed_dim[-1]])

`.\models\cvt\init.py`

# 版权声明和许可证信息
#
# 根据 Apache 许可证版本 2.0 授权使用此文件
# 除非遵守许可证，否则不得使用此文件
# 可以在以下链接处获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 如果法律要求或书面同意，本软件根据“原样”分发，无任何明示或暗示的担保或条件。
# 请查阅许可证获取更多信息。

from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available

# 定义导入结构
_import_structure = {"configuration_cvt": ["CVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CvtConfig"]}

# 检查是否 Torch 可用，否则抛出可选依赖项不可用的异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则添加以下模块到导入结构中
    _import_structure["modeling_cvt"] = [
        "CVT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "CvtForImageClassification",
        "CvtModel",
        "CvtPreTrainedModel",
    ]

# 检查是否 TensorFlow 可用，否则抛出可选依赖项不可用的异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 TensorFlow 可用，则添加以下模块到导入结构中
    _import_structure["modeling_tf_cvt"] = [
        "TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFCvtForImageClassification",
        "TFCvtModel",
        "TFCvtPreTrainedModel",
    ]

# 如果是类型检查阶段
if TYPE_CHECKING:
    # 导入配置和模型相关模块（Torch 或 TensorFlow 可能会存在）
    from .configuration_cvt import CVT_PRETRAINED_CONFIG_ARCHIVE_MAP, CvtConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_cvt import (
            CVT_PRETRAINED_MODEL_ARCHIVE_LIST,
            CvtForImageClassification,
            CvtModel,
            CvtPreTrainedModel,
        )

    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_tf_cvt import (
            TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFCvtForImageClassification,
            TFCvtModel,
            TFCvtPreTrainedModel,
        )

# 非类型检查阶段
else:
    import sys

    # 将当前模块替换为懒加载模块，以便动态导入指定模块结构
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\data2vec\configuration_data2vec_audio.py`

# coding=utf-8
# 设置文件编码为 UTF-8，确保支持中文等多种字符集
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
# 版权声明，版权归 HuggingFace Inc. 团队所有
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License 2.0 版本授权许可，允许复制、修改、发布、分发本软件
# you may not use this file except in compliance with the License.
# 除非遵守许可证，否则不得使用本文件
# You may obtain a copy of the License at
# 可以从以下链接获取许可证副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# 除非适用法律要求或书面同意，否则不得使用本软件
# distributed under the License is distributed on an "AS IS" BASIS,
# 本软件按“原样”分发，不提供任何形式的担保或条件
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有明示或暗示的担保或条件
# See the License for the specific language governing permissions and
# 请参阅许可证以获取详细的许可条款
# limitations under the License.
# 许可下的限制
""" Data2VecText configuration"""
# Data2VecText 配置模块说明

import math  # 导入 math 模块

from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...utils import logging  # 导入日志记录工具


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/data2vec-base-960h": "https://huggingface.co/facebook/data2vec-audio-base-960h/resolve/main/config.json",
    # 预训练模型映射字典，指定模型名称和配置文件的 URL
    # See all Data2VecAudio models at https://huggingface.co/models?filter=data2vec-audio
    # 查看所有 Data2VecAudio 模型的链接
}


class Data2VecAudioConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate
    an Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Data2VecAudio
    [facebook/data2vec-audio-base-960h](https://huggingface.co/facebook/data2vec-audio-base-960h) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import Data2VecAudioConfig, Data2VecAudioModel

    >>> # Initializing a Data2VecAudio facebook/data2vec-audio-base-960h style configuration
    >>> configuration = Data2VecAudioConfig()

    >>> # Initializing a model (with random weights) from the facebook/data2vec-audio-base-960h style configuration
    >>> model = Data2VecAudioModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    # Data2VecAudioConfig 类的说明文档和示例用法

    model_type = "data2vec-audio"  # 模型类型为 data2vec-audio
    # 初始化函数，用于创建一个 Transformer 模型的实例
    def __init__(
        self,
        vocab_size=32,  # 词汇表大小，默认为32
        hidden_size=768,  # 隐藏层大小，默认为768
        num_hidden_layers=12,  # Transformer 模型的隐藏层层数，默认为12
        num_attention_heads=12,  # 注意力头的数量，默认为12
        intermediate_size=3072,  # 中间层的大小，默认为3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为 GELU
        hidden_dropout=0.1,  # 隐藏层的 dropout 比率，默认为0.1
        activation_dropout=0.1,  # 激活函数的 dropout 比率，默认为0.1
        attention_dropout=0.1,  # 注意力层的 dropout 比率，默认为0.1
        feat_proj_dropout=0.0,  # 特征投影层的 dropout 比率，默认为0.0
        final_dropout=0.1,  # 最终输出层的 dropout 比率，默认为0.1
        layerdrop=0.1,  # 层间 dropout 比率，默认为0.1
        initializer_range=0.02,  # 初始化权重范围，默认为0.02
        layer_norm_eps=1e-5,  # Layer normalization 的 epsilon 参数，默认为1e-5
        feat_extract_activation="gelu",  # 特征提取层的激活函数，默认为 GELU
        conv_dim=(512, 512, 512, 512, 512, 512, 512),  # 卷积层的维度列表，默认为指定维度
        conv_stride=(5, 2, 2, 2, 2, 2, 2),  # 卷积层的步幅列表，默认为指定步幅
        conv_kernel=(10, 3, 3, 3, 3, 2, 2),  # 卷积层的核大小列表，默认为指定核大小
        conv_bias=False,  # 是否在卷积层使用偏置，默认为False
        num_conv_pos_embedding_groups=16,  # 卷积位置嵌入的组数，默认为16
        conv_pos_kernel_size=19,  # 卷积位置嵌入的核大小，默认为19
        num_conv_pos_embeddings=5,  # 卷积位置嵌入的数量，默认为5
        mask_time_prob=0.05,  # 时间掩码的概率，默认为0.05
        mask_time_length=10,  # 时间掩码的长度，默认为10
        mask_time_min_masks=2,  # 最小时间掩码数，默认为2
        mask_feature_prob=0.0,  # 特征掩码的概率，默认为0.0
        mask_feature_length=10,  # 特征掩码的长度，默认为10
        mask_feature_min_masks=0,  # 最小特征掩码数，默认为0
        ctc_loss_reduction="sum",  # CTC 损失函数的减少方式，默认为"sum"
        ctc_zero_infinity=False,  # CTC 损失函数是否允许无穷大，默认为False
        use_weighted_layer_sum=False,  # 是否使用加权层求和，默认为False
        classifier_proj_size=256,  # 分类器投影层的大小，默认为256
        tdnn_dim=(512, 512, 512, 512, 1500),  # TDNN 层的维度列表，默认为指定维度
        tdnn_kernel=(5, 3, 3, 1, 1),  # TDNN 层的核大小列表，默认为指定核大小
        tdnn_dilation=(1, 2, 3, 1, 1),  # TDNN 层的膨胀率列表，默认为指定膨胀率
        xvector_output_dim=512,  # x-vector 输出的维度，默认为512
        pad_token_id=0,  # 填充符的 token id，默认为0
        bos_token_id=1,  # 开始符的 token id，默认为1
        eos_token_id=2,  # 结束符的 token id，默认为2
        add_adapter=False,  # 是否添加适配器层，默认为False
        adapter_kernel_size=3,  # 适配器层的核大小，默认为3
        adapter_stride=2,  # 适配器层的步幅，默认为2
        num_adapter_layers=3,  # 适配器层的数量，默认为3
        output_hidden_size=None,  # 输出隐藏层的大小，默认为None
        **kwargs,  # 其它关键字参数
    ):
        # 计算卷积层步幅列表中所有元素的乘积，并返回结果
        @property
        def inputs_to_logits_ratio(self):
            return math.prod(self.conv_stride)

`.\models\data2vec\configuration_data2vec_text.py`

# coding=utf-8
# 文件编码声明，使用 UTF-8 编码格式
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
# 版权声明，版权归 HuggingFace Inc. 团队所有
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 授权协议声明，使用 Apache License, Version 2.0 版本
# you may not use this file except in compliance with the License.
# 除非遵循 Apache License 2.0 版本，否则不得使用此文件
# You may obtain a copy of the License at
# 可以获取协议的副本链接
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# 除非适用法律要求或书面同意，否则软件
# distributed under the License is distributed on an "AS IS" BASIS,
# 根据协议分发软件，按"现状"提供，无任何担保
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 无论是明示还是暗示的，均不对软件的任何担保或条件
# See the License for the specific language governing permissions and
# 详细信息请查阅协议
# limitations under the License.
# 限制的详细内容请查阅协议
""" Data2VecText configuration"""
# 模块说明，Data2VecText 配置
from collections import OrderedDict
# 导入 OrderedDict，有序字典类
from typing import Mapping
# 导入 Mapping，映射类型

from ...configuration_utils import PretrainedConfig
# 导入预训练配置类 PretrainedConfig
from ...onnx import OnnxConfig
# 导入 OnnxConfig，ONNX 配置类
from ...utils import logging
# 导入 logging 模块，日志模块

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/data2vec-text-base": "https://huggingface.co/data2vec/resolve/main/config.json",
}
# 预训练配置映射表，将预训练模型名称映射到其配置文件的 URL

class Data2VecTextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Data2VecTextModel`] and [`Data2VecTextModel`]. It
    is used to instantiate a Data2VecText model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2VecText
    [facebook/data2vec-text-base](https://huggingface.co/facebook/data2vec-text-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # Data2VecText 配置类，用于存储 Data2VecTextModel 的配置

    model_type = "data2vec-text"
    # 模型类型标识为 "data2vec-text"

    def __init__(
        self,
        vocab_size=30522,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=1,
        bos_token_id=0,
        eos_token_id=2,
        position_embedding_type="absolute",
        use_cache=True,
        classifier_dropout=None,
        **kwargs,
    ):
        # 初始化方法，配置 Data2VecText 模型的各项参数
        pass
        # 占位符，未实际执行操作，保留该方法以后可能的参数扩展
        ):
            # 调用父类初始化方法，传递相关参数并设置默认的特殊标记的 ID
            super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

            # 初始化模型的词汇表大小
            self.vocab_size = vocab_size
            # 初始化隐藏层的大小
            self.hidden_size = hidden_size
            # 初始化隐藏层的数量
            self.num_hidden_layers = num_hidden_layers
            # 初始化注意力头的数量
            self.num_attention_heads = num_attention_heads
            # 初始化隐藏层激活函数
            self.hidden_act = hidden_act
            # 初始化中间层的大小
            self.intermediate_size = intermediate_size
            # 初始化隐藏层的 dropout 概率
            self.hidden_dropout_prob = hidden_dropout_prob
            # 初始化注意力矩阵的 dropout 概率
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            # 初始化最大位置嵌入的长度
            self.max_position_embeddings = max_position_embeddings
            # 初始化类型词汇表的大小
            self.type_vocab_size = type_vocab_size
            # 初始化初始化范围
            self.initializer_range = initializer_range
            # 初始化层归一化的 epsilon 值
            self.layer_norm_eps = layer_norm_eps
            # 初始化位置嵌入的类型
            self.position_embedding_type = position_embedding_type
            # 初始化是否使用缓存
            self.use_cache = use_cache
            # 初始化分类器的 dropout 概率
            self.classifier_dropout = classifier_dropout
# 定义一个继承自OnnxConfig的Data2VecTextOnnxConfig类，用于配置ONNX模型的输入规格
class Data2VecTextOnnxConfig(OnnxConfig):
    
    # 定义一个inputs属性，返回一个映射，描述模型的输入
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        
        # 如果任务是"multiple-choice"
        if self.task == "multiple-choice":
            # 定义动态轴的映射，包含批处理、选择和序列三个轴
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则，定义动态轴的映射，包含批处理和序列两个轴
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 返回一个有序字典，描述模型的输入，包括input_ids和attention_mask两个输入
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),         # input_ids使用动态轴定义
                ("attention_mask", dynamic_axis),    # attention_mask使用动态轴定义
            ]
        )

`.\models\data2vec\configuration_data2vec_vision.py`

# 设置文件编码为 UTF-8
# 版权声明：Meta Platforms 和 The HuggingFace Inc. 团队保留所有权利
#
# 根据 Apache 许可证 2.0 版本授权使用本文件
# 除非遵守许可证，否则不得使用本文件
# 可以在以下网址获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件基于"原样"分发，无任何明示或暗示的担保或条件
# 有关详细信息，请参阅许可证

""" Data2VecVision 模型配置 """

# 导入所需的模块
from collections import OrderedDict  # 导入有序字典模块
from typing import Mapping  # 导入类型提示 Mapping

from packaging import version  # 导入版本管理模块

# 导入配置工具函数和类
from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...onnx import OnnxConfig  # 导入 ONNX 配置类
from ...utils import logging  # 导入日志工具模块

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 预训练配置存档映射表
DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/data2vec-vision-base-ft": (
        "https://huggingface.co/facebook/data2vec-vision-base-ft/resolve/main/config.json"
    ),
}

class Data2VecVisionConfig(PretrainedConfig):
    r"""
    这是用于存储 [`Data2VecVisionModel`] 配置的配置类。根据指定的参数实例化 Data2VecVision 模型，定义模型架构。
    使用默认值实例化配置将产生类似于 Data2VecVision [facebook/data2vec-vision-base](https://huggingface.co/facebook/data2vec-vision-base) 架构的配置。

    示例:

    ```
    >>> from transformers import Data2VecVisionConfig, Data2VecVisionModel

    >>> # 初始化一个 Data2VecVision data2vec_vision-base-patch16-224-in22k 风格的配置
    >>> configuration = Data2VecVisionConfig()

    >>> # 从上述配置初始化一个（带有随机权重）模型
    >>> model = Data2VecVisionModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    model_type = "data2vec-vision"
    # 初始化函数，用于设置Transformer模型的各项参数
    def __init__(
        self,
        hidden_size=768,  # Transformer中隐藏层的大小，默认为768
        num_hidden_layers=12,  # Transformer中的隐藏层数，默认为12
        num_attention_heads=12,  # 每个注意力头的数量，默认为12
        intermediate_size=3072,  # Feedforward层的中间大小，默认为3072
        hidden_act="gelu",  # 激活函数类型，默认为GELU
        hidden_dropout_prob=0.0,  # 隐藏层的Dropout概率，默认为0.0（无Dropout）
        attention_probs_dropout_prob=0.0,  # 注意力层的Dropout概率，默认为0.0（无Dropout）
        initializer_range=0.02,  # 参数初始化的范围，默认为0.02
        layer_norm_eps=1e-12,  # Layer normalization的epsilon值，默认为1e-12
        image_size=224,  # 输入图像的大小，默认为224
        patch_size=16,  # 每个patch的大小，默认为16
        num_channels=3,  # 输入图像的通道数，默认为3（RGB）
        use_mask_token=False,  # 是否使用Mask Token，默认为False
        use_absolute_position_embeddings=False,  # 是否使用绝对位置编码，默认为False
        use_relative_position_bias=False,  # 是否使用相对位置偏置，默认为False
        use_shared_relative_position_bias=False,  # 是否使用共享的相对位置偏置，默认为False
        layer_scale_init_value=0.1,  # 层次标度初始化值，默认为0.1
        drop_path_rate=0.1,  # Drop Path的概率，默认为0.1
        use_mean_pooling=True,  # 是否使用均值池化，默认为True
        out_indices=[3, 5, 7, 11],  # 输出索引的列表（用于解码头），默认为[3, 5, 7, 11]
        pool_scales=[1, 2, 3, 6],  # 池化的尺度列表（用于解码头），默认为[1, 2, 3, 6]
        use_auxiliary_head=True,  # 是否使用辅助解码头，默认为True
        auxiliary_loss_weight=0.4,  # 辅助解码头的损失权重，默认为0.4
        auxiliary_channels=256,  # 辅助解码头的通道数，默认为256
        auxiliary_num_convs=1,  # 辅助解码头中的卷积层数，默认为1
        auxiliary_concat_input=False,  # 辅助解码头中是否将输入进行拼接，默认为False
        semantic_loss_ignore_index=255,  # 语义损失中要忽略的索引，默认为255
        **kwargs,  # 其他未指定参数，以字典形式接收
    ):
        super().__init__(**kwargs)
    
        # 设置Transformer模型的各项参数
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
    
        # 图像相关的参数设置
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.use_mask_token = use_mask_token
        self.use_absolute_position_embeddings = use_absolute_position_embeddings
        self.use_relative_position_bias = use_relative_position_bias
        self.use_shared_relative_position_bias = use_shared_relative_position_bias
        self.layer_scale_init_value = layer_scale_init_value
        self.drop_path_rate = drop_path_rate
        self.use_mean_pooling = use_mean_pooling
    
        # 解码头相关的参数设置（语义分割）
        self.out_indices = out_indices
        self.pool_scales = pool_scales
    
        # 辅助解码头相关的参数设置（语义分割）
        self.use_auxiliary_head = use_auxiliary_head
        self.auxiliary_loss_weight = auxiliary_loss_weight
        self.auxiliary_channels = auxiliary_channels
        self.auxiliary_num_convs = auxiliary_num_convs
        self.auxiliary_concat_input = auxiliary_concat_input
        self.semantic_loss_ignore_index = semantic_loss_ignore_index
# 从transformers.models.vit.configuration_vit.ViTOnnxConfig复制而来的类Data2VecVisionOnnxConfig，继承自OnnxConfig类
class Data2VecVisionOnnxConfig(OnnxConfig):
    # 设定torch_onnx_minimum_version属性为1.11版本
    torch_onnx_minimum_version = version.parse("1.11")

    # 定义inputs属性为一个有序字典，描述模型输入的名称及其对应的维度顺序
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
            ]
        )

    # 定义atol_for_validation属性，返回浮点数1e-4，用作验证时的绝对误差容差
    @property
    def atol_for_validation(self) -> float:
        return 1e-4

`.\models\data2vec\convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py`

# 设置文件编码为 UTF-8

# 版权声明及许可证信息
# Copyright 2021 The HuggingFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Convert Wav2Vec2 checkpoint."""

# 导入必要的库
import argparse
import os
from functools import reduce

import fairseq  # 导入 fairseq 库
import torch  # 导入 PyTorch 库
from datasets import load_dataset  # 导入 load_dataset 函数

from transformers import Wav2Vec2Processor, logging  # 导入 Wav2Vec2Processor 和 logging
from transformers.models.data2vec.configuration_data2vec_audio import Data2VecAudioConfig  # 导入 Data2VecAudioConfig

# 从 fairseq 库复制了 Data2VecAudioModel 别名为 Dummy，未使用的导入，故标记为 F401
# Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel as Dummy  # noqa: F401
from transformers.models.data2vec.modeling_data2vec_audio import Data2VecAudioForCTC, Data2VecAudioModel  # 导入相关的模型定义


logging.set_verbosity_info()  # 设置 logging 级别为 info
logger = logging.get_logger(__name__)  # 获取当前模块的 logger 对象

# 映射字典，用于映射模型中的参数名称
MAPPING = {
    "post_extract_proj": "feature_projection.projection",
    "models.0.layer_norm": "feature_projection.layer_norm",
    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
    "fc2": "encoder.layers.*.feed_forward.output_dense",
    "final_layer_norm": "encoder.layers.*.final_layer_norm",
    "encoder.layer_norm": "encoder.layer_norm",
    "w2v_model.layer_norm": "feature_projection.layer_norm",
    "w2v_encoder.proj": "lm_head",
    "mask_emb": "masked_spec_embed",
}
TOP_LEVEL_KEYS = [
    "lm_head",
]


def set_recursively(hf_pointer, key, value, full_name, weight_type):
    # 递归设置参数值的函数
    for attribute in key.split("."):
        hf_pointer = getattr(hf_pointer, attribute)  # 获取指定属性的值

    if weight_type is not None:
        hf_shape = getattr(hf_pointer, weight_type).shape  # 获取指定权重类型的形状信息
    else:
        hf_shape = hf_pointer.shape  # 获取对象的形状信息

    # 检查形状是否匹配，若不匹配则引发 ValueError
    if hf_shape != value.shape:
        raise ValueError(
            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
            f" {value.shape} for {full_name}"
        )

    # 根据权重类型设置参数值
    if weight_type == "weight":
        hf_pointer.weight.data = value
    elif weight_type == "weight_g":
        hf_pointer.weight_g.data = value
    elif weight_type == "weight_v":
        hf_pointer.weight_v.data = value
    elif weight_type == "bias":
        hf_pointer.bias.data = value
    else:
        hf_pointer.data = value
    # 使用日志记录器对象输出信息，格式化字符串包含动态部分
    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
# 递归加载 Fairseq 模型的权重到 Hugging Face 模型中
def recursively_load_weights(fairseq_model, hf_model, is_headless):
    # 存储未使用的权重列表
    unused_weights = []
    # 获取 Fairseq 模型的状态字典
    fairseq_dict = fairseq_model.state_dict()

    # 根据是否 headless 设置特征提取器和位置卷积嵌入器
    if not is_headless:
        feature_extractor = hf_model.data2vec_audio.feature_extractor
        pos_conv_embedding = hf_model.data2vec_audio.encoder.pos_conv_embed
    else:
        feature_extractor = hf_model.feature_extractor
        pos_conv_embedding = hf_model.encoder.pos_conv_embed

    # 遍历 Fairseq 模型的状态字典
    for name, value in fairseq_dict.items():
        is_used = False
        # 如果名称中包含 "conv_layers"，则加载卷积层权重
        if "conv_layers" in name:
            load_conv_layer(
                name,
                value,
                feature_extractor,
                unused_weights,
            )
            is_used = True
        # 如果名称中包含 "pos_conv"，则加载位置卷积层权重
        elif "pos_conv" in name:
            load_pos_conv_layer(
                name,
                value,
                pos_conv_embedding,
                unused_weights,
            )
            is_used = True
        else:
            # 否则，根据映射表 MAPPING 加载对应的权重
            for key, mapped_key in MAPPING.items():
                if not is_headless:
                    # 根据条件修改 mapped_key
                    mapped_key = "data2vec_audio." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                    is_used = True
                    # 如果 mapped_key 中包含 "*", 则替换为层索引
                    if "*" in mapped_key:
                        layer_index = name.split(key)[0].split(".")[-2]
                        mapped_key = mapped_key.replace("*", layer_index)
                    # 根据名称确定权重类型
                    if "weight_g" in name:
                        weight_type = "weight_g"
                    elif "weight_v" in name:
                        weight_type = "weight_v"
                    elif "bias" in name:
                        weight_type = "bias"
                    elif "weight" in name:
                        # TODO: 不匹配 quantizer.weight_proj
                        weight_type = "weight"
                    else:
                        weight_type = None
                    # 递归设置权重到 Hugging Face 模型中
                    set_recursively(hf_model, mapped_key, value, name, weight_type)
                continue
        # 如果未使用，则将名称添加到未使用权重列表中
        if not is_used:
            unused_weights.append(name)

    # 记录未使用的权重
    logger.warning(f"Unused weights: {unused_weights}")


# 根据字符串路径访问模块中的对象
def access_by_string(module, path):
    names = path.split(".")
    return reduce(getattr, names, module)


# 设置权重到指定路径的函数
def set_weights(full_name, module, fsq_value, hf_weight_path):
    # 通过字符串路径获取 Hugging Face 模型中的权重
    hf_weight = access_by_string(module, hf_weight_path)
    hf_value = hf_weight.data

    # 检查 Fairseq 和 Hugging Face 模型的权重形状是否匹配
    if fsq_value.shape != hf_value.shape:
        raise ValueError(f"{full_name} has size {fsq_value.shape}, but {hf_value.shape} was found.")
    # 设置 Fairseq 模型的值到 Hugging Face 模型的权重中
    hf_weight.data = fsq_value
    # 记录权重初始化成功的信息
    logger.info(f"{full_name} was correctly initialized from {hf_weight_path}.")


# 加载卷积层权重的函数
def load_conv_layer(full_name, value, feature_extractor, unused_weights):
    # 获取卷积层名称
    name = full_name.split("conv_layers.")[-1]
    items = name.split(".")
    layer_id = int(items[0])
    type_id = int(items[1])

    weight_type = name.split(".")[-1]
    # 如果 type_id 等于 0，则将 layer_type 设置为 "conv"
    if type_id == 0:
        layer_type = "conv"
    # 如果 type_id 等于 2，则将 layer_type 设置为 "layer_norm"
    elif type_id == 2:
        layer_type = "layer_norm"
    # 如果 type_id 不是 0 也不是 2，则将 full_name 添加到 unused_weights 列表中并返回
    else:
        unused_weights.append(full_name)
        return

    # 调用 set_weights 函数来设置权重，使用给定的 full_name、feature_extractor、value 和拼接的路径字符串
    set_weights(full_name, feature_extractor, value, f"conv_layers.{layer_id}.{layer_type}.{weight_type}")
def load_pos_conv_layer(full_name, value, pos_conv_embeddings, unused_weights):
    # 从完整名称中提取出layer_id和type_id
    name = full_name.split("pos_conv.")[-1]
    items = name.split(".")
    layer_id = int(items[0])
    type_id = int(items[1])

    # 提取权重类型
    weight_type = name.split(".")[-1]
    
    # 如果type_id不为0，则将full_name加入unused_weights列表并返回
    if type_id != 0:
        unused_weights.append(full_name)
        return
    else:
        layer_type = "conv"

    # 调用set_weights函数，设置权重
    set_weights(full_name, pos_conv_embeddings, value, f"layers.{layer_id}.{layer_type}.{weight_type}")


@torch.no_grad()
def convert_wav2vec2_checkpoint(
    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
    """
    将模型的权重复制/粘贴/调整为transformers设计。
    """
    # 如果提供了config_path，则从预训练模型加载Data2VecAudioConfig
    if config_path is not None:
        config = Data2VecAudioConfig.from_pretrained(config_path)
    else:
        config = Data2VecAudioConfig()

    # 如果不是finetuned状态
    if not is_finetuned:
        # 修改final_proj层的名称
        hf_wav2vec = Data2VecAudioModel(config)
        data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)

        # 加载原始checkpoint的状态字典
        state_dict = torch.load(checkpoint_path)
        # 调整final_proj层权重和偏置的命名
        state_dict["model"]["final_proj.weight"] = state_dict["model"].pop("final_proj.0.weight")
        state_dict["model"]["final_proj.bias"] = state_dict["model"].pop("final_proj.0.bias")
        # 保存转换后的checkpoint
        converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
        torch.save(state_dict, converted_ckpt)
    else:
        # 加载finetuned状态的模型
        hf_wav2vec = Data2VecAudioForCTC(config)
        converted_ckpt = checkpoint_path

    # 定义函数，用于加载fairseq模型
    def load_data2vec(path):
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([path])
        return model[0].eval()

    # 加载转换后的模型
    model = load_data2vec(converted_ckpt)

    # 递归加载权重到hf_wav2vec模型中
    recursively_load_weights(model, hf_wav2vec, not is_finetuned)

    # 从预训练模型facebook/wav2vec2-large-lv60加载processor
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")

    # 加载LibriSpeech ASR的验证集数据集
    ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
    input_audio = [x["array"] for x in ds[:4]["audio"]]

    # 使用processor对输入音频进行处理，返回inputs字典
    inputs = processor(input_audio, return_tensors="pt", padding=True)

    # 提取inputs中的input_values和attention_mask
    input_values = inputs.input_values
    attention_mask = inputs.attention_mask

    # 打印原始的input_values和attention_mask（已注释）
    # input_values = inputs.input_values[:, :-1]
    # attention_mask = inputs.attention_mask[:, :-1]

    # 设置hf_wav2vec和model为eval模式
    hf_wav2vec.eval()
    model.eval()

    # 如果是finetuned状态
    if is_finetuned:
        # 获取模型预测的输出和hf_wav2vec的输出
        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)["encoder_out"].transpose(0, 1)
        our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["logits"]

        # 计算预测的标签id，并通过processor进行批量解码
        pred_ids = torch.argmax(our_output, dim=-1)
        output_string = processor.batch_decode(pred_ids)

        # 打印预期输出和模型预测的输出字符串
        print(f"Expected Output: {ds[:4]['text']}, Pred: {output_string}")
    # 如果条件为假，执行以下操作
    else:
        # 使用模型进行推理，获取输出张量
        their_output = model(
            source=input_values,  # 输入数据
            padding_mask=(1 - attention_mask),  # 填充掩码
            mask=False,  # 不使用遮罩
            features_only=True  # 仅返回特征结果
        )["layer_results"][-1][0].transpose(0, 1)
        
        # 使用hf_wav2vec模型获取输出张量
        our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["last_hidden_state"]

    # 打印我们的输出和他们的输出的形状
    print(our_output.shape, their_output.shape)
    
    # 计算两个张量之间的最大绝对差异
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # 输出最大绝对差异，预期在1e-7左右
    
    # 检查两个模型的输出张量是否在给定的容差范围内接近
    success = torch.allclose(our_output, their_output, atol=1e-3)
    print("Do both models output the same tensors?", "🔥" if success else "💩")  # 打印两个模型是否输出相同的张量
    
    # 如果输出不接近，抛出异常
    if not success:
        raise Exception("Something went wRoNg")

    # 将hf_wav2vec模型保存到指定路径
    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)

    # 如果模型已经微调，则保存processor；否则，保存特征提取器
    if is_finetuned:
        processor.save_pretrained(pytorch_dump_folder_path)
    else:
        processor.feature_extractor.save_pretrained(pytorch_dump_folder_path)
# 如果这个脚本被直接执行而不是被导入，则执行以下代码块
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()
    # 添加一个参数：输出 PyTorch 模型的路径
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加一个参数：fairseq 检查点的路径
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
    # 添加一个参数：微调模型的字典路径
    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
    # 添加一个参数：待转换模型的 HF（Hugging Face）配置文件路径
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    # 添加一个标志参数：指示待转换模型是否是经过微调的模型
    parser.add_argument(
        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
    )
    # 解析命令行参数
    args = parser.parse_args()
    
    # 调用函数 convert_wav2vec2_checkpoint，传递命令行参数以执行模型转换操作
    convert_wav2vec2_checkpoint(
        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
    )

`.\models\data2vec\convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py`

# 引入所需的模块和库
import argparse  # 用于解析命令行参数
import os  # 用于操作系统相关的功能
import pathlib  # 提供处理文件和目录路径的类

import fairseq  # 引入fairseq库
import torch  # 引入PyTorch库
from fairseq.modules import TransformerSentenceEncoderLayer  # 从fairseq模块中引入TransformerSentenceEncoderLayer
from packaging import version  # 用于版本比较的包

from transformers import (  # 从transformers库中引入多个类和函数
    Data2VecTextConfig,  # 用于配置Data2VecText模型的类
    Data2VecTextForMaskedLM,  # 用于Data2VecText的MLM任务的类
    Data2VecTextForSequenceClassification,  # 用于Data2VecText的序列分类任务的类
    Data2VecTextModel,  # Data2VecText模型的主类
)
from transformers.models.bert.modeling_bert import (  # 从BERT模型中引入多个类
    BertIntermediate,  # BERT中间层的类
    BertLayer,  # BERT层的类
    BertOutput,  # BERT输出层的类
    BertSelfAttention,  # BERT自注意力机制的类
    BertSelfOutput,  # BERT自注意力输出的类
)

# 重要提示：为了运行本脚本，请确保从以下链接下载字典：`dict.txt` https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
# 文件来源于 https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
from transformers.utils import logging  # 从transformers工具模块中引入日志记录功能


if version.parse(fairseq.__version__) < version.parse("0.9.0"):
    raise Exception("requires fairseq >= 0.9.0")  # 如果fairseq版本低于0.9.0，抛出异常

logging.set_verbosity_info()  # 设置日志详细程度为info
logger = logging.get_logger(__name__)  # 获取当前脚本的日志记录器

SAMPLE_TEXT = "Hello world! cécé herlolip"  # 示例文本

def convert_data2vec_checkpoint_to_pytorch(
    data2vec_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
):
    """
    将data2vec的检查点权重复制/粘贴/调整到我们的BERT结构中。
    """
    # 获取data2vec检查点的路径信息
    data2vec_checkpoint_dir, data2vec_checkpoint_file_name = os.path.split(data2vec_checkpoint_path)
    # 从预训练的data2vec模型加载模型
    data2vec = Data2VecTextModel.from_pretrained(
        data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name
    )
    data2vec.eval()  # 设置模型为评估模式，禁用dropout
    data2vec_model = data2vec.models[0]  # 获取data2vec模型的主体部分
    data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder  # 获取data2vec模型的句子编码器
    # 创建Data2VecTextConfig配置对象，用于后续的BERT模型
    config = Data2VecTextConfig(
        vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings,  # 词汇表大小
        hidden_size=data2vec_model.args.encoder_embed_dim,  # 隐藏层大小
        num_hidden_layers=data2vec_model.args.encoder_layers,  # 隐藏层层数
        num_attention_heads=data2vec_model.args.encoder_attention_heads,  # 注意力头数
        intermediate_size=data2vec_model.args.encoder_ffn_embed_dim,  # 中间层大小
        max_position_embeddings=514,  # 最大位置编码
        type_vocab_size=1,  # 类型词汇表大小
        layer_norm_eps=1e-5,  # 层归一化epsilon值，与fairseq默认相同
    )
    if classification_head:
        config.num_labels = data2vec.model.classification_heads["mnli"].out_proj.weight.shape[0]  # 如果有分类头，设置标签数目
    print("Our BERT config:", config)  # 打印配置信息
    # 根据是否需要分类头选择合适的模型：如果需要分类头，则使用Data2VecTextForSequenceClassification，否则使用Data2VecTextForMaskedLM
    model = Data2VecTextForSequenceClassification(config) if classification_head else Data2VecTextForMaskedLM(config)
    model.eval()

    # 现在让我们复制所有的权重。

    # 复制嵌入层权重
    model.data2vec_text.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight
    model.data2vec_text.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight
    model.data2vec_text.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        model.data2vec_text.embeddings.token_type_embeddings.weight
    )  # 将其置零，因为data2vec不使用这些
    model.data2vec_text.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight
    model.data2vec_text.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias

    if classification_head:
        # 如果存在分类头，复制分类器权重
        model.classifier.dense.weight = data2vec.model.classification_heads["mnli"].dense.weight
        model.classifier.dense.bias = data2vec.model.classification_heads["mnli"].dense.bias
        model.classifier.out_proj.weight = data2vec.model.classification_heads["mnli"].out_proj.weight
        model.classifier.out_proj.bias = data2vec.model.classification_heads["mnli"].out_proj.bias
    else:
        # 否则，复制语言模型头权重
        model.lm_head.dense.weight = data2vec_model.encoder.lm_head.dense.weight
        model.lm_head.dense.bias = data2vec_model.encoder.lm_head.dense.bias
        model.lm_head.layer_norm.weight = data2vec_model.encoder.lm_head.layer_norm.weight
        model.lm_head.layer_norm.bias = data2vec_model.encoder.lm_head.layer_norm.bias
        model.lm_head.decoder.weight = data2vec_model.encoder.lm_head.weight
        model.lm_head.decoder.bias = data2vec_model.encoder.lm_head.bias

    # 检查是否输出相同的结果。

    # 使用data2vec对样本文本编码并添加批次维度
    input_ids: torch.Tensor = data2vec.encode(SAMPLE_TEXT).unsqueeze(0)

    # 计算我们模型的输出
    our_output = model(input_ids)[0]

    if classification_head:
        # 如果使用分类头，计算data2vec模型的输出
        their_output = data2vec.model.classification_heads["mnli"](data2vec.extract_features(input_ids))
    else:
        # 否则，计算data2vec模型的输出
        their_output = data2vec_model(input_ids)[0]

    # 打印两个输出的形状
    print(our_output.shape, their_output.shape)

    # 计算两者之间的最大绝对差
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # 大约为1e-7

    # 检查两个模型输出的张量是否几乎相同
    success = torch.allclose(our_output, their_output, atol=1e-3)
    print("Do both models output the same tensors?", "🔥" if success else "💩")

    # 如果两者输出不几乎相同，则抛出异常
    if not success:
        raise Exception("Something went wRoNg")

    # 创建目录以保存PyTorch模型
    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")

    # 将模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    # 如果这个脚本是作为主程序运行时执行以下操作

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # Required parameters
    parser.add_argument(
        "--checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
    )
    # 添加一个必选参数，用于指定官方 PyTorch 转储文件的路径

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 添加一个必选参数，用于指定输出 PyTorch 模型的文件夹路径

    parser.add_argument(
        "--classification_head", action="store_true", help="Whether to convert a final classification head."
    )
    # 添加一个选项参数，表示是否转换最终的分类头部

    args = parser.parse_args()
    # 解析命令行参数，并将其存储在 args 变量中

    convert_data2vec_checkpoint_to_pytorch(
        args.checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
    )
    # 调用函数 convert_data2vec_checkpoint_to_pytorch，传入命令行参数中解析的路径和选项


这段代码是一个典型的命令行工具的入口点，它使用 argparse 模块解析命令行参数，并调用一个函数来处理这些参数指定的任务。

`.\models\data2vec\convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py`

#!/usr/bin/env python3
import argparse  # 导入命令行参数解析库
import json  # 导入 JSON 操作库

import torch  # 导入 PyTorch 深度学习库
from huggingface_hub import hf_hub_download  # 导入 Hugging Face Hub 下载函数
from PIL import Image  # 导入 PIL 图像处理库
from timm.models import create_model  # 导入 timm 模型创建函数

from transformers import (  # 导入 transformers 库中的以下模块
    BeitImageProcessor,  # Beit 图像处理器
    Data2VecVisionConfig,  # Data2Vec 视觉配置类
    Data2VecVisionForImageClassification,  # Data2Vec 图像分类模型
    Data2VecVisionModel,  # Data2Vec 视觉模型
)


def create_rename_keys(config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec."):
    prefix = "backbone." if is_semantic else ""  # 根据是否语义化设置前缀

    rename_keys = []  # 初始化重命名键列表
    for i in range(config.num_hidden_layers):
        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
        # 编码器层：输出投影、2个前馈神经网络和2个层归一化
        rename_keys.append(
            (f"{prefix}blocks.{i}.norm1.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_before.weight")
        )  # 添加权重归一化前的重命名键
        rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_before.bias"))  # 添加偏置归一化前的重命名键
        rename_keys.append(
            (f"{prefix}blocks.{i}.attn.proj.weight", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.weight")
        )  # 添加注意力投影层权重的重命名键
        rename_keys.append(
            (f"{prefix}blocks.{i}.attn.proj.bias", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.bias")
        )  # 添加注意力投影层偏置的重命名键
        rename_keys.append(
            (f"{prefix}blocks.{i}.norm2.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_after.weight")
        )  # 添加权重归一化后的重命名键
        rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_after.bias"))  # 添加偏置归一化后的重命名键
        rename_keys.append(
            (f"{prefix}blocks.{i}.mlp.fc1.weight", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.weight")
        )  # 添加中间层第一个全连接层权重的重命名键
        rename_keys.append(
            (f"{prefix}blocks.{i}.mlp.fc1.bias", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.bias")
        )  # 添加中间层第一个全连接层偏置的重命名键
        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"{hf_prefix}encoder.layer.{i}.output.dense.weight"))
        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"{hf_prefix}encoder.layer.{i}.output.dense.bias"))  # 添加中间层第二个全连接层偏置的重命名键

    # projection layer + position embeddings
    # 投影层 + 位置嵌入
    rename_keys.extend(
        [
            (f"{prefix}cls_token", f"{hf_prefix}embeddings.cls_token"),  # 添加类别标记的重命名键
            (f"{prefix}patch_embed.proj.weight", f"{hf_prefix}embeddings.patch_embeddings.projection.weight"),  # 添加投影层权重的重命名键
            (f"{prefix}patch_embed.proj.bias", f"{hf_prefix}embeddings.patch_embeddings.projection.bias"),  # 添加投影层偏置的重命名键
        ]
    )
    # 如果具有语言模型头部
    if has_lm_head:
        # 将以下键值对添加到重命名列表，用于重命名模型的不同部分
        rename_keys.extend(
            [
                ("mask_token", f"{hf_prefix}embeddings.mask_token"),  # 重命名掩码标记
                (
                    "rel_pos_bias.relative_position_bias_table",
                    f"{hf_prefix}encoder.relative_position_bias.relative_position_bias_table",  # 重命名相对位置偏置表
                ),
                (
                    "rel_pos_bias.relative_position_index",
                    f"{hf_prefix}encoder.relative_position_bias.relative_position_index",  # 重命名相对位置索引
                ),
                ("norm.weight", "layernorm.weight"),  # 重命名归一化层权重
                ("norm.bias", "layernorm.bias"),  # 重命名归一化层偏置
            ]
        )
    # 如果是语义任务
    elif is_semantic:
        # 将以下键值对添加到重命名列表，用于语义分割分类头部的重命名
        rename_keys.extend(
            [
                ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),  # 重命名解码头部卷积层权重
                ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),  # 重命名解码头部卷积层偏置
                ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),  # 重命名辅助头部卷积层权重
                ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),  # 重命名辅助头部卷积层偏置
            ]
        )
    else:
        # 将以下键值对添加到重命名列表，用于常规的分类任务头部重命名
        rename_keys.extend(
            [
                ("fc_norm.weight", f"{hf_prefix}pooler.layernorm.weight"),  # 重命名全连接层归一化层权重
                ("fc_norm.bias", f"{hf_prefix}pooler.layernorm.bias"),  # 重命名全连接层归一化层偏置
                ("head.weight", "classifier.weight"),  # 重命名分类头部权重
                ("head.bias", "classifier.bias"),  # 重命名分类头部偏置
            ]
        )
    
    return rename_keys  # 返回包含所有重命名键值对的列表
# 读取输入的状态字典，根据配置和条件重新组织其内容
def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec_vision."):
    # 遍历配置中指定数量的隐藏层
    for i in range(config.num_hidden_layers):
        # 根据语义和前缀确定当前层的前缀
        prefix = "backbone." if is_semantic else ""

        # 读取并移除当前层注意力机制的查询、键和值的权重
        in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight")
        q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias")
        v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias")

        # 将查询权重放入预定义的位置
        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
            : config.hidden_size, :
        ]
        # 将查询偏置放入预定义的位置
        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.bias"] = q_bias
        # 将键权重放入预定义的位置
        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
            config.hidden_size : config.hidden_size * 2, :
        ]
        # 将值权重放入预定义的位置
        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
            -config.hidden_size :, :
        ]
        # 将值偏置放入预定义的位置
        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.bias"] = v_bias

        # 读取并移除当前层的 gamma_1 和 gamma_2
        gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1")
        gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2")

        # 将 gamma_1 放入预定义的位置
        state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_1"] = gamma_1
        # 将 gamma_2 放入预定义的位置
        state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_2"] = gamma_2

        # 如果没有语言模型头部，处理相对位置偏置表和索引
        if not has_lm_head:
            # 移除当前层的相对位置偏置表和索引
            table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table")
            index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index")

            # 将相对位置偏置表放入预定义的位置
            state_dict[
                f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"
            ] = table
            # 将相对位置索引放入预定义的位置
            state_dict[
                f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"
            ] = index


# 获取命令行参数
def get_args():
    # 创建参数解析器
    parser = argparse.ArgumentParser(
        "Convert Data2VecVision to HF for image classification and pretraining", add_help=False
    )
    # 添加命令行参数：HF 检查点名称
    parser.add_argument("--hf_checkpoint_name", type=str)
    # 添加命令行参数：输入图像大小，默认为 224
    parser.add_argument("--input_size", default=224, type=int, help="images input size")
    # 添加命令行参数：BEiT 检查点路径，默认为空字符串
    parser.add_argument("--beit_checkpoint", default="", help="beit checkpoint")

    # 解析并返回命令行参数
    return parser.parse_args()


# 加载 BEiT 模型
def load_beit_model(args, is_finetuned, is_large):
    # 加载模型的状态字典，用于模型权重初始化
    def load_state_dict(model, state_dict, prefix="", ignore_missing="relative_position_index"):
        # 用于存储找不到的键的列表
        missing_keys = []
        # 用于存储意外的键的列表
        unexpected_keys = []
        # 用于存储错误消息的列表
        error_msgs = []

        # 复制 state_dict 以便 _load_from_state_dict 可以修改它
        metadata = getattr(state_dict, "_metadata", None)
        state_dict = state_dict.copy()
        if metadata is not None:
            state_dict._metadata = metadata

        # 递归加载模型的每个模块的状态字典
        def load(module, prefix=""):
            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
            module._load_from_state_dict(
                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
            )
            for name, child in module._modules.items():
                if child is not None:
                    load(child, prefix + name + ".")

        load(model, prefix=prefix)

        # 根据指定的 ignore_missing 规则筛选出需要警告的缺失键和需要忽略的键
        warn_missing_keys = []
        ignore_missing_keys = []
        for key in missing_keys:
            keep_flag = True
            for ignore_key in ignore_missing.split("|"):
                if ignore_key in key:
                    keep_flag = False
                    break
            if keep_flag:
                warn_missing_keys.append(key)
            else:
                ignore_missing_keys.append(key)

        # 更新 missing_keys 为 warn_missing_keys
        missing_keys = warn_missing_keys

        # 输出模型权重未初始化的警告信息
        if len(missing_keys) > 0:
            print(
                "Weights of {} not initialized from pretrained model: {}".format(
                    model.__class__.__name__, missing_keys
                )
            )
        # 输出未使用的预训练模型权重的信息
        if len(unexpected_keys) > 0:
            print("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys))
        # 输出被忽略的模型权重未初始化的信息
        if len(ignore_missing_keys) > 0:
            print(
                "Ignored weights of {} not initialized from pretrained model: {}".format(
                    model.__class__.__name__, ignore_missing_keys
                )
            )
        # 输出加载模型过程中的错误消息
        if len(error_msgs) > 0:
            print("\n".join(error_msgs))

    # 定义模型的关键字参数字典
    model_kwargs = {
        "pretrained": False,
        "use_shared_rel_pos_bias": True,
        "use_abs_pos_emb": False,
        "init_values": 0.1,
    }

    # 如果是微调过的模型，更新模型关键字参数字典
    if is_finetuned:
        model_kwargs.update(
            {
                "num_classes": 1000,
                "use_mean_pooling": True,
                "init_scale": 0.001,
                "use_rel_pos_bias": True,
            }
        )

    # 创建指定配置的模型实例
    model = create_model(
        "beit_large_patch16_224" if is_large else "beit_base_patch16_224",
        **model_kwargs,
    )
    # 获取模型的补丁嵌入层的补丁大小
    patch_size = model.patch_embed.patch_size
    # 计算窗口大小
    args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1])
    # 加载 PyTorch 模型检查点
    checkpoint = torch.load(args.beit_checkpoint, map_location="cpu")

    # 打印加载检查点的信息
    print(f"Load ckpt from {args.beit_checkpoint}")
    # 初始化检查点模型
    checkpoint_model = None
    # 遍历指定的模型关键字列表，检查检查点中是否存在该关键字
    for model_key in ("model", "module"):
        # 如果找到了指定的模型关键字
        if model_key in checkpoint:
            # 从检查点中获取相应模型的状态字典
            checkpoint_model = checkpoint[model_key]
            # 打印加载状态字典的消息，指定加载的模型关键字
            print(f"Load state_dict by model_key = {model_key}")
            # 中断循环，已找到并加载了状态字典
            break

    # 获取所有状态字典键的列表
    all_keys = list(checkpoint_model.keys())
    # 遍历所有状态字典的键
    for key in all_keys:
        # 如果键包含"relative_position_index"字符串
        if "relative_position_index" in key:
            # 从状态字典中移除该键及其对应的值
            checkpoint_model.pop(key)

        # 如果键包含"relative_position_bias_table"字符串
        if "relative_position_bias_table" in key:
            # 获取相对位置偏置表的值
            rel_pos_bias = checkpoint_model[key]
            # 获取源和目标模型中的位置数量及注意力头数
            src_num_pos, num_attn_heads = rel_pos_bias.size()
            dst_num_pos, _ = model.state_dict()[key].size()
            dst_patch_shape = model.patch_embed.patch_shape
            # 检查目标模型的补丁形状是否为方形，若不是则抛出未实现的错误
            if dst_patch_shape[0] != dst_patch_shape[1]:
                raise NotImplementedError()

    # 使用加载的状态字典更新模型的参数
    load_state_dict(model, checkpoint_model, prefix="")

    # 返回更新后的模型
    return model
def main():
    # 获取命令行参数
    args = get_args()

    # 检查是否进行了微调
    is_finetuned = "ft1k" in args.hf_checkpoint_name
    # 检查模型是否为大模型
    is_large = "large" in args.hf_checkpoint_name

    if is_finetuned:
        # 如果进行了微调，导入微调模型的代码
        # 你需要将 https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_finetune.py
        # 复制到当前文件夹中。
        import modeling_finetune  # noqa: F401
    else:
        # 如果没有进行微调，导入周期性模型的代码
        # 你需要将 https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_cyclical.py
        # 复制到当前文件夹中。
        # 注意：目前我们只转换了下游模型而不是完整的预训练模型。这意味着在集成测试中，你需要在以下行之后添加 `return x`：
        # https://github.com/facebookresearch/data2vec_vision/blob/af9a36349aaed59ae66e69b5dabeef2d62fdc5da/beit/modeling_cyclical.py#L197
        import modeling_cyclical  # noqa: F401

    # 1. 创建模型配置
    config = Data2VecVisionConfig()
    if is_finetuned:
        # 如果进行了微调，设置特定的配置选项
        config.use_relative_position_bias = True
        config.use_shared_relative_position_bias = False
        config.use_mean_pooling = True
        config.num_labels = 1000

        # 下载并加载 ImageNet 类标签映射
        repo_id = "huggingface/label-files"
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
    else:
        # 如果没有进行微调，设置默认的配置选项
        config.use_relative_position_bias = False
        config.use_shared_relative_position_bias = True
        config.use_mean_pooling = False

    if is_large:
        # 如果模型是大模型，设置大模型特有的配置选项
        config.hidden_size = 1024
        config.intermediate_size = 4096
        config.num_hidden_layers = 24
        config.num_attention_heads = 16

    # 2. 加载 Beit 模型
    orig_model = load_beit_model(args, is_finetuned, is_large)
    orig_model.eval()

    # 3. 前向传播 Beit 模型
    image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
    image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png")
    encoding = image_processor(images=image, return_tensors="pt")
    pixel_values = encoding["pixel_values"]

    orig_args = (pixel_values,) if is_finetuned else (pixel_values, None)
    with torch.no_grad():
        orig_model_output = orig_model(*orig_args)

    # 4. 加载 HF Data2VecVision 模型
    if is_finetuned:
        # 如果进行了微调，使用 Image Classification 的配置创建 HF Data2VecVision 模型
        hf_model = Data2VecVisionForImageClassification(config)
        hf_model.eval()
        has_lm_head = False
        hf_prefix = "data2vec_vision."
    else:
        # 如果没有进行微调，创建标准 HF Data2VecVision 模型
        hf_model = Data2VecVisionModel(config)
        hf_model.eval()
        has_lm_head = True
        hf_prefix = ""
    # 使用配置和前缀生成重命名键列表
    rename_keys = create_rename_keys(config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
    # 获取原始模型的状态字典
    state_dict = orig_model.state_dict()
    # 根据重命名键，更新状态字典中的键名
    for src, dest in rename_keys:
        val = state_dict.pop(src)  # 移除原始键，并获取对应的数值
        state_dict[dest] = val  # 将数值与新的键名关联起来

    # 将更新后的状态字典读入查询-键-值功能
    read_in_q_k_v(state_dict, config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
    # 加载状态字典到 HF 模型中，允许缺失的键
    missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False)
    print("HF missing", missing_keys)  # 打印缺失的键列表
    print("HF unexpected_keys", unexpected_keys)  # 打印意外的键列表

    # 5. Forward HF Data2VecVision model
    # 使用 torch.no_grad() 上下文，前向传播 HF 模型，计算像素值的输出
    with torch.no_grad():
        hf_model_output = hf_model(pixel_values)

    # 如果是微调状态，选择 logits；否则选择最后的隐藏状态
    hf_output = hf_model_output.logits if is_finetuned else hf_model_output.last_hidden_state

    # 6. Compare
    # 计算 HF 输出与原始模型输出的最大绝对差值
    max_absolute_diff = torch.max(torch.abs(hf_output - orig_model_output)).item()

    print(f"max_absolute_diff = {max_absolute_diff}")  # 打印最大绝对差值
    # 检查 HF 输出与原始模型输出是否接近，指定绝对容差
    success = torch.allclose(hf_output, orig_model_output, atol=1e-3)
    print("Do both models output the same tensors?", "🔥" if success else "💩")  # 打印比较结果
    if not success:
        raise Exception("Something went wRoNg")  # 如果输出不接近，抛出异常

    # 7. Save
    print(f"Saving to {args.hf_checkpoint_name}")  # 打印保存路径
    hf_model.save_pretrained(args.hf_checkpoint_name)  # 将 HF 模型保存到指定路径
    image_processor.save_pretrained(args.hf_checkpoint_name)  # 将图像处理器保存到同一路径
# 如果该脚本作为主程序运行，则执行 main() 函数
if __name__ == "__main__":
    main()
    # 运行以下命令将检查点转换为 PyTorch 格式：
    # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
    #         --beit_checkpoint ./pretrained_base.pt \
    #         --hf_checkpoint_name "./data2vec-vision-base"
    # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
    #         --beit_checkpoint ./finetuned_base.pt \
    #         --hf_checkpoint_name "./data2vec-vision-base-ft1k"
    # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
    #         --beit_checkpoint ./pretrained_large.pt \
    #         --hf_checkpoint_name "./data2vec-vision-large"
    # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
    #         --beit_checkpoint ./finetuned_large.pt \
    #         --hf_checkpoint_name "./data2vec-vision-large-ft1k"

`.\models\data2vec\modeling_data2vec_audio.py`

# 设置代码文件的编码格式为UTF-8

# 引入必要的依赖库和模块
# 版权声明和许可协议
# 本代码基于Apache License, Version 2.0发布

""" PyTorch Data2VecAudio model. """

# 引入必要的库和模块
import math  # 数学计算库
import warnings  # 警告处理库
from typing import Optional, Tuple, Union  # 类型提示模块

import numpy as np  # 数组处理库
import torch  # PyTorch深度学习库
import torch.utils.checkpoint  # PyTorch中的checkpoint功能
from torch import nn  # PyTorch中的神经网络模块
from torch.nn import CrossEntropyLoss  # 交叉熵损失函数

# 引入Hugging Face自定义的模块和类
from ...activations import ACT2FN  # 激活函数
from ...integrations.deepspeed import is_deepspeed_zero3_enabled  # DeepSpeed集成模块
from ...modeling_outputs import (
    BaseModelOutput,  # 基础模型输出
    CausalLMOutput,  # 因果语言模型输出
    SequenceClassifierOutput,  # 序列分类器输出
    TokenClassifierOutput,  # 标记分类器输出
    Wav2Vec2BaseModelOutput,  # Wav2Vec2基础模型输出
    XVectorOutput,  # X向量输出
)
from ...modeling_utils import PreTrainedModel  # 预训练模型基类
from ...utils import (
    add_code_sample_docstrings,  # 添加代码示例文档字符串
    add_start_docstrings,  # 添加起始文档字符串
    add_start_docstrings_to_model_forward,  # 添加模型前向传播的起始文档字符串
    is_peft_available,  # 是否有PEFT可用
    logging,  # 日志记录
)
from .configuration_data2vec_audio import Data2VecAudioConfig  # Data2VecAudio配置类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


_HIDDEN_STATES_START_POSITION = 2  # 隐藏状态的起始位置

# 用于文档的配置信息
_CONFIG_FOR_DOC = "Data2VecAudioConfig"

# 用于文档的检查点信息
_CHECKPOINT_FOR_DOC = "facebook/data2vec-audio-base-960h"
_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]  # 预期的输出形状

# CTC（连接时序分类）的文档信息
_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
_CTC_EXPECTED_LOSS = 66.95

# 预训练模型存档列表
DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/data2vec-audio-base",
    "facebook/data2vec-audio-base-10m",
    "facebook/data2vec-audio-base-100h",
    "facebook/data2vec-audio-base-960h",
    # 更多模型详见 https://huggingface.co/models?filter=data2vec-audio
]


# 从transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices复制而来
def _compute_mask_indices(
    shape: Tuple[int, int],
    mask_prob: float,
    mask_length: int,
    attention_mask: Optional[torch.LongTensor] = None,
    min_masks: int = 0,
) -> np.ndarray:
    """
    计算给定形状的随机掩码间隔。用于实现ASR的数据增强方法[SpecAugment](https://arxiv.org/abs/1904.08779)。
    注意：此方法不适合在TPU上运行，应该在训练期间作为预处理步骤在CPU上运行。
    """
    # 省略部分代码，未完整展示
    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    """
    # 解包 shape 元组，得到 batch_size 和 sequence_length
    batch_size, sequence_length = shape

    # 检查 mask_length 是否合法（大于0）
    if mask_length < 1:
        raise ValueError("`mask_length` has to be bigger than 0.")

    # 检查 mask_length 是否小于 sequence_length
    if mask_length > sequence_length:
        raise ValueError(
            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
            f" and `sequence_length`: {sequence_length}`"
        )

    # epsilon 用于概率舍入
    epsilon = np.random.rand(1).item()

    def compute_num_masked_span(input_length):
        """Given input length, compute how many spans should be masked"""
        # 计算应该被mask的span的数量
        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
        # 确保 num_masked_span 不小于 min_masks
        num_masked_span = max(num_masked_span, min_masks)

        # 确保 num_masked_span * mask_length 不大于 sequence_length
        if num_masked_span * mask_length > sequence_length:
            num_masked_span = sequence_length // mask_length

        # 确保 num_masked_span 不大于 input_length - (mask_length - 1)
        if input_length - (mask_length - 1) < num_masked_span:
            num_masked_span = max(input_length - (mask_length - 1), 0)

        return num_masked_span

    # 计算每个样本的输入长度
    input_lengths = (
        attention_mask.sum(-1).detach().tolist()
        if attention_mask is not None
        else [sequence_length for _ in range(batch_size)]
    )

    # 创建一个全为 False 的布尔数组作为 spec_aug_mask 的初始值
    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
    # 保存 spec_aug_mask 的索引
    spec_aug_mask_idxs = []

    # 计算最大允许的 masked span 数量
    max_num_masked_span = compute_num_masked_span(sequence_length)

    # 如果 max_num_masked_span 为 0，则直接返回空的 spec_aug_mask
    if max_num_masked_span == 0:
        return spec_aug_mask


这段代码主要是用于生成一种名为 SpecAugment 的遮罩技术，用于语音识别和其他序列数据处理中。
    # 对于每个输入长度计算需要掩码的片段数量
    for input_length in input_lengths:
        
        # 计算当前输入长度下的掩码片段数目
        num_masked_span = compute_num_masked_span(input_length)

        # 随机选择要掩码的索引
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )

        # 选择第一个样本索引作为填充向量的虚拟索引，以确保所有批次具有相同的维度，因为概率舍入
        # 选择第一个样本索引是为了简化向量的填充操作
        if len(spec_aug_mask_idx) == 0:
            # 当 `input_length` 严格小于 `sequence_length` 时会出现这种情况，
            # 此时最后一个标记应该是填充标记，可以用作虚拟掩码 ID
            dummy_mask_idx = sequence_length - 1
        else:
            dummy_mask_idx = spec_aug_mask_idx[0]

        # 将虚拟掩码索引添加到掩码索引数组中，确保数组长度达到 `max_num_masked_span`
        spec_aug_mask_idx = np.concatenate(
            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
        )
        spec_aug_mask_idxs.append(spec_aug_mask_idx)

    # 将列表转换为 NumPy 数组
    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)

    # 将掩码索引扩展为掩码片段
    spec_aug_mask_idxs = np.broadcast_to(
        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)

    # 添加偏移量到起始索引，使索引创建一个掩码片段
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets

    # 确保索引不超过 `sequence_length - 1`
    if spec_aug_mask_idxs.max() > sequence_length - 1:
        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1

    # 使用索引散布掩码
    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)

    # 返回最终的掩码张量
    return spec_aug_mask
# 定义一个用于处理音频转换的卷积层的类，继承自 nn.Module
class Data2VecAudioConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 从配置中获取输入和输出的卷积维度
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层，指定输入和输出维度、卷积核大小、步长和是否使用偏置
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 创建一个用于层归一化的 LayerNorm 层，参数为输出卷积维度，启用元素级仿射变换
        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
        # 获取指定的激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    # 前向传播函数，接收隐藏状态并返回转换后的隐藏状态
    def forward(self, hidden_states):
        # 对输入的隐藏状态进行一维卷积处理
        hidden_states = self.conv(hidden_states)

        # 将卷积输出的维度进行转置，交换倒数第二和倒数第一维度
        hidden_states = hidden_states.transpose(-2, -1)
        # 对转置后的隐藏状态进行层归一化处理
        hidden_states = self.layer_norm(hidden_states)
        # 再次对隐藏状态进行维度转置，还原初始维度排列
        hidden_states = hidden_states.transpose(-2, -1)
        # 使用预定义的激活函数处理隐藏状态并返回
        hidden_states = self.activation(hidden_states)
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer 复制而来，修改类名为 Data2VecAudioPadLayer
class Data2VecAudioPadLayer(nn.Module):
    def __init__(self, num_conv_pos_embeddings):
        super().__init__()
        # 根据给定的卷积位置嵌入数量计算需要移除的填充数量
        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0

    # 前向传播函数，根据需要移除的填充数量截断隐藏状态的最后一维
    def forward(self, hidden_states):
        if self.num_pad_remove > 0:
            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
        return hidden_states


# 定义一个用于位置卷积的类，继承自 nn.Module
class Data2VecAudioPositionalConvLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个一维卷积层，指定输入和输出维度相同，卷积核大小、填充方式和卷积组数
        self.conv = nn.Conv1d(
            config.hidden_size,
            config.hidden_size,
            kernel_size=config.conv_pos_kernel_size,
            padding=config.conv_pos_kernel_size // 2,
            groups=config.num_conv_pos_embedding_groups,
        )

        # 创建一个用于处理填充的 Data2VecAudioPadLayer 类的实例
        self.padding = Data2VecAudioPadLayer(config.conv_pos_kernel_size)
        # 获取指定的激活函数
        self.activation = ACT2FN[config.feat_extract_activation]
        # 创建一个 LayerNorm 层，用于层归一化，参数为隐藏大小，禁用元素级仿射变换
        self.layer_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False)

    # 前向传播函数，接收隐藏状态并返回转换后的隐藏状态
    def forward(self, hidden_states):
        # 对输入的隐藏状态进行一维卷积处理
        hidden_states = self.conv(hidden_states)
        # 使用 padding 层处理卷积输出，截断最后一维的填充
        hidden_states = self.padding(hidden_states)

        # 将隐藏状态的维度进行转置，交换第一和第二维度
        hidden_states = hidden_states.transpose(1, 2)
        # 对转置后的隐藏状态进行层归一化处理
        hidden_states = self.layer_norm(hidden_states)
        # 再次对隐藏状态进行维度转置，还原初始维度排列
        hidden_states = hidden_states.transpose(1, 2)
        # 使用预定义的激活函数处理隐藏状态并返回
        hidden_states = self.activation(hidden_states)
        return hidden_states


# 定义一个用于位置卷积嵌入的类，继承自 nn.Module
class Data2VecAudioPositionalConvEmbedding(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用列表推导式创建包含多个 Data2VecAudioPositionalConvLayer 实例的 ModuleList
        self.layers = nn.ModuleList(
            [Data2VecAudioPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)]
        )
    # 定义一个方法，用于前向传播神经网络模型中的隐藏状态
    def forward(self, hidden_states):
        # 转置隐藏状态张量的第一维和第二维，以便适配网络层期望的输入格式
        hidden_states = hidden_states.transpose(1, 2)
        # 依次通过每一层网络层处理隐藏状态张量
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        # 再次转置隐藏状态张量的第一维和第二维，使其恢复原始输入的维度
        hidden_states = hidden_states.transpose(1, 2)
        # 返回经过所有网络层处理后的隐藏状态张量
        return hidden_states
class Data2VecAudioFeatureEncoder(nn.Module):
    """Construct the features from raw audio waveform"""

    def __init__(self, config):
        super().__init__()
        # 初始化卷积层列表，每层都是 Data2VecAudioConvLayer 类的实例，根据配置创建对应数量的层
        self.conv_layers = nn.ModuleList(
            [Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
        )
        # 梯度检查点功能默认关闭
        self.gradient_checkpointing = False
        # 默认需要计算梯度
        self._requires_grad = True

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder._freeze_parameters 复制而来
    def _freeze_parameters(self):
        # 将所有参数的梯度计算关闭
        for param in self.parameters():
            param.requires_grad = False
        # 同时将类属性 _requires_grad 设置为 False
        self._requires_grad = False

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder.forward 复制而来
    def forward(self, input_values):
        # 将输入的波形数据增加一个维度，以符合模型的输入要求
        hidden_states = input_values[:, None]

        # 如果当前模型需要计算梯度并且处于训练状态，则设置 hidden_states 变量需要计算梯度
        if self._requires_grad and self.training:
            hidden_states.requires_grad = True

        # 遍历所有卷积层，逐层计算特征表示
        for conv_layer in self.conv_layers:
            # 如果开启了梯度检查点并且模型在训练阶段，则使用梯度检查点函数来计算卷积层的输出
            if self._requires_grad and self.gradient_checkpointing and self.training:
                hidden_states = self._gradient_checkpointing_func(
                    conv_layer.__call__,
                    hidden_states,
                )
            else:
                hidden_states = conv_layer(hidden_states)

        # 返回最终的隐藏状态表示
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection 复制而来，修改了类名和部分参数
class Data2VecAudioFeatureProjection(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 层归一化层，用于标准化卷积层的输出
        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
        # 线性投影层，将卷积层的输出映射到隐藏状态的维度上
        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
        # Dropout 层，用于随机丢弃部分神经元，防止过拟合
        self.dropout = nn.Dropout(config.feat_proj_dropout)

    def forward(self, hidden_states):
        # 对隐藏状态进行层归一化处理
        norm_hidden_states = self.layer_norm(hidden_states)
        # 将归一化后的隐藏状态进行线性投影，映射到目标维度
        hidden_states = self.projection(norm_hidden_states)
        # 对投影后的隐藏状态进行 Dropout 处理，随机丢弃部分神经元
        hidden_states = self.dropout(hidden_states)
        # 返回投影后的隐藏状态和未投影的归一化隐藏状态，用于量化操作
        return hidden_states, norm_hidden_states


# 从 transformers.models.bart.modeling_bart.BartAttention 复制而来，修改了类名和部分参数
class Data2VecAudioAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[Data2VecAudioConfig] = None,
    ):
        # 调用父类初始化方法
        super().__init__()
        # 设置注意力机制的嵌入维度
        self.embed_dim = embed_dim
        # 设置注意力头的数量
        self.num_heads = num_heads
        # 设置dropout率
        self.dropout = dropout
        # 计算每个注意力头的维度
        self.head_dim = embed_dim // num_heads
        # 保存配置信息
        self.config = config

        # 检查嵌入维度是否能被注意力头的数量整除
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        # 缩放系数，用于缩放注意力得分
        self.scaling = self.head_dim**-0.5
        # 是否为解码器
        self.is_decoder = is_decoder
        # 是否使用因果（causal）注意力
        self.is_causal = is_causal

        # 初始化键（key）的线性投影层
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        # 初始化值（value）的线性投影层
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        # 初始化查询（query）的线性投影层
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        # 初始化输出的线性投影层
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 重新形状张量，以便为多头注意力做准备
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->Data2VecAudio
# 定义一个名为Data2VecAudioFeedForward的神经网络模块，继承自nn.Module
class Data2VecAudioFeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 中间层的dropout操作，使用配置中的激活函数的dropout率
        self.intermediate_dropout = nn.Dropout(config.activation_dropout)

        # 中间层的全连接层，输入维度为config.hidden_size，输出维度为config.intermediate_size
        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择或者初始化激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

        # 输出层的全连接层，输入维度为config.intermediate_size，输出维度为config.hidden_size
        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 输出层的dropout操作，使用配置中的dropout率
        self.output_dropout = nn.Dropout(config.hidden_dropout)

    # 定义前向传播函数，接受隐藏状态hidden_states作为输入
    def forward(self, hidden_states):
        # 中间层的全连接操作
        hidden_states = self.intermediate_dense(hidden_states)
        # 中间层的激活函数操作
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 中间层的dropout操作
        hidden_states = self.intermediate_dropout(hidden_states)

        # 输出层的全连接操作
        hidden_states = self.output_dense(hidden_states)
        # 输出层的dropout操作
        hidden_states = self.output_dropout(hidden_states)
        # 返回处理后的隐藏状态作为输出
        return hidden_states


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer with Wav2Vec2->Data2VecAudio
# 定义一个名为Data2VecAudioEncoderLayer的神经网络模块，继承自nn.Module
class Data2VecAudioEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用Data2VecAudioAttention作为注意力机制
        self.attention = Data2VecAudioAttention(
            embed_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=False,
        )
        # dropout操作，使用配置中的隐藏层dropout率
        self.dropout = nn.Dropout(config.hidden_dropout)
        # LayerNorm操作，输入维度为config.hidden_size，epsilon值为config.layer_norm_eps
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 使用Data2VecAudioFeedForward作为前馈神经网络
        self.feed_forward = Data2VecAudioFeedForward(config)
        # 最终的LayerNorm操作，输入维度为config.hidden_size，epsilon值为config.layer_norm_eps
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # 定义前向传播函数，接受隐藏状态hidden_states、注意力掩码attention_mask（可选）、是否输出注意力权重output_attentions（可选）
    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
        # 复制隐藏状态用于后续加法残差连接
        attn_residual = hidden_states
        # 使用注意力机制处理隐藏状态
        hidden_states, attn_weights, _ = self.attention(
            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
        )
        # dropout操作
        hidden_states = self.dropout(hidden_states)
        # 加法残差连接
        hidden_states = attn_residual + hidden_states

        # LayerNorm操作
        hidden_states = self.layer_norm(hidden_states)
        # 前馈神经网络操作
        hidden_states = hidden_states + self.feed_forward(hidden_states)
        # 最终LayerNorm操作
        hidden_states = self.final_layer_norm(hidden_states)

        outputs = (hidden_states,)

        # 如果输出注意力权重，将注意力权重加入输出元组
        if output_attentions:
            outputs += (attn_weights,)

        # 返回输出元组
        return outputs


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder with Wav2Vec2->Data2VecAudio
# 定义一个名为Data2VecAudioEncoder的神经网络模块，继承自nn.Module
class Data2VecAudioEncoder(nn.Module):
    # 初始化方法，接受一个配置参数对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 将传入的配置对象保存到实例变量中
        self.config = config
        # 创建一个位置卷积嵌入对象，并保存到实例变量中
        self.pos_conv_embed = Data2VecAudioPositionalConvEmbedding(config)
        # 创建一个 LayerNorm 层，并设置其参数
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层，并设置其参数
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 创建一个由多个音频编码器层组成的模块列表，数量由配置中的 num_hidden_layers 决定
        self.layers = nn.ModuleList([Data2VecAudioEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 初始化梯度检查点标志为 False
        self.gradient_checkpointing = False

    # 前向传播方法，接受输入的隐藏状态张量和其他可选参数
    def forward(
        self,
        hidden_states: torch.tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        ):
            all_hidden_states = () if output_hidden_states else None
            all_self_attentions = () if output_attentions else None

            if attention_mask is not None:
                # 确保填充的 token 输出为 0
                expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
                hidden_states[~expand_attention_mask] = 0

                # 扩展 attention_mask
                attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
                attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
                attention_mask = attention_mask.expand(
                    attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
                )

            # 计算位置嵌入
            position_embeddings = self.pos_conv_embed(hidden_states)
            # 将位置嵌入加到隐藏状态上
            hidden_states = hidden_states + position_embeddings
            # Layer Normalization
            hidden_states = self.layer_norm(hidden_states)
            # Dropout
            hidden_states = self.dropout(hidden_states)

            # 检查是否启用了 DeepSpeed Zero3
            deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()

            # 遍历每个 Transformer 层
            for layer in self.layers:
                if output_hidden_states:
                    all_hidden_states = all_hidden_states + (hidden_states,)

                # 添加 LayerDrop（参见 https://arxiv.org/abs/1909.11556 进行描述）
                dropout_probability = torch.rand([])

                # 判断是否跳过当前层
                skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
                if not skip_the_layer or deepspeed_zero3_is_enabled:
                    # 如果启用了梯度检查点和处于训练阶段，则调用梯度检查点函数
                    if self.gradient_checkpointing and self.training:
                        layer_outputs = self._gradient_checkpointing_func(
                            layer.__call__,
                            hidden_states,
                            attention_mask,
                            output_attentions,
                        )
                    else:
                        # 否则直接调用 Transformer 层
                        layer_outputs = layer(
                            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
                        )
                    hidden_states = layer_outputs[0]

                if skip_the_layer:
                    layer_outputs = (None, None)

                if output_attentions:
                    all_self_attentions = all_self_attentions + (layer_outputs[1],)

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 根据 return_dict 决定返回的数据结构
            if not return_dict:
                return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
            return BaseModelOutput(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states,
                attentions=all_self_attentions,
            )
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Adapter复制而来，将Wav2Vec2改为Data2VecAudio
class Data2VecAudioAdapter(nn.Module):
    def __init__(self, config):
        super().__init__()

        # 如果配置中的输出隐藏大小不等于隐藏大小，则可能需要降维特征维度
        if config.output_hidden_size != config.hidden_size:
            # 创建线性投影层，将隐藏状态维度降至输出隐藏大小
            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
            # 创建LayerNorm层，用于规范投影后的隐藏状态
            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
        else:
            self.proj = self.proj_layer_norm = None

        # 创建一个包含多个Data2VecAudioAdapterLayer模块的层列表
        self.layers = nn.ModuleList(Data2VecAudioAdapterLayer(config) for _ in range(config.num_adapter_layers))
        # 设置层的随机丢弃率
        self.layerdrop = config.layerdrop

    def forward(self, hidden_states):
        # 如果存在投影层和LayerNorm层，则将隐藏状态投影至输出隐藏大小并规范化
        if self.proj is not None and self.proj_layer_norm is not None:
            hidden_states = self.proj(hidden_states)
            hidden_states = self.proj_layer_norm(hidden_states)

        # 调换维度顺序，将时间维度置于第二位
        hidden_states = hidden_states.transpose(1, 2)

        # 对每一层进行循环处理
        for layer in self.layers:
            # 计算当前层是否被丢弃的概率
            layerdrop_prob = np.random.random()
            # 如果非训练状态或者未丢弃该层，则通过当前层处理隐藏状态
            if not self.training or (layerdrop_prob > self.layerdrop):
                hidden_states = layer(hidden_states)

        # 恢复原始维度顺序，将时间维度放回第三位
        hidden_states = hidden_states.transpose(1, 2)
        return hidden_states


# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AdapterLayer复制而来，将Wav2Vec2改为Data2VecAudio
class Data2VecAudioAdapterLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个一维卷积层，用于处理隐藏状态
        self.conv = nn.Conv1d(
            config.output_hidden_size,
            2 * config.output_hidden_size,
            config.adapter_kernel_size,
            stride=config.adapter_stride,
            padding=1,
        )

    def forward(self, hidden_states):
        # 将隐藏状态通过一维卷积层处理
        hidden_states = self.conv(hidden_states)
        # 对卷积结果进行门控线性单元（GLU）操作
        hidden_states = nn.functional.glu(hidden_states, dim=1)

        return hidden_states


class Data2VecAudioPreTrainedModel(PreTrainedModel):
    """
    一个抽象类，处理权重初始化以及下载和加载预训练模型的简单接口。
    """

    # 使用Data2VecAudioConfig作为配置类
    config_class = Data2VecAudioConfig
    # 模型的基本名称前缀
    base_model_prefix = "data2vec_audio"
    # 主要输入名称
    main_input_name = "input_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 初始化模型权重的方法，根据模块类型不同采取不同的初始化方式
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果模块是 Data2VecAudioFeatureProjection 类型
        if isinstance(module, Data2VecAudioFeatureProjection):
            # 计算均匀分布的上下界 k
            k = math.sqrt(1 / module.projection.in_features)
            # 对投影层的权重进行均匀初始化
            nn.init.uniform_(module.projection.weight, a=-k, b=k)
            # 对投影层的偏置进行均匀初始化
            nn.init.uniform_(module.projection.bias, a=-k, b=k)
        # 如果模块是 Data2VecAudioPositionalConvLayer 类型
        elif isinstance(module, Data2VecAudioPositionalConvLayer):
            # 对卷积层的偏置进行常数初始化（设置为0）
            nn.init.constant_(module.conv.bias, 0)
        # 如果模块是 nn.Linear 类型
        elif isinstance(module, nn.Linear):
            # 对线性层的权重进行正态分布初始化
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有偏置项，则将偏置项初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是 nn.LayerNorm 或 nn.GroupNorm 类型
        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
            # 如果有偏置项，则将偏置项初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
            # 如果有权重项，则将权重项初始化为全1
            if module.weight is not None:
                module.weight.data.fill_(1.0)
        # 如果模块是 nn.Conv1d 类型
        elif isinstance(module, nn.Conv1d):
            # 对卷积层的权重进行 Kaiming 正态分布初始化
            nn.init.kaiming_normal_(module.weight)
            # 如果有偏置项，则计算均匀分布的上下界 k 并进行均匀初始化
            if module.bias is not None:
                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                nn.init.uniform_(module.bias, a=-k, b=k)

    # 从 Wav2Vec2PreTrainedModel 类的方法 _get_feat_extract_output_lengths 复制而来
    def _get_feat_extract_output_lengths(
        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
    ):
        """
        Computes the output length of the convolutional layers
        """

        # 根据需要是否添加适配器的标志
        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter

        # 定义计算卷积层输出长度的内部函数
        def _conv_out_length(input_length, kernel_size, stride):
            # 使用 PyTorch 文档中描述的公式计算 1D 卷积层输出长度
            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1

        # 根据配置中的卷积核大小和步长循环计算输出长度
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

        # 如果需要添加适配器，根据配置中的适配器层数循环计算输出长度
        if add_adapter:
            for _ in range(self.config.num_adapter_layers):
                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)

        return input_lengths

    # 从 Wav2Vec2PreTrainedModel 类的方法 _get_feature_vector_attention_mask 复制而来
    def _get_feature_vector_attention_mask(
        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
    ):
        """
        Computes the attention mask for the feature vector
        """
        # 计算未填充部分的长度，即 attention_mask.sum(-1)，但不是原地操作以便在推理模式下运行。
        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]

        # 根据非填充长度获取特征提取器的输出长度，可选择添加适配器
        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
        output_lengths = output_lengths.to(torch.long)

        # 获取批次大小
        batch_size = attention_mask.shape[0]

        # 重新初始化 attention_mask，确保所有值在输出长度之前都是被注意的
        attention_mask = torch.zeros(
            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
        )

        # 在输出长度的索引之前的所有位置设置为1，以确保这些位置被注意到
        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1

        # 反转 attention_mask，累积求和，再次反转，并转换为布尔类型
        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()

        # 返回处理后的 attention_mask
        return attention_mask
# DATA2VEC_AUDIO_START_DOCSTRING 的值是一个多行字符串，用于说明 Data2VecAudio 模型的背景和基本信息
DATA2VEC_AUDIO_START_DOCSTRING = r"""
    Data2VecAudio was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
    Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and
    Michael Auli.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving etc.).

    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`Data2VecAudioConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# DATA2VEC_AUDIO_INPUTS_DOCSTRING 的值暂时为空字符串，用于定义 Data2VecAudio 模型的输入文档字符串
DATA2VEC_AUDIO_INPUTS_DOCSTRING = r"""
"""
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            # 输入的原始语音波形的浮点值。可以通过加载 *.flac* 或 *.wav* 音频文件得到值数组，类型为 *List[float]* 或 *numpy.ndarray*，
            # 例如通过 soundfile 库（*pip install soundfile*）实现。使用 [`AutoProcessor`] 进行填充并转换为类型为 *torch.FloatTensor* 的张量，
            # 详细信息请参阅 [`Wav2Vec2Processor.__call__`]。

        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 避免对填充令牌索引执行卷积和注意力的掩码。掩码的值为 `[0, 1]`：
            # 
            # - 1 表示**未掩码**的令牌，
            # - 0 表示**已掩码**的令牌。
            # 
            # [什么是注意力掩码？](../glossary#attention-mask)
            # 
            # <Tip warning={true}>
            # `attention_mask` 应该在相应的处理器具有 `config.return_attention_mask == True` 的情况下传递，这对所有预训练 Data2Vec Audio 模型都是成立的。
            # 请注意，即使有 `attention_mask`，零填充的输入与非填充的输入将会有略微不同的输出，因为在位置编码中有多个卷积层。有关更详细的解释，请参见
            # [这里](https://github.com/huggingface/transformers/issues/25621#issuecomment-1713759349)。
            # </Tip>

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详细信息请参见返回张量下的 `attentions`。
        
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详细信息请参见返回张量下的 `hidden_states`。
        
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
"""
`
"""
注释：

@add_start_docstrings(
    "The bare Data2VecAudio Model transformer outputting raw hidden-states without any specific head on top.",
    DATA2VEC_AUDIO_START_DOCSTRING,
)
# 定义 Data2VecAudioModel 类，继承自 Data2VecAudioPreTrainedModel 类，表示数据向量音频模型
class Data2VecAudioModel(Data2VecAudioPreTrainedModel):
    def __init__(self, config: Data2VecAudioConfig):
        # 调用父类构造函数，初始化模型配置
        super().__init__(config)
        # 保存模型配置
        self.config = config
        # 创建音频特征提取器对象
        self.feature_extractor = Data2VecAudioFeatureEncoder(config)
        # 创建音频特征投影对象
        self.feature_projection = Data2VecAudioFeatureProjection(config)

        # 如果配置中的掩码时间概率大于 0 或者掩码特征概率大于 0，则需要掩码特征嵌入
        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
            # 初始化掩码特征嵌入参数
            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())

        # 创建音频编码器对象
        self.encoder = Data2VecAudioEncoder(config)

        # 如果配置中包含适配器，则创建适配器对象；否则适配器对象为 None
        self.adapter = Data2VecAudioAdapter(config) if config.add_adapter else None

        # 初始化权重并应用最终处理
        self.post_init()

    # 冻结特征编码器函数，禁止特征编码器的梯度计算，以防止其在训练过程中更新参数
    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        self.feature_extractor._freeze_parameters()

    # 掩码隐藏状态函数，用于掩码隐藏状态的处理
    def _mask_hidden_states
    ):
        """
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        """

        # `config.apply_spec_augment` can set masking to False
        if not getattr(self.config, "apply_spec_augment", True):
            return hidden_states

        # generate indices & apply SpecAugment along time axis
        batch_size, sequence_length, hidden_size = hidden_states.size()

        if mask_time_indices is not None:
            # apply SpecAugment along time axis with given mask_time_indices
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
        elif self.config.mask_time_prob > 0 and self.training:
            # Calculate mask indices for time axis based on configuration parameters
            mask_time_indices = _compute_mask_indices(
                (batch_size, sequence_length),
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)

        if self.config.mask_feature_prob > 0 and self.training:
            # generate indices & apply SpecAugment along feature axis
            mask_feature_indices = _compute_mask_indices(
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
            hidden_states[mask_feature_indices] = 0

        return hidden_states

    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Wav2Vec2BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
        # 如果未指定output_attentions，则使用配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定output_hidden_states，则使用配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定return_dict，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 提取输入特征向量
        extract_features = self.feature_extractor(input_values)
        # 调整特征向量的维度顺序
        extract_features = extract_features.transpose(1, 2)

        if attention_mask is not None:
            # 计算适应特征向量的减少后的attention_mask
            attention_mask = self._get_feature_vector_attention_mask(
                extract_features.shape[1], attention_mask, add_adapter=False
            )

        # 对特征向量进行特征投影
        hidden_states, extract_features = self.feature_projection(extract_features)
        # 在计算中对隐藏状态进行屏蔽处理
        hidden_states = self._mask_hidden_states(
            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
        )

        # 编码器的输出
        encoder_outputs = self.encoder(
            hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的最后隐藏状态
        hidden_states = encoder_outputs[0]

        # 如果存在适配器，应用适配器
        if self.adapter is not None:
            hidden_states = self.adapter(hidden_states)

        # 如果不返回字典，则返回元组形式的输出
        if not return_dict:
            return (hidden_states, extract_features) + encoder_outputs[1:]

        # 如果返回字典，则创建Wav2Vec2BaseModelOutput对象并返回
        return Wav2Vec2BaseModelOutput(
            last_hidden_state=hidden_states,
            extract_features=extract_features,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
    """
    Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).

    Inherited documentation and configuration are added from `DATA2VEC_AUDIO_START_DOCSTRING`.

    Args:
        config (:class:`~transformers.Data2VecAudioConfig`):
            The model configuration class that specifies the model's architecture and parameters.

    Attributes:
        data2vec_audio (:class:`~transformers.Data2VecAudioModel`):
            The base Data2VecAudioModel instance.
        dropout (:obj:`torch.nn.Dropout`):
            Dropout module with a dropout probability as specified in `config.final_dropout`.
        lm_head (:obj:`torch.nn.Linear`):
            The linear layer for the language modeling head with output size `config.vocab_size`.

    Raises:
        ValueError: If `config.vocab_size` is not defined in the model configuration.

    Notes:
        This class extends `Data2VecAudioPreTrainedModel` and adds a language modeling head on top for CTC.
    """
    def __init__(self, config):
        super().__init__(config)

        # Initialize base Data2VecAudioModel and dropout layer
        self.data2vec_audio = Data2VecAudioModel(config)
        self.dropout = nn.Dropout(config.final_dropout)

        # Check if vocab_size is defined in the configuration
        if config.vocab_size is None:
            raise ValueError(
                f"You are trying to instantiate {self.__class__} with a configuration that "
                "does not define the vocabulary size of the language model head. Please "
                "instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
                "or define `vocab_size` of your model's configuration."
            )

        # Determine the output size of the linear layer based on model configuration
        output_hidden_size = (
            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
        )
        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)

        # Initialize weights and apply final processing
        self.post_init()

    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.

        Deprecated:
            The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5.
            Please use the equivalent `freeze_feature_encoder` method instead.
        """
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        self.freeze_feature_encoder()

    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        self.data2vec_audio.feature_extractor._freeze_parameters()

    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_CTC_EXPECTED_OUTPUT,
        expected_loss=_CTC_EXPECTED_LOSS,
    )
    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward with wav2vec2->data2vec_audio
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,

        """
        Forward pass of the Data2VecAudioForCTC model.

        Args:
            input_values (torch.Tensor, optional):
                Input tensor of shape `(batch_size, sequence_length, feature_dim)` containing audio features.
            attention_mask (torch.Tensor, optional):
                Mask to avoid performing attention on padding tokens.
            output_attentions (bool, optional):
                Whether to return attentions weights of all attention layers.
            output_hidden_states (bool, optional):
                Whether to return hidden states of all layers.
            return_dict (bool, optional):
                Whether to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
            labels (torch.Tensor, optional):
                Labels for computing the CTC loss.

        Returns:
            :class:`~transformers.modeling_outputs.CausalLMOutput`: A :class:`~transformers.modeling_outputs.CausalLMOutput` containing:
                - loss (`torch.FloatTensor`, optional):
                    CTC loss if :obj:`labels` is provided.
                - logits (`torch.FloatTensor`):
                    The logits output tensor of the language modeling head.

        Examples:
            For examples on usage, please see the documentation and code samples provided.

        Warnings:
            This method is adapted from `transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward`
            with modifications for Data2VecAudio models.
        """
        ) -> Union[Tuple, CausalLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """

        # Decide whether to return results as a dictionary based on the provided argument or configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Convert input audio features into vector representations
        outputs = self.data2vec_audio(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract hidden states and apply dropout regularization
        hidden_states = outputs[0]
        hidden_states = self.dropout(hidden_states)

        # Generate logits from the language model head
        logits = self.lm_head(hidden_states)

        # Initialize loss as None
        loss = None
        if labels is not None:
            # Check if any label index exceeds the vocabulary size, which is invalid
            if labels.max() >= self.config.vocab_size:
                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")

            # Retrieve the lengths of input features using attention mask
            attention_mask = (
                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
            )
            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)

            # Identify valid labels by creating a mask and calculate their lengths
            labels_mask = labels >= 0
            target_lengths = labels_mask.sum(-1)
            flattened_targets = labels.masked_select(labels_mask)

            # Apply log softmax to logits and transpose for CTC loss calculation
            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

            # Disable CuDNN optimizations to ensure reproducibility in loss calculation
            with torch.backends.cudnn.flags(enabled=False):
                # Compute CTC loss using provided parameters
                loss = nn.functional.ctc_loss(
                    log_probs,
                    flattened_targets,
                    input_lengths,
                    target_lengths,
                    blank=self.config.pad_token_id,
                    reduction=self.config.ctc_loss_reduction,
                    zero_infinity=self.config.ctc_zero_infinity,
                )

        # If return_dict is False, return outputs as a tuple
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # If return_dict is True, return outputs wrapped in CausalLMOutput
        return CausalLMOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
@add_start_docstrings(
    """
    Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
    like SUPERB Keyword Spotting.
    """,
    DATA2VEC_AUDIO_START_DOCSTRING,
)
class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 检查配置中是否存在并且允许使用适配器，如果是，则引发值错误
        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Sequence classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)"
            )

        # 初始化 Data2VecAudioModel 模型
        self.data2vec_audio = Data2VecAudioModel(config)
        
        # 计算层数：transformer 层数加上输入嵌入层
        num_layers = config.num_hidden_layers + 1  

        # 如果配置中使用加权层求和，则初始化层权重参数
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)

        # 线性投影层，将隐藏状态映射到分类器投影尺寸
        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)

        # 分类器层，将投影后的特征映射到类别数量
        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)

        # 初始化权重并进行后处理
        self.post_init()

    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        """
        # 发出警告，方法即将弃用，请改用 `freeze_feature_encoder`
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        # 调用 `freeze_feature_encoder` 方法冻结特征编码器的参数
        self.freeze_feature_encoder()

    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        # 冻结特征编码器的参数，禁止其梯度计算
        self.data2vec_audio.feature_extractor._freeze_parameters()

    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        # 冻结基础模型的参数，禁止其梯度计算，只有分类头部会被更新
        for param in self.data2vec_audio.parameters():
            param.requires_grad = False

    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
    )
    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward 复制，并将 wav2vec2 改为 data2vec_audio
    def forward(
        self,
        input_values: Optional[torch.Tensor],  # 输入的张量数据，可选
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码张量，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选
        labels: Optional[torch.Tensor] = None,  # 分类/回归任务的标签张量，可选
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict  # 决定是否返回字典格式的输出，根据传入的参数或配置决定
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states  # 如果配置指定使用加权层求和，则强制输出隐藏状态

        outputs = self.data2vec_audio(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        if self.config.use_weighted_layer_sum:
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]  # 获取输出中的隐藏状态
            hidden_states = torch.stack(hidden_states, dim=1)  # 在指定维度上堆叠隐藏状态张量
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)  # 对层权重进行softmax归一化
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)  # 使用加权求和隐藏状态
        else:
            hidden_states = outputs[0]  # 获取普通的隐藏状态输出

        hidden_states = self.projector(hidden_states)  # 投影隐藏状态

        if attention_mask is None:
            pooled_output = hidden_states.mean(dim=1)  # 如果没有注意力掩码，则对隐藏状态进行平均池化
        else:
            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)  # 根据注意力掩码生成填充掩码
            hidden_states[~padding_mask] = 0.0  # 使用填充掩码将非填充位置置零
            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)  # 对填充位置求和并进行平均池化

        logits = self.classifier(pooled_output)  # 使用分类器得到logits

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # 使用交叉熵损失函数
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))  # 计算损失

        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]  # 如果不返回字典，则组合输出
            return ((loss,) + output) if loss is not None else output  # 返回包含损失的输出或者普通输出

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )  # 返回字典格式的分类器输出
@add_start_docstrings(
    """
    Data2VecAudio Model with a frame classification head on top for tasks like Speaker Diarization.
    """,
    DATA2VEC_AUDIO_START_DOCSTRING,
)
class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Audio frame classification does not support the use of Data2VecAudio adapters"
                " (config.add_adapter=True)"
            )
        
        # 初始化 Data2VecAudioModel，并设置层数
        self.data2vec_audio = Data2VecAudioModel(config)
        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
        
        # 如果设置了使用加权层求和，则初始化层权重
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        
        # 初始化分类器线性层
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.num_labels = config.num_labels

        # 初始化模型权重
        self.init_weights()

    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        # 调用 freeze_feature_encoder 方法来冻结特征编码器的参数
        self.freeze_feature_encoder()

    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        # 冻结特征编码器的参数，使其在训练过程中不会更新
        self.data2vec_audio.feature_extractor._freeze_parameters()

    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        # 冻结基础模型的参数，使其在训练过程中不会更新，只有分类头会更新
        for param in self.data2vec_audio.parameters():
            param.requires_grad = False

    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
    )
    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.forward 复制，将 wav2vec2 替换为 data2vec_audio
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs
    ):
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 初始化是否返回字典，默认为配置中的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 根据权重层求和的配置决定是否输出隐藏状态
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 调用data2vec_audio方法处理音频输入，获取输出
        outputs = self.data2vec_audio(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果配置为使用加权层求和，则对隐藏状态进行加权求和处理
        if self.config.use_weighted_layer_sum:
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]  # 获取隐藏状态的起始位置
            hidden_states = torch.stack(hidden_states, dim=1)  # 在指定维度上堆叠隐藏状态张量
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)  # 对层权重进行softmax归一化
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)  # 加权求和隐藏状态
        else:
            hidden_states = outputs[0]  # 否则直接使用第一个输出作为隐藏状态

        logits = self.classifier(hidden_states)  # 使用分类器对隐藏状态进行分类预测

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # 使用交叉熵损失函数
            # 计算分类器的损失，labels需要转换成合适的形状和类型
            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))

        # 如果不返回字典形式的输出，则返回元组形式的输出
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]  # 组合输出结果
            return output

        # 返回TokenClassifierOutput对象，包含损失、logits、隐藏状态和注意力
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss
# 定义了一个 AMSoftmaxLoss 类，用于计算带有 AM-Softmax 的损失函数
class AMSoftmaxLoss(nn.Module):
    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
        super(AMSoftmaxLoss, self).__init__()
        self.scale = scale  # 缩放因子，用于调整角度余弦值的范围
        self.margin = margin  # AM-Softmax 中的 margin 参数
        self.num_labels = num_labels  # 标签的数量
        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)  # 权重矩阵，初始化为随机值
        self.loss = nn.CrossEntropyLoss()  # 使用交叉熵作为损失函数

    def forward(self, hidden_states, labels):
        labels = labels.flatten()  # 将标签展平，以便计算损失
        weight = nn.functional.normalize(self.weight, dim=0)  # 对权重进行 L2 归一化
        hidden_states = nn.functional.normalize(hidden_states, dim=1)  # 对隐藏状态进行 L2 归一化
        cos_theta = torch.mm(hidden_states, weight)  # 计算余弦相似度
        psi = cos_theta - self.margin  # 计算带有 margin 的调整值

        onehot = nn.functional.one_hot(labels, self.num_labels)  # 将标签转为 one-hot 格式
        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)  # 根据标签应用 AM-Softmax 运算得到最终 logits
        loss = self.loss(logits, labels)  # 计算损失

        return loss


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer
# 定义了 TDNNLayer 类，用于实现时延神经网络层
class TDNNLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]  # 输入通道维度
        self.out_conv_dim = config.tdnn_dim[layer_id]  # 输出通道维度
        self.kernel_size = config.tdnn_kernel[layer_id]  # 卷积核大小
        self.dilation = config.tdnn_dilation[layer_id]  # 空洞卷积的扩展率

        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)  # 使用线性层作为卷积核
        self.activation = nn.ReLU()  # 激活函数为 ReLU

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        if is_peft_available():
            from peft.tuners.lora import LoraLayer

            if isinstance(self.kernel, LoraLayer):
                warnings.warn(
                    "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
                    "You should exclude TDNNLayer from LoRA's target modules.",
                )

        # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
        hidden_states = hidden_states.transpose(1, 2)  # 转置张量以适应卷积操作的输入要求
        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)  # 调整权重形状
        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)  # 执行卷积操作
        hidden_states = hidden_states.transpose(1, 2)  # 还原张量形状

        hidden_states = self.activation(hidden_states)  # 应用 ReLU 激活函数
        return hidden_states


@add_start_docstrings(
    """
    Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    """,
    DATA2VEC_AUDIO_START_DOCSTRING,
)
# 定义了 Data2VecAudioForXVector 类，扩展自 Data2VecAudioPreTrainedModel，用于 XVector 特征提取
class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
    def __init__(self, config):
        # 调用父类的初始化方法，传递配置参数
        super().__init__(config)

        # 创建音频数据转换模型对象
        self.data2vec_audio = Data2VecAudioModel(config)
        
        # 计算层数，包括Transformer层和输入嵌入层
        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
        
        # 如果配置指定使用加权层求和，则初始化层权重参数
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        
        # 创建线性映射层，将隐藏状态映射到TDNN的输入维度
        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])

        # 创建多个TDNN层的列表
        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
        self.tdnn = nn.ModuleList(tdnn_layers)

        # 创建特征提取器的线性层，用于生成x-vector的输出维度
        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
        
        # 创建分类器的线性层，将x-vector映射到分类数目的维度
        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)

        # 初始化AMSoftmax损失函数，使用指定的输出维度和类别数目
        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)

        # 执行权重初始化的函数
        self.init_weights()

    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        # 发出警告，表明此函数将被弃用，建议使用freeze_feature_encoder代替
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        # 调用freeze_feature_encoder方法，冻结特征编码器的参数
        self.freeze_feature_encoder()

    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        # 冻结特征编码器的参数，使其在训练期间不会更新梯度
        self.data2vec_audio.feature_extractor._freeze_parameters()

    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        # 遍历数据转换模型的所有参数，并将requires_grad设置为False，以禁用它们的梯度计算
        for param in self.data2vec_audio.parameters():
            param.requires_grad = False

    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
        """
        Computes the output length of the TDNN layers
        """

        def _conv_out_length(input_length, kernel_size, stride):
            # 从PyTorch文档中获取的1D卷积层输出长度的计算公式
            return (input_length - kernel_size) // stride + 1

        # 计算每个TDNN层的输出长度
        for kernel_size in self.config.tdnn_kernel:
            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)

        return input_lengths

    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=XVectorOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
    )
    # 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.forward复制而来，修改为使用data2vec_audio
    # 定义一个前向传播方法，用于模型推断或训练时的正向处理
    def forward(
        self,
        # 输入数据张量，可以为空
        input_values: Optional[torch.Tensor],
        # 注意力掩码张量，用于指定模型关注的部分，可以为空
        attention_mask: Optional[torch.Tensor] = None,
        # 是否输出注意力权重，可选参数，默认为空
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态，可选参数，默认为空
        output_hidden_states: Optional[bool] = None,
        # 是否返回结果字典形式，可选参数，默认为空
        return_dict: Optional[bool] = None,
        # 标签数据张量，用于训练时指定真实标签，可以为空
        labels: Optional[torch.Tensor] = None,
        ) -> Union[Tuple, XVectorOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        
        # 初始化 return_dict，如果未提供则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果配置中指定了使用加权层求和的隐藏状态，则设置 output_hidden_states 为 True
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
        
        # 将输入数据通过 data2vec_audio 方法处理，获取模型的输出
        outputs = self.data2vec_audio(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        # 如果配置中使用了加权层求和的隐藏状态，对隐藏状态进行加权求和操作
        if self.config.use_weighted_layer_sum:
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            hidden_states = torch.stack(hidden_states, dim=1)
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            # 否则直接取模型输出的第一个元素作为隐藏状态
            hidden_states = outputs[0]
        
        # 将隐藏状态通过 projector 方法进行投影处理
        hidden_states = self.projector(hidden_states)
        
        # 对每个 TDNN 层依次处理隐藏状态
        for tdnn_layer in self.tdnn:
            hidden_states = tdnn_layer(hidden_states)
        
        # 统计池化操作
        if attention_mask is None:
            # 如果没有提供 attention_mask，则对隐藏状态在第一维度（batch 维度）上进行均值和标准差计算
            mean_features = hidden_states.mean(dim=1)
            std_features = hidden_states.std(dim=1)
        else:
            # 根据 attention_mask 计算特征提取的输出长度和 TDNN 层的输出长度
            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
            mean_features = []
            std_features = []
            # 对每个序列长度进行遍历，计算每个序列的均值和标准差
            for i, length in enumerate(tdnn_output_lengths):
                mean_features.append(hidden_states[i, :length].mean(dim=0))
                std_features.append(hidden_states[i, :length].std(dim=0))
            mean_features = torch.stack(mean_features)
            std_features = torch.stack(std_features)
        # 将均值和标准差拼接在一起作为统计池化的结果
        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
        
        # 将统计池化的结果通过 feature_extractor 进行特征提取
        output_embeddings = self.feature_extractor(statistic_pooling)
        
        # 将特征提取的结果通过 classifier 进行分类器预测得到 logits
        logits = self.classifier(output_embeddings)
        
        # 初始化损失为 None
        loss = None
        # 如果提供了 labels，则计算损失
        if labels is not None:
            loss = self.objective(logits, labels)
        
        # 如果不要求返回字典形式的输出，则直接返回元组形式的输出
        if not return_dict:
            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output
        
        # 如果要求返回字典形式的输出，则构建 XVectorOutput 对象并返回
        return XVectorOutput(
            loss=loss,
            logits=logits,
            embeddings=output_embeddings,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Transformers-源码解析-三十一-

Transformers 源码解析（三十一）

.\models\ctrl\__init__.py

.\models\cvt\configuration_cvt.py

.\models\cvt\convert_cvt_original_pytorch_checkpoint_to_pytorch.py

.\models\cvt\modeling_cvt.py

.\models\cvt\modeling_tf_cvt.py

.\models\cvt\__init__.py

.\models\data2vec\configuration_data2vec_audio.py

.\models\data2vec\configuration_data2vec_text.py

.\models\data2vec\configuration_data2vec_vision.py

.\models\data2vec\convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py

.\models\data2vec\convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py

.\models\data2vec\convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py

.\models\data2vec\modeling_data2vec_audio.py

`.\models\ctrl\init.py`

`.\models\cvt\configuration_cvt.py`

`.\models\cvt\convert_cvt_original_pytorch_checkpoint_to_pytorch.py`

`.\models\cvt\modeling_cvt.py`

`.\models\cvt\modeling_tf_cvt.py`

`.\models\cvt\init.py`

`.\models\data2vec\configuration_data2vec_audio.py`

`.\models\data2vec\configuration_data2vec_text.py`

`.\models\data2vec\configuration_data2vec_vision.py`

`.\models\data2vec\convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py`

`.\models\data2vec\convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py`

`.\models\data2vec\convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py`

`.\models\data2vec\modeling_data2vec_audio.py`