Transformers 源码解析（一百零三）

`.\models\sew\init.py`

# 版权声明及许可信息，指明此代码的版权归HuggingFace团队所有，受Apache License, Version 2.0许可
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 引入类型检查模块，用于检查类型是否可用
from typing import TYPE_CHECKING

# 引入必要的依赖项，包括OptionalDependencyNotAvailable异常和_LazyModule
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构，用于延迟加载模块
_import_structure = {"configuration_sew": ["SEW_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWConfig"]}

# 检查是否可以使用torch库，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加SEW相关模型到导入结构中
    _import_structure["modeling_sew"] = [
        "SEW_PRETRAINED_MODEL_ARCHIVE_LIST",
        "SEWForCTC",
        "SEWForSequenceClassification",
        "SEWModel",
        "SEWPreTrainedModel",
    ]

# 如果当前环境是类型检查模式
if TYPE_CHECKING:
    # 从configuration_sew模块中导入所需内容，包括SEW_PRETRAINED_CONFIG_ARCHIVE_MAP和SEWConfig
    from .configuration_sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig

    # 再次检查是否可以使用torch库，若不可用则抛出OptionalDependencyNotAvailable异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，从modeling_sew模块中导入SEW相关的模型和类
        from .modeling_sew import (
            SEW_PRETRAINED_MODEL_ARCHIVE_LIST,
            SEWForCTC,
            SEWForSequenceClassification,
            SEWModel,
            SEWPreTrainedModel,
        )

# 如果当前环境不是类型检查模式
else:
    # 导入sys模块
    import sys

    # 将当前模块注册为_LazyModule，用于延迟加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\sew_d\configuration_sew_d.py`

# coding=utf-8
# 上面是指定源代码文件的编码格式为 UTF-8

# 版权声明及许可协议，指明此代码的版权信息和使用许可
# Copyright 2021 ASAPP Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" SEW-D model configuration"""
# 以上是模块的简短描述，指明本文件用于SEW-D模型的配置

# 导入 functools 和 operator 模块
import functools
import operator

# 从相关的模块导入 PretrainedConfig 类
from ...configuration_utils import PretrainedConfig
# 导入日志记录工具
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# SEW-D 预训练模型配置文件映射表，将模型名称映射到其配置文件的 URL
SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "asapp/sew-d-tiny-100k": "https://huggingface.co/asapp/sew-d-tiny-100k/resolve/main/config.json",
    # 查看所有 SEW-D 模型的列表可访问 https://huggingface.co/models?filter=sew-d
}

# SEWDConfig 类，继承自 PretrainedConfig 类
class SEWDConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`SEWDModel`]. It is used to instantiate a SEW-D
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the SEW-D
    [asapp/sew-d-tiny-100k](https://huggingface.co/asapp/sew-d-tiny-100k) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import SEWDConfig, SEWDModel

    >>> # Initializing a SEW-D asapp/sew-d-tiny-100k style configuration
    >>> configuration = SEWDConfig()

    >>> # Initializing a model (with random weights) from the asapp/sew-d-tiny-100k style configuration
    >>> model = SEWDModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    # SEWDConfig 类的文档字符串，描述了 SEWDConfig 的配置信息和使用方法

    # 模型类型为 "sew-d"
    model_type = "sew-d"
    # 初始化函数，设置模型的各种参数和默认值
    def __init__(
        self,
        vocab_size=32,  # 词汇表大小，默认为32
        hidden_size=768,  # 隐藏层大小，默认为768
        num_hidden_layers=12,  # Transformer中隐藏层的数量，默认为12
        num_attention_heads=12,  # 注意力头的数量，默认为12
        intermediate_size=3072,  # 中间层大小，默认为3072
        squeeze_factor=2,  # 压缩因子，默认为2
        max_position_embeddings=512,  # 最大位置嵌入数，默认为512
        position_buckets=256,  # 位置桶的数量，默认为256
        share_att_key=True,  # 是否共享注意力的键，默认为True
        relative_attention=True,  # 是否使用相对注意力，默认为True
        pos_att_type=("p2c", "c2p"),  # 位置注意力类型，默认为("p2c", "c2p")
        norm_rel_ebd="layer_norm",  # 相对位置编码的规范化方法，默认为"layer_norm"
        hidden_act="gelu_python",  # 隐藏层激活函数，默认为"gelu_python"
        hidden_dropout=0.1,  # 隐藏层的dropout率，默认为0.1
        activation_dropout=0.1,  # 激活函数的dropout率，默认为0.1
        attention_dropout=0.1,  # 注意力机制的dropout率，默认为0.1
        feat_proj_dropout=0.0,  # 特征投影的dropout率，默认为0.0
        final_dropout=0.1,  # 最终输出的dropout率，默认为0.1
        initializer_range=0.02,  # 参数初始化的范围，默认为0.02
        layer_norm_eps=1e-7,  # 层归一化的epsilon值，默认为1e-7
        feature_layer_norm_eps=1e-5,  # 特征层归一化的epsilon值，默认为1e-5
        feat_extract_norm="group",  # 特征提取的归一化方法，默认为"group"
        feat_extract_activation="gelu",  # 特征提取的激活函数，默认为"gelu"
        conv_dim=(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512),  # 卷积层的通道数，默认为指定的元组
        conv_stride=(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1),  # 卷积层的步长，默认为指定的元组
        conv_kernel=(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1),  # 卷积层的核大小，默认为指定的元组
        conv_bias=False,  # 卷积层是否使用偏置，默认为False
        num_conv_pos_embeddings=128,  # 卷积位置嵌入的数量，默认为128
        num_conv_pos_embedding_groups=16,  # 卷积位置嵌入的分组数量，默认为16
        apply_spec_augment=True,  # 是否应用特定的数据增强方法，默认为True
        mask_time_prob=0.05,  # 时间掩码的概率，默认为0.05
        mask_time_length=10,  # 时间掩码的长度，默认为10
        mask_time_min_masks=2,  # 时间掩码的最小数量，默认为2
        mask_feature_prob=0.0,  # 特征掩码的概率，默认为0.0
        mask_feature_length=10,  # 特征掩码的长度，默认为10
        mask_feature_min_masks=0,  # 特征掩码的最小数量，默认为0
        ctc_loss_reduction="mean",  # CTC损失的减少方法，默认为"mean"
        ctc_zero_infinity=False,  # CTC损失是否将无穷值设置为零，默认为False
        use_weighted_layer_sum=False,  # 是否使用加权层的总和，默认为False
        classifier_proj_size=256,  # 分类器投影大小，默认为256
        pad_token_id=0,  # 填充token的ID，默认为0
        bos_token_id=1,  # 开始token的ID，默认为1
        eos_token_id=2,  # 结束token的ID，默认为2
        **kwargs,
    ):
    
    # 计算输入到logits的比率，通过减少conv_stride的乘积来实现
    @property
    def inputs_to_logits_ratio(self):
        return functools.reduce(operator.mul, self.conv_stride, 1)
    
    # 获取隐藏层dropout的值，并发出警告
    @property
    def hidden_dropout(self):
        logger.warning_once("hidden_dropout is not used by the model and will be removed as config attribute in v4.35")
        return self._hidden_dropout
    
    # 将实例序列化为Python字典的方法
    def to_dict(self):
        """
        Serializes this instance to a Python dictionary.
        """
        # 调用父类的to_dict方法，获取基类的字典表示
        output = super().to_dict()
        # 将_hidden_dropout键改为hidden_dropout，并将其值存入字典中
        output["hidden_dropout"] = output.pop("_hidden_dropout")
        return output

`.\models\sew_d\convert_sew_d_original_pytorch_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert SEW checkpoint."""


import argparse  # 导入解析命令行参数的模块
import json  # 导入处理 JSON 的模块
import os  # 导入操作系统相关功能的模块

import fairseq  # 导入 fairseq 库
import torch  # 导入 PyTorch 深度学习框架
from fairseq.data import Dictionary  # 从 fairseq 库中导入 Dictionary 类

# Register SEW's fairseq modules
from sew_asapp import tasks  # noqa: F401  # 导入 SEW 的 fairseq 模块，并标记忽略 F401 警告

from transformers import (
    SEWDConfig,  # 从 transformers 库导入 SEWDConfig 类
    SEWDForCTC,  # 从 transformers 库导入 SEWDForCTC 类
    SEWDModel,  # 从 transformers 库导入 SEWDModel 类
    Wav2Vec2CTCTokenizer,  # 从 transformers 库导入 Wav2Vec2CTCTokenizer 类
    Wav2Vec2FeatureExtractor,  # 从 transformers 库导入 Wav2Vec2FeatureExtractor 类
    Wav2Vec2Processor,  # 从 transformers 库导入 Wav2Vec2Processor 类
    logging,  # 导入 logging 模块
)


logging.set_verbosity_info()  # 设置日志记录级别为 info
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

MAPPING = {
    "post_extract_proj": "feature_projection",  # 映射关系的字典，将 SEW 中的 post_extract_proj 映射为 feature_projection
    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",  # 将 SEW 中的 encoder.pos_conv.0 映射为 encoder.pos_conv_embed.conv
    "attention.self.query_proj": "encoder.encoder.layer.*.attention.self.query_proj",  # 将 SEW 中的 attention.self.query_proj 映射为 encoder.encoder.layer.*.attention.self.query_proj
    "attention.self.key_proj": "encoder.encoder.layer.*.attention.self.key_proj",  # 将 SEW 中的 attention.self.key_proj 映射为 encoder.encoder.layer.*.attention.self.key_proj
    "attention.self.value_proj": "encoder.encoder.layer.*.attention.self.value_proj",  # 将 SEW 中的 attention.self.value_proj 映射为 encoder.encoder.layer.*.attention.self.value_proj
    "attention.output.dense": "encoder.encoder.layer.*.attention.output.dense",  # 将 SEW 中的 attention.output.dense 映射为 encoder.encoder.layer.*.attention.output.dense
    "attention.output.LayerNorm": "encoder.encoder.layer.*.attention.output.LayerNorm",  # 将 SEW 中的 attention.output.LayerNorm 映射为 encoder.encoder.layer.*.attention.output.LayerNorm
    "intermediate.dense": "encoder.encoder.layer.*.intermediate.dense",  # 将 SEW 中的 intermediate.dense 映射为 encoder.encoder.layer.*.intermediate.dense
    "output.dense": "encoder.encoder.layer.*.output.dense",  # 将 SEW 中的 output.dense 映射为 encoder.encoder.layer.*.output.dense
    "output.LayerNorm": "encoder.encoder.layer.*.output.LayerNorm",  # 将 SEW 中的 output.LayerNorm 映射为 encoder.encoder.layer.*.output.LayerNorm
    "encoder.encoder.rel_embeddings": "encoder.encoder.rel_embeddings",  # 将 SEW 中的 encoder.encoder.rel_embeddings 映射为 encoder.encoder.rel_embeddings
    "encoder.encoder.LayerNorm": "encoder.encoder.LayerNorm",  # 将 SEW 中的 encoder.encoder.LayerNorm 映射为 encoder.encoder.LayerNorm
    "encoder.upsample.0": "encoder.upsample.projection",  # 将 SEW 中的 encoder.upsample.0 映射为 encoder.upsample.projection
    "encoder.layer_norm": "encoder.layer_norm",  # 将 SEW 中的 encoder.layer_norm 映射为 encoder.layer_norm
    "w2v_model.layer_norm": "layer_norm",  # 将 SEW 中的 w2v_model.layer_norm 映射为 layer_norm
    "w2v_encoder.proj": "lm_head",  # 将 SEW 中的 w2v_encoder.proj 映射为 lm_head
    "mask_emb": "masked_spec_embed",  # 将 SEW 中的 mask_emb 映射为 masked_spec_embed
}


def set_recursively(hf_pointer, key, value, full_name, weight_type):
    for attribute in key.split("."):  # 遍历 key 字符串按点分割后的列表
        hf_pointer = getattr(hf_pointer, attribute)  # 获取 hf_pointer 对象中的对应属性

    if weight_type is not None:  # 如果 weight_type 不为 None
        hf_shape = getattr(hf_pointer, weight_type).shape  # 获取 hf_pointer 对象中 weight_type 属性的形状
    else:
        hf_shape = hf_pointer.shape  # 否则获取 hf_pointer 对象本身的形状

    assert hf_shape == value.shape, (  # 断言 hf_pointer 对象的形状与 value 的形状相同
        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
        f" {value.shape} for {full_name}"
    )

    if weight_type == "weight":  # 如果 weight_type 为 "weight"
        hf_pointer.weight.data = value  # 将 hf_pointer 对象的 weight 属性的数据设置为 value
    elif weight_type == "weight_g":  # 如果 weight_type 为 "weight_g"
        hf_pointer.weight_g.data = value  # 将 hf_pointer 对象的 weight_g 属性的数据设置为 value
    elif weight_type == "weight_v":  # 如果 weight_type 为 "weight_v"
        hf_pointer.weight_v.data = value  # 将 hf_pointer 对象的 weight_v 属性的数据设置为 value
    elif weight_type == "bias":  # 如果 weight_type 为 "bias"
        hf_pointer.bias.data = value  # 将 hf_pointer 对象的 bias 属性的数据设置为 value
    else:
        hf_pointer.data = value  # 否则将 hf_pointer 对象的数据设置为 value
    # 使用 logger 对象记录信息，此处使用了格式化字符串，根据条件拼接日志消息
    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
# 递归加载权重函数，用于将 Fairseq 模型的权重加载到 Hugging Face 模型中
def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
    # 未使用的权重列表
    unused_weights = []
    # 获取 Fairseq 模型的状态字典
    fairseq_dict = fairseq_model.state_dict()

    # 根据是否微调选择特征提取器
    feature_extractor = hf_model.sew_d.feature_extractor if is_finetuned else hf_model.feature_extractor

    # 遍历 Fairseq 模型的状态字典
    for name, value in fairseq_dict.items():
        is_used = False
        # 如果权重名称中包含 "conv_layers"
        if "conv_layers" in name:
            # 加载卷积层权重
            load_conv_layer(
                name,
                value,
                feature_extractor,
                unused_weights,
                hf_model.config.feat_extract_norm == "group",
            )
            is_used = True
        else:
            # 遍历映射字典中的每个键值对
            for key, mapped_key in MAPPING.items():
                # 根据是否微调调整映射键
                mapped_key = "sew_d." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key

                # 如果权重名称中包含映射字典的键或者符合特定条件
                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                    is_used = True
                    # 如果映射键包含通配符 "*"
                    if "*" in mapped_key:
                        # 提取层索引号
                        layer_index = name.split(key)[0].split(".")[-2]
                        if not layer_index.isnumeric():
                            continue
                        # 替换通配符为具体层索引
                        mapped_key = mapped_key.replace("*", layer_index)
                    # 确定权重类型
                    if "weight_g" in name:
                        weight_type = "weight_g"
                    elif "weight_v" in name:
                        weight_type = "weight_v"
                    elif "weight" in name:
                        weight_type = "weight"
                    elif "bias" in name:
                        weight_type = "bias"
                    else:
                        weight_type = None
                    # 递归设置 Hugging Face 模型的权重
                    set_recursively(hf_model, mapped_key, value, name, weight_type)
                continue
        # 如果权重未被使用，则加入到未使用的权重列表中
        if not is_used:
            unused_weights.append(name)

    # 记录未使用的权重信息
    logger.warning(f"Unused weights: {unused_weights}")


# 加载卷积层权重的函数
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
    # 提取卷积层名称
    name = full_name.split("conv_layers.")[-1]
    # 分割名称中的各个部分
    items = name.split(".")
    # 提取层和类型 ID
    layer_id = int(items[0])
    type_id = int(items[1])
    # 如果权重类型为0（偏置）：
    if type_id == 0:
        # 如果权重名称中包含"bias"
        if "bias" in name:
            # 断言当前值的形状与特征提取器中指定卷积层的偏置数据形状相匹配，否则抛出异常
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
            )
            # 将值赋给特征提取器中指定卷积层的偏置数据
            feature_extractor.conv_layers[layer_id].conv.bias.data = value
            # 记录日志，表示第 layer_id 层的卷积层偏置从 full_name 初始化完成
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
        # 如果权重名称中包含"weight"
        elif "weight" in name:
            # 断言当前值的形状与特征提取器中指定卷积层的权重数据形状相匹配，否则抛出异常
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
            )
            # 将值赋给特征提取器中指定卷积层的权重数据
            feature_extractor.conv_layers[layer_id].conv.weight.data = value
            # 记录日志，表示第 layer_id 层的卷积层权重从 full_name 初始化完成
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
    
    # 如果权重类型为2且不使用组规范（group norm），或者权重类型为2且是第一层且使用组规范
    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
        # 如果权重名称中包含"bias"
        if "bias" in name:
            # 断言当前值的形状与特征提取器中指定卷积层的层归一化偏置数据形状相匹配，否则抛出异常
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
                " found."
            )
            # 将值赋给特征提取器中指定卷积层的层归一化偏置数据
            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
            # 记录日志，表示第 layer_id 层的卷积层层归一化偏置从 full_name 初始化完成
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
        # 如果权重名称中包含"weight"
        elif "weight" in name:
            # 断言当前值的形状与特征提取器中指定卷积层的层归一化权重数据形状相匹配，否则抛出异常
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
            )
            # 将值赋给特征提取器中指定卷积层的层归一化权重数据
            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
            # 记录日志，表示第 layer_id 层的卷积层层归一化权重从 full_name 初始化完成
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
    
    # 如果以上条件都不满足，则将未使用的权重名称添加到未使用权重列表中
    else:
        unused_weights.append(full_name)
# 在给定模型和微调标志的情况下，生成SEWDConfig对象，配置转换参数
def convert_config(model, is_finetuned):
    # 创建一个SEWDConfig对象，用于存储配置信息
    config = SEWDConfig()

    # 根据是否微调选择配置信息来源
    if is_finetuned:
        # 如果是微调，从模型的w2v_encoder属性中获取w2v_model的配置信息
        fs_config = model.w2v_encoder.w2v_model.cfg
    else:
        # 如果不是微调，直接使用模型本身的配置信息
        fs_config = model.cfg

    # 将转换后的参数赋值给SEWDConfig对象的各个属性
    config.conv_bias = fs_config.conv_bias
    conv_layers = eval(fs_config.conv_feature_layers)
    config.conv_dim = [x[0] for x in conv_layers]
    config.conv_kernel = [x[1] for x in conv_layers]
    config.conv_stride = [x[2] for x in conv_layers]
    config.feat_extract_activation = "gelu"
    config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
    config.final_dropout = 0.0
    config.hidden_act = fs_config.activation_fn.name
    config.hidden_size = fs_config.encoder_embed_dim
    config.initializer_range = 0.02
    config.intermediate_size = fs_config.encoder_ffn_embed_dim
    config.layer_norm_eps = 1e-5
    config.layerdrop = fs_config.encoder_layerdrop
    config.num_attention_heads = fs_config.encoder_attention_heads
    config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
    config.num_conv_pos_embeddings = fs_config.conv_pos
    config.num_feat_extract_layers = len(conv_layers)
    config.num_hidden_layers = fs_config.encoder_layers
    config.squeeze_factor = fs_config.squeeze_factor

    # 针对DeBERTa模型的特定参数设置
    config.max_position_embeddings = fs_config.max_position_embeddings
    config.position_buckets = fs_config.position_buckets
    config.share_att_key = fs_config.share_att_key
    config.relative_attention = fs_config.relative_attention
    config.position_biased_input = fs_config.position_biased_input
    config.pos_att_type = tuple(fs_config.pos_att_type.split("|"))
    config.norm_rel_ebd = fs_config.norm_rel_ebd

    # 对于微调模型，处理可能被Wav2VecCtc模型覆盖的参数
    if is_finetuned:
        fs_config = model.cfg
        config.final_dropout = fs_config.final_dropout
        config.layerdrop = fs_config.layerdrop

    # 设置剩余的配置参数
    config.activation_dropout = fs_config.activation_dropout
    config.apply_spec_augment = fs_config.mask_prob > 0 or fs_config.mask_channel_prob > 0
    config.attention_dropout = fs_config.attention_dropout
    config.feat_proj_dropout = fs_config.dropout_input
    config.hidden_dropout = fs_config.dropout
    config.mask_feature_length = fs_config.mask_channel_length
    config.mask_feature_prob = fs_config.mask_channel_prob
    config.mask_time_length = fs_config.mask_length
    config.mask_time_prob = fs_config.mask_prob

    # 设置特定的特征提取器类型和分词器类
    config.feature_extractor_type = "Wav2Vec2FeatureExtractor"
    config.tokenizer_class = "Wav2Vec2CTCTokenizer"

    # 返回配置对象
    return config


# 使用torch.no_grad()装饰器，确保在此函数内部禁用梯度计算
@torch.no_grad()
def convert_sew_checkpoint(
    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    # 在此处实现模型权重复制、粘贴和调整的逻辑，转换为transformers设计
    # 函数体内部逻辑尚未提供，需根据具体需求补充实现
    pass
    # 如果已经微调过模型
    if is_finetuned:
        # 加载模型、任务和参数覆盖，使用指定的检查点路径
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
        )
    else:
        # 加载模型、任务和参数覆盖，使用指定的检查点路径
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])

    # 如果提供了配置路径，从预训练配置加载配置信息
    if config_path is not None:
        config = SEWDConfig.from_pretrained(config_path)
    else:
        # 否则根据模型和微调状态生成配置信息
        config = convert_config(model[0], is_finetuned)

    # 设置模型为评估模式
    model = model[0].eval()

    # 根据配置的特征提取器参数，决定是否返回注意力掩码
    return_attention_mask = True if config.feat_extract_norm == "layer" else False
    # 创建 Wav2Vec2FeatureExtractor 特征提取器对象
    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=16000,
        padding_value=0,
        do_normalize=True,
        return_attention_mask=return_attention_mask,
    )

    # 如果已经微调过模型
    if is_finetuned:
        if dict_path:
            # 加载目标字典并调整特殊标记的索引，以适应当前任务
            target_dict = Dictionary.load(dict_path)
            target_dict.indices[target_dict.bos_word] = target_dict.pad_index
            target_dict.indices[target_dict.pad_word] = target_dict.bos_index
            config.bos_token_id = target_dict.pad_index
            config.pad_token_id = target_dict.bos_index
            config.eos_token_id = target_dict.eos_index
            config.vocab_size = len(target_dict.symbols)
            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
            # 如果指定的 PyTorch 转储文件夹路径不是目录，则记录错误并返回
            if not os.path.isdir(pytorch_dump_folder_path):
                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
                return
            # 如果目录不存在则创建
            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
            # 将目标字典的索引保存为 JSON 文件
            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
                json.dump(target_dict.indices, vocab_handle)
            # 创建 Wav2Vec2CTCTokenizer 标记器对象
            tokenizer = Wav2Vec2CTCTokenizer(
                vocab_path,
                unk_token=target_dict.unk_word,
                pad_token=target_dict.pad_word,
                bos_token=target_dict.bos_word,
                eos_token=target_dict.eos_word,
                word_delimiter_token="|",
                do_lower_case=False,
            )
            # 创建 Wav2Vec2Processor 处理器对象
            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
            # 将处理器保存到指定的 PyTorch 转储文件夹路径
            processor.save_pretrained(pytorch_dump_folder_path)

        # 创建 SEWDForCTC 模型对象
        hf_model = SEWDForCTC(config)
    else:
        # 创建 SEWDModel 模型对象
        hf_model = SEWDModel(config)
        # 将特征提取器保存到指定的 PyTorch 转储文件夹路径
        feature_extractor.save_pretrained(pytorch_dump_folder_path)

    # 递归地加载权重到模型
    recursively_load_weights(model, hf_model, is_finetuned)

    # 将模型保存到指定的 PyTorch 转储文件夹路径
    hf_model.save_pretrained(pytorch_dump_folder_path)
# 如果该脚本作为主程序运行，则执行以下代码块
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()
    # 添加参数：输出 PyTorch 模型的路径
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加参数：fairseq 检查点的路径
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
    # 添加参数：微调模型的字典路径
    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
    # 添加参数：要转换的模型的 hf config.json 文件路径
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    # 添加参数：标志，指示要转换的模型是否为微调模型
    parser.add_argument(
        "--is_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
    )
    # 解析命令行参数
    args = parser.parse_args()
    
    # 调用 convert_sew_checkpoint 函数，传递解析后的参数
    convert_sew_checkpoint(
        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, args.is_finetuned
    )

`.\models\sew_d\modeling_sew_d.py`

# coding=utf-8
# 版权所有 2021 年 ASAPP 公司和 HuggingFace 公司团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）授权;
# 您只能在遵守许可证的情况下使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件是基于“按现状”分发的，
# 没有任何明示或暗示的担保或条件。
# 有关特定语言权限，请参阅许可证。
""" PyTorch SEW model."""

import math
import warnings
from collections.abc import Sequence
from typing import Optional, Tuple, Union

import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss, LayerNorm

from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import softmax_backward_data
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_sew_d import SEWDConfig


logger = logging.get_logger(__name__)

_HIDDEN_STATES_START_POSITION = 1


# 通用文档字符串
_CONFIG_FOR_DOC = "SEWDConfig"

# 基础文档字符串
_CHECKPOINT_FOR_DOC = "asapp/sew-d-tiny-100k-ft-ls100h"
_EXPECTED_OUTPUT_SHAPE = [1, 292, 384]

# CTC（连续文本识别）文档字符串
_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTIL OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
_CTC_EXPECTED_LOSS = 0.21

# 音频分类文档字符串
_SEQ_CLASS_CHECKPOINT = "anton-l/sew-d-mid-400k-ft-keyword-spotting"
_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
_SEQ_CLASS_EXPECTED_LOSS = 3.16

SEW_D_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "asapp/sew-d-tiny-100k",
    "asapp/sew-d-small-100k",
    "asapp/sew-d-mid-100k",
    "asapp/sew-d-mid-k127-100k",
    "asapp/sew-d-base-100k",
    "asapp/sew-d-base-plus-100k",
    "asapp/sew-d-mid-400k",
    "asapp/sew-d-mid-k127-400k",
    "asapp/sew-d-base-plus-400k",
    # 查看所有 SEW 模型：https://huggingface.co/models?filter=sew-d
]


# 从 transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices 复制而来
def _compute_mask_indices(
    shape: Tuple[int, int],
    mask_prob: float,
    mask_length: int,
    attention_mask: Optional[torch.LongTensor] = None,
    min_masks: int = 0,
) -> np.ndarray:
    """
    计算给定形状的随机掩码间隔。用于实现“SpecAugment: ASR 的简单数据增强方法”。
    https://arxiv.org/abs/1904.08779
    注意，此方法未经优化以在 TPU 上运行，并应作为训练过程中的预处理步骤在 CPU 上运行。
    """
    # 确定掩码的数量
    num_masks = int(round(shape[0] * mask_prob))
    # 确保生成的掩码数不低于指定的最小掩码数
    num_masks = max(num_masks, min_masks)

    # 生成掩码索引
    mask_indices = np.full(shape[0], -1, dtype=np.int64)
    for i in range(num_masks):
        # 随机选择掩码的起始位置
        start = np.random.randint(0, shape[0] - mask_length + 1)
        # 标记起始位置及其后 mask_length - 1 个位置为掩码
        mask_indices[start : start + mask_length] = 1

    return mask_indices
    # 计算给定形状的掩码
    # 参数：
    #   shape: 要计算掩码的形状，应为大小为2的元组，第一个元素是批量大小，第二个元素是要跨越的轴的长度。
    #   mask_prob: 要掩盖的整个轴的百分比（介于0和1之间），将由`mask_length`长度的独立生成的掩码跨度数量计算为`mask_prob*shape[1]/mask_length`。
    #              由于重叠的存在，`mask_prob`是一个上限，实际百分比会较小。
    #   mask_length: 掩码的长度
    #   min_masks: 最小的掩码跨度数
    #   attention_mask: （右填充的）注意力掩码，独立缩短每个批次维度的特征轴。
    """
    batch_size, sequence_length = shape
    
    # 如果掩码长度小于1，抛出值错误异常
    if mask_length < 1:
        raise ValueError("`mask_length` has to be bigger than 0.")
    
    # 如果掩码长度大于序列长度，抛出值错误异常
    if mask_length > sequence_length:
        raise ValueError(
            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
            f" and `sequence_length`: {sequence_length}`"
        )
    
    # epsilon用于概率舍入
    epsilon = np.random.rand(1).item()
    
    def compute_num_masked_span(input_length):
        """给定输入长度，计算应掩码的跨度数"""
        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
        num_masked_span = max(num_masked_span, min_masks)
    
        # 确保掩码跨度数 <= 序列长度
        if num_masked_span * mask_length > sequence_length:
            num_masked_span = sequence_length // mask_length
    
        # 确保掩码跨度数 <= 输入长度 - (掩码长度 - 1)
        if input_length - (mask_length - 1) < num_masked_span:
            num_masked_span = max(input_length - (mask_length - 1), 0)
    
        return num_masked_span
    
    # 计算批量中的掩码跨度数
    input_lengths = (
        attention_mask.sum(-1).detach().tolist()
        if attention_mask is not None
        else [sequence_length for _ in range(batch_size)]
    )
    
    # SpecAugment掩码初始化
    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
    spec_aug_mask_idxs = []
    
    # 计算在序列长度内最大的掩码跨度数
    max_num_masked_span = compute_num_masked_span(sequence_length)
    
    # 如果最大掩码跨度数为0，直接返回掩码矩阵
    if max_num_masked_span == 0:
        return spec_aug_mask
    for input_length in input_lengths:
        # 计算当前输入的被遮罩段数
        num_masked_span = compute_num_masked_span(input_length)

        # 随机选择要遮罩的索引位置
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )

        # 选择第一个随机索引作为虚拟索引，用于在批处理中填充向量，确保所有批次具有相同的维度
        # 由于概率舍入，选择第一个样本使得向量填充两次
        if len(spec_aug_mask_idx) == 0:
            # 如果 `input_length` 严格小于 `sequence_length`，则只能发生这种情况
            # 最后一个标记必须是填充标记，可以用作虚拟掩码 ID
            dummy_mask_idx = sequence_length - 1
        else:
            dummy_mask_idx = spec_aug_mask_idx[0]

        # 将虚拟掩码索引与随机生成的掩码索引合并
        spec_aug_mask_idx = np.concatenate(
            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
        )
        spec_aug_mask_idxs.append(spec_aug_mask_idx)

    # 将列表转换为 numpy 数组
    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)

    # 将掩码索引扩展为掩码段
    spec_aug_mask_idxs = np.broadcast_to(
        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)

    # 将起始索引添加偏移量，以创建掩码段
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets

    # 确保索引不超过序列长度
    if spec_aug_mask_idxs.max() > sequence_length - 1:
        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1

    # 在指定的索引位置上进行散布，创建掩码
    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)

    # 返回最终的掩码结果
    return spec_aug_mask
# Copied from transformers.models.deberta_v2.modeling_deberta_v2.make_log_bucket_position
def make_log_bucket_position(relative_pos, bucket_size, max_position):
    # 计算相对位置的符号
    sign = torch.sign(relative_pos)
    # 计算桶的中间位置
    mid = bucket_size // 2
    # 计算绝对位置
    abs_pos = torch.where(
        (relative_pos < mid) & (relative_pos > -mid),
        torch.tensor(mid - 1).type_as(relative_pos),
        torch.abs(relative_pos),
    )
    # 计算对数位置
    log_pos = (
        torch.ceil(torch.log(abs_pos / mid) / torch.log(torch.tensor((max_position - 1) / mid)) * (mid - 1)) + mid
    )
    # 根据绝对位置是否小于等于桶的中间位置选择最终的位置
    bucket_pos = torch.where(abs_pos <= mid, relative_pos.type_as(log_pos), log_pos * sign)
    return bucket_pos


# Copied from transformers.models.deberta_v2.modeling_deberta_v2.build_relative_position
def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1, device=None):
    """
    根据查询和键构建相对位置

    假设查询的绝对位置 \\(P_q\\) 范围是 (0, query_size)，键的绝对位置 \\(P_k\\) 范围是 (0, key_size)，
    则查询到键的相对位置为 \\(R_{q \\rightarrow k} = P_q - P_k\\)

    Args:
        query_size (int): 查询的长度
        key_size (int): 键的长度
        bucket_size (int): 位置桶的大小
        max_position (int): 允许的最大绝对位置
        device (`torch.device`): 创建张量所用的设备

    Return:
        `torch.LongTensor`: 形状为 [1, query_size, key_size] 的张量
    """

    # 创建查询 ID 序列和键 ID 序列
    q_ids = torch.arange(0, query_size, device=device)
    k_ids = torch.arange(0, key_size, device=device)
    # 计算相对位置 ID
    rel_pos_ids = q_ids[:, None] - k_ids[None, :]
    # 如果指定了桶的大小和最大绝对位置，则应用对数桶化
    if bucket_size > 0 and max_position > 0:
        rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
    rel_pos_ids = rel_pos_ids.to(torch.long)
    # 限制相对位置的长度，并添加批次维度
    rel_pos_ids = rel_pos_ids[:query_size, :]
    rel_pos_ids = rel_pos_ids.unsqueeze(0)
    return rel_pos_ids


@torch.jit.script
# Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand
def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
    return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)])


@torch.jit.script
# Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand
def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
    return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)])


@torch.jit.script
# Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand
def pos_dynamic_expand(pos_index, p2c_att, key_layer):
    return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2)))


# Copied from transformers.models.deberta.modeling_deberta.get_mask
def get_mask(input, local_context):
    if not isinstance(local_context, DropoutContext):
        # 如果 local_context 不是 DropoutContext 的实例，则使用传入的 dropout 值，否则返回空的 mask
        dropout = local_context
        mask = None
    # 如果条件不成立，执行以下操作
    else:
        # 从局部上下文中获取 dropout 参数
        dropout = local_context.dropout
        # 将 dropout 参数乘以局部上下文的缩放因子
        dropout *= local_context.scale
        # 如果局部上下文不重用掩码，则 mask 为 None；否则，从局部上下文中获取掩码
        mask = local_context.mask if local_context.reuse_mask else None

    # 如果 dropout 大于 0 并且 mask 为 None，则执行以下操作
    if dropout > 0 and mask is None:
        # 创建一个与 input 张量相同形状的随机掩码，并转换为布尔型
        mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool)

    # 如果局部上下文对象是 DropoutContext 的实例，则执行以下操作
    if isinstance(local_context, DropoutContext):
        # 如果局部上下文的掩码为 None，则将当前掩码赋值给局部上下文的掩码
        if local_context.mask is None:
            local_context.mask = mask

    # 返回计算得到的掩码和 dropout 参数
    return mask, dropout
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer复制代码，替换Wav2Vec2为SEWD
class SEWDNoLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 如果layer_id大于0，则设置输入卷积维度为config.conv_dim[layer_id - 1]，否则设置为1
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 设置输出卷积维度为config.conv_dim[layer_id]
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],  # 使用config中的卷积核大小
            stride=config.conv_stride[layer_id],       # 使用config中的步幅大小
            bias=config.conv_bias,                     # 使用config中的偏置
        )
        # 设置激活函数为ACT2FN[config.feat_extract_activation]
        self.activation = ACT2FN[config.feat_extract_activation]

    # 定义前向传播函数
    def forward(self, hidden_states):
        # 将输入hidden_states通过卷积层self.conv
        hidden_states = self.conv(hidden_states)
        # 将卷积后的hidden_states应用激活函数self.activation
        hidden_states = self.activation(hidden_states)
        return hidden_states


# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer复制代码，替换Wav2Vec2为SEWD
class SEWDLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 如果layer_id大于0，则设置输入卷积维度为config.conv_dim[layer_id - 1]，否则设置为1
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 设置输出卷积维度为config.conv_dim[layer_id]
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],  # 使用config中的卷积核大小
            stride=config.conv_stride[layer_id],       # 使用config中的步幅大小
            bias=config.conv_bias,                     # 使用config中的偏置
        )
        # 创建一个LayerNorm层，对输出卷积维度进行归一化
        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
        # 设置激活函数为ACT2FN[config.feat_extract_activation]
        self.activation = ACT2FN[config.feat_extract_activation]

    # 定义前向传播函数
    def forward(self, hidden_states):
        # 将输入hidden_states通过卷积层self.conv
        hidden_states = self.conv(hidden_states)
        
        # 将hidden_states的维度进行转置，将倒数第二维与倒数第一维交换
        hidden_states = hidden_states.transpose(-2, -1)
        # 将转置后的hidden_states通过LayerNorm层self.layer_norm进行归一化
        hidden_states = self.layer_norm(hidden_states)
        # 再次将hidden_states的维度进行转置，将倒数第二维与倒数第一维交换回来
        hidden_states = hidden_states.transpose(-2, -1)
        
        # 将归一化后的hidden_states应用激活函数self.activation
        hidden_states = self.activation(hidden_states)
        return hidden_states


# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer复制代码，替换Wav2Vec2为SEWD
class SEWDGroupNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 如果layer_id大于0，则设置输入卷积维度为config.conv_dim[layer_id - 1]，否则设置为1
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 设置输出卷积维度为config.conv_dim[layer_id]
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],  # 使用config中的卷积核大小
            stride=config.conv_stride[layer_id],       # 使用config中的步幅大小
            bias=config.conv_bias,                     # 使用config中的偏置
        )
        # 设置激活函数为ACT2FN[config.feat_extract_activation]
        self.activation = ACT2FN[config.feat_extract_activation]

        # 创建一个GroupNorm层，对输出卷积维度进行分组归一化
        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)

    # 定义前向传播函数
    def forward(self, hidden_states):
        # 将输入hidden_states通过卷积层self.conv
        hidden_states = self.conv(hidden_states)
        # 将卷积后的hidden_states通过GroupNorm层self.layer_norm进行归一化
        hidden_states = self.layer_norm(hidden_states)
        # 将归一化后的hidden_states应用激活函数self.activation
        hidden_states = self.activation(hidden_states)
        return hidden_states
# 从transformers.models.sew.modeling_sew.SEWPositionalConvEmbedding复制而来，修改SEW为SEWD
class SEWDPositionalConvEmbedding(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个一维卷积层，用于位置编码的卷积
        self.conv = nn.Conv1d(
            config.hidden_size,
            config.hidden_size,
            kernel_size=config.num_conv_pos_embeddings,
            padding=config.num_conv_pos_embeddings // 2,
            groups=config.num_conv_pos_embedding_groups,
            stride=config.squeeze_factor,
        )

        # 如果启用了deepspeed的zero3功能
        if is_deepspeed_zero3_enabled():
            import deepspeed
            # 使用zero3的gathered parameters将权重进行分布式处理
            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
            # 注册卷积层的权重变量给deepspeed.zero
            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
        else:
            # 对卷积层的权重进行权重归一化处理
            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)

        # 创建一个用于卷积后padding的层
        self.padding = SEWDSamePadLayer(config.num_conv_pos_embeddings)
        # 激活函数选择，根据配置选择不同的激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 进行一维卷积
        hidden_states = self.conv(hidden_states)
        # 进行padding处理
        hidden_states = self.padding(hidden_states)
        # 使用选择的激活函数进行激活
        hidden_states = self.activation(hidden_states)

        return hidden_states


# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer复制而来，修改Wav2Vec2为SEW
class SEWDSamePadLayer(nn.Module):
    def __init__(self, num_conv_pos_embeddings):
        super().__init__()
        # 根据卷积位置编码数目确定是否需要移除的padding数量
        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0

    def forward(self, hidden_states):
        # 如果需要移除padding，则进行裁剪
        if self.num_pad_remove > 0:
            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
        return hidden_states


# 从transformers.models.sew.modeling_sew.SEWUpsampling复制而来，修改SEW为SEWD
class SEWDUpsampling(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个线性层，用于上采样投影
        self.projection = nn.Linear(config.hidden_size, config.hidden_size * config.squeeze_factor)
        # 根据配置选择不同的激活函数
        self.activation = ACT2FN[config.feat_extract_activation]
        # 保存下采样倍数
        self.squeeze_factor = config.squeeze_factor

    def forward(self, hidden_states):
        # 进行线性投影
        hidden_states = self.projection(hidden_states)
        # 使用选择的激活函数进行激活
        hidden_states = self.activation(hidden_states)

        # 如果下采样因子大于1
        if self.squeeze_factor > 1:
            # 将嵌入通道转换为序列长度
            bsz, src_len, src_embed_dim = hidden_states.size()
            tgt_len = src_len * self.squeeze_factor
            tgt_embed_dim = src_embed_dim // self.squeeze_factor
            hidden_states = hidden_states.reshape(bsz, src_len, self.squeeze_factor, tgt_embed_dim)
            hidden_states = hidden_states.reshape(bsz, tgt_len, tgt_embed_dim)

        return hidden_states


# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder复制而来，修改Wav2Vec2为SEWD
class SEWDFeatureEncoder(nn.Module):
    """从原始音频波形构建特征"""

    def __init__(self, config):
        super().__init__()

        # 根据配置选择特征提取的归一化方式
        if config.feat_extract_norm == "group":
            # 如果是group归一化，则创建一系列卷积层
            conv_layers = [SEWDGroupNormConvLayer(config, layer_id=0)] + [
                SEWDNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
            ]
        elif config.feat_extract_norm == "layer":
            # 如果是layer归一化，则创建一系列卷积层
            conv_layers = [SEWDLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
        else:
            # 若配置不匹配则抛出异常
            raise ValueError(
                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
            )
        
        # 将卷积层列表转换为ModuleList
        self.conv_layers = nn.ModuleList(conv_layers)
        self.gradient_checkpointing = False
        self._requires_grad = True

    def _freeze_parameters(self):
        # 冻结所有参数，使其不需要梯度更新
        for param in self.parameters():
            param.requires_grad = False
        self._requires_grad = False

    def forward(self, input_values):
        # 将输入值添加一个维度，用于处理
        hidden_states = input_values[:, None]

        # 如果需要梯度并且正在训练，确保hidden_states需要梯度
        if self._requires_grad and self.training:
            hidden_states.requires_grad = True

        # 遍历所有卷积层进行前向传播
        for conv_layer in self.conv_layers:
            if self._requires_grad and self.gradient_checkpointing and self.training:
                # 如果开启了梯度检查点功能，使用梯度检查点函数进行前向传播
                hidden_states = self._gradient_checkpointing_func(
                    conv_layer.__call__,
                    hidden_states,
                )
            else:
                # 否则直接通过卷积层进行前向传播
                hidden_states = conv_layer(hidden_states)

        # 返回处理后的隐藏状态
        return hidden_states


class SEWDFeatureExtractor(SEWDFeatureEncoder):
    def __init__(self, config):
        super().__init__(config)
        # 发出警告，表明该类将被弃用并在未来版本中移除，建议使用基类替代
        warnings.warn(
            f"The class `{self.__class__.__name__}` has been depreciated "
            "and will be removed in Transformers v5. "
            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
            FutureWarning,
        )


# 从transformers.models.deberta.modeling_deberta.ContextPooler复制而来
class ContextPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建线性层和稳定的dropout层
        self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
        self.dropout = StableDropout(config.pooler_dropout)
        self.config = config

    def forward(self, hidden_states):
        # 通过简单地获取第一个token的隐藏状态来“池化”模型
        context_token = hidden_states[:, 0]
        context_token = self.dropout(context_token)
        pooled_output = self.dense(context_token)
        pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
        # 返回池化后的输出
        return pooled_output

    @property
    def output_dim(self):
        # 返回输出维度大小，与隐藏大小相同
        return self.config.hidden_size


# 从transformers.models.deberta.modeling_deberta.XSoftmax复制而来
class XSoftmax(torch.autograd.Function):
    """
    Masked Softmax which is optimized for saving memory

    Args:
        input (`torch.tensor`): The input tensor that will apply softmax.
        mask (`torch.IntTensor`):
            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax

    Example:

    ```
    >>> import torch
    >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax

    >>> # Make a tensor
    >>> x = torch.randn([4, 20, 100])

    >>> # Create a mask
    >>> mask = (x > 0).int()

    >>> # Specify the dimension to apply softmax
    >>> dim = -1

    >>> y = XSoftmax.apply(x, mask, dim)
    ```"""

    @staticmethod
    def forward(self, input, mask, dim):
        # 设置对象的维度属性
        self.dim = dim
        # 创建反向掩码，将输入掩码转换为布尔类型取反
        rmask = ~(mask.to(torch.bool))

        # 用最小的浮点数填充输入中的掩码位置
        output = input.masked_fill(rmask, torch.tensor(torch.finfo(input.dtype).min))
        # 在指定维度上应用 softmax 函数
        output = torch.softmax(output, self.dim)
        # 将输出中掩码位置重新填充为0
        output.masked_fill_(rmask, 0)
        # 保存输出作为反向传播的一部分
        self.save_for_backward(output)
        return output

    @staticmethod
    def backward(self, grad_output):
        # 获取保存的输出张量
        (output,) = self.saved_tensors
        # 调用自定义的 softmax 反向传播函数计算输入梯度
        inputGrad = softmax_backward_data(self, grad_output, output, self.dim, output)
        return inputGrad, None, None

    @staticmethod
    def symbolic(g, self, mask, dim):
        import torch.onnx.symbolic_helper as sym_help
        from torch.onnx.symbolic_opset9 import masked_fill, softmax

        # 将掩码转换为长整型
        mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx["Long"])
        # 计算反向掩码，使用 ONNX 运算符
        r_mask = g.op(
            "Cast",
            g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
            to_i=sym_help.cast_pytorch_to_onnx["Bool"],
        )
        # 使用 ONNX 运算符对输入进行掩码填充
        output = masked_fill(
            g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
        )
        # 使用 ONNX 运算符在指定维度上应用 softmax
        output = softmax(g, output, dim)
        return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool)))


# Copied from transformers.models.deberta.modeling_deberta.DropoutContext
class DropoutContext(object):
    def __init__(self):
        self.dropout = 0
        self.mask = None
        self.scale = 1
        self.reuse_mask = True


# Copied from transformers.models.deberta.modeling_deberta.XDropout
class XDropout(torch.autograd.Function):
    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""

    @staticmethod
    def forward(ctx, input, local_ctx):
        # 调用函数获取掩码和 dropout 概率
        mask, dropout = get_mask(input, local_ctx)
        # 计算缩放比例
        ctx.scale = 1.0 / (1 - dropout)
        if dropout > 0:
            # 保存掩码用于反向传播
            ctx.save_for_backward(mask)
            # 应用掩码并乘以缩放比例
            return input.masked_fill(mask, 0) * ctx.scale
        else:
            return input
    # 定义静态方法，用于在反向传播时计算梯度
    def backward(ctx, grad_output):
        # 如果上下文中的缩放值大于1，则执行以下操作
        if ctx.scale > 1:
            # 从上下文保存的张量中获取掩码
            (mask,) = ctx.saved_tensors
            # 将梯度张量中的被掩码位置清零，并乘以缩放因子，然后返回
            return grad_output.masked_fill(mask, 0) * ctx.scale, None
        else:
            # 如果缩放值不大于1，则直接返回梯度和空值
            return grad_output, None

    # 定义静态方法，用于在符号图中生成 Dropout 操作
    def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
        # 导入符号化操作集12
        from torch.onnx import symbolic_opset12

        # 如果 local_ctx 是 DropoutContext 类型，则获取其中的 dropout 率
        dropout_p = local_ctx
        if isinstance(local_ctx, DropoutContext):
            dropout_p = local_ctx.dropout
        
        # 在导出过程中，稳定的 Dropout 只在训练时调用此函数
        train = True
        
        # TODO: 我们应该检查 opset_version 是否大于12，但目前没有很好的方法来执行此检查。
        # 如今，如果 opset_version < 12，导出将会因为 CheckerError 而失败。
        # 一旦 https://github.com/pytorch/pytorch/issues/78391 问题得到解决，可以像下面这样处理：
        # if opset_version < 12:
        #   return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train)
        
        # 使用符号化操作集12中的 dropout 函数生成符号化节点
        return symbolic_opset12.dropout(g, input, dropout_p, train)
# Copied from transformers.models.deberta.modeling_deberta.StableDropout
class StableDropout(nn.Module):
    """
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    """

    def __init__(self, drop_prob):
        super().__init__()
        # 初始化稳定的dropout模块
        self.drop_prob = drop_prob  # 设置dropout概率
        self.count = 0  # 上下文堆栈计数
        self.context_stack = None  # 上下文堆栈初始化为空

    def forward(self, x):
        """
        Call the module

        Args:
            x (`torch.tensor`): The input tensor to apply dropout
        """
        if self.training and self.drop_prob > 0:
            return XDropout.apply(x, self.get_context())  # 如果处于训练状态且dropout概率大于0，则应用自定义的dropout操作
        return x  # 否则直接返回输入

    def clear_context(self):
        # 清空上下文堆栈
        self.count = 0
        self.context_stack = None

    def init_context(self, reuse_mask=True, scale=1):
        if self.context_stack is None:
            self.context_stack = []  # 如果上下文堆栈为空，则初始化为空列表
        self.count = 0  # 计数器归零
        for c in self.context_stack:
            c.reuse_mask = reuse_mask  # 设置重用掩码标志
            c.scale = scale  # 设置比例

    def get_context(self):
        if self.context_stack is not None:
            if self.count >= len(self.context_stack):
                self.context_stack.append(DropoutContext())  # 如果计数超过堆栈长度，则添加新的dropout上下文
            ctx = self.context_stack[self.count]  # 获取当前计数对应的dropout上下文
            ctx.dropout = self.drop_prob  # 设置dropout概率
            self.count += 1  # 计数器加一
            return ctx  # 返回dropout上下文
        else:
            return self.drop_prob  # 如果上下文堆栈为空，则返回dropout概率本身


# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaV2->SEWD, DebertaLayerNorm->LayerNorm, hidden_dropout_prob->activation_dropout
class SEWDSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)  # 使用线性层变换隐藏状态
        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)  # 应用LayerNorm进行归一化
        self.dropout = StableDropout(config.activation_dropout)  # 使用稳定的dropout模块

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)  # 线性变换隐藏状态
        hidden_states = self.dropout(hidden_states)  # 应用稳定的dropout
        hidden_states = self.LayerNorm(hidden_states + input_tensor)  # 使用LayerNorm对变换后的隐藏状态进行归一化
        return hidden_states  # 返回处理后的隐藏状态


# Copied from transformers.models.deberta_v2.modeling_deberta_v2.DisentangledSelfAttention with attention_probs_dropout_prob->attention_dropout, hidden_dropout_prob->activation_dropout
class DisentangledSelfAttention(nn.Module):
    """
    Disentangled self-attention module

    Parameters:
        config (`DebertaV2Config`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaV2Config`]

    """
    # 定义类和其中的初始化方法，包含Transformer注意力机制相关参数和组件
    def __init__(self, config):
    
        # 调用基类初始化方法，默认调用具有模型特定特征的方法
        super().__init__()
    
        # 验证隐藏维度是否是 attention head 的倍数，否则会抛出错误
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )
    
        # 初始化注意力头数量
        self.num_attention_heads = config.num_attention_heads
    
        _attention_head_size = config.hidden_size // config.num_attention_heads # 默认计算每个头的大小
        # 根据配置中self.attention_head_size的设置进行可能的调整
        self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size)
        # 计算头数乘以每个头的大小，用于计算总头大小
        self.all_head_size = self.num_attention_heads * self.attention_head_size
            
    
        # 创建线性投影层以将输入映射到所需的输出维度
        self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
        self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
        self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
    
        # 检查是否共享注意力键
        self.share_att_key = getattr(config, "share_att_key", False)
        # 设置注意力类型的参数列表
        self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
        # 检查是否使用相对注意力机制
        self.relative_attention = getattr(config, "relative_attention", False)
    
        # 使用相对注意力时，将 position_buckets 和 max_relative_positions 等参数的默认值设定
        if self.relative_attention:
            self.position_buckets = getattr(config, "position_buckets", -1)
            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
            # 设置 max_relative_positions 初始值为 max_position_embeddings，除非使用 position_buckets 或者其小于 1
            if self.max_relative_positions < 1:
                self.max_relative_positions = config.max_position_embeddings
            # 计算实际的相对位置嵌入大小
            self.pos_ebd_size = self.max_relative_positions
            # 如果 position_buckets 参数已配置，调整 pos_ebd_size 大小
            if self.position_buckets > 0:
                self.pos_ebd_size = self.position_buckets
    
            # 初始化位置 dropout 层
            self.pos_dropout = StableDropout(config.activation_dropout)
    
            # 如果不共享attention键，则创建额外的线性投影层用于处理位置相关的输入
            if "c2p" in self.pos_att_type:
                self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
            if "p2c" in self.pos_att_type:
                self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size)
    
        # 初始化模型的下垂dropout层
        self.dropout = StableDropout(config.attention_dropout)
    
    # 随后定义了 batch 处理数据的内部步骤 x 转换函数
    def transpose_for_scores(self, x, attention_heads):
        # 获取数据和头数维度形状
        new_x_shape = x.size()[:-1] + (attention_heads, -1)
        # 重塑数据的形状以准备在循环过程中使用
        x = x.view(new_x_shape)
        # 转置以将数据按注意力头划分
        return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1))
    
    # 随后定义了前向传播方法，处理输入数据
    def forward(
        self,
        hidden_states,
        attention_mask,
        output_attentions=False,
        query_states=None,
        relative_pos=None,
        rel_embeddings=None,
# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->SEWD
class SEWDAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化自注意力层，使用SEWD版本的DisentangledSelfAttention
        self.self = DisentangledSelfAttention(config)
        # 初始化自注意力层输出层，使用SEWD版本的SEWDSelfOutput
        self.output = SEWDSelfOutput(config)
        self.config = config

    def forward(
        self,
        hidden_states,
        attention_mask,
        output_attentions=False,
        query_states=None,
        relative_pos=None,
        rel_embeddings=None,
    ):
        # 执行自注意力计算，调用SEWD版本的DisentangledSelfAttention模型
        self_output = self.self(
            hidden_states,
            attention_mask,
            output_attentions,
            query_states=query_states,
            relative_pos=relative_pos,
            rel_embeddings=rel_embeddings,
        )
        if output_attentions:
            self_output, att_matrix = self_output
        if query_states is None:
            query_states = hidden_states
        # 执行自注意力输出层计算，调用SEWD版本的SEWDSelfOutput模型
        attention_output = self.output(self_output, query_states)

        if output_attentions:
            return (attention_output, att_matrix)  # 返回注意力输出和注意力矩阵（如果有的话）
        else:
            return attention_output  # 返回注意力输出结果


# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->SEWD
class SEWDIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用线性层将隐藏状态的大小转换为中间状态大小
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用线性层进行转换
        hidden_states = self.dense(hidden_states)
        # 应用中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states  # 返回转换后的中间状态


# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm, hidden_dropout_prob->activation_dropout
class SEWDOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用线性层将中间状态大小转换为隐藏状态大小
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 使用SEWD版本的LayerNorm，初始化LayerNorm层
        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
        # 使用SEWD版本的StableDropout，初始化稳定Dropout层
        self.dropout = StableDropout(config.activation_dropout)
        self.config = config

    def forward(self, hidden_states, input_tensor):
        # 使用线性层进行转换
        hidden_states = self.dense(hidden_states)
        # 应用稳定Dropout
        hidden_states = self.dropout(hidden_states)
        # 应用LayerNorm
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states  # 返回处理后的隐藏状态


# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->SEWD
class SEWDLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化SEWD版本的注意力层、中间层和输出层
        self.attention = SEWDAttention(config)
        self.intermediate = SEWDIntermediate(config)
        self.output = SEWDOutput(config)
    # 定义神经网络模型中的前向传播函数，用于计算每个层的输出
    def forward(
        self,
        hidden_states,             # 输入的隐藏状态，通常是模型中前一层的输出
        attention_mask,            # 注意力掩码，指定哪些位置需要进行注意力计算
        query_states=None,         # 查询状态，用于多头注意力机制中的查询
        relative_pos=None,         # 相对位置编码，用于自注意力机制中的位置编码
        rel_embeddings=None,       # 相对位置嵌入，用于计算相对位置偏移
        output_attentions=False,   # 是否输出注意力矩阵
    ):
        # 调用注意力层计算注意力输出
        attention_output = self.attention(
            hidden_states,          # 输入的隐藏状态
            attention_mask,         # 注意力掩码
            output_attentions=output_attentions,  # 是否输出注意力矩阵的标志
            query_states=query_states,            # 查询状态
            relative_pos=relative_pos,            # 相对位置编码
            rel_embeddings=rel_embeddings,        # 相对位置嵌入
        )
        # 如果需要输出注意力矩阵，则解包注意力输出
        if output_attentions:
            attention_output, att_matrix = attention_output
        # 将注意力输出传入中间层进行处理
        intermediate_output = self.intermediate(attention_output)
        # 将中间层的输出传入输出层，生成最终层的输出
        layer_output = self.output(intermediate_output, attention_output)
        # 如果需要输出注意力矩阵，则返回输出层的输出和注意力矩阵
        if output_attentions:
            return (layer_output, att_matrix)
        else:
            # 否则，仅返回输出层的输出
            return layer_output
# Copied from transformers.models.deberta_v2.modeling_deberta_v2.ConvLayer
# 定义一个名为 ConvLayer 的类，继承自 nn.Module
class ConvLayer(nn.Module):
    # 初始化方法，接受一个 config 对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 从 config 中获取卷积核大小，默认为 3
        kernel_size = getattr(config, "conv_kernel_size", 3)
        # 从 config 中获取卷积的分组数，默认为 1
        groups = getattr(config, "conv_groups", 1)
        # 从 config 中获取卷积激活函数，默认为 "tanh"
        self.conv_act = getattr(config, "conv_act", "tanh")
        # 创建一个 1 维卷积层，输入和输出通道数都为 config.hidden_size，卷积核大小为 kernel_size
        # padding 设置为 (kernel_size - 1) // 2 保证卷积后维度不变
        # groups 参数控制分组卷积
        self.conv = nn.Conv1d(
            config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups
        )
        # 创建一个 LayerNorm 层，输入维度为 config.hidden_size，eps 参数为 config.layer_norm_eps
        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
        # 创建一个稳定 Dropout 层，dropout 概率为 config.hidden_dropout_prob
        self.dropout = StableDropout(config.hidden_dropout_prob)
        # 将 config 对象保存到当前对象的 config 属性中
        self.config = config

    # 前向传播方法，接受 hidden_states、residual_states 和 input_mask 作为输入
    def forward(self, hidden_states, residual_states, input_mask):
        # 对 hidden_states 进行维度变换，将第二维和第三维交换，然后做卷积操作
        out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous()
        # 创建一个逻辑张量 rmask，标识 input_mask 为 0 的位置
        rmask = (1 - input_mask).bool()
        # 将 out 张量中 rmask 为 True 的位置置为 0
        out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
        # 对 out 张量应用指定的激活函数 ACT2FN[self.conv_act]，然后加上 dropout 处理
        out = ACT2FN[self.conv_act](self.dropout(out))

        # 计算 layer_norm_input，即 residual_states 和 out 的和
        layer_norm_input = residual_states + out
        # 对 layer_norm_input 应用 LayerNorm 层，然后赋值给 output
        output = self.LayerNorm(layer_norm_input).to(layer_norm_input)

        # 如果 input_mask 为 None，则直接将 output 赋值给 output_states
        if input_mask is None:
            output_states = output
        else:
            # 如果 input_mask 的维度与 layer_norm_input 的维度不同，进行维度调整
            if input_mask.dim() != layer_norm_input.dim():
                if input_mask.dim() == 4:
                    input_mask = input_mask.squeeze(1).squeeze(1)
                input_mask = input_mask.unsqueeze(2)

            # 将 input_mask 转换为与 output 相同的数据类型，并与 output 相乘，得到 output_states
            input_mask = input_mask.to(output.dtype)
            output_states = output * input_mask

        # 返回 output_states
        return output_states


# Copied from transformers.models.deberta_v2.modeling_deberta_v2.DebertaV2Encoder with DebertaV2->SEWD
# 定义一个名为 SEWDTransformerEncoder 的类，继承自 nn.Module
class SEWDTransformerEncoder(nn.Module):
    """Modified BertEncoder with relative position bias support"""
    # 初始化方法，接受一个 config 对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()

        # 创建一个包含多个 SEWDLayer 的 ModuleList，层数为 config.num_hidden_layers
        self.layer = nn.ModuleList([SEWDLayer(config) for _ in range(config.num_hidden_layers)])
        # 从 config 中获取是否支持相对位置偏置的标志，默认为 False
        self.relative_attention = getattr(config, "relative_attention", False)

        # 如果支持相对位置偏置
        if self.relative_attention:
            # 从 config 中获取最大相对位置的范围，默认为 -1
            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
            # 如果最大相对位置小于 1，则设置为 config.max_position_embeddings
            if self.max_relative_positions < 1:
                self.max_relative_positions = config.max_position_embeddings

            # 从 config 中获取位置桶的数量，默认为 -1
            self.position_buckets = getattr(config, "position_buckets", -1)
            # 计算位置嵌入的尺寸
            pos_ebd_size = self.max_relative_positions * 2

            # 如果指定了位置桶的数量，则重新计算位置嵌入的尺寸
            if self.position_buckets > 0:
                pos_ebd_size = self.position_buckets * 2

            # 创建一个 nn.Embedding 层用于存储相对位置嵌入
            self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size)

        # 从 config 中获取并解析 norm_rel_ebd 字符串，设置是否使用 LayerNorm 进行相对位置嵌入的归一化
        self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]

        # 如果设置了 "layer_norm"，则创建一个 LayerNorm 层，用于相对位置嵌入的归一化
        if "layer_norm" in self.norm_rel_ebd:
            self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)

        # 如果 config 中指定了卷积核大小大于 0，则创建一个 ConvLayer
        self.conv = ConvLayer(config) if getattr(config, "conv_kernel_size", 0) > 0 else None
        # 默认关闭梯度检查点
        self.gradient_checkpointing = False
    # 返回相对位置嵌入（如果启用相对注意力机制），否则返回空值
    def get_rel_embedding(self):
        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
        # 如果相对位置嵌入不为空，并且规范化名称包含"layer_norm"
        if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd):
            # 对相对位置嵌入进行层标准化处理
            rel_embeddings = self.LayerNorm(rel_embeddings)
        return rel_embeddings

    # 获取注意力遮罩，根据不同维度扩展遮罩的尺寸
    def get_attention_mask(self, attention_mask):
        if attention_mask.dim() <= 2:
            # 在维度1和2上扩展注意力遮罩的尺寸
            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            # 生成扩展后的注意力遮罩
            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
        elif attention_mask.dim() == 3:
            # 如果遮罩是3维的，则在维度1上进行扩展
            attention_mask = attention_mask.unsqueeze(1)
        return attention_mask

    # 获取相对位置编码，如果启用相对注意力机制且相对位置未提供，则构建相对位置编码
    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
        if self.relative_attention and relative_pos is None:
            # 如果启用相对注意力机制且未提供相对位置，则根据输入的大小构建相对位置编码
            q = query_states.size(-2) if query_states is not None else hidden_states.size(-2)
            relative_pos = build_relative_position(
                q,
                hidden_states.size(-2),
                bucket_size=self.position_buckets,
                max_position=self.max_relative_positions,
                device=hidden_states.device,
            )
        return relative_pos

    # 前向传播函数，接收输入的隐藏状态和注意力遮罩等参数，并返回模型的输出
    def forward(
        self,
        hidden_states,
        attention_mask,
        output_hidden_states=True,
        output_attentions=False,
        query_states=None,
        relative_pos=None,
        return_dict=True,
        ):
            # 如果注意力掩码的维度小于等于2，直接使用作为输入掩码
            if attention_mask.dim() <= 2:
                input_mask = attention_mask
            else:
                # 否则，将注意力掩码在倒数第二个维度上求和，并检查大于0的部分作为输入掩码
                input_mask = attention_mask.sum(-2) > 0
            # 获取注意力掩码，根据模型定义的方法
            attention_mask = self.get_attention_mask(attention_mask)
            # 获取相对位置编码，用于当前层的注意力计算
            relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)

            # 初始化用于存储所有隐藏状态和注意力权重的变量，根据输出设置决定是否需要存储
            all_hidden_states = () if output_hidden_states else None
            all_attentions = () if output_attentions else None

            # 如果隐藏状态是一个序列，取第一个作为下一步的键值对
            if isinstance(hidden_states, Sequence):
                next_kv = hidden_states[0]
            else:
                next_kv = hidden_states
            # 获取相对位置编码矩阵
            rel_embeddings = self.get_rel_embedding()
            # 初始化输出状态为当前的键值对
            output_states = next_kv
            # 遍历每一层的神经网络模块
            for i, layer_module in enumerate(self.layer):
                # 如果需要输出隐藏状态，则将当前状态加入到所有隐藏状态中
                if output_hidden_states:
                    all_hidden_states = all_hidden_states + (output_states,)

                # 如果开启了梯度检查点且正在训练阶段，使用梯度检查点函数计算当前层输出状态
                if self.gradient_checkpointing and self.training:
                    output_states = self._gradient_checkpointing_func(
                        layer_module.__call__,
                        next_kv,
                        attention_mask,
                        query_states,
                        relative_pos,
                        rel_embeddings,
                        output_attentions,
                    )
                else:
                    # 否则，正常调用当前层的前向传播函数
                    output_states = layer_module(
                        next_kv,
                        attention_mask,
                        query_states=query_states,
                        relative_pos=relative_pos,
                        rel_embeddings=rel_embeddings,
                        output_attentions=output_attentions,
                    )

                # 如果需要输出注意力权重，从输出状态中提取注意力权重
                if output_attentions:
                    output_states, att_m = output_states

                # 如果是第一层且存在卷积模块，将当前隐藏状态与输入掩码传递给卷积模块
                if i == 0 and self.conv is not None:
                    output_states = self.conv(hidden_states, output_states, input_mask)

                # 如果有查询状态，更新为当前输出状态，并更新下一步的键值对
                if query_states is not None:
                    query_states = output_states
                    if isinstance(hidden_states, Sequence):
                        next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
                else:
                    next_kv = output_states

                # 如果需要输出注意力权重，将当前层计算得到的注意力权重加入到所有注意力中
                if output_attentions:
                    all_attentions = all_attentions + (att_m,)

            # 如果需要输出隐藏状态，将最后一层的输出状态加入到所有隐藏状态中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (output_states,)

            # 如果不需要以字典形式返回结果，则返回元组，过滤掉值为None的项
            if not return_dict:
                return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None)
            # 否则，以BaseModelOutput形式返回结果，包括最后隐藏状态、所有隐藏状态和所有注意力权重
            return BaseModelOutput(
                last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions
            )
# 定义 SEWDEncoder 类，继承自 nn.Module，用于实现一个自定义的编码器模型
class SEWDEncoder(nn.Module):
    # 初始化方法，接受一个 config 参数
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 初始化位置卷积嵌入层对象
        self.pos_conv_embed = SEWDPositionalConvEmbedding(config)
        # 初始化一维平均池化层
        self.pool = nn.AvgPool1d(config.squeeze_factor, config.squeeze_factor)
        # 初始化 SEWDTransformerEncoder 编码器
        self.encoder = SEWDTransformerEncoder(config)
        # 初始化 SEWDUpsampling 上采样层
        self.upsample = SEWDUpsampling(config)
        # 梯度检查点设置为 False
        self.gradient_checkpointing = False

    # 前向传播方法，接受多个参数
    def forward(
        self,
        hidden_states: torch.tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 计算最大编码器长度
        max_encoder_length = hidden_states.shape[1] // self.config.squeeze_factor
        # 如果没有给定注意力掩码，则创建一个全为 1 的张量作为默认注意力掩码
        if attention_mask is None:
            attention_mask = torch.ones(
                (hidden_states.shape[0], max_encoder_length), dtype=torch.long, device=hidden_states.device
            )
        else:
            # 将注意力掩码为 False 的位置对应的隐藏状态设为 0
            hidden_states[~attention_mask.bool()] = 0.0
            # 计算输入长度并应用池化公式以获取真实的输出长度
            input_lengths = (attention_mask.long()).sum(-1)
            output_lengths = input_lengths // self.config.squeeze_factor
            # 生成注意力掩码，限制注意力范围在有效输出长度内
            attention_ids = (
                torch.arange(0, max_encoder_length, device=output_lengths.device)
                .view(1, -1)
                .expand(output_lengths.shape[0], -1)
            )
            attention_mask = (attention_ids < output_lengths.view(-1, 1)).long()

        # 记录输入时间步数
        n_input_timesteps = hidden_states.shape[1]

        # 将隐藏状态维度转置，以适应位置嵌入计算
        hidden_states = hidden_states.transpose(1, 2)
        # 计算位置嵌入
        position_embeddings = self.pos_conv_embed(hidden_states)
        # 对隐藏状态进行池化操作
        pooled_hidden_states = self.pool(hidden_states)
        # 选择较小的长度作为最终的隐藏状态长度
        min_length = min(position_embeddings.size(-1), pooled_hidden_states.size(-1))
        # 将池化后的隐藏状态和位置嵌入相加得到最终的隐藏状态表示
        hidden_states = pooled_hidden_states[..., :min_length] + position_embeddings[..., :min_length]
        # 将隐藏状态维度再次转置为输出形状
        hidden_states = hidden_states.transpose(1, 2)

        # 将最终隐藏状态传入编码器进行编码，获取编码器输出
        encoder_outputs = self.encoder(hidden_states, attention_mask, output_hidden_states, output_attentions)

        # 对编码器输出进行上采样操作
        hidden_states = self.upsample(encoder_outputs.last_hidden_state)
        # 如果上采样后的长度小于输入长度，则进行填充操作
        if hidden_states.shape[1] < n_input_timesteps:
            hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, n_input_timesteps - hidden_states.shape[1]))

        # 如果 return_dict 为 False，则返回非空的元组
        if not return_dict:
            return tuple(
                v for v in [hidden_states, encoder_outputs.hidden_states, encoder_outputs.attentions] if v is not None
            )
        
        # 返回 BaseModelOutput 对象，包含最终的隐藏状态、编码器的隐藏状态和注意力权重
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


# SEWDPreTrainedModel 是一个抽象类，继承自 PreTrainedModel，用于处理权重初始化、预训练模型的下载和加载接口
class SEWDPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 定义 SEWDConfig 类作为配置类
    config_class = SEWDConfig
    # 设置基础模型前缀为 "sew-d"
    base_model_prefix = "sew-d"
    # 设置主输入名称为 "input_values"
    main_input_name = "input_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """初始化权重"""
        # 如果模块是 SEWDPositionalConvEmbedding 的实例
        if isinstance(module, SEWDPositionalConvEmbedding):
            # 初始化卷积层的权重为正态分布
            nn.init.normal_(
                module.conv.weight,
                mean=0,
                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
            )
            # 初始化卷积层的偏置为常数0
            nn.init.constant_(module.conv.bias, 0)
        elif isinstance(module, nn.Linear):
            # 对线性层的权重进行初始化，使用正态分布，标准差为配置中的初始化范围
            # 这里与 TensorFlow 版本略有不同，后者使用截断正态分布进行初始化
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
            # 对层归一化和分组归一化的偏置初始化为零
            module.bias.data.zero_()
            # 对层归一化和分组归一化的权重初始化为1
            module.weight.data.fill_(1.0)
        elif isinstance(module, nn.Conv1d):
            # 如果启用了 DeepSpeed Zero3
            if is_deepspeed_zero3_enabled():
                import deepspeed

                # 如果模块有权重分布，使用 GatheredParameters 进行初始化
                if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
                    with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
                        # 使用 Kaiming 正态分布初始化权重
                        nn.init.kaiming_normal_(module.weight.data)
                else:
                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
                        # 使用 Kaiming 正态分布初始化权重
                        nn.init.kaiming_normal_(module.weight.data)
            else:
                # 使用 Kaiming 正态分布初始化权重
                nn.init.kaiming_normal_(module.weight.data)
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化嵌入层的权重，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果设置了填充索引，将对应索引的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()

        # 如果模块是线性层或卷积层且有偏置，则将偏置初始化为零
        if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None:
            module.bias.data.zero_()

    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
        """
        计算卷积层的输出长度
        """

        def _conv_out_length(input_length, kernel_size, stride):
            # 从 https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html 取得的一维卷积层输出长度公式
            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1

        # 遍历配置中的卷积核大小和步长，计算每一层卷积的输出长度
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

        # 返回最终的输入长度
        return input_lengths
    # 根据给定的特征向量长度和注意力掩码计算输出长度
    output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
    # 获取批次大小
    batch_size = attention_mask.shape[0]

    # 创建一个全零注意力掩码张量，形状为(batch_size, feature_vector_length)，与输入掩码相同的数据类型和设备
    attention_mask = torch.zeros(
        (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
    )

    # 设置输出长度之前的所有位置为1，以确保这些位置被完全考虑
    attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1

    # 反转注意力掩码张量，沿着最后一个维度进行累积求和，并再次反转，最终转换为布尔类型
    attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()

    # 返回处理后的注意力掩码张量
    return attention_mask
@add_start_docstrings(
    "The bare SEW-D Model transformer outputting raw hidden-states without any specific head on top.",
    SEWD_START_DOCSTRING,
)
# 使用add_start_docstrings装饰器添加文档字符串，描述SEW-D模型输出原始隐藏状态，没有特定的输出头部
# 继承自SEWDPreTrainedModel，该类可能定义在transformers.models.sew.modeling_sew.SEWModel中，将SEW替换为SEWD，layer_norm_eps替换为feature_layer_norm_eps
class SEWDModel(SEWDPreTrainedModel):
    # 初始化方法，接受一个 SEWDConfig 类型的参数 config
    def __init__(self, config: SEWDConfig):
        # 调用父类的初始化方法，传入 config 参数
        super().__init__(config)
        # 将 config 参数保存在对象的 config 属性中
        self.config = config
        # 使用 SEWDFeatureEncoder 类根据 config 创建特征提取器对象，保存在 feature_extractor 属性中
        self.feature_extractor = SEWDFeatureEncoder(config)
        # 创建一个具有指定维度的 LayerNorm 层，eps 参数为 config 中的 feature_layer_norm_eps
        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.feature_layer_norm_eps)

        # 判断是否需要将特征向量投影到不同的维度
        self.project_features = config.conv_dim[-1] != config.hidden_size
        if self.project_features:
            # 如果需要投影特征向量，创建一个 Linear 层，将 conv_dim[-1] 维度投影到 hidden_size 维度
            self.feature_projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
        # 创建一个 Dropout 层，用于特征投影的 dropout，dropout 率为 config.feat_proj_dropout
        self.feature_dropout = nn.Dropout(config.feat_proj_dropout)

        # 如果 config 中指定了 mask_time_prob 或 mask_feature_prob 大于 0.0
        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
            # 创建一个随机初始化的可学习参数，大小为 hidden_size
            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())

        # 使用 SEWDEncoder 类根据 config 创建编码器对象，保存在 encoder 属性中
        self.encoder = SEWDEncoder(config)

        # 调用类的后期初始化方法
        self.post_init()

    # 以下方法是从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states 复制而来
    def _mask_hidden_states(
        self,
        hidden_states: torch.FloatTensor,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
    ):
        """
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        """

        # `config.apply_spec_augment` can set masking to False
        # 检查配置中是否禁用了 SpecAugment，如果是，则直接返回隐藏状态
        if not getattr(self.config, "apply_spec_augment", True):
            return hidden_states

        # generate indices & apply SpecAugment along time axis
        batch_size, sequence_length, hidden_size = hidden_states.size()

        if mask_time_indices is not None:
            # apply SpecAugment along time axis with given mask_time_indices
            # 使用给定的 mask_time_indices 对时间轴进行 SpecAugment
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
        elif self.config.mask_time_prob > 0 and self.training:
            # compute mask indices if not provided and apply SpecAugment along time axis
            # 如果未提供 mask_time_indices，则计算掩码索引并沿时间轴应用 SpecAugment
            mask_time_indices = _compute_mask_indices(
                (batch_size, sequence_length),
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)

        if self.config.mask_feature_prob > 0 and self.training:
            # generate indices & apply SpecAugment along feature axis
            # 如果训练模式且配置中开启了 mask_feature_prob，则生成索引并沿特征轴应用 SpecAugment
            mask_feature_indices = _compute_mask_indices(
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
            hidden_states[mask_feature_indices] = 0

        return hidden_states

    @add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 定义方法的返回类型为元组或BaseModelOutput类型
    ) -> Union[Tuple, BaseModelOutput]:
        # 如果未指定输出注意力的配置，则使用模型的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定输出隐藏状态的配置，则使用模型的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定返回字典的配置，则使用模型的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 提取特征向量
        extract_features = self.feature_extractor(input_values)
        # 调整特征向量的维度顺序
        extract_features = extract_features.transpose(1, 2)
        # 对特征向量进行层归一化处理
        extract_features = self.layer_norm(extract_features)

        # 如果需要将特征向量投影，则进行投影
        if self.project_features:
            extract_features = self.feature_projection(extract_features)
        # 对特征向量进行特征丢弃（dropout）
        hidden_states = self.feature_dropout(extract_features)

        # 如果存在注意力遮罩，则根据特征向量的形状生成相应的减少注意力遮罩
        if attention_mask is not None:
            # 计算与特征向量对应的减少注意力遮罩
            attention_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)

        # 根据时间索引遮罩隐藏状态
        hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)

        # 将隐藏状态和其他配置传递给编码器进行处理
        encoder_outputs = self.encoder(
            hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从编码器输出中提取隐藏状态
        hidden_states = encoder_outputs[0]

        # 如果不要求返回字典，则返回隐藏状态和编码器输出的其他部分
        if not return_dict:
            return (hidden_states,) + encoder_outputs[1:]

        # 返回BaseModelOutput对象，包含最终隐藏状态、所有隐藏状态和注意力值
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
@add_start_docstrings(
    """SEW-D Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
    SEWD_START_DOCSTRING,
)
# SEWDForCTC 类，用于在 Connectionist Temporal Classification (CTC) 上添加一个语言建模头部的 SEW-D 模型。
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC 复制而来，将 Wav2Vec2 改为 SEWD，wav2vec2 改为 sew_d，WAV_2_VEC_2 改为 SEWD
class SEWDForCTC(SEWDPreTrainedModel):
    def __init__(self, config, target_lang: Optional[str] = None):
        super().__init__(config)

        # 初始化 SEWD 模型和 dropout 层
        self.sew_d = SEWDModel(config)
        self.dropout = nn.Dropout(config.final_dropout)

        self.target_lang = target_lang

        # 检查是否定义了语言模型头部的词汇表大小
        if config.vocab_size is None:
            raise ValueError(
                f"You are trying to instantiate {self.__class__} with a configuration that "
                "does not define the vocabulary size of the language model head. Please "
                "instantiate the model as follows: `SEWDForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
                "or define `vocab_size` of your model's configuration."
            )

        # 根据配置确定输出隐藏层大小
        output_hidden_size = (
            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
        )

        # 初始化语言模型头部线性层
        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)

        # 初始化权重并进行最终处理
        self.post_init()

    def tie_weights(self):
        """
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        """

        # 注意，`tie_weights` 通常用于绑定输入和输出嵌入权重。在这里，我们重新定义此方法，以便在 SEWD 中正确加载适配器层，避免引入新的 API 到 `PreTrainedModel`。
        # 如果 `target_lang` 不是 None，并且配置中未定义 `adapter_attn_dim`，则会引发 ValueError。
        # 如果 `target_lang` 是 None，并且配置中定义了 `adapter_attn_dim`，则记录日志信息。
        # 如果 `target_lang` 不是 None，则强制加载指定的适配器层。
        target_lang = self.target_lang

        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
            logger.info("By default `target_lang` is set to 'eng'.")
        elif target_lang is not None:
            self.load_adapter(target_lang, force_load=True)
    # 调用此函数将冻结特征编码器的梯度计算，使其参数在训练期间不会更新。
    def freeze_feature_extractor(self):
        # 发出警告，提示该方法即将被弃用并在 Transformers v5 中移除，建议使用等效的 `freeze_feature_encoder` 方法。
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        # 调用 `freeze_feature_encoder` 方法冻结特征编码器。
        self.freeze_feature_encoder()

    # 调用此函数将禁用特征编码器的梯度计算，使其参数在训练期间不会更新。
    def freeze_feature_encoder(self):
        # 调用内部函数 `_freeze_parameters` 来冻结特征编码器的参数。
        self.sew_d.feature_extractor._freeze_parameters()

    # 调用此函数将禁用基础模型的梯度计算，使其参数在训练期间不会更新，只有分类头会被更新。
    def freeze_base_model(self):
        # 遍历 `self.sew_d` 的所有参数，将它们的梯度计算设为 False。
        for param in self.sew_d.parameters():
            param.requires_grad = False

    # 在模型前向传播过程中的参数注解和代码示例的添加函数修饰器
    @add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_CTC_EXPECTED_OUTPUT,
        expected_loss=_CTC_EXPECTED_LOSS,
    )
    # 模型的前向传播函数，接受输入值、注意力掩码等多个可选参数
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
        ) -> Union[Tuple, CausalLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """

        # Determine if return_dict is explicitly provided; otherwise, use the default from model config
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Perform the forward pass through the model's sequence to sequence decoder
        outputs = self.sew_d(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract hidden states from the model outputs and apply dropout
        hidden_states = outputs[0]
        hidden_states = self.dropout(hidden_states)

        # Compute logits from the language model head
        logits = self.lm_head(hidden_states)

        # Initialize loss as None
        loss = None
        # Calculate loss only if labels are provided
        if labels is not None:
            # Check if any label value exceeds the vocabulary size, which would be invalid
            if labels.max() >= self.config.vocab_size:
                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")

            # Retrieve input lengths from attention_mask, defaulting to all ones if mask is None
            attention_mask = (
                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
            )
            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)

            # Determine target lengths and flatten the targets tensor
            labels_mask = labels >= 0
            target_lengths = labels_mask.sum(-1)
            flattened_targets = labels.masked_select(labels_mask)

            # Compute log probabilities using log_softmax for the logits
            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

            # Disable cudnn for this section due to compatibility issues with fp16
            with torch.backends.cudnn.flags(enabled=False):
                # Compute the connectionist temporal classification (CTC) loss
                loss = nn.functional.ctc_loss(
                    log_probs,
                    flattened_targets,
                    input_lengths,
                    target_lengths,
                    blank=self.config.pad_token_id,
                    reduction=self.config.ctc_loss_reduction,
                    zero_infinity=self.config.ctc_zero_infinity,
                )

        # If return_dict is False, return output tuple without loss
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # If return_dict is True, return CausalLMOutput object with all relevant outputs
        return CausalLMOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
@add_start_docstrings(
    """
    SEWD Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like SUPERB
    Keyword Spotting.
    """,
    SEWD_START_DOCSTRING,
)
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification中复制而来，将Wav2Vec2改为SEWD，wav2vec2改为sew_d，WAV_2_VEC_2改为SEWD
class SEWDForSequenceClassification(SEWDPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Sequence classification does not support the use of SEWD adapters (config.add_adapter=True)"
            )
        self.sew_d = SEWDModel(config)
        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        """
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        self.freeze_feature_encoder()

    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        self.sew_d.feature_extractor._freeze_parameters()

    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        for param in self.sew_d.parameters():
            param.requires_grad = False

    @add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_SEQ_CLASS_CHECKPOINT,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        # 设定是否返回结果的字典形式，默认根据模型配置决定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果配置中指定了使用加权层求和，则输出隐藏状态
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 调用模型的前向计算
        outputs = self.sew_d(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果配置中指定了使用加权层求和
        if self.config.use_weighted_layer_sum:
            # 获取隐藏状态的列表
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            # 将隐藏状态堆叠起来形成张量
            hidden_states = torch.stack(hidden_states, dim=1)
            # 计算归一化的权重
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            # 对隐藏状态进行加权求和
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            # 否则直接取第一个输出作为隐藏状态
            hidden_states = outputs[0]

        # 将隐藏状态传递给投影器
        hidden_states = self.projector(hidden_states)

        # 如果没有给定注意力掩码，则对隐藏状态进行平均池化
        if attention_mask is None:
            pooled_output = hidden_states.mean(dim=1)
        else:
            # 否则根据注意力掩码获取特征向量并进行加权池化
            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
            hidden_states[~padding_mask] = 0.0
            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)

        # 将池化输出传递给分类器得到预测的 logits
        logits = self.classifier(pooled_output)

        # 计算损失，如果给定了标签
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        # 根据是否返回字典形式决定输出的结构
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # 返回带有损失、logits、隐藏状态和注意力的结果字典形式
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\sew_d\init.py`

# 版权声明及许可声明，指明版权归 The HuggingFace Team 所有，依照 Apache License, Version 2.0 许可
#
# 在遵循许可的前提下，你可以使用本文件。你可以从以下链接获取许可的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件根据“原样”分发，无任何明示或暗示的担保或条件。
# 请查阅许可文件以了解详细信息。
from typing import TYPE_CHECKING

# 从相对路径引入工具函数和模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构字典
_import_structure = {"configuration_sew_d": ["SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWDConfig"]}

# 检查是否有 torch 可用，若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，将下列模块添加到导入结构中
    _import_structure["modeling_sew_d"] = [
        "SEW_D_PRETRAINED_MODEL_ARCHIVE_LIST",
        "SEWDForCTC",
        "SEWDForSequenceClassification",
        "SEWDModel",
        "SEWDPreTrainedModel",
    ]

# 如果是类型检查阶段
if TYPE_CHECKING:
    # 从配置模块中导入指定符号
    from .configuration_sew_d import SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWDConfig

    # 再次检查 torch 是否可用，若不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若 torch 可用，则从模型模块中导入指定符号
        from .modeling_sew_d import (
            SEW_D_PRETRAINED_MODEL_ARCHIVE_LIST,
            SEWDForCTC,
            SEWDForSequenceClassification,
            SEWDModel,
            SEWDPreTrainedModel,
        )

# 如果不是类型检查阶段
else:
    # 导入 sys 模块
    import sys

    # 将当前模块设置为 LazyModule，延迟加载所需的子模块和符号
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\siglip\configuration_siglip.py`

# coding=utf-8
# 上方声明文件编码为 UTF-8

# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
# 版权声明，版权归 HuggingFace Inc. 团队所有

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 根据 Apache License, Version 2.0 进行许可，详细信息可查看指定的链接

""" Siglip model configuration"""
# 文档字符串，指定此文件是关于 Siglip 模型配置的

import os
from typing import Union

# 从其他模块导入必要的类和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取模块的日志记录器
logger = logging.get_logger(__name__)

# 定义预训练配置文件映射，将模型名称映射到预训练配置文件的 URL
SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "google/siglip-base-patch16-224": "https://huggingface.co/google/siglip-base-patch16-224/resolve/main/config.json",
}


class SiglipTextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`SiglipTextModel`]. It is used to instantiate a
    Siglip text encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip
    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # SiglipTextConfig 类继承自 PretrainedConfig，用于存储 SiglipTextModel 的配置信息
    # 根据指定的参数实例化 Siglip 文本编码器，定义模型架构
    # 使用默认配置实例化将产生与 Siglip google/siglip-base-patch16-224 模型架构相似的配置

    # 详细文档字符串描述了此类的作用和用法，以及与 PretrainedConfig 类的关系，用于控制模型输出
    # 可以查阅 PretrainedConfig 的文档获取更多信息
    # 定义模型类型为 Siglip 文本模型
    model_type = "siglip_text_model"
    # 初始化方法，用于创建一个新的配置对象
    def __init__(
        self,
        vocab_size=32000,
        hidden_size=768,
        intermediate_size=3072,
        num_hidden_layers=12,
        num_attention_heads=12,
        max_position_embeddings=64,
        hidden_act="gelu_pytorch_tanh",
        layer_norm_eps=1e-6,
        attention_dropout=0.0,
        # 这个参数与 `CLIPTokenizer` 的默认值以及 openai/siglip 的默认值不同
        # 参考 https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
        pad_token_id=1,
        bos_token_id=49406,
        eos_token_id=49407,
        **kwargs,
    ):
        # 调用父类的初始化方法，设置特殊标记的 ID 和其余的关键字参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 设置配置对象的各种属性
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.max_position_embeddings = max_position_embeddings
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act
        self.attention_dropout = attention_dropout

    @classmethod
    # 从预训练模型名称或路径创建一个预训练配置对象的类方法
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 在关键字参数中设置 token 相关的参数
        cls._set_token_in_kwargs(kwargs)

        # 调用 get_config_dict 方法获取配置字典和更新后的关键字参数
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典的 model_type 为 "siglip"，则使用其 text_config 配置
        if config_dict.get("model_type") == "siglip":
            config_dict = config_dict["text_config"]

        # 如果配置字典中有 model_type，且与当前类的 model_type 属性不匹配，输出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典和关键字参数创建一个新的类对象
        return cls.from_dict(config_dict, **kwargs)
class SiglipVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input images.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.

    Example:

    ```
    >>> from transformers import SiglipVisionConfig, SiglipVisionModel

    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
    >>> configuration = SiglipVisionConfig()

    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
    >>> model = SiglipVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    # 设置模型类型标识符为 "siglip_vision_model"
    model_type = "siglip_vision_model"
    # 初始化函数，用于创建一个新的配置对象
    def __init__(
        self,
        hidden_size=768,
        intermediate_size=3072,
        num_hidden_layers=12,
        num_attention_heads=12,
        num_channels=3,
        image_size=224,
        patch_size=16,
        hidden_act="gelu_pytorch_tanh",
        layer_norm_eps=1e-6,
        attention_dropout=0.0,
        **kwargs,
    ):
        # 调用父类的初始化方法，传递所有未命名的参数
        super().__init__(**kwargs)

        # 设置配置对象的属性值
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_channels = num_channels
        self.patch_size = patch_size
        self.image_size = image_size
        self.attention_dropout = attention_dropout
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 在 kwargs 中设置 token
        cls._set_token_in_kwargs(kwargs)

        # 获取配置字典和可能更新后的 kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中的 model_type 是 "siglip"，则使用 vision_config 作为新的配置字典
        if config_dict.get("model_type") == "siglip":
            config_dict = config_dict["vision_config"]

        # 如果配置字典中包含 model_type，并且类有 model_type 属性，且它们不相同，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 使用配置字典和 kwargs 创建新的配置对象
        return cls.from_dict(config_dict, **kwargs)
# 定义了一个继承自PretrainedConfig的配置类SiglipConfig，用于存储SiglipModel的配置信息
class SiglipConfig(PretrainedConfig):
    r"""
    [`SiglipConfig`] 是用来存储 [`SiglipModel`] 配置的类。它用于根据指定的参数实例化一个Siglip模型，
    定义了文本模型和视觉模型的配置。使用默认值实例化配置将产生类似于Siglip [google/siglip-base-patch16-224]
    架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可以用来控制模型的输出。详细信息请参阅 [`PretrainedConfig`] 的文档。

    Args:
        text_config (`dict`, *optional*):
            用于初始化 [`SiglipTextConfig`] 的配置选项字典。
        vision_config (`dict`, *optional*):
            用于初始化 [`SiglipVisionConfig`] 的配置选项字典。
        kwargs (*optional*):
            关键字参数字典。

    Example:

    ```
    >>> from transformers import SiglipConfig, SiglipModel

    >>> # 使用google/siglip-base-patch16-224风格的配置初始化SiglipConfig
    >>> configuration = SiglipConfig()

    >>> # 使用google/siglip-base-patch16-224风格的配置初始化SiglipModel（随机权重）
    >>> model = SiglipModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config

    >>> # 我们还可以从SiglipTextConfig和SiglipVisionConfig初始化SiglipConfig
    >>> from transformers import SiglipTextConfig, SiglipVisionConfig

    >>> # 初始化SiglipText和SiglipVision配置
    >>> config_text = SiglipTextConfig()
    >>> config_vision = SiglipVisionConfig()

    >>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision)
    ```"""

    # 模型类型标识为"siglip"
    model_type = "siglip"

    # 初始化方法，接受text_config和vision_config作为可选参数
    def __init__(self, text_config=None, vision_config=None, **kwargs):
        # 调用父类PretrainedConfig的初始化方法
        super().__init__(**kwargs)

        # 如果text_config为None，则使用默认空字典并记录日志
        if text_config is None:
            text_config = {}
            logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.")

        # 如果vision_config为None，则使用默认空字典并记录日志
        if vision_config is None:
            vision_config = {}
            logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.")

        # 使用text_config初始化SiglipTextConfig对象，并赋值给self.text_config
        self.text_config = SiglipTextConfig(**text_config)
        # 使用vision_config初始化SiglipVisionConfig对象，并赋值给self.vision_config
        self.vision_config = SiglipVisionConfig(**vision_config)

        # 设置初始化因子为1.0
        self.initializer_factor = 1.0

    @classmethod
    # 定义一个类方法，用于从 Siglip 文本模型配置和 Siglip 视觉模型配置实例化一个 SiglipConfig 对象或其派生类。
    @classmethod
    # 方法参数包括文本配置对象 text_config、视觉配置对象 vision_config，以及任意额外的关键字参数 kwargs
    def from_text_vision_configs(cls, text_config: SiglipTextConfig, vision_config: SiglipVisionConfig, **kwargs):
        r"""
        Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
        model configuration.

        Returns:
            [`SiglipConfig`]: An instance of a configuration object
        """
        # 使用 text_config 的字典形式和 vision_config 的字典形式，以及任意额外的关键字参数，实例化一个 SiglipConfig 对象并返回
        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

`.\models\siglip\convert_siglip_to_hf.py`

# 获取 SigLIP 模型配置信息的函数
def get_siglip_config(model_name):
    # 创建 SigLIPConfig 对象
    config = SiglipConfig()

    # 根据模型名称确定词汇表大小
    vocab_size = 250000 if "i18n" in model_name else 32000
    # 根据模型名称确定图像大小
    image_size = model_name_to_image_size[model_name]
    # 根据模型名称确定补丁大小
    patch_size = 16 if "patch16" in model_name else 14

    # 设置视觉配置的图像大小和补丁大小
    config.vision_config.image_size = image_size
    config.vision_config.patch_size = patch_size
    # 设置文本配置的词汇表大小
    config.text_config.vocab_size = vocab_size

    # 如果模型名称包含 "base"，则无需额外操作
    if "base" in model_name:
        pass
    # 如果模型名称中包含"large"
    elif "large" in model_name:
        # 设置文本模型的隐藏层大小为1024
        config.text_config.hidden_size = 1024
        # 设置文本模型的中间层大小为4096
        config.text_config.intermediate_size = 4096
        # 设置文本模型的隐藏层数量为24
        config.text_config.num_hidden_layers = 24
        # 设置文本模型的注意力头数为16
        config.text_config.num_attention_heads = 16
        # 设置视觉模型的隐藏层大小为1024
        config.vision_config.hidden_size = 1024
        # 设置视觉模型的中间层大小为4096
        config.vision_config.intermediate_size = 4096
        # 设置视觉模型的隐藏层数量为24
        config.vision_config.num_hidden_layers = 24
        # 设置视觉模型的注意力头数为16
        config.vision_config.num_attention_heads = 16
    # 如果模型名称中包含"so400m"
    elif "so400m" in model_name:
        # 设置文本模型的隐藏层大小为1152
        config.text_config.hidden_size = 1152
        # 设置文本模型的中间层大小为4304
        config.text_config.intermediate_size = 4304
        # 设置文本模型的隐藏层数量为27
        config.text_config.num_hidden_layers = 27
        # 设置文本模型的注意力头数为16
        config.text_config.num_attention_heads = 16
        # 设置视觉模型的隐藏层大小为1152
        config.vision_config.hidden_size = 1152
        # 设置视觉模型的中间层大小为4304
        config.vision_config.intermediate_size = 4304
        # 设置视觉模型的隐藏层数量为27
        config.vision_config.num_hidden_layers = 27
        # 设置视觉模型的注意力头数为16
        config.vision_config.num_attention_heads = 16
    else:
        # 若模型名称不符合已知模型，则引发值错误异常
        raise ValueError("Model not supported")

    # 返回配置对象config
    return config
def create_rename_keys(config):
    rename_keys = []
    # fmt: off  # 关闭代码格式化，以便后续手动指定格式

    # vision encoder  # 以下是关于视觉编码器的重命名键设置

    # 将旧键 "params/img/embedding/kernel" 映射到新键 "vision_model.embeddings.patch_embedding.weight"，并添加到重命名键列表中
    rename_keys.append(("params/img/embedding/kernel", "vision_model.embeddings.patch_embedding.weight"))

    # 将旧键 "params/img/embedding/bias" 映射到新键 "vision_model.embeddings.patch_embedding.bias"，并添加到重命名键列表中
    rename_keys.append(("params/img/embedding/bias", "vision_model.embeddings.patch_embedding.bias"))

    # 将旧键 "params/img/pos_embedding" 映射到新键 "vision_model.embeddings.position_embedding.weight"，并添加到重命名键列表中
    rename_keys.append(("params/img/pos_embedding", "vision_model.embeddings.position_embedding.weight"))
    # 遍历从配置中获取的视觉模型的隐藏层数量次数，进行重命名键值对的添加
    for i in range(config.vision_config.num_hidden_layers):
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的LayerNorm_0层的权重参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_0/scale", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的LayerNorm_0层的偏置参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_0/bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的LayerNorm_1层的权重参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_1/scale", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的LayerNorm_1层的偏置参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_1/bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的MlpBlock_0层的第一层全连接层的权重参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_0/kernel", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的MlpBlock_0层的第一层全连接层的偏置参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_0/bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的MlpBlock_0层的第二层全连接层的权重参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_1/kernel", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的MlpBlock_0层的第二层全连接层的偏置参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_1/bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的MultiHeadDotProductAttention_0层的key投影层的权重参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/key/kernel", f"vision_model.encoder.layers.{i}.self_attn.k_proj.weight"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的MultiHeadDotProductAttention_0层的key投影层的偏置参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/key/bias", f"vision_model.encoder.layers.{i}.self_attn.k_proj.bias"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的MultiHeadDotProductAttention_0层的value投影层的权重参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/value/kernel", f"vision_model.encoder.layers.{i}.self_attn.v_proj.weight"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的MultiHeadDotProductAttention_0层的value投影层的偏置参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/value/bias", f"vision_model.encoder.layers.{i}.self_attn.v_proj.bias"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的MultiHeadDotProductAttention_0层的query投影层的权重参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/query/kernel", f"vision_model.encoder.layers.{i}.self_attn.q_proj.weight"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的MultiHeadDotProductAttention_0层的query投影层的偏置参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/query/bias", f"vision_model.encoder.layers.{i}.self_attn.q_proj.bias"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的MultiHeadDotProductAttention_0层的输出投影层的权重参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/out/kernel", f"vision_model.encoder.layers.{i}.self_attn.out_proj.weight"))
        # 添加重命名键值对，将旧参数路径映射到新的视觉模型编码器的第i层的MultiHeadDotProductAttention_0层的输出投影层的偏置参数路径
        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/out/bias", f"vision_model.encoder.layers.{i}.self_attn.out_proj.bias"))
    
    # 添加重命名键值对，将旧参数路径映射到新的视觉模型的编码器层之后的LayerNorm层的权重参数路径
    rename_keys.append(("params/img/Transformer/encoder_norm/scale", "vision_model.post_layernorm.weight"))
    # 添加重命名键值对，将旧参数路径映射到新的视觉模型的编码器层之后的LayerNorm层的偏置参数路径
    rename_keys.append(("params/img/Transformer/encoder_norm/bias", "vision_model.post_layernorm.bias"))
    
    # 添加重命名键值对，将旧参数路径映射到新的视觉模型的头部模块的探测参数路径
    rename_keys.append(("params/img/MAPHead_0/probe", "vision_model.head.probe"))
    # 将键值对添加到 `rename_keys` 列表，用于指定源键和目标键的映射关系，用于重命名模型参数
    
    rename_keys.append(("params/img/MAPHead_0/LayerNorm_0/scale", "vision_model.head.layernorm.weight"))
    rename_keys.append(("params/img/MAPHead_0/LayerNorm_0/bias", "vision_model.head.layernorm.bias"))
    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_0/kernel", "vision_model.head.mlp.fc1.weight"))
    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_0/bias", "vision_model.head.mlp.fc1.bias"))
    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_1/kernel", "vision_model.head.mlp.fc2.weight"))
    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_1/bias", "vision_model.head.mlp.fc2.bias"))
    rename_keys.append(("params/img/MAPHead_0/MultiHeadDotProductAttention_0/out/kernel", "vision_model.head.attention.out_proj.weight"))
    rename_keys.append(("params/img/MAPHead_0/MultiHeadDotProductAttention_0/out/bias", "vision_model.head.attention.out_proj.bias"))
    
    # text encoder
    
    # 添加用于文本编码器的键值对映射，重命名模型参数
    rename_keys.append(("params/txt/Embed_0/embedding", "text_model.embeddings.token_embedding.weight"))
    rename_keys.append(("params/txt/pos_embedding", "text_model.embeddings.position_embedding.weight"))
    # 遍历配置中指定的文本模型隐藏层数量次数
    for i in range(config.text_config.num_hidden_layers):
        # 将参数重命名并添加到 rename_keys 列表中，映射到文本模型编码器每一层的 LayerNorm 层的权重和偏置
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_0/scale", f"text_model.encoder.layers.{i}.layer_norm1.weight"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_0/bias", f"text_model.encoder.layers.{i}.layer_norm1.bias"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_1/scale", f"text_model.encoder.layers.{i}.layer_norm2.weight"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_1/bias", f"text_model.encoder.layers.{i}.layer_norm2.bias"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_0/kernel", f"text_model.encoder.layers.{i}.mlp.fc1.weight"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_0/bias", f"text_model.encoder.layers.{i}.mlp.fc1.bias"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_1/kernel", f"text_model.encoder.layers.{i}.mlp.fc2.weight"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_1/bias", f"text_model.encoder.layers.{i}.mlp.fc2.bias"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/key/kernel", f"text_model.encoder.layers.{i}.self_attn.k_proj.weight"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/key/bias", f"text_model.encoder.layers.{i}.self_attn.k_proj.bias"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/value/kernel", f"text_model.encoder.layers.{i}.self_attn.v_proj.weight"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/value/bias", f"text_model.encoder.layers.{i}.self_attn.v_proj.bias"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/query/kernel", f"text_model.encoder.layers.{i}.self_attn.q_proj.weight"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/query/bias", f"text_model.encoder.layers.{i}.self_attn.q_proj.bias"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/out/kernel", f"text_model.encoder.layers.{i}.self_attn.out_proj.weight"))
        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/out/bias", f"text_model.encoder.layers.{i}.self_attn.out_proj.bias"))
    
    # 将最后几个参数重命名并添加到 rename_keys 列表中，映射到文本模型的最终归一化层、输出层权重和偏置
    rename_keys.append(("params/txt/Encoder_0/encoder_norm/scale", "text_model.final_layer_norm.weight"))
    rename_keys.append(("params/txt/Encoder_0/encoder_norm/bias", "text_model.final_layer_norm.bias"))
    rename_keys.append(("params/txt/head/kernel", "text_model.head.weight"))
    rename_keys.append(("params/txt/head/bias", "text_model.head.bias"))
    
    # 学习到的温度和偏置（此处的注释并没有提供代码细节，可能表示这部分信息是从数据中学习到的额外参数）
    # 将元组 ("params/t", "logit_scale") 添加到 rename_keys 列表中
    rename_keys.append(("params/t", "logit_scale"))
    # 将元组 ("params/b", "logit_bias") 添加到 rename_keys 列表中
    rename_keys.append(("params/b", "logit_bias"))

    # 返回 rename_keys 列表作为函数的结果
    return rename_keys
# 重命名字典中的键，并根据配置修改值的形状
def rename_key(dct, old, new, config):
    # 弹出旧键对应的值
    val = dct.pop(old)

    # 根据新键中的标识和配置调整值的形状
    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "vision" in new:
        val = val.reshape(-1, config.vision_config.hidden_size)
    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "text" in new:
        val = val.reshape(-1, config.text_config.hidden_size)

    # 如果新键指定了特定的权重矩阵，进行转置操作
    if "patch_embedding.weight" in new:
        val = val.transpose(3, 2, 0, 1)
    elif new.endswith("weight") and "position_embedding" not in new and "token_embedding" not in new:
        val = val.T

    # 根据新键中的标识和配置再次调整值的形状
    if "position_embedding" in new and "vision" in new:
        val = val.reshape(-1, config.vision_config.hidden_size)
    if "position_embedding" in new and "text" in new:
        val = val.reshape(-1, config.text_config.hidden_size)

    # 如果新键是偏置项，将值调整为一维数组
    if new.endswith("bias"):
        val = val.reshape(-1)

    # 将处理后的值转换为 Torch 张量，并存入字典中
    dct[new] = torch.from_numpy(val)


# 从状态字典中读取注意力机制的输入投影层参数
def read_in_q_k_v_head(state_dict, config):
    # 弹出并重塑键为"key/kernel"的参数
    key_proj_weight = (
        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/key/kernel")
        .reshape(-1, config.vision_config.hidden_size)
        .T
    )
    # 弹出并重塑键为"key/bias"的参数
    key_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/key/bias").reshape(-1)
    # 弹出并重塑键为"value/kernel"的参数
    value_proj_weight = (
        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/value/kernel")
        .reshape(-1, config.vision_config.hidden_size)
        .T
    )
    # 弹出并重塑键为"value/bias"的参数
    value_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/value/bias").reshape(-1)
    # 弹出并重塑键为"query/kernel"的参数
    query_proj_weight = (
        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/query/kernel")
        .reshape(-1, config.vision_config.hidden_size)
        .T
    )
    # 弹出并重塑键为"query/bias"的参数
    query_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/query/bias").reshape(-1)

    # 将重塑后的参数拼接成一个单一的矩阵和向量，并加入状态字典中
    state_dict["vision_model.head.attention.in_proj_weight"] = torch.from_numpy(
        np.concatenate([query_proj_weight, key_proj_weight, value_proj_weight], axis=0)
    )
    state_dict["vision_model.head.attention.in_proj_bias"] = torch.from_numpy(
        np.concatenate([query_proj_bias, key_proj_bias, value_proj_bias], axis=0)
    )
# 定义函数，用于将模型的权重转换到 SigLIP 结构
def convert_siglip_checkpoint(model_name, pytorch_dump_folder_path, verify_logits=True, push_to_hub=False):
    """
    Copy/paste/tweak model's weights to our SigLIP structure.
    """

    # 获取默认的 SigLIP 配置
    config = get_siglip_config(model_name)

    # 获取模型名称对应的检查点
    checkpoint = model_name_to_checkpoint[model_name]

    # 获取词汇文件路径
    if "i18n" in model_name:
        vocab_file = "/Users/nielsrogge/Documents/SigLIP/multilingual_vocab/sentencepiece.model"
    else:
        vocab_file = "/Users/nielsrogge/Documents/SigLIP/english_vocab/sentencepiece.model"

    # 加载原始状态字典
    data = load(checkpoint)
    state_dict = flatten_nested_dict(data)

    # 移除并重命名一些键
    rename_keys = create_rename_keys(config)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest, config)

    # 对注意力池化头的 qkv 矩阵需要特殊处理
    read_in_q_k_v_head(state_dict, config)

    # 加载 HuggingFace 模型
    model = SiglipModel(config).eval()
    model.load_state_dict(state_dict)

    # 创建处理器
    # 注意: 使得分词器不返回 attention_mask，因为原始模型不需要它
    image_size = config.vision_config.image_size
    size = {"height": image_size, "width": image_size}
    image_processor = SiglipImageProcessor(size=size)
    tokenizer = SiglipTokenizer(vocab_file=vocab_file, model_input_names=["input_ids"])
    processor = SiglipProcessor(image_processor=image_processor, tokenizer=tokenizer)

    # 在虚拟图片和文本上进行验证
    url_1 = "https://cdn.openai.com/multimodal-neurons/assets/apple/apple-ipod.jpg"
    image_1 = Image.open(requests.get(url_1, stream=True).raw).convert("RGB")
    url_2 = "https://cdn.openai.com/multimodal-neurons/assets/apple/apple-blank.jpg"
    image_2 = Image.open(requests.get(url_2, stream=True).raw).convert("RGB")
    texts = ["an apple", "a picture of an apple"]

    inputs = processor(images=[image_1, image_2], text=texts, return_tensors="pt", padding="max_length")

    # 针对输入的 input_ids 进行验证
    if image_size == 224:
        filename = "siglip_pixel_values.pt"
    elif image_size == 256:
        filename = "siglip_pixel_values_256.pt"
    elif image_size == 384:
        filename = "siglip_pixel_values_384.pt"
    elif image_size == 512:
        filename = "siglip_pixel_values_512.pt"
    else:
        raise ValueError("Image size not supported")

    # 下载并加载原始像素数值
    filepath = hf_hub_download(repo_id="nielsr/test-image", filename=filename, repo_type="dataset")
    original_pixel_values = torch.load(filepath)

    # 下载并加载原始 input_ids
    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="siglip_input_ids.pt", repo_type="dataset")
    original_input_ids = torch.load(filepath)

    # 如果模型名称不包含 "i18n"，则断言 inputs.input_ids 与 original_input_ids 相等
    if "i18n" not in model_name:
        assert inputs.input_ids.tolist() == original_input_ids.tolist()

    # 打印原始像素值的平均值
    print("Mean of original pixel values:", original_pixel_values.mean())
    # 输出新像素值的均值
    print("Mean of new pixel values:", inputs.pixel_values.mean())

    # 使用原始像素值进行测试，因为我们没有准确的像素值
    with torch.no_grad():
        # 使用模型进行推断，输入包括输入的 ID 和原始像素值
        outputs = model(input_ids=inputs.input_ids, pixel_values=original_pixel_values)

    # 输出前三行三列的 logits_per_image
    print(outputs.logits_per_image[:3, :3])

    # 计算输出的 logits_per_image 的 sigmoid 函数，得到概率值
    probs = torch.sigmoid(outputs.logits_per_image)
    # 打印第一张图像是 texts[0] 的概率
    print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
    # 打印第一张图像是 texts[1] 的概率
    print(f"{probs[0][1]:.1%} that image 0 is '{texts[1]}'")

    # 如果需要验证 logits
    if verify_logits:
        # 根据模型名称选择预期的 slice
        if model_name == "siglip-base-patch16-224":
            expected_slice = torch.tensor(
                [[-2.9621, -2.1672], [-0.2713, 0.2910]],
            )
        elif model_name == "siglip-base-patch16-256":
            expected_slice = torch.tensor(
                [[-3.1146, -1.9894], [-0.7312, 0.6387]],
            )
        elif model_name == "siglip-base-patch16-384":
            expected_slice = torch.tensor(
                [[-2.8098, -2.1891], [-0.4242, 0.4102]],
            )
        elif model_name == "siglip-base-patch16-512":
            expected_slice = torch.tensor(
                [[-2.7899, -2.2668], [-0.4295, -0.0735]],
            )
        elif model_name == "siglip-large-patch16-256":
            expected_slice = torch.tensor(
                [[-1.5827, -0.5801], [-0.9153, 0.1363]],
            )
        elif model_name == "siglip-large-patch16-384":
            expected_slice = torch.tensor(
                [[-2.1523, -0.2899], [-0.2959, 0.7884]],
            )
        elif model_name == "siglip-so400m-patch14-384":
            expected_slice = torch.tensor([[-1.2441, -0.6649], [-0.7060, 0.7374]])
        elif model_name == "siglip-base-patch16-256-i18n":
            expected_slice = torch.tensor(
                [[-0.9064, 0.1073], [-0.0299, 0.5304]],
            )

        # 断言前三行三列的 logits_per_image 与预期的 slice 相似
        assert torch.allclose(outputs.logits_per_image[:3, :3], expected_slice, atol=1e-4)
        print("Looks ok!")

    # 如果有指定的 pytorch_dump_folder_path
    if pytorch_dump_folder_path is not None:
        # 创建目录（如果不存在）
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        # 打印保存模型和处理器的信息
        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 打印保存处理器的信息
        print(f"Saving processor to {pytorch_dump_folder_path}")
        # 将处理器保存到指定路径
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到 Hub
    if push_to_hub:
        # 推送模型到 Hub
        model.push_to_hub(f"nielsr/{model_name}")
        # 推送处理器到 Hub
        processor.push_to_hub(f"nielsr/{model_name}")
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码
    parser = argparse.ArgumentParser()
    # 创建命令行参数解析器对象

    # Required parameters
    parser.add_argument(
        "--model_name",
        default="siglip-base-patch16-224",
        type=str,
        choices=model_name_to_checkpoint.keys(),
        help="Name of the model you'd like to convert.",
    )
    # 添加一个必选的参数 `--model_name`，默认为 "siglip-base-patch16-224"，
    # 类型为字符串，可以从 `model_name_to_checkpoint` 字典的键中选择，
    # 用于指定要转换的模型名称

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加一个可选的参数 `--pytorch_dump_folder_path`，默认为 None，
    # 类型为字符串，用于指定输出 PyTorch 模型的目录路径

    parser.add_argument(
        "--verify_logits",
        action="store_false",
        help="Whether to verify logits against the original implementation.",
    )
    # 添加一个可选的开关参数 `--verify_logits`，
    # 当存在时将其设置为 False，用于指示是否对 logits 进行与原始实现的验证

    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    # 添加一个可选的开关参数 `--push_to_hub`，
    # 当存在时设置为 True，用于指示是否将转换后的模型推送到 🤗 hub

    # 解析命令行参数
    args = parser.parse_args()

    # 调用转换函数，传入解析后的参数
    convert_siglip_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub)

`.\models\siglip\image_processing_siglip.py`

# 导入所需模块和类
from typing import Dict, List, Optional, Union

from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
    resize,
    to_channel_dimension_format,
)
from ...image_utils import (
    IMAGENET_STANDARD_MEAN,         # 导入图像处理所需的标准均值
    IMAGENET_STANDARD_STD,          # 导入图像处理所需的标准标准差
    ChannelDimension,               # 导入通道维度相关的枚举
    ImageInput,                     # 导入图像输入的类型定义
    PILImageResampling,             # 导入 PIL 图像的重采样方式枚举
    infer_channel_dimension_format, # 推断通道维度格式的函数
    is_scaled_image,                # 判断图像是否已经缩放的函数
    make_list_of_images,            # 将输入转换为图像列表的函数
    to_numpy_array,                 # 将图像转换为 NumPy 数组的函数
    valid_images,                   # 检查图像有效性的函数
    validate_kwargs,                # 验证关键字参数的函数
    validate_preprocess_arguments,  # 验证预处理参数的函数
)
from ...utils import TensorType, is_vision_available, logging  # 导入张量类型、可视化库是否可用的函数和日志记录器


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


if is_vision_available():  # 如果可视化库可用
    import PIL  # 导入 PIL 库用于图像处理


class SiglipImageProcessor(BaseImageProcessor):
    r"""
    Constructs a SigLIP image processor.
    """
    # 定义一个类，用于预处理图像数据以供模型输入使用
    class ImagePreprocessing:
        # 模型的输入名称列表
        model_input_names = ["pixel_values"]
    
        # 初始化方法，设置各种图像预处理参数的默认值，并允许通过关键字参数进一步覆盖
        def __init__(
            self,
            do_resize: bool = True,  # 是否调整图像大小的标志，默认为True
            size: Dict[str, int] = None,  # 调整大小后的图像尺寸，默认为{"height": 224, "width": 224}
            resample: PILImageResampling = PILImageResampling.BICUBIC,  # 调整图像大小时使用的重采样滤波器，默认为BICUBIC
            do_rescale: bool = True,  # 是否对图像进行重新缩放的标志，默认为True
            rescale_factor: Union[int, float] = 1 / 255,  # 重新缩放图像时的缩放因子，默认为1/255
            do_normalize: bool = True,  # 是否对图像进行归一化的标志，默认为True
            image_mean: Optional[Union[float, List[float]]] = None,  # 归一化图像时使用的均值，默认为[0.5, 0.5, 0.5]
            image_std: Optional[Union[float, List[float]]] = None,  # 归一化图像时使用的标准差，默认为[0.5, 0.5, 0.5]
            **kwargs,  # 其他可能的关键字参数
        ):
    # 构造函数初始化，继承父类并传递关键字参数
    def __init__(
        self,
        **kwargs,
    ) -> None:
        # 调用父类初始化方法，传递所有关键字参数
        super().__init__(**kwargs)
        # 如果提供了 size 参数，则使用提供的值；否则使用默认尺寸 (height: 224, width: 224)
        size = size if size is not None else {"height": 224, "width": 224}
        # 如果提供了 image_mean 参数，则使用提供的值；否则使用 IMAGENET_STANDARD_MEAN 值
        image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
        # 如果提供了 image_std 参数，则使用提供的值；否则使用 IMAGENET_STANDARD_STD 值

        image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD

        # 设置对象的属性，用于预处理图像
        self.do_resize = do_resize  # 是否执行重新调整尺寸操作的标志
        self.size = size  # 图像的目标尺寸
        self.resample = resample  # 重新调整尺寸时的重采样方法
        self.do_rescale = do_rescale  # 是否执行重新缩放的标志
        self.rescale_factor = rescale_factor  # 图像重新缩放的因子
        self.do_normalize = do_normalize  # 是否执行归一化的标志
        self.image_mean = image_mean  # 归一化时的均值
        self.image_std = image_std  # 归一化时的标准差

        # 定义可以接受的预处理关键字参数列表
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    # 图像预处理方法
    def preprocess(
        self,
        images: ImageInput,  # 待处理的图像数据
        do_resize: bool = None,  # 是否执行重新调整尺寸的标志
        size: Dict[str, int] = None,  # 图像的目标尺寸
        resample: PILImageResampling = None,  # 重新调整尺寸时的重采样方法
        do_rescale: bool = None,  # 是否执行重新缩放的标志
        rescale_factor: float = None,  # 图像重新缩放的因子
        do_normalize: bool = None,  # 是否执行归一化的标志
        image_mean: Optional[Union[float, List[float]]] = None,  # 归一化时的均值
        image_std: Optional[Union[float, List[float]]] = None,  # 归一化时的标准差
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回张量的格式
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,  # 数据的通道顺序
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据的通道顺序
        **kwargs,  # 其他未命名参数

`.\models\siglip\modeling_siglip.py`

# 导入 math 库，用于数学运算
import math
# 导入 warnings 库，用于警告处理
import warnings
# 导入 dataclass 模块中的 dataclass 装饰器，用于创建数据类
from dataclasses import dataclass
# 导入 typing 库，用于类型提示
from typing import Any, Optional, Tuple, Union

# 导入 numpy 库，通常用于科学计算
import numpy as np
# 导入 torch 库，主要深度学习框架
import torch
# 导入 torch.utils.checkpoint 模块，用于模型的检查点
import torch.utils.checkpoint
# 导入 torch.nn 模块，用于神经网络相关操作
from torch import nn
# 导入 torch.nn 中的损失函数，如 BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
# 导入 torch.nn.init 中的 _calculate_fan_in_and_fan_out 函数，用于计算初始化时的 fan_in 和 fan_out
from torch.nn.init import _calculate_fan_in_and_fan_out

# 导入 ACT2FN，用于激活函数
from ...activations import ACT2FN
# 导入 _prepare_4d_attention_mask 函数，用于准备四维注意力掩码
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
# 导入 BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput 等模型输出相关类
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
# 导入 PreTrainedModel 类，作为所有预训练模型的基类
from ...modeling_utils import PreTrainedModel
# 导入各种辅助函数和工具函数，如日志记录、代码示例文档字符串添加等
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# General docstring
# 针对文档的配置信息
_CONFIG_FOR_DOC = "SiglipConfig"
# 针对文档的检查点信息
_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"

# Image classification docstring
# 图像分类的检查点信息
_IMAGE_CLASS_CHECKPOINT = "google/siglip-base-patch16-224"
# 图像分类的预期输出信息
_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_1"

# SigLIP 预训练模型存档列表
SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/siglip-base-patch16-224",
    # 查看所有 SigLIP 模型，请访问 https://huggingface.co/models?filter=siglip
]

def _trunc_normal_(tensor, mean, std, a, b):
    # 从 PyTorch 官方代码库复制的截断正态分布初始化方法，直到它包含在几个官方发布版本中 - RW
    # 基于 https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf 的方法

    def norm_cdf(x):
        # 计算标准正态分布的累积分布函数
        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0

    if (mean < a - 2 * std) or (mean > b + 2 * std):
        # 如果均值 mean 超出了 [a, b] 区间的两个标准差之外，发出警告
        warnings.warn(
            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
            "The distribution of values may be incorrect.",
            stacklevel=2,
        )

    # 通过截断的均匀分布生成值，然后使用正态分布的逆累积分布函数进行转换
    # 获取上下限的累积分布函数值
    l = norm_cdf((a - mean) / std)
    u = norm_cdf((b - mean) / std)

    # 在 [l, u] 区间均匀填充张量的值，然后转换到 [2l-1, 2u-1] 区间
    tensor.uniform_(2 * l - 1, 2 * u - 1)
    # 使用逆CDF变换将张量转换为截断标准正态分布
    tensor.erfinv_()
    
    # 将张量缩放到正确的均值和标准差
    tensor.mul_(std * math.sqrt(2.0))
    tensor.add_(mean)
    
    # 使用 clamp 方法确保张量值在指定范围内
    tensor.clamp_(min=a, max=b)
# 使用截断正态分布填充给定的张量。值从正态分布中抽取，但超出[a, b]范围的值将重新抽取，直到在范围内。
def trunc_normal_tf_(
    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
) -> torch.Tensor:
    """Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \\leq \text{mean} \\leq b`.

    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
    and the result is subsequently scaled and shifted by the mean and std args.

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    """
    with torch.no_grad():
        # 调用内部函数 _trunc_normal_，从标准正态分布中抽取值并进行截断处理
        _trunc_normal_(tensor, 0, 1.0, a, b)
        # 对张量进行缩放（乘以std）和平移（加上mean）
        tensor.mul_(std).add_(mean)


# 根据张量的形状计算“fan_in”和“fan_out”，并根据给定的比例因子和分布类型初始化张量
def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
    if mode == "fan_in":
        denom = fan_in
    elif mode == "fan_out":
        denom = fan_out
    elif mode == "fan_avg":
        denom = (fan_in + fan_out) / 2

    variance = scale / denom

    if distribution == "truncated_normal":
        # 设置截断正态分布的标准差常量，该值是标准正态分布截断到(-2, 2)区间的标准差
        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
    elif distribution == "normal":
        with torch.no_grad():
            # 从正态分布中抽取值并填充张量
            tensor.normal_(std=math.sqrt(variance))
    elif distribution == "uniform":
        # 计算均匀分布的上下界
        bound = math.sqrt(3 * variance)
        with torch.no_grad():
            # 从均匀分布中抽取值并填充张量
            tensor.uniform_(-bound, bound)
    else:
        raise ValueError(f"invalid distribution {distribution}")


# 使用“fan_in”模式和截断正态分布初始化张量
def lecun_normal_(tensor):
    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")


# 使用“fan_in”模式和正态分布初始化张量
def default_flax_embed_init(tensor):
    variance_scaling_(tensor, mode="fan_in", distribution="normal")


@dataclass
# 从transformers.models.clip.modeling_clip.CLIPVisionModelOutput类复制而来，仅修改为使用Siglip
class SiglipVisionModelOutput(ModelOutput):
    """
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    # 定义函数的参数列表，用于描述函数接受的输入参数以及它们的数据类型和形状
    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            图像嵌入，通过将投影层应用于池化输出得到。是一个可选参数，当模型使用 `with_projection=True` 初始化时返回。
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的输出隐藏状态序列。是一个形状为 `(batch_size, sequence_length, hidden_size)` 的张量。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型每一层的隐藏状态元组。如果模型具有嵌入层，则包括嵌入输出，形状为 `(batch_size, sequence_length, hidden_size)`。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            注意力权重元组，用于计算自注意力头中的加权平均值。形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
    """
    
    # 定义四个参数，分别对应图像嵌入、最后隐藏状态、隐藏状态元组和注意力权重元组，都有各自的数据类型和可选性
    image_embeds: Optional[torch.FloatTensor] = None
    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 使用 @dataclass 装饰器声明一个数据类，用于表示 SiglipTextModelOutput 类
@dataclass
# 从 transformers.models.clip.modeling_clip.CLIPTextModelOutput 复制过来，并将 CLIP 替换为 Siglip
class SiglipTextModelOutput(ModelOutput):
    """
    文本模型输出的基类，同时包含最后隐藏状态的汇聚。

    Args:
        text_embeds (`torch.FloatTensor`，形状为 `(batch_size, output_dim)`，可选项，在初始化模型时设置 `with_projection=True` 时返回):
            通过将投影层应用于池化输出获得的文本嵌入。
        last_hidden_state (`torch.FloatTensor`，形状为 `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态的序列。
        hidden_states (`tuple(torch.FloatTensor)`，可选项，在传递 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
            元组 `torch.FloatTensor`（如果模型具有嵌入层，则为嵌入层的输出，以及每一层的输出），
            形状为 `(batch_size, sequence_length, hidden_size)`。

            模型在每层输出的隐藏状态以及可选的初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`，可选项，在传递 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
            元组 `torch.FloatTensor`（每层一个），形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。

            注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
    """

    text_embeds: Optional[torch.FloatTensor] = None
    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


# 使用 @dataclass 装饰器声明一个数据类，用于表示 SiglipOutput 类
@dataclass
# 从 transformers.models.clip.modeling_clip.CLIPOutput 复制过来，并将 CLIP 替换为 Siglip
class SiglipOutput(ModelOutput):
    """
    Siglip 输出的基类。
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image: (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text: (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
            The text embeddings obtained by applying the projection layer to the pooled output of `SiglipTextModel`.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
            The image embeddings obtained by applying the projection layer to the pooled output of `SiglipVisionModel`.
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the `SiglipTextModel`.
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the `SiglipVisionModel`.
    """
    
    # 定义一个类，用于封装对比损失和模型输出
    loss: Optional[torch.FloatTensor] = None
    logits_per_image: torch.FloatTensor = None
    logits_per_text: torch.FloatTensor = None
    text_embeds: torch.FloatTensor = None
    image_embeds: torch.FloatTensor = None
    text_model_output: BaseModelOutputWithPooling = None
    vision_model_output: BaseModelOutputWithPooling = None
    
    def to_tuple(self) -> Tuple[Any]:
        # 返回包含类属性的元组，但是对于"text_model_output"和"vision_model_output"属性，返回其转换为元组后的值
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )
class SiglipVisionEmbeddings(nn.Module):
    def __init__(self, config: SiglipVisionConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size  # 从配置中获取隐藏大小作为嵌入维度
        self.image_size = config.image_size  # 从配置中获取图像大小
        self.patch_size = config.patch_size  # 从配置中获取补丁大小

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,  # 输入通道数
            out_channels=self.embed_dim,      # 输出通道数（嵌入维度）
            kernel_size=self.patch_size,      # 卷积核大小（补丁大小）
            stride=self.patch_size,           # 卷积步长（补丁大小）
            padding="valid",                  # 卷积填充方式为有效填充
        )

        self.num_patches = (self.image_size // self.patch_size) ** 2  # 计算图像中的补丁数量
        self.num_positions = self.num_patches  # 位置嵌入的位置数量等于补丁数量
        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)  # 创建位置嵌入层
        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
        # 注册位置 ID 缓冲区，用于存储位置索引的张量

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        patch_embeds = self.patch_embedding(pixel_values)  # 使用卷积层对像素值进行补丁嵌入
        embeddings = patch_embeds.flatten(2).transpose(1, 2)  # 将补丁嵌入展平并进行维度转置

        embeddings = embeddings + self.position_embedding(self.position_ids)
        # 加上位置嵌入，以增强补丁嵌入的语义表示
        return embeddings


# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Siglip
class SiglipTextEmbeddings(nn.Module):
    def __init__(self, config: SiglipTextConfig):
        super().__init__()
        embed_dim = config.hidden_size  # 从配置中获取隐藏大小作为嵌入维度

        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)  # 创建标记嵌入层
        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)  # 创建位置嵌入层

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册位置 ID 缓冲区，用于存储位置索引的张量，支持序列化时的导出

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.Tensor:
        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
        # 计算输入序列的长度

        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]
            # 如果未提供位置 ID，则使用预注册的位置 ID，并根据序列长度截取

        if inputs_embeds is None:
            inputs_embeds = self.token_embedding(input_ids)
            # 如果未提供嵌入张量，则使用输入标记 ID 进行嵌入

        position_embeddings = self.position_embedding(position_ids)
        # 获取位置嵌入张量

        embeddings = inputs_embeds + position_embeddings
        # 将标记嵌入和位置嵌入相加，生成最终的嵌入表示

        return embeddings


class SiglipAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
    # 初始化函数，用于初始化一个注意力机制模型对象
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 将传入的配置对象保存在实例变量中
        self.config = config
        # 设置嵌入维度为配置对象中的隐藏大小
        self.embed_dim = config.hidden_size
        # 设置注意力头的数量为配置对象中的注意力头数量
        self.num_heads = config.num_attention_heads
        # 计算每个注意力头的维度
        self.head_dim = self.embed_dim // self.num_heads
        # 检查嵌入维度是否可以整除注意力头数量，否则抛出数值错误
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        # 设置缩放因子为头维度的负半数
        self.scale = self.head_dim**-0.5
        # 设置注意力机制中的丢弃率为配置对象中的注意力丢弃率
        self.dropout = config.attention_dropout

        # 初始化线性层，用于键、值、查询、输出的投影
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    # 前向传播函数，执行输入张量的注意力计算
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        # 获取隐藏状态张量的维度信息
        batch_size, q_len, _ = hidden_states.size()

        # 将隐藏状态张量投影到查询向量空间
        query_states = self.q_proj(hidden_states)
        # 将隐藏状态张量投影到键向量空间
        key_states = self.k_proj(hidden_states)
        # 将隐藏状态张量投影到值向量空间
        value_states = self.v_proj(hidden_states)

        # 将投影后的张量重新形状为 (batch_size, q_len, num_heads, head_dim)，并交换维度
        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)

        # 获取键-值对应的序列长度
        k_v_seq_len = key_states.shape[-2]
        # 计算注意力权重，使用 query 和 key 的点积，并乘以缩放因子
        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale

        # 检查注意力权重的维度是否符合预期
        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        # 如果有注意力掩码，则将其加到注意力权重上
        if attention_mask is not None:
            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask

        # 将注意力权重转换为 float32 类型，并进行 dropout
        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
        # 计算加权后的值向量
        attn_output = torch.matmul(attn_weights, value_states)

        # 检查输出的注意力张量的维度是否符合预期
        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        # 重新调整注意力输出的维度顺序，并保证连续的内存布局
        attn_output = attn_output.transpose(1, 2).contiguous()
        # 将注意力输出重新形状为 (batch_size, q_len, embed_dim)
        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)

        # 对输出应用最终的投影层变换
        attn_output = self.out_proj(attn_output)

        # 返回注意力输出以及注意力权重
        return attn_output, attn_weights
# 从 transformers.models.clip.modeling_clip.CLIPMLP 复制而来，将 CLIP 替换为 Siglip
class SiglipMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]  # 获取激活函数
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)  # 第一个全连接层
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)  # 第二个全连接层

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)  # 第一个全连接层的前向传播
        hidden_states = self.activation_fn(hidden_states)  # 应用激活函数
        hidden_states = self.fc2(hidden_states)  # 第二个全连接层的前向传播
        return hidden_states


# 从 transformers.models.clip.modeling_clip.CLIPEncoderLayer 复制而来，将 CLIP 替换为 Siglip
class SiglipEncoderLayer(nn.Module):
    def __init__(self, config: SiglipConfig):
        super().__init__()
        self.embed_dim = config.hidden_size  # 嵌入维度
        self.self_attn = SiglipAttention(config)  # 自注意力机制
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 第一个层归一化
        self.mlp = SiglipMLP(config)  # 多层感知机
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 第二个层归一化

    # 忽略复制部分
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`):
                输入的张量形状为 `(batch, seq_len, embed_dim)`。
            attention_mask (`torch.FloatTensor`):
                注意力掩码形状为 `(batch, 1, q_len, k_v_seq_len)`，其中填充元素由非常大的负值表示。
            output_attentions (`bool`, *optional*, defaults to `False`):
                是否返回所有注意力层的注意力张量。查看返回的张量中的 `attentions` 获取更多细节。
        """
        residual = hidden_states  # 保留残差连接

        hidden_states = self.layer_norm1(hidden_states)  # 第一个层归一化
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
        )  # 自注意力层的前向传播
        hidden_states = residual + hidden_states  # 残差连接

        residual = hidden_states  # 更新残差连接
        hidden_states = self.layer_norm2(hidden_states)  # 第二个层归一化
        hidden_states = self.mlp(hidden_states)  # 多层感知机的前向传播
        hidden_states = residual + hidden_states  # 残差连接

        outputs = (hidden_states,)  # 输出结果

        if output_attentions:
            outputs += (attn_weights,)  # 如果需要输出注意力权重，则加入到输出中

        return outputs


class SiglipPreTrainedModel(PreTrainedModel):
    """
    一个处理权重初始化和下载预训练模型的抽象类。
    """

    config_class = SiglipConfig  # 配置类
    base_model_prefix = "siglip"  # 基础模型前缀
    supports_gradient_checkpointing = True  # 支持梯度检查点
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果模块是 SiglipVisionEmbeddings 类型
        if isinstance(module, SiglipVisionEmbeddings):
            # 根据配置选择隐藏大小，初始化位置嵌入权重
            width = (
                self.config.vision_config.hidden_size
                if isinstance(self.config, SiglipConfig)
                else self.config.hidden_size
            )
            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
        
        # 如果模块是 nn.Embedding 类型
        elif isinstance(module, nn.Embedding):
            # 调用默认的 Flax 嵌入初始化方法
            default_flax_embed_init(module.weight)
        
        # 如果模块是 SiglipAttention 类型
        elif isinstance(module, SiglipAttention):
            # 使用 Xavier 均匀分布初始化权重
            nn.init.xavier_uniform_(module.q_proj.weight)
            nn.init.xavier_uniform_(module.k_proj.weight)
            nn.init.xavier_uniform_(module.v_proj.weight)
            nn.init.xavier_uniform_(module.out_proj.weight)
            # 初始化偏置为零
            nn.init.zeros_(module.q_proj.bias)
            nn.init.zeros_(module.k_proj.bias)
            nn.init.zeros_(module.v_proj.bias)
            nn.init.zeros_(module.out_proj.bias)
        
        # 如果模块是 SiglipMLP 类型
        elif isinstance(module, SiglipMLP):
            # 使用 Xavier 均匀分布初始化全连接层权重
            nn.init.xavier_uniform_(module.fc1.weight)
            nn.init.xavier_uniform_(module.fc2.weight)
            # 使用小的正态分布初始化偏置
            nn.init.normal_(module.fc1.bias, std=1e-6)
            nn.init.normal_(module.fc2.bias, std=1e-6)
        
        # 如果模块是 SiglipMultiheadAttentionPoolingHead 类型
        elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
            # 使用 Xavier 均匀分布初始化 probe 数据
            nn.init.xavier_uniform_(module.probe.data)
            # 使用 Xavier 均匀分布初始化注意力层的权重
            nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
            # 初始化注意力层的偏置为零
            nn.init.zeros_(module.attention.in_proj_bias.data)
        
        # 如果模块是 SiglipModel 类型
        elif isinstance(module, SiglipModel):
            # 初始化 logit_scale 数据为 log(1.0)
            logit_scale_init = torch.log(torch.tensor(1.0))
            module.logit_scale.data.fill_(logit_scale_init)
            # 初始化 logit_bias 数据为零
            module.logit_bias.data.zero_()
        
        # 如果模块是 nn.Linear 或 nn.Conv2d 类型
        elif isinstance(module, (nn.Linear, nn.Conv2d)):
            # 使用 LeCun 正态分布初始化权重
            lecun_normal_(module.weight)
            # 如果有偏置，初始化偏置为零
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        
        # 如果模块是 nn.LayerNorm 类型
        elif isinstance(module, nn.LayerNorm):
            # 初始化偏置为零
            module.bias.data.zero_()
            # 初始化权重为 1.0
            module.weight.data.fill_(1.0)
# SIGLIP_START_DOCSTRING 是一个包含模型介绍信息的原始字符串，用于说明该模型继承自 PreTrainedModel 类，
# 可以查看超类文档以了解通用方法（如下载或保存模型、调整输入嵌入大小、修剪头等）。
SIGLIP_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`SiglipConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# SIGLIP_TEXT_INPUTS_DOCSTRING 是一个包含文本输入信息的原始字符串，用于说明模型输入的参数和类型。
SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# SIGLIP_VISION_INPUTS_DOCSTRING 是一个空字符串，暂未填充任何文档内容。
SIGLIP_VISION_INPUTS_DOCSTRING = r"""
    
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            # 输入的像素数值张量，形状为(batch_size, num_channels, height, width)，包含图像的像素值。
            # 默认情况下会忽略填充部分。可以使用`AutoImageProcessor`获取像素值。详见`CLIPImageProcessor.__call__`。
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。返回的张量中`attentions`字段会提供更详细的信息。
            # 可选参数，默认为False。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。返回的张量中`hidden_states`字段会提供更详细的信息。
            # 可选参数，默认为False。
        return_dict (`bool`, *optional*):
            # 是否返回[`~utils.ModelOutput`]格式的结果，而不是普通的元组。
            # 可选参数，默认为False。
"""
SIGLIP_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            输入序列标记在词汇表中的索引。默认情况下将忽略填充。
            
            可以使用 [`AutoTokenizer`] 获得这些索引。详情请参见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`]。

            [什么是输入 ID？](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            遮罩，避免在填充的标记索引上执行注意力计算。遮罩值在 `[0, 1]` 之间：

            - 1 表示 **未被遮罩** 的标记，
            - 0 表示 **被遮罩** 的标记。

            [什么是注意力遮罩？](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            每个输入序列标记在位置嵌入中的位置索引。选择范围为 `[0, config.max_position_embeddings - 1]`。

            [什么是位置 ID？](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            像素值。默认情况下将忽略填充。可以使用 [`AutoImageProcessor`] 获取像素值。详情请参见 [`CLIPImageProcessor.__call__`]。

        return_loss (`bool`, *optional*):
            是否返回对比损失。
        output_attentions (`bool`, *optional*):
            是否返回所有注意力层的注意力张量。返回的张量中有关 `attentions` 的更多细节。

        output_hidden_states (`bool`, *optional*):
            是否返回所有层的隐藏状态。返回的张量中有关 `hidden_states` 的更多细节。

        return_dict (`bool`, *optional*):
            是否返回 [`~utils.ModelOutput`] 而非普通元组。
"""


# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
class SiglipEncoder(nn.Module):
    """
    由 `config.num_hidden_layers` 个自注意力层组成的 Transformer 编码器。每一层都是一个 [`SiglipEncoderLayer`]。

    Args:
        config: SiglipConfig
    """

    def __init__(self, config: SiglipConfig):
        super().__init__()
        self.config = config
        # 创建包含多个 `SiglipEncoderLayer` 的模块列表
        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点标志，默认为 False
        self.gradient_checkpointing = False

    # 忽略复制
    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    def __init__(self, config: SiglipTextConfig):
        super().__init__()
        self.config = config
        embed_dim = config.hidden_size
        self.embeddings = SiglipTextEmbeddings(config)  # 初始化文本嵌入层对象
        self.encoder = SiglipEncoder(config)  # 初始化编码器对象
        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)  # 初始化最终层规范化对象

        self.head = nn.Linear(embed_dim, embed_dim)  # 创建线性层，用于处理池化输出

    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:

        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is None:
            raise ValueError("You have to specify input_ids")  # 如果没有提供输入ID，则抛出数值错误

        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_shape[-1])  # 将输入ID调整为二维形状

        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)  # 调用文本嵌入层进行输入嵌入

        # note: SigLIP's text model does not use a causal mask, unlike the original CLIP model.
        # expand attention_mask
        if attention_mask is not None:
            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)  # 准备四维注意力掩码

        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )  # 调用编码器进行前向传播

        last_hidden_state = encoder_outputs[0]  # 取编码器输出的最后隐藏状态
        last_hidden_state = self.final_layer_norm(last_hidden_state)  # 对最后隐藏状态进行规范化

        # Assuming "sticky" EOS tokenization, last token is always EOS.
        pooled_output = last_hidden_state[:, -1, :]  # 汇集最终的输出，假设“sticky” EOS 标记化

        pooled_output = self.head(pooled_output)  # 通过线性层处理池化输出

        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]  # 如果不返回字典形式，则返回元组

        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )  # 返回字典形式的输出结果，包括最后隐藏状态、池化输出、隐藏状态和注意力权重
# 从 SigLIP 模型派生的文本模型，没有额外的头部或顶层投影
@add_start_docstrings(
    """The text model from SigLIP without any head or projection on top.""",
    SIGLIP_START_DOCSTRING,  # 添加了 SigLIP 的起始文档字符串
)
class SiglipTextModel(SiglipPreTrainedModel):
    config_class = SiglipTextConfig  # 设置配置类为 SiglipTextConfig

    _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"]  # 不可分割的模块列表

    def __init__(self, config: SiglipTextConfig):
        super().__init__(config)
        self.text_model = SiglipTextTransformer(config)  # 初始化 SiglipTextTransformer 模型
        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        return self.text_model.embeddings.token_embedding  # 获取输入嵌入层

    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value  # 设置输入嵌入层

    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)  # 添加前向传播方法的文档字符串
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)  # 替换返回值文档字符串
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:

        Examples:

        ```
        >>> from transformers import AutoTokenizer, SiglipTextModel

        >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        return self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


class SiglipVisionTransformer(nn.Module):
    def __init__(self, config: SiglipVisionConfig):
        super().__init__()
        self.config = config
        embed_dim = config.hidden_size

        self.embeddings = SiglipVisionEmbeddings(config)  # 初始化视觉嵌入层
        self.encoder = SiglipEncoder(config)  # 初始化编码器
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)  # 初始化后层归一化
        self.head = SiglipMultiheadAttentionPoolingHead(config)  # 初始化多头注意力池化头部

    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)  # 添加前向传播方法的文档字符串
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
    # 使用装饰器 @replace_return_docstrings 替换返回值的文档字符串，指定输出类型为 BaseModelOutputWithPooling，配置类为 SiglipVisionConfig
    def forward(
        self,
        pixel_values,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:
        该方法不返回具体文本，但应根据输出类型 BaseModelOutputWithPooling 进行说明。

        """
        # 如果 output_attentions 不为 None，则使用该值；否则使用 self.config.output_attentions
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果 output_hidden_states 不为 None，则使用该值；否则使用 self.config.output_hidden_states
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 return_dict 不为 None，则使用该值；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将像素值传递给嵌入层，得到隐藏状态
        hidden_states = self.embeddings(pixel_values)

        # 调用编码器进行前向传播，传递隐藏状态和其他配置参数
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的最后隐藏状态并通过后层标准化层处理
        last_hidden_state = encoder_outputs[0]
        last_hidden_state = self.post_layernorm(last_hidden_state)

        # 将处理后的最后隐藏状态传递给头部层，得到池化输出
        pooled_output = self.head(last_hidden_state)

        # 如果 return_dict 为 False，则返回元组形式的结果
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果 return_dict 为 True，则返回 BaseModelOutputWithPooling 类型的对象
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
class SiglipMultiheadAttentionPoolingHead(nn.Module):
    """Multihead Attention Pooling."""

    def __init__(self, config: SiglipVisionConfig):
        super().__init__()

        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.mlp = SiglipMLP(config)

    def forward(self, hidden_state):
        batch_size = hidden_state.shape[0]
        probe = self.probe.repeat(batch_size, 1, 1)

        # 使用注意力机制处理隐藏状态，probe作为query和key，hidden_state作为value
        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]

        residual = hidden_state
        # 使用 LayerNorm 进行归一化处理
        hidden_state = self.layernorm(hidden_state)
        # 使用 MLP 进行多层感知机处理，然后加上残差连接
        hidden_state = residual + self.mlp(hidden_state)

        # 返回处理后的隐藏状态的第一个维度（通常是batch维度）的第一个元素
        return hidden_state[:, 0]


@add_start_docstrings(
    """The vision model from SigLIP without any head or projection on top.""",
    SIGLIP_START_DOCSTRING,
)
class SiglipVisionModel(SiglipPreTrainedModel):
    config_class = SiglipVisionConfig
    main_input_name = "pixel_values"

    def __init__(self, config: SiglipVisionConfig):
        super().__init__(config)

        self.vision_model = SiglipVisionTransformer(config)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        # 返回视觉模型的嵌入层
        return self.vision_model.embeddings.patch_embedding

    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
    def forward(
        self,
        pixel_values,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        返回：

        示例：

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, SiglipVisionModel

        >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # 汇聚的特征
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        return self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
# 使用指定的文档字符串模板为类添加文档字符串
@add_start_docstrings(SIGLIP_START_DOCSTRING)
class SiglipModel(SiglipPreTrainedModel):
    # 设置配置类为 SiglipConfig
    config_class = SiglipConfig

    def __init__(self, config: SiglipConfig):
        # 调用父类的初始化方法
        super().__init__(config)

        # 检查配置文件中的文本配置是否为 SiglipTextConfig 类型
        if not isinstance(config.text_config, SiglipTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type SiglipTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查配置文件中的视觉配置是否为 SiglipVisionConfig 类型
        if not isinstance(config.vision_config, SiglipVisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type SiglipVisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 获取文本和视觉配置
        text_config = config.text_config
        vision_config = config.vision_config

        # 初始化文本模型和视觉模型
        self.text_model = SiglipTextTransformer(text_config)
        self.vision_model = SiglipVisionTransformer(vision_config)

        # 初始化用于缩放和偏置的参数
        self.logit_scale = nn.Parameter(torch.randn(1))
        self.logit_bias = nn.Parameter(torch.randn(1))

        # 执行额外的初始化步骤和最终处理
        self.post_init()

    # 使用指定的文档字符串模板为方法添加文档字符串
    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`SiglipTextModel`].

        Examples:

        ```
        >>> from transformers import AutoTokenizer, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
        >>> with torch.no_grad():
        ...     text_features = model.get_text_features(**inputs)
        ```"""
        # 根据参数设置或默认配置，确定是否返回注意力权重、隐藏状态及字典形式的返回
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 SigLIP 文本模型处理输入，获取文本输出
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从文本输出中获取池化的输出作为文本特征表示
        pooled_output = text_outputs[1]

        # 返回文本特征表示
        return pooled_output

    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
    def get_image_features(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> torch.FloatTensor:
        r"""
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`SiglipVisionModel`].

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     image_features = model.get_image_features(**inputs)
        ```"""
        # Use SiglipModel's config for some fields (if specified) instead of those of vision & text components.
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 使用 self.config 中的 output_attentions 字段，如果未指定则使用 vision_model 的默认值
        # 使用 self.config 中的 output_hidden_states 字段，如果未指定则使用 vision_model 的默认值
        # 使用 self.config 中的 use_return_dict 字段，如果未指定则使用 vision_model 的默认值

        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 调用 vision_model 进行前向传播，传入像素值、注意力输出、隐藏状态输出和返回字典标志位

        pooled_output = vision_outputs[1]
        # 从 vision_model 的输出中获取汇聚的特征向量作为 pooled_output

        return pooled_output
# 声明一个用于 SigLIP 图像分类的编码器模型，其顶部有一个图像分类头部（线性层，位于补丁标记的最终隐藏状态之上），例如用于 ImageNet。
@add_start_docstrings(
    """
    SigLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    """,
    SIGLIP_START_DOCSTRING,
)
class SiglipForImageClassification(SiglipPreTrainedModel):
    # 主输入名称为 "pixel_values"
    main_input_name = "pixel_values"

    # 初始化函数，接受一个 SiglipConfig 类型的配置对象
    def __init__(self, config: SiglipConfig) -> None:
        # 调用父类的初始化函数
        super().__init__(config)

        # 设置模型的标签数量
        self.num_labels = config.num_labels
        # 创建 SiglipVisionTransformer 类型的视觉模型
        self.vision_model = SiglipVisionTransformer(config.vision_config)

        # 分类器头部
        self.classifier = (
            nn.Linear(config.vision_config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # 初始化权重并应用最终处理
        self.post_init()

    # 声明 forward 方法，用于模型的前向传播
    @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[tuple, ImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 output_attentions 参数为 None，则使用 self.config.output_attentions；否则使用传入的值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果 output_hidden_states 参数为 None，则使用 self.config.output_hidden_states；否则使用传入的值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 return_dict 参数为 None，则使用 self.config.use_return_dict；否则使用传入的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 vision_model 对象进行前向传播，传入 pixel_values 和各参数
        outputs = self.vision_model(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取序列输出
        sequence_output = outputs[0]

        # 对 patch tokens 进行平均池化
        sequence_output = torch.mean(sequence_output[:, 1:, :], dim=1)
        # 应用分类器对序列输出进行分类预测
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            # 将 labels 移动到与 logits 相同的设备，以支持模型并行计算
            labels = labels.to(logits.device)
            # 确定问题类型（回归、单标签分类或多标签分类）
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不要求返回字典形式的结果，则返回元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典形式的结果，则返回 ImageClassifierOutput 对象
        return ImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\siglip\processing_siglip.py`

# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Image/Text processor class for SigLIP.
"""

from typing import List, Optional, Union

from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType


class SiglipProcessor(ProcessorMixin):
    r"""
    Constructs a Siglip processor which wraps a Siglip image processor and a Siglip tokenizer into a single processor.

    [`SiglipProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`SiglipTokenizer`]. See the
    [`~SiglipProcessor.__call__`] and [`~SiglipProcessor.decode`] for more information.

    Args:
        image_processor ([`SiglipImageProcessor`]):
            The image processor is a required input.
        tokenizer ([`SiglipTokenizer`]):
            The tokenizer is a required input.
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "SiglipImageProcessor"
    tokenizer_class = "SiglipTokenizer"

    def __init__(self, image_processor, tokenizer):
        super().__init__(image_processor, tokenizer)

    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        images: ImageInput = None,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: int = None,
        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
    ):
        """
        This method combines the functionalities of both image processing and tokenization. It processes input text
        and/or images according to specified padding, truncation, and max length parameters, and returns processed data
        in a format based on the return_tensors argument.
        """
        raise NotImplementedError

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    @property
    def model_input_names(self):
        """
        Property method that provides model input names. It is copied from transformers.models.clip.processing_clip.
        CLIPProcessor.model_input_names with modifications for Siglip and T5.
        """
        return {
            "text": "inputs",
            "image": "pixel_values",
            "padding": "padding",
            "max_length": "max_length",
            "truncation": "truncation",
            "return_tensors": "return_tensors",
        }
    # 定义方法：获取模型输入的名称列表
    def model_input_names(self):
        # 获取分词器的模型输入名称列表
        tokenizer_input_names = self.tokenizer.model_input_names
        # 获取图像处理器的模型输入名称列表
        image_processor_input_names = self.image_processor.model_input_names
        # 将两个列表合并，并去除重复的元素，以列表形式返回结果
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

`.\models\siglip\tokenization_siglip.py`

# coding=utf-8
# 设定文件编码为 UTF-8

# Copyright 2024 The HuggingFace Inc. team.
# 版权声明，版权归 The HuggingFace Inc. 团队所有

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 授权许可

# you may not use this file except in compliance with the License.
# 除非符合许可证要求，否则不得使用此文件

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证的副本

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则依据 "AS IS" 分发本软件，
# 无论是明示的还是隐含的，不包括任何形式的担保或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 请参阅许可证以了解有关权限和限制的详细信息

""" Tokenization class for SigLIP model."""
# 为 SigLIP 模型设计的分词类

import os
import re
import string
import warnings
from shutil import copyfile
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple

import sentencepiece as spm  # 导入 sentencepiece 库

from ...convert_slow_tokenizer import import_protobuf  # 导入从 protobuf 导入的函数
from ...tokenization_utils import PreTrainedTokenizer  # 导入预训练分词器基类
from ...tokenization_utils_base import AddedToken  # 导入添加的 token 类型


if TYPE_CHECKING:
    from ...tokenization_utils_base import TextInput  # 导入文本输入类型

from ...utils import logging, requires_backends  # 导入日志记录和后端要求


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}  # 词汇文件的名称映射

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "google/siglip-base-patch16-224": "https://huggingface.co/google/siglip-base-patch16-224/resolve/main/spiece.model",
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "google/siglip-base-patch16-224": 256,
}

SPIECE_UNDERLINE = "▁"  # SentencePiece 使用的前缀符号


class SiglipTokenizer(PreTrainedTokenizer):
    """
    Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    # Siglip 分词器的构造函数，基于 SentencePiece

    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=False,
        remove_space=True,
        keep_accents=False,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        **kwargs
    ):
        # 初始化函数，设置分词器的各种参数
        pass
    """
    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"</s>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        model_max_length (`int`, *optional*, defaults to 64):
            The maximum length (in number of tokens) for model inputs.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
    """

    # Define constants related to tokenizer vocabulary files and models
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]

    # Constructor method for the Tokenizer class
    def __init__(
        self,
        vocab_file,
        eos_token="</s>",
        unk_token="<unk>",
        pad_token="</s>",
        additional_special_tokens=None,
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        model_max_length=64,
        do_lower_case=True,
        **kwargs,
    ):
    @property
    # 返回词汇表的大小，基于 SentencePiece 模型的词汇量
    def vocab_size(self):
        return self.sp_model.get_piece_size()

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_vocab
    def get_vocab(self):
        # 创建词汇表字典，将词汇 ID 映射到对应的词汇
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        # 更新词汇表字典，加入额外的特殊标记的映射
        vocab.update(self.added_tokens_encoder)
        return vocab

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_special_tokens_mask
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> torch.Tensor:
        # 返回特殊标记的掩码张量，用于标记哪些是特殊标记
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # If the token list already has special tokens, delegate to the base class's method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # Calculate special tokens mask for the normal case (with special tokens)
        if token_ids_1 is None:
            return ([0] * len(token_ids_0)) + [1]  # No sequence pair, return mask for token_ids_0
        else:
            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]  # Return mask for token_ids_0 and token_ids_1

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._add_eos_if_not_present
    def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
        """Do not add eos again if user already added it."""
        # Check if eos_token_id is already present at the end of token_ids
        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
            # Warn if eos_token is already present to prevent duplication in future versions
            warnings.warn(
                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
                " eos tokens being added."
            )
            return token_ids  # Return unchanged token_ids if eos_token is already present
        else:
            return token_ids + [self.eos_token_id]  # Add eos_token_id to token_ids and return

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.create_token_type_ids_from_sequences
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        eos = [self.eos_token_id]  # Create a list containing eos_token_id

        # Calculate token type ids assuming the presence of EOS tokens
        if token_ids_1 is None:
            return len(token_ids_0 + eos) * [0]  # Return zero mask for token_ids_0 + eos
        else:
            return len(token_ids_0 + eos + token_ids_1 + eos) * [0]  # Return zero mask for token_ids_0 + eos + token_ids_1 + eos

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.build_inputs_with_special_tokens
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
        """
        Build model inputs from a sequence or a pair of sequences, including adding special tokens.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional list of IDs for the second sequence in a pair.

        Returns:
            `List[int]`: A list of token IDs with added special tokens.
        """
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # Ensure the first sequence ends with an end-of-sequence token if not already present
        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
        
        # If only one sequence is provided, return it with added special tokens
        if token_ids_1 is None:
            return token_ids_0
        else:
            # Ensure the second sequence ends with an end-of-sequence token if not already present
            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
            # Concatenate both sequences with their respective special tokens added
            return token_ids_0 + token_ids_1

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.__getstate__
    def __getstate__(self):
        # Create a copy of the object's state dictionary
        state = self.__dict__.copy()
        # Set the 'sp_model' attribute to None to avoid pickling issues
        state["sp_model"] = None
        return state

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.__setstate__
    def __setstate__(self, d):
        # Restore the object's state from the provided dictionary
        self.__dict__ = d

        # Ensure backward compatibility by initializing 'sp_model_kwargs' if absent
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # Initialize 'sp_model' using SentencePieceProcessor with saved parameters
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(self.vocab_file)

    def remove_punctuation(self, text: str) -> str:
        # Remove all punctuation characters from the input text
        return text.translate(str.maketrans("", "", string.punctuation))

    # source: https://github.com/google-research/big_vision/blob/3b8e5ab6ad4f96e32b32826f9e1b8fd277914f9c/big_vision/evaluators/proj/image_text/prompt_engineering.py#L94
    def canonicalize_text(self, text, *, keep_punctuation_exact_string=None):
        """Returns canonicalized `text` (puncuation removed).

        Args:
            text (`str`):
                String to be canonicalized.
            keep_punctuation_exact_string (`str`, *optional*):
                If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
                (but will still remove '{' and '}' that appear separately).
        """
        if keep_punctuation_exact_string:
            # Replace occurrences of 'keep_punctuation_exact_string' with itself after removing punctuation
            text = keep_punctuation_exact_string.join(
                self.remove_punctuation(part) for part in text.split(keep_punctuation_exact_string)
            )
        else:
            # Remove all punctuation characters from the entire text
            text = self.remove_punctuation(text)
        
        # Replace multiple spaces with a single space, then strip leading and trailing spaces
        text = re.sub(r"\s+", " ", text)
        text = text.strip()

        return text
    # 将文本转换为标记列表的方法
    def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]:
        """
        Converts a string to a list of tokens.
        """
        # 使用父类的方法将文本转换为标记列表
        tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs)

        # 如果标记数大于1且第一个标记是SPIECE_UNDERLINE，并且第二个标记是特殊标记之一，则移除第一个标记
        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
            tokens = tokens[1:]
        return tokens

    @property
    # 从transformers.models.t5.tokenization_t5.T5Tokenizer.unk_token_length中复制而来
    def unk_token_length(self):
        # 返回未知标记的编码长度
        return len(self.sp_model.encode(str(self.unk_token)))

    def _tokenize(self, text, **kwargs):
        """
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE.

        For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give `['H', 'e', 'y']` instead of `['▁He', 'y']`.

        Thus we always encode `f"{unk_token}text"` and strip the `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        """
        # 规范化文本，保持标点符号的精确性
        text = self.canonicalize_text(text, keep_punctuation_exact_string=None)
        # 使用句子片段模型对文本进行编码
        tokens = self.sp_model.encode(text, out_type=str)

        # 1. 编码字符串 + 前缀，例如 "<unk> Hey"
        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
        # 2. 从 ['<','unk','>', '▁Hey'] 中移除 self.unk_token
        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens

    # 从transformers.models.t5.tokenization_t5.T5Tokenizer._convert_token_to_id中复制而来
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 使用词汇表将标记转换为标识符
        return self.sp_model.piece_to_id(token)

    # 从transformers.models.t5.tokenization_t5.T5Tokenizer._convert_id_to_token中复制而来
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用词汇表将标识符转换为标记
        token = self.sp_model.IdToPiece(index)
        return token

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        current_sub_tokens = []
        out_string = ""
        prev_is_special = False
        for token in tokens:
            # 确保特殊标记不使用句子片段模型解码
            if token in self.all_special_tokens:
                if not prev_is_special:
                    out_string += " "
                out_string += self.sp_model.decode(current_sub_tokens) + token
                prev_is_special = True
                current_sub_tokens = []
            else:
                current_sub_tokens.append(token)
                prev_is_special = False
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string.strip()
    # 从 transformers.models.t5.tokenization_t5.T5Tokenizer.save_vocabulary 复制而来的方法
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果保存目录不存在，则记录错误信息并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 拼接输出的词汇文件路径
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        
        # 如果当前词汇文件路径与输出路径不同，并且当前词汇文件存在，则复制当前词汇文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # 如果当前词汇文件不存在，则将序列化后的模型内容写入输出文件
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)
        
        # 返回输出文件的路径元组
        return (out_vocab_file,)

Transformers-源码解析-一百零三-

Transformers 源码解析（一百零三）

.\models\sew\__init__.py

.\models\sew_d\configuration_sew_d.py

.\models\sew_d\convert_sew_d_original_pytorch_checkpoint_to_pytorch.py

.\models\sew_d\modeling_sew_d.py

.\models\sew_d\__init__.py

.\models\siglip\configuration_siglip.py

.\models\siglip\convert_siglip_to_hf.py

.\models\siglip\image_processing_siglip.py

.\models\siglip\modeling_siglip.py

.\models\siglip\processing_siglip.py

.\models\siglip\tokenization_siglip.py

`.\models\sew\init.py`

`.\models\sew_d\configuration_sew_d.py`

`.\models\sew_d\convert_sew_d_original_pytorch_checkpoint_to_pytorch.py`

`.\models\sew_d\modeling_sew_d.py`

`.\models\sew_d\init.py`

`.\models\siglip\configuration_siglip.py`

`.\models\siglip\convert_siglip_to_hf.py`

`.\models\siglip\image_processing_siglip.py`

`.\models\siglip\modeling_siglip.py`

`.\models\siglip\processing_siglip.py`

`.\models\siglip\tokenization_siglip.py`