Transformers 源码解析（一百二十九）

`.\models\xmod\init.py`

# flake8: noqa
# 禁用 flake8 检查，因为无法忽略 "F401 '...' imported but unused" 警告，但要保留其他警告。因此完全不检查此模块。

# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# 根据 Apache 许可证 2.0 版本许可。除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件是基于"原样"提供的，
# 没有任何明示或暗示的担保或条件。
# 有关特定语言的权限，请参阅许可证。

from typing import TYPE_CHECKING

# 导入异常处理模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构
_import_structure = {
    "configuration_xmod": [
        "XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "XmodConfig",
        "XmodOnnxConfig",
    ],
}

# 检查是否导入了 torch，如果未导入则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果导入了 torch，则添加以下模块到导入结构中
    _import_structure["modeling_xmod"] = [
        "XMOD_PRETRAINED_MODEL_ARCHIVE_LIST",
        "XmodForCausalLM",
        "XmodForMaskedLM",
        "XmodForMultipleChoice",
        "XmodForQuestionAnswering",
        "XmodForSequenceClassification",
        "XmodForTokenClassification",
        "XmodModel",
        "XmodPreTrainedModel",
    ]

# 如果 TYPE_CHECKING 为真，则导入配置和建模模块
if TYPE_CHECKING:
    from .configuration_xmod import XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP, XmodConfig, XmodOnnxConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_xmod import (
            XMOD_PRETRAINED_MODEL_ARCHIVE_LIST,
            XmodForCausalLM,
            XmodForMaskedLM,
            XmodForMultipleChoice,
            XmodForQuestionAnswering,
            XmodForSequenceClassification,
            XmodForTokenClassification,
            XmodModel,
            XmodPreTrainedModel,
        )

# 如果不是 TYPE_CHECKING 模式，则使用 _LazyModule 模块化导入结构
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\x_clip\configuration_x_clip.py`

# 设置文件编码为 UTF-8，确保代码中文本正确解析
# 版权声明和许可信息，指明此代码的使用权限和限制
# 导入必要的模块和函数，包括 PretrainedConfig 和 logging
import os
from typing import Union

from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)

# XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP 是一个映射表，将模型名称映射到其预训练配置文件的 URL
XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/xclip-base-patch32": "https://huggingface.co/microsoft/xclip-base-patch32/resolve/main/config.json",
}

# XCLIPTextConfig 是一个配置类，用于存储 X-CLIP 模型的配置信息
class XCLIPTextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`XCLIPModel`]. It is used to instantiate an X-CLIP
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the X-CLIP
    [microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # 模型类型为文本X-CLIP模型
    model_type = "xclip_text_model"

    # 初始化函数，设置X-CLIP文本模型的各种参数
    def __init__(
        self,
        vocab_size=49408,  # 词汇表大小，默认49408，定义可表示的不同token数量
        hidden_size=512,   # 编码器层和池化器层的维度
        intermediate_size=2048,  # Transformer编码器中"intermediate"（即前馈）层的维度
        num_hidden_layers=12,     # Transformer编码器中的隐藏层层数
        num_attention_heads=8,    # Transformer编码器中每个注意力层的注意头数
        max_position_embeddings=77,  # 模型可能使用的最大序列长度，一般设置为较大值，如512、1024或2048
        hidden_act="quick_gelu",  # 编码器和池化器中的非线性激活函数，支持"gelu"、"relu"、"selu"和"quick_gelu"
        layer_norm_eps=1e-5,      # 层归一化层使用的epsilon值
        attention_dropout=0.0,    # 注意力概率的dropout比率
        initializer_range=0.02,   # 用于初始化所有权重矩阵的截断正态分布的标准差
        initializer_factor=1.0,   # 初始化所有权重矩阵的因子（内部初始化测试时应保持为1）
        pad_token_id=1,           # 填充token的ID
        bos_token_id=0,           # 开始token的ID
        eos_token_id=2,           # 结束token的ID
        **kwargs,                 # 其他关键字参数
    ):
    ):
        # 调用父类的初始化方法，设置Transformer模型的各种参数，包括填充、起始和结束标记的ID
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 设置当前对象的词汇表大小、隐藏层大小、中间层大小、隐藏层的数量、注意力头的数量、最大位置嵌入、层归一化epsilon值、隐藏层激活函数、初始化范围、初始化因子以及注意力机制的dropout率
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.max_position_embeddings = max_position_embeddings
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.initializer_factor = initializer_factor
        self.attention_dropout = attention_dropout

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 将token相关的参数设置到kwargs中
        cls._set_token_in_kwargs(kwargs)

        # 从预训练模型名称或路径中获取配置字典和更新后的kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中的模型类型是"xclip"，则使用其文本配置字典
        if config_dict.get("model_type") == "xclip":
            config_dict = config_dict["text_config"]

        # 如果配置字典中存在"model_type"属性，并且当前类定义了"model_type"属性，且它们不相等，发出警告信息
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 根据配置字典和kwargs创建当前类的实例
        return cls.from_dict(config_dict, **kwargs)
# XCLIPVisionConfig 类，继承自 PretrainedConfig，用于存储 X-CLIP 模型的配置信息
class XCLIPVisionConfig(PretrainedConfig):
    r"""
    这是一个配置类，用于存储 [`XCLIPModel`] 的配置。根据指定的参数实例化 X-CLIP 模型，定义模型架构。使用默认配置实例化将得到与 X-CLIP
    [microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) 架构类似的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。详细信息请参阅 [`PretrainedConfig`] 的文档。
    """
    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            # 编码器层和池化层的维度大小
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            # Transformer 编码器中“中间”（即前馈）层的维度大小
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            # Transformer 编码器中的隐藏层数量
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            # Transformer 编码器中每个注意力层的注意力头数量
            Number of attention heads for each attention layer in the Transformer encoder.
        mit_hidden_size (`int`, *optional*, defaults to 512):
            # Multiframe Integration Transformer（MIT）中编码器层的维度大小
            Dimensionality of the encoder layers of the Multiframe Integration Transformer (MIT).
        mit_intermediate_size (`int`, *optional*, defaults to 2048):
            # Multiframe Integration Transformer（MIT）中“中间”（即前馈）层的维度大小
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Multiframe Integration Transformer (MIT).
        mit_num_hidden_layers (`int`, *optional*, defaults to 1):
            # Multiframe Integration Transformer（MIT）中的隐藏层数量
            Number of hidden layers in the Multiframe Integration Transformer (MIT).
        mit_num_attention_heads (`int`, *optional*, defaults to 8):
            # Multiframe Integration Transformer（MIT）中每个注意力层的注意力头数量
            Number of attention heads for each attention layer in the Multiframe Integration Transformer (MIT).
        image_size (`int`, *optional*, defaults to 224):
            # 每个图像的分辨率大小
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 32):
            # 每个图像块（patch）的分辨率大小
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            # 编码器和池化层中的非线性激活函数
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"`, `"gelu_new"` and ``"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
            # 层归一化层使用的 epsilon 值
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            # 注意力概率的 dropout 比率
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            # 用于初始化所有权重矩阵的截断正态分布的标准差
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1):
            # 用于初始化所有权重矩阵的因子（内部测试时保持为1）
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
        drop_path_rate (`float`, *optional*, defaults to 0.0):
            # 随机深度的比率
            Stochastic depth rate.

    Example:

    ```
    >>> from transformers import XCLIPVisionModel, XCLIPVisionConfig

    >>> # 使用 microsoft/xclip-base-patch32 风格的配置初始化 XCLIPVisionModel
    >>> configuration = XCLIPVisionConfig()

    >>> # 使用 microsoft/xclip-base-patch32 风格的配置初始化 XCLIPVisionModel 模型
    >>> model = XCLIPVisionModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    # 设定模型类型为"xclip_vision_model"
    model_type = "xclip_vision_model"

    # 定义初始化方法，设置模型各种参数
    def __init__(
        self,
        hidden_size=768,
        intermediate_size=3072,
        num_hidden_layers=12,
        num_attention_heads=12,
        mit_hidden_size=512,
        mit_intermediate_size=2048,
        mit_num_hidden_layers=1,
        mit_num_attention_heads=8,
        num_channels=3,
        image_size=224,
        patch_size=32,
        num_frames=8,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-5,
        attention_dropout=0.0,
        initializer_range=0.02,
        initializer_factor=1.0,
        drop_path_rate=0.0,
        **kwargs,
    ):
        # 调用父类的初始化方法，传递额外参数
        super().__init__(**kwargs)

        # 设置模型各项参数
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.mit_hidden_size = mit_hidden_size
        self.mit_intermediate_size = mit_intermediate_size
        self.mit_num_hidden_layers = mit_num_hidden_layers
        self.mit_num_attention_heads = mit_num_attention_heads
        self.num_channels = num_channels
        self.patch_size = patch_size
        self.num_frames = num_frames
        self.image_size = image_size
        self.initializer_range = initializer_range
        self.initializer_factor = initializer_factor
        self.attention_dropout = attention_dropout
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act
        self.drop_path_rate = drop_path_rate

    @classmethod
    # 从预训练模型加载配置，返回预训练配置的实例
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 设置Token到kwargs中
        cls._set_token_in_kwargs(kwargs)

        # 获取配置字典和更新后的kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典的模型类型是"xclip"，则使用视觉配置字典
        if config_dict.get("model_type") == "xclip":
            config_dict = config_dict["vision_config"]

        # 如果配置字典中包含模型类型，并且模型类型与当前类的model_type不一致，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典创建实例，并传入更新后的kwargs
        return cls.from_dict(config_dict, **kwargs)
# XCLIPConfig 类继承自 PretrainedConfig，用于存储 X-CLIP 模型的配置信息。
# 该配置类包含了初始化 X-CLIP 模型所需的各种参数，定义了文本模型和视觉模型的配置。
# 使用默认参数实例化一个配置对象将会得到与 microsoft/xclip-base-patch32 架构类似的配置。
# 配置对象继承自 PretrainedConfig，并可用于控制模型的输出。详细信息请参阅 PretrainedConfig 的文档。

class XCLIPConfig(PretrainedConfig):
    r"""
    [`XCLIPConfig`] is the configuration class to store the configuration of a [`XCLIPModel`]. It is used to
    instantiate X-CLIP model according to the specified arguments, defining the text model and vision model configs.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the X-CLIP
    [microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`XCLIPTextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`XCLIPVisionConfig`].
        projection_dim (`int`, *optional*, defaults to 512):
            Dimentionality of text and vision projection layers.
        prompt_layers (`int`, *optional*, defaults to 2):
            Number of layers in the video specific prompt generator.
        prompt_alpha (`float`, *optional*, defaults to 0.1):
            Alpha value to use in the video specific prompt generator.
        prompt_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the video specific prompt generator. If string,
            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
        prompt_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads in the cross-attention of the video specific prompt generator.
        prompt_attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for the attention layers in the video specific prompt generator.
        prompt_projection_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for the projection layers in the video specific prompt generator.
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            The inital value of the *logit_scale* parameter. Default is used as per the original XCLIP implementation.
        kwargs (*optional*):
            Dictionary of keyword arguments.
    """

    model_type = "xclip"

    def __init__(
        self,
        text_config=None,
        vision_config=None,
        projection_dim=512,
        prompt_layers=2,
        prompt_alpha=0.1,
        prompt_hidden_act="quick_gelu",
        prompt_num_attention_heads=8,
        prompt_attention_dropout=0.0,
        prompt_projection_dropout=0.0,
        logit_scale_init_value=2.6592,
        **kwargs,
    ):
    # 从文本模型配置和视觉模型配置实例化一个 XCLIPConfig（或其派生类）对象
    def from_text_vision_configs(cls, text_config: XCLIPTextConfig, vision_config: XCLIPVisionConfig, **kwargs):
        r"""
        从 XCLIP 文本模型配置和 XCLIP 视觉模型配置实例化一个 [`XCLIPConfig`]（或其派生类）对象。

        Returns:
            [`XCLIPConfig`]: 配置对象的一个实例
        """

        # 使用 text_config 的字典表示和 vision_config 的字典表示实例化一个配置对象，并传递额外的关键字参数
        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

`.\models\x_clip\convert_x_clip_original_pytorch_to_hf.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse  # 导入命令行参数解析模块

import gdown  # 导入用于从Google Drive下载文件的模块
import numpy as np  # 导入用于数值计算的numpy库
import torch  # 导入PyTorch深度学习库
from huggingface_hub import hf_hub_download  # 导入从Hugging Face Hub下载模型的函数

from transformers import (  # 导入transformers库中的各类对象
    CLIPTokenizer,  # CLIP模型的分词器
    CLIPTokenizerFast,  # 加速版本的CLIP分词器
    VideoMAEImageProcessor,  # 视频和图像处理器
    XCLIPConfig,  # XCLIP模型的配置类
    XCLIPModel,  # XCLIP模型
    XCLIPProcessor,  # XCLIP处理器
    XCLIPTextConfig,  # XCLIP文本配置类
    XCLIPVisionConfig,  # XCLIP视觉配置类
)


def get_xclip_config(model_name, num_frames):
    text_config = XCLIPTextConfig()  # 创建一个XCLIP文本配置对象

    # 从模型名称中提取patch大小
    start_idx = model_name.find("patch")
    patch_size = int(model_name[start_idx + len("patch"): start_idx + len("patch") + 2])
    vision_config = XCLIPVisionConfig(patch_size=patch_size, num_frames=num_frames)  # 创建一个XCLIP视觉配置对象

    if "large" in model_name:
        # 如果模型名称中包含"large"，设置大模型的文本和视觉配置
        text_config.hidden_size = 768
        text_config.intermediate_size = 3072
        text_config.num_attention_heads = 12

        vision_config.hidden_size = 1024
        vision_config.intermediate_size = 4096
        vision_config.num_attention_heads = 16
        vision_config.num_hidden_layers = 24
        vision_config.mit_hidden_size = 768
        vision_config.mit_intermediate_size = 3072

    if model_name == "xclip-large-patch14-16-frames":
        # 如果模型名称是"xclip-large-patch14-16-frames"，设置特定的图片尺寸
        vision_config.image_size = 336

    config = XCLIPConfig.from_text_vision_configs(text_config, vision_config)  # 通过文本和视觉配置创建XCLIP模型的配置对象

    if "large" in model_name:
        config.projection_dim = 768  # 如果模型名称中包含"large"，设置投影维度为768

    return config  # 返回配置对象


def rename_key(name):
    # 文本编码器
    if name == "token_embedding.weight":
        name = name.replace("token_embedding.weight", "text_model.embeddings.token_embedding.weight")
    if name == "positional_embedding":
        name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight")
    if "ln_1" in name:
        name = name.replace("ln_1", "layer_norm1")
    if "ln_2" in name:
        name = name.replace("ln_2", "layer_norm2")
    if "c_fc" in name:
        name = name.replace("c_fc", "fc1")
    if "c_proj" in name:
        name = name.replace("c_proj", "fc2")
    if name.startswith("transformer.resblocks"):
        name = name.replace("transformer.resblocks", "text_model.encoder.layers")
    if "attn.out_proj" in name and "message" not in name:
        name = name.replace("attn.out_proj", "self_attn.out_proj")
    if "ln_final" in name:
        name = name.replace("ln_final", "text_model.final_layer_norm")
    # 视觉编码器
    # 如果变量 name 等于 "visual.class_embedding"，则替换为 "vision_model.embeddings.class_embedding"
    if name == "visual.class_embedding":
        name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
    
    # 如果变量 name 等于 "visual.positional_embedding"，则替换为 "vision_model.embeddings.position_embedding.weight"
    if name == "visual.positional_embedding":
        name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight")
    
    # 如果变量 name 以 "visual.transformer.resblocks" 开头，则替换为 "vision_model.encoder.layers"
    if name.startswith("visual.transformer.resblocks"):
        name = name.replace("visual.transformer.resblocks", "vision_model.encoder.layers")
    
    # 如果变量 name 中包含 "visual.conv1"，则替换为 "vision_model.embeddings.patch_embedding"
    if "visual.conv1" in name:
        name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding")
    
    # 如果变量 name 中包含 "visual.ln_pre"，则替换为 "vision_model.pre_layernorm"
    if "visual.ln_pre" in name:
        name = name.replace("visual.ln_pre", "vision_model.pre_layernorm")
    
    # 如果变量 name 中包含 "visual.ln_post"，则替换为 "vision_model.post_layernorm"
    if "visual.ln_post" in name:
        name = name.replace("visual.ln_post", "vision_model.post_layernorm")
    
    # 如果变量 name 中包含 "visual.proj"，则替换为 "visual_projection.weight"
    if "visual.proj" in name:
        name = name.replace("visual.proj", "visual_projection.weight")
    
    # 如果变量 name 中包含 "text_projection"，则替换为 "text_projection.weight"
    if "text_projection" in name:
        name = name.replace("text_projection", "text_projection.weight")
    
    # 如果变量 name 中包含 "prompts_visual_proj"，则替换为 "prompts_visual_projection"
    if "prompts_visual_proj" in name:
        name = name.replace("prompts_visual_proj", "prompts_visual_projection")
    
    # 如果变量 name 中包含 "prompts_visual_ln"，则替换为 "prompts_visual_layernorm"
    if "prompts_visual_ln" in name:
        name = name.replace("prompts_visual_ln", "prompts_visual_layernorm")
    
    # 如果变量 name 等于 "mit.positional_embedding"，则替换 "positional" 为 "position"
    if name == "mit.positional_embedding":
        name = name.replace("positional", "position")
    
    # 如果变量 name 以 "mit.resblocks" 开头，则替换为 "mit.encoder.layers"
    if name.startswith("mit.resblocks"):
        name = name.replace("mit.resblocks", "mit.encoder.layers")
    
    # 如果变量 name 以 "prompts_generator.norm" 开头，则替换为 "prompts_generator.layernorm"
    if name.startswith("prompts_generator.norm"):
        name = name.replace("prompts_generator.norm", "prompts_generator.layernorm")
    
    # 返回处理后的 name 变量
    return name
# 简单返回给定的原始状态字典，没有进行任何转换操作
def convert_state_dict(orig_state_dict, config):
    return orig_state_dict



# 准备视频数据，根据帧数选择对应的视频文件进行下载和加载
def prepare_video(num_frames):
    # 根据帧数选择对应的视频文件名
    if num_frames == 8:
        filename = "eating_spaghetti_8_frames.npy"
    elif num_frames == 16:
        filename = "eating_spaghetti.npy"
    elif num_frames == 32:
        filename = "eating_spaghetti_32_frames.npy"
    # 使用指定的repo_id和文件名从指定仓库类型（dataset）下载文件
    file = hf_hub_download(
        repo_id="hf-internal-testing/spaghetti-video",
        filename=filename,
        repo_type="dataset",
    )
    # 加载numpy数组中的视频数据
    video = np.load(file)
    # 将视频数据转换为列表形式并返回
    return list(video)



# 这是一个尚未实现的函数声明，用于将XClip模型的检查点转换为PyTorch格式
def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
    pass
    model_to_url = {
        # 定义一个字典，将模型名称映射到其对应的预训练模型下载地址
        # fully supervised kinetics-400 checkpoints
        "xclip-base-patch32": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_8.pth",
        "xclip-base-patch32-16-frames": (
            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_16.pth"
        ),
        "xclip-base-patch16": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_16_8.pth",
        "xclip-base-patch16-16-frames": (
            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_16_16.pth"
        ),
        "xclip-large-patch14": "https://drive.google.com/u/0/uc?id=1NUOImq0o5DlQTST17iIP3vG7DgmHQuCx&amp;export=download&amp;confirm=t&amp;uuid=b26caedc-88e2-473e-830a-9d158b653cdb",
        "xclip-large-patch14-16-frames": "https://drive.google.com/u/0/uc?id=1FOYgnJc097OJ4lGwtRCCydQyVPJEOH7d&amp;export=download&amp;confirm=t&amp;uuid=538fa810-e671-4050-b385-9a623f89804f",
        # fully supervised kinetics-600 checkpoints
        "xclip-base-patch16-kinetics-600": (
            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_8.pth"
        ),
        "xclip-base-patch16-kinetics-600-16-frames": (
            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_16.pth"
        ),
        "xclip-large-patch14-kinetics-600": "https://drive.google.com/u/0/uc?id=1FV8C1INuM91sLAN4ImjzePLIlpMSihwV&amp;export=download&amp;confirm=t&amp;uuid=141d4977-4a65-44ae-864f-4b0c19f838be",
        # few shot
        "xclip-base-patch16-hmdb-2-shot": (
            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_2.pth"
        ),
        "xclip-base-patch16-hmdb-4-shot": (
            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_4.pth"
        ),
        "xclip-base-patch16-hmdb-8-shot": (
            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_8.pth"
        ),
        "xclip-base-patch16-hmdb-16-shot": (
            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_16.pth"
        ),
        "xclip-base-patch16-ucf-2-shot": (
            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_2.pth"
        ),
        "xclip-base-patch16-ucf-4-shot": (
            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_4.pth"
        ),
        "xclip-base-patch16-ucf-8-shot": (
            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_8.pth"
        ),
        "xclip-base-patch16-ucf-16-shot": (
            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_16.pth"
        ),
        # zero shot
        "xclip-base-patch16-zero-shot": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/zero.pth",
    }

    # 根据给定的模型名称获取相应的预训练模型下载地址
    checkpoint_url = model_to_url[model_name]
    
    # 默认帧数设置为8帧，如果模型名称中包含"16-frames"，则设置为16帧
    num_frames = 8
    if "16-frames" in model_name:
        num_frames = 16
    # 如果模型名称中包含 "shot"，设定帧数为32
    elif "shot" in model_name:
        num_frames = 32

    # 根据模型名称获取对应的配置信息
    config = get_xclip_config(model_name, num_frames)
    # 创建 XCLIPModel 模型对象
    model = XCLIPModel(config)
    # 将模型设置为评估模式
    model.eval()

    # 如果 checkpoint_url 中包含 "drive"
    if "drive" in checkpoint_url:
        # 设置输出文件名为 "pytorch_model.bin"
        output = "pytorch_model.bin"
        # 使用 gdown 下载 checkpoint_url 对应的文件到 output
        gdown.cached_download(checkpoint_url, output, quiet=False)
        # 从下载的文件中加载模型状态字典到 state_dict，并指定在 CPU 上加载
        state_dict = torch.load(output, map_location="cpu")["model"]
    else:
        # 从 checkpoint_url 加载预训练模型状态字典到 state_dict
        state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]

    # 转换加载的状态字典以匹配当前配置
    state_dict = convert_state_dict(state_dict, config)

    # 创建 XCLIPModel 模型对象
    model = XCLIPModel(config)
    # 根据加载的状态字典加载模型参数，允许缺少键，严格性设置为 False
    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
    # 断言确保缺少的键为指定的列表
    assert missing_keys == ["text_model.embeddings.position_ids", "vision_model.embeddings.position_ids"]
    # 将模型设置为评估模式
    model.eval()

    # 根据模型名称选择图片处理的尺寸
    size = 336 if model_name == "xclip-large-patch14-16-frames" else 224
    # 创建视频多模态自动编码器图像处理器对象，指定图片尺寸
    image_processor = VideoMAEImageProcessor(size=size)
    # 从预训练模型加载 CLIPTokenizer 对象
    slow_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
    # 从预训练模型加载 CLIPTokenizerFast 对象
    fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
    # 创建 XCLIPProcessor 处理器对象，指定图像处理器和快速分词器
    processor = XCLIPProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)

    # 准备视频数据，获取输入参数
    video = prepare_video(num_frames)
    # 使用处理器处理文本和视频输入数据，返回 PyTorch 张量格式，进行填充
    inputs = processor(
        text=["playing sports", "eating spaghetti", "go shopping"], videos=video, return_tensors="pt", padding=True
    )

    # 打印像素值的形状信息
    print("Shape of pixel values:", inputs.pixel_values.shape)

    # 禁用梯度计算
    with torch.no_grad():
        # 使用模型进行推断，获取输出
        outputs = model(**inputs)

    # 验证输出结果
    logits_per_video = outputs.logits_per_video
    # 对 logits 进行 softmax 处理得到概率
    probs = logits_per_video.softmax(dim=1)
    # 打印概率值
    print("Probs:", probs)

    # 根据模型名称选择预期的概率张量
    if model_name == "xclip-base-patch32":
        expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
    elif model_name == "xclip-base-patch32-16-frames":
        expected_probs = torch.tensor([[7.0999e-04, 9.9883e-01, 4.5580e-04]])
    elif model_name == "xclip-base-patch16":
        expected_probs = torch.tensor([[0.0083, 0.9681, 0.0236]])
    elif model_name == "xclip-base-patch16-16-frames":
        expected_probs = torch.tensor([[7.6937e-04, 9.9728e-01, 1.9473e-03]])
    elif model_name == "xclip-large-patch14":
        expected_probs = torch.tensor([[0.0062, 0.9864, 0.0075]])
    elif model_name == "xclip-large-patch14-16-frames":
        expected_probs = torch.tensor([[3.3877e-04, 9.9937e-01, 2.8888e-04]])
    elif model_name == "xclip-base-patch16-kinetics-600":
        expected_probs = torch.tensor([[0.0555, 0.8914, 0.0531]])
    elif model_name == "xclip-base-patch16-kinetics-600-16-frames":
        expected_probs = torch.tensor([[3.8554e-04, 9.9929e-01, 3.2754e-04]])
    elif model_name == "xclip-large-patch14-kinetics-600":
        expected_probs = torch.tensor([[0.0036, 0.9920, 0.0045]])
    elif model_name == "xclip-base-patch16-hmdb-2-shot":
        expected_probs = torch.tensor([[7.1890e-06, 9.9994e-01, 5.6559e-05]])
    # 根据模型名称选择预期的概率张量
    elif model_name == "xclip-base-patch16-hmdb-4-shot":
        expected_probs = torch.tensor([[1.0320e-05, 9.9993e-01, 6.2435e-05]])
    elif model_name == "xclip-base-patch16-hmdb-8-shot":
        expected_probs = torch.tensor([[4.1377e-06, 9.9990e-01, 9.8386e-05]])
    elif model_name == "xclip-base-patch16-hmdb-16-shot":
        expected_probs = torch.tensor([[4.1347e-05, 9.9962e-01, 3.3411e-04]])
    elif model_name == "xclip-base-patch16-ucf-2-shot":
        expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
    elif model_name == "xclip-base-patch16-ucf-4-shot":
        expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
    elif model_name == "xclip-base-patch16-ucf-8-shot":
        expected_probs = torch.tensor([[0.0027, 0.9904, 0.0070]])
    elif model_name == "xclip-base-patch16-ucf-16-shot":
        expected_probs = torch.tensor([[9.8219e-04, 9.9593e-01, 3.0863e-03]])
    # zero shot
    elif model_name == "xclip-base-patch16-zero-shot":
        expected_probs = torch.tensor([[3.5082e-04, 9.9785e-01, 1.7966e-03]])
    else:
        raise ValueError(f"Model name {model_name} not supported")

    # 使用assert语句检查模型输出的概率值与预期概率张量的接近程度
    assert torch.allclose(probs, expected_probs, atol=1e-3)
    # 输出确认信息
    print("Looks ok!")

    # 如果指定了PyTorch模型保存路径，则保存模型
    if pytorch_dump_folder_path is not None:
        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
        model.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到hub，则推送模型、processor和slow tokenizer文件到指定的组织
    if push_to_hub:
        print("Pushing model, processor and slow tokenizer files to the hub...")
        model.push_to_hub(model_name, organization="nielsr")
        processor.push_to_hub(model_name, organization="nielsr")
        slow_tokenizer.push_to_hub(model_name, organization="nielsr")
if __name__ == "__main__":
    # 如果作为主程序执行，进入主程序逻辑

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    parser.add_argument(
        "--model_name",
        default="xclip-base-patch32",
        type=str,
        help="Name of the model.",
    )
    # 添加模型名称参数，默认为"xclip-base-patch32"，类型为字符串，用于指定模型名称

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加PyTorch模型输出目录路径参数，默认为None，类型为字符串，用于指定PyTorch模型的输出目录路径

    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    # 添加是否推送到🤗 hub的参数，使用store_true来标记是否推送模型到hub

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数进行模型转换，传入命令行解析后的参数
    convert_xclip_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

`.\models\x_clip\modeling_x_clip.py`

# coding=utf-8
# Copyright 2022 Microsoft Research and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch X-CLIP model."""

# Import necessary modules and functions
from copy import copy
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

# Importing from within the package
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)

# Initialize logger for this module
logger = logging.get_logger(__name__)

# Default model checkpoint for documentation purposes
_CHECKPOINT_FOR_DOC = "microsoft/xclip-base-patch32"

# List of pre-trained model archives available for X-CLIP
XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/xclip-base-patch32",
    # See all X-CLIP models at https://huggingface.co/models?filter=x-clip
]

# contrastive loss function, adapted from
# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    """
    Computes the contrastive loss given logits.

    Args:
        logits (torch.Tensor): The logits from the model.

    Returns:
        torch.Tensor: The computed contrastive loss.
    """
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))


# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->x_clip
def x_clip_loss(similarity: torch.Tensor) -> torch.Tensor:
    """
    Computes the X-CLIP loss given similarity scores.

    Args:
        similarity (torch.Tensor): The similarity scores between captions and images.

    Returns:
        torch.Tensor: The computed X-CLIP loss.
    """
    caption_loss = contrastive_loss(similarity)
    image_loss = contrastive_loss(similarity.t())  # Transpose for image loss computation
    return (caption_loss + image_loss) / 2.0


@dataclass
class XCLIPOutput(ModelOutput):
    """
    Placeholder class for model outputs specific to X-CLIP.
    """
    # The class itself is currently empty but serves as a placeholder.
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for video-text similarity.
        logits_per_video (`torch.FloatTensor` of shape `(video_batch_size, text_batch_size)`):
            The scaled dot product scores between `video_embeds` and `text_embeds`. This represents the video-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, video_batch_size)`):
            The scaled dot product scores between `text_embeds` and `video_embeds`. This represents the text-video
            similarity scores.
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`XCLIPTextModel`].
        video_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The video embeddings obtained by applying the projection layer to the pooled output of
            [`XCLIPVisionModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`XCLIPTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`XCLIPVisionModel`].
        mit_output (`BaseModelOutputWithPooling`):
            The output of `XCLIPMultiframeIntegrationTransformer` (MIT for short).
    """

    # Optional attributes initialized to None
    loss: Optional[torch.FloatTensor] = None
    logits_per_video: torch.FloatTensor = None
    logits_per_text: torch.FloatTensor = None
    text_embeds: torch.FloatTensor = None
    video_embeds: torch.FloatTensor = None
    text_model_output: BaseModelOutputWithPooling = None
    vision_model_output: BaseModelOutputWithPooling = None
    mit_output: BaseModelOutputWithPooling = None

    # Method to convert instance to a tuple, handling special cases for complex types
    def to_tuple(self) -> Tuple[Any]:
        return tuple(
            # Return self[k] for basic attributes, or convert and return tuple for complex attributes
            self[k]
            if k not in ["text_model_output", "vision_model_output", "mit_output"]
            else getattr(self, k).to_tuple()  # Convert complex attribute to tuple
            for k in self.keys()  # Iterate through all attribute keys
        )
# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->XCLIP
class XCLIPVisionEmbeddings(nn.Module):
    def __init__(self, config: XCLIPVisionConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        # 定义一个可学习的类别嵌入向量
        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))

        # 定义用于提取图像补丁特征的二维卷积层
        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            bias=False,
        )

        # 计算图像中补丁的数量
        self.num_patches = (self.image_size // self.patch_size) ** 2
        self.num_positions = self.num_patches + 1

        # 定义一个位置嵌入层，用于表示每个位置的特征
        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
        
        # 注册一个位置 id 的缓冲区张量，用于表示序列中每个位置的索引
        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        batch_size = pixel_values.shape[0]
        target_dtype = self.patch_embedding.weight.dtype

        # 使用卷积层提取图像补丁的特征表示
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

        # 将类别嵌入向量扩展到每个样本
        class_embeds = self.class_embedding.expand(batch_size, 1, -1)

        # 将类别嵌入向量和图像补丁特征连接起来作为最终的视觉嵌入表示
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)

        # 添加位置嵌入到最终的视觉嵌入表示中
        embeddings = embeddings + self.position_embedding(self.position_ids)

        return embeddings


# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->XCLIP
class XCLIPTextEmbeddings(nn.Module):
    def __init__(self, config: XCLIPTextConfig):
        super().__init__()
        embed_dim = config.hidden_size

        # 定义一个用于词嵌入的嵌入层
        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)

        # 定义一个用于位置嵌入的嵌入层
        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)

        # 注册一个位置 id 的缓冲区张量，用于表示序列中每个位置的索引
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.Tensor:
        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]

        # 如果没有提供位置 id，则使用预定义的位置 id
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果没有提供嵌入的输入，则使用输入 id 来获取词嵌入
        if inputs_embeds is None:
            inputs_embeds = self.token_embedding(input_ids)

        # 使用位置嵌入层获取位置嵌入
        position_embeddings = self.position_embedding(position_ids)

        # 将词嵌入和位置嵌入相加作为最终的文本嵌入表示
        embeddings = inputs_embeds + position_embeddings

        return embeddings


# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->XCLIP
# 定义一个名为 XCLIPAttention 的类，表示从论文 'Attention Is All You Need' 中的多头注意力机制
class XCLIPAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size  # 获取隐藏大小作为嵌入维度
        self.num_heads = config.num_attention_heads  # 获取注意力头数
        self.head_dim = self.embed_dim // self.num_heads  # 计算每个注意力头的维度
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_dim**-0.5  # 缩放因子
        self.dropout = config.attention_dropout  # 注意力机制中的 dropout 概率

        # 线性变换层，用于查询、键、值和输出的投影
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    # 重新形状张量以便多头注意力计算
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    # 前向传播函数，执行多头注意力计算
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,



# 定义一个名为 XCLIPMLP 的类，从 CLIP 模型中复制，表示多层感知机（MLP）部分
class XCLIPMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]  # 激活函数选择
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)  # 第一个全连接层
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)  # 第二个全连接层

    # 前向传播函数，执行全连接层的计算
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)  # 第一层全连接
        hidden_states = self.activation_fn(hidden_states)  # 应用激活函数
        hidden_states = self.fc2(hidden_states)  # 第二层全连接
        return hidden_states  # 返回计算结果



# 定义一个名为 XCLIPEncoderLayer 的类，从 CLIP 模型中复制，表示编码器层
class XCLIPEncoderLayer(nn.Module):
    def __init__(self, config: XCLIPConfig):
        super().__init__()
        self.embed_dim = config.hidden_size  # 获取隐藏大小作为嵌入维度
        self.self_attn = XCLIPAttention(config)  # 自注意力层
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 第一个层归一化层
        self.mlp = XCLIPMLP(config)  # 多层感知机层
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 第二个层归一化层

    # 前向传播函数，执行编码器层的计算
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
        """
        定义了一个方法，接收以下参数并返回结果的元组：
        - hidden_states (`torch.FloatTensor`): 形状为 `(batch, seq_len, embed_dim)` 的输入层张量
        - attention_mask (`torch.FloatTensor`): 形状为 `(batch, 1, tgt_len, src_len)` 的注意力掩码，
          其中填充元素用非常大的负值表示
        - output_attentions (`bool`, *可选*): 是否返回所有注意力层的注意力张量。查看返回张量中的 `attentions` 获取更多细节。
        """
        residual = hidden_states  # 保存输入 hidden_states 作为残差连接的基础

        hidden_states = self.layer_norm1(hidden_states)  # 应用第一个层归一化

        # 使用 self_attn 处理 hidden_states，接收 attention_mask 和 output_attentions 作为参数
        # 返回处理后的 hidden_states 和可能的注意力权重 attn_weights
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
        )

        hidden_states = residual + hidden_states  # 将残差连接应用于处理后的 hidden_states

        residual = hidden_states  # 更新残差连接的基础为当前的 hidden_states

        hidden_states = self.layer_norm2(hidden_states)  # 应用第二个层归一化

        hidden_states = self.mlp(hidden_states)  # 应用 MLP 层

        hidden_states = residual + hidden_states  # 再次将残差连接应用于处理后的 hidden_states

        outputs = (hidden_states,)  # 将最终处理后的 hidden_states 存储在 outputs 中

        if output_attentions:
            outputs += (attn_weights,)  # 如果需要返回 attentions，则将 attn_weights 添加到 outputs 中

        return outputs  # 返回包含 hidden_states 和可能的 attn_weights 的元组作为输出
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    if drop_prob == 0.0 or not training:
        return input
    keep_prob = 1 - drop_prob
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # binarize
    output = input.div(keep_prob) * random_tensor
    return output


# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->XCLIP
class XCLIPDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


class XCLIPVisionEncoderLayer(nn.Module):
    """
    This corresponds to the `CrossFramelAttentionBlock` class in the original implementation.
    """

    def __init__(self, config: XCLIPConfig):
        super().__init__()
        self.num_frames = config.num_frames
        self.embed_dim = config.hidden_size

        # Message passing components
        self.message_fc = nn.Linear(self.embed_dim, self.embed_dim)  # Linear transformation for message passing
        self.message_ln = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # Layer normalization for messages
        self.message_attn = XCLIPAttention(config)  # Attention mechanism for message passing

        # Drop path implementation
        self.drop_path = XCLIPDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()

        # Self-attention mechanism
        self.self_attn = XCLIPAttention(config)  # Self-attention mechanism
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # Layer normalization after self-attention

        # MLP (Feedforward neural network)
        self.mlp = XCLIPMLP(config)  # Multilayer perceptron for feature transformation
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # Layer normalization after MLP

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
                输入到层的隐藏状态张量，形状为 `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
                注意力掩码张量，大小为 `(batch, 1, tgt_len, src_len)`，其中填充元素由非常大的负值表示。
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
                文本模型的因果关注掩码。掩码值选定在 `[0, 1]` 之间：
                - 1 表示 **未屏蔽** 的标记，
                - 0 表示 **屏蔽** 的标记。
                [什么是注意力掩码?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
                是否返回所有注意力层的注意力张量。详细信息请参见返回的张量下的 `attentions`。
        """
        # 计算输入张量的维度
        batch_time, seq_length, hidden_size = hidden_states.size()
        # 计算批次大小
        batch_size = batch_time // self.num_frames
        # 提取第一个时间步的隐藏状态，通过全连接层生成消息标记
        msg_token = self.message_fc(hidden_states[:, 0, :])
        # 调整形状以匹配批次和帧数
        msg_token = msg_token.view(batch_size, self.num_frames, hidden_size)

        # 对消息标记应用注意力操作，并通过随机丢弃路径进行正则化
        msg_token = msg_token + self.drop_path(self.message_attn(self.message_ln(msg_token))[0])
        # 添加虚拟序列维度
        msg_token = msg_token.view(-1, 1, hidden_size)

        # 将消息标记连接到原始输入张量中
        hidden_states = torch.cat([hidden_states, msg_token], dim=1)

        # 保存残差连接
        residual = hidden_states

        # 对连接后的张量进行层归一化
        hidden_states = self.layer_norm1(hidden_states)
        # 应用自注意力机制，并返回注意力权重
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
        )
        # 添加残差连接
        hidden_states = residual + hidden_states

        # 裁剪处理后的张量，恢复原始的序列长度
        hidden_states = hidden_states[:, :seq_length, :]

        # 保存残差连接
        residual = hidden_states
        # 对连接后的张量再次进行层归一化
        hidden_states = self.layer_norm2(hidden_states)
        # 应用多层感知机层
        hidden_states = self.mlp(hidden_states)
        # 添加残差连接
        hidden_states = residual + hidden_states

        # 输出结果为包含处理后的隐藏状态的元组
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则将它们添加到输出元组中
        if output_attentions:
            outputs += (attn_weights,)

        return outputs
# 定义一个名为 XCLIPPreTrainedModel 的类，继承自 PreTrainedModel，用于处理权重初始化和预训练模型的下载与加载的简单接口。
class XCLIPPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定该类的配置类为 XCLIPConfig，用于处理模型配置信息
    config_class = XCLIPConfig
    # 基础模型的前缀名称为 "x_clip"，用于标识模型的基础结构
    base_model_prefix = "x_clip"
    # 支持梯度检查点技术，允许在模型训练时进行梯度检查点
    supports_gradient_checkpointing = True
    def _init_weights(self, module):
        """Initialize the weights"""
        # 从配置中获取初始化因子
        factor = self.config.initializer_factor
        
        # 如果 module 是 XCLIPTextEmbeddings 类型
        if isinstance(module, XCLIPTextEmbeddings):
            # 初始化 token_embedding 的权重
            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
            # 初始化 position_embedding 的权重
            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
        
        # 如果 module 是 XCLIPVisionEmbeddings 类型
        elif isinstance(module, XCLIPVisionEmbeddings):
            # 重新设置初始化因子
            factor = self.config.initializer_factor
            # 初始化 class_embedding 的权重
            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
            # 初始化 patch_embedding 的权重
            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
            # 初始化 position_embedding 的权重
            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
        
        # 如果 module 是 XCLIPAttention 类型
        elif isinstance(module, XCLIPAttention):
            # 重新设置初始化因子
            factor = self.config.initializer_factor
            # 计算输入投影的标准差
            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            # 计算输出投影的标准差
            out_proj_std = (module.embed_dim**-0.5) * factor
            # 初始化 q_proj 的权重
            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
            # 初始化 k_proj 的权重
            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
            # 初始化 v_proj 的权重
            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
            # 初始化 out_proj 的权重
            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
        
        # 如果 module 是 XCLIPMLP 类型
        elif isinstance(module, XCLIPMLP):
            # 重新设置初始化因子
            factor = self.config.initializer_factor
            # 计算输入投影的标准差
            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            # 计算全连接层的标准差
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            # 初始化 fc1 的权重
            nn.init.normal_(module.fc1.weight, std=fc_std)
            # 初始化 fc2 的权重
            nn.init.normal_(module.fc2.weight, std=in_proj_std)
        
        # 如果 module 是 XCLIPModel 类型
        elif isinstance(module, XCLIPModel):
            # 重新设置初始化因子
            factor = self.config.initializer_factor
            # 初始化 text_projection 的权重
            nn.init.normal_(
                module.text_projection.weight,
                std=module.text_embed_dim**-0.5 * factor,
            )
            # 初始化 visual_projection 的权重
            nn.init.normal_(
                module.visual_projection.weight,
                std=module.vision_embed_dim**-0.5 * factor,
            )
            # 初始化 prompts_visual_projection 的权重
            nn.init.normal_(module.prompts_visual_projection, mean=0.0, std=module.vision_embed_dim**-0.5 * factor)
        
        # 如果 module 是 XCLIPMultiframeIntegrationTransformer 类型
        elif isinstance(module, XCLIPMultiframeIntegrationTransformer):
            # 初始化 position_embedding 的权重
            nn.init.normal_(module.position_embedding, std=self.config.initializer_factor)
        
        # 如果 module 是 nn.LayerNorm 类型
        if isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为 1.0
            module.weight.data.fill_(1.0)
        
        # 如果 module 是 nn.Linear 类型
        if isinstance(module, nn.Linear):
            # 初始化权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_factor)
            # 如果有偏置项，将偏置项初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
# X_CLIP_START_DOCSTRING 是一个包含模型描述信息的原始字符串文档
X_CLIP_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`XCLIPConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# X_CLIP_TEXT_INPUTS_DOCSTRING 是一个关于文本输入参数的文档字符串
X_CLIP_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# X_CLIP_VISION_INPUTS_DOCSTRING 是一个空字符串，暂时未包含内容
X_CLIP_VISION_INPUTS_DOCSTRING = r"""
"""
    Args:
        # `pixel_values` 是一个 torch.FloatTensor，表示图像的像素值，形状为 `(batch_size, num_channels, height, width)`
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        
        # 是否输出所有注意力层的注意力张量，默认为 False
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        
        # 是否输出所有隐藏层的隐藏状态，默认为 False
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        
        # 是否返回一个 `utils.ModelOutput` 对象而不是普通的元组，默认为 False
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
X_CLIP_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.
            
            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
            
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            
            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
            
            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->XCLIP
class XCLIPEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`XCLIPEncoderLayer`].

    Args:
        config: XCLIPConfig
    """

    def __init__(self, config: XCLIPConfig):
        super().__init__()
        self.config = config
        # 创建一个由多个 XCLIPEncoderLayer 组成的列表，数量为 config.num_hidden_layers
        self.layers = nn.ModuleList([XCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 定义 XCLIPTextTransformer 类，继承自 nn.Module，用于文本转换任务的模型
class XCLIPTextTransformer(nn.Module):
    # 初始化函数，接受一个 XCLIPTextConfig 类型的配置对象作为参数
    def __init__(self, config: XCLIPTextConfig):
        super().__init__()
        # 将配置对象保存在模型中
        self.config = config
        # 根据配置对象中的 hidden_size 参数设置嵌入维度
        embed_dim = config.hidden_size
        # 创建 XCLIPTextEmbeddings 对象，用于文本的嵌入表示
        self.embeddings = XCLIPTextEmbeddings(config)
        # 创建 XCLIPEncoder 对象，用于编码文本信息
        self.encoder = XCLIPEncoder(config)
        # 创建最终的 LayerNorm 层，用于规范化最终输出的特征
        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

    # 前向传播函数，处理输入数据并返回模型的输出
    @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:

        """
        # 如果没有显式提供output_attentions，则使用配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果没有显式提供output_hidden_states，则使用配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果没有显式提供return_dict，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果没有提供input_ids，则抛出数值错误异常
        if input_ids is None:
            raise ValueError("You have to specify either input_ids")

        # 获取input_ids的形状
        input_shape = input_ids.size()
        # 将input_ids视图调整为二维张量
        input_ids = input_ids.view(-1, input_shape[-1])

        # 使用self.embeddings嵌入输入的input_ids和可选的position_ids
        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

        # X_CLIP的文本模型使用因果掩码，在这里准备它
        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
        # 创建四维因果注意力掩码
        causal_attention_mask = _create_4d_causal_attention_mask(
            input_shape, hidden_states.dtype, device=hidden_states.device
        )
        # 如果提供了attention_mask，则将其扩展为四维张量
        if attention_mask is not None:
            # 将二维的attention_mask扩展为四维的
            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)

        # 使用encoder对输入进行编码，得到encoder_outputs
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的最后隐藏状态
        last_hidden_state = encoder_outputs[0]
        # 对最后隐藏状态进行最终的层归一化
        last_hidden_state = self.final_layer_norm(last_hidden_state)

        # 从最后隐藏状态中提取汇总输出（pooled_output）
        # pooled_output是从每个序列中eot embedding（end-of-token）中获取特征
        pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]

        # 如果return_dict为False，则返回一个元组，包括最后隐藏状态、汇总输出和其他输出状态
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果return_dict为True，则返回一个BaseModelOutputWithPooling对象
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
class XCLIPTextModel(XCLIPPreTrainedModel):
    config_class = XCLIPTextConfig

    def __init__(self, config: XCLIPTextConfig):
        super().__init__(config)
        # 使用传入的配置初始化文本模型
        self.text_model = XCLIPTextTransformer(config)
        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        # 返回文本模型的输入嵌入层（token embedding）
        return self.text_model.embeddings.token_embedding

    def set_input_embeddings(self, value):
        # 设置文本模型的输入嵌入层（token embedding）
        self.text_model.embeddings.token_embedding = value

    @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:

        Examples:

        ```
        >>> from transformers import AutoTokenizer, XCLIPTextModel

        >>> model = XCLIPTextModel.from_pretrained("microsoft/xclip-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```
        使用文本模型处理传入的参数，并返回处理结果
        """
        return self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


class XCLIPVisionEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`XCLIPVisionEncoderLayer`].

    Args:
        config: XCLIPConfig
    """

    def __init__(self, config: XCLIPConfig):
        super().__init__()
        self.config = config
        # 创建多层视觉编码器，每层是一个XCLIPVisionEncoderLayer实例
        self.layers = nn.ModuleList([XCLIPVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 视觉编码器的前向传播方法，处理输入嵌入，注意力掩码等，返回处理结果
        pass  # pass语句表示这里暂时没有额外的操作，仅作为占位符使用


class XCLIPVisionTransformer(nn.Module):
    """
    This corresponds to the `CrossFrameCommunicationTransformer` class in the original implementation.
    """
    # 初始化方法，接受一个配置对象 config: XCLIPVisionConfig，并调用父类的初始化方法
    def __init__(self, config: XCLIPVisionConfig):
        super().__init__()
        # 将传入的配置对象保存到实例属性中
        self.config = config
        # 从配置对象中获取隐藏层的维度，并保存为 embed_dim
        embed_dim = config.hidden_size

        # 创建 XCLIPVisionEmbeddings 对象，并保存到实例属性中
        self.embeddings = XCLIPVisionEmbeddings(config)
        # 创建 LayerNorm 层，用于前处理，对输入进行归一化，eps 参数来自配置对象
        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        # 创建 XCLIPVisionEncoder 对象，并保存到实例属性中
        self.encoder = XCLIPVisionEncoder(config)
        # 创建 LayerNorm 层，用于后处理，对输出进行归一化，eps 参数来自配置对象
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

    # 前向传播方法，接受像素值 pixel_values 和一些可选参数，返回模型输出
    @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:

        """
        # 如果 output_attentions 参数不为 None，则使用其值；否则使用配置对象中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果 output_hidden_states 参数不为 None，则使用其值；否则使用配置对象中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 return_dict 参数不为 None，则使用其值；否则使用配置对象中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将像素值传入嵌入层，并获取隐藏状态
        hidden_states = self.embeddings(pixel_values)
        # 对隐藏状态进行前处理，应用 LayerNorm
        hidden_states = self.pre_layernorm(hidden_states)

        # 将前处理后的隐藏状态传入编码器，获取编码器的输出
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从编码器输出中获取最后一层隐藏状态
        last_hidden_state = encoder_outputs[0]
        # 从最后一层隐藏状态中获取池化输出，通常是第一个位置的输出
        pooled_output = last_hidden_state[:, 0, :]
        # 对池化输出进行后处理，应用 LayerNorm
        pooled_output = self.post_layernorm(pooled_output)

        # 如果不要求返回字典形式的结果，则返回元组形式的输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果要求返回字典形式的结果，则创建 BaseModelOutputWithPooling 对象，并返回
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
class XCLIPVisionModel(XCLIPPreTrainedModel):
    # 指定配置类为XCLIPVisionConfig
    config_class = XCLIPVisionConfig
    # 主要输入名称为"pixel_values"
    main_input_name = "pixel_values"

    def __init__(self, config: XCLIPVisionConfig):
        # 调用父类构造函数初始化模型
        super().__init__(config)
        # 初始化视觉模型为XCLIPVisionTransformer的实例
        self.vision_model = XCLIPVisionTransformer(config)
        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        # 返回视觉模型中的补丁嵌入层作为输入嵌入层
        return self.vision_model.embeddings.patch_embedding

    @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        # 正向传播函数定义
        # pixel_values: 像素值作为输入
        # output_attentions: 是否输出注意力
        # output_hidden_states: 是否输出隐藏状态
        # return_dict: 是否返回字典形式结果
        ...



class XCLIPMultiframeIntegrationTransformer(nn.Module):
    """
    这对应于原始实现中的`MultiframeIntegrationTransformer`类。
    """

    def __init__(self, config: XCLIPVisionConfig):
        # 初始化函数
        super().__init__()
        # 定义位置嵌入为可学习参数
        self.position_embedding = nn.Parameter(torch.empty(1, config.num_frames, config.hidden_size))
        # 使用XCLIPEncoder对数据进行编码
        self.encoder = XCLIPEncoder(config)

    def forward(
        self,
        hidden_states,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        # 正向传播函数定义
        residual = hidden_states

        # 添加位置嵌入到隐藏状态中
        hidden_states = hidden_states + self.position_embedding

        # 调用编码器对输入进行编码
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取编码器的输出的最后隐藏状态
        last_hidden_state = encoder_outputs[0]

        # 将最后隐藏状态转换为与输入相同的数据类型，并加上残差连接
        last_hidden_state = last_hidden_state.type(hidden_states.dtype) + residual

        # 计算池化输出，取平均值
        pooled_output = last_hidden_state.mean(dim=1, keepdim=False)

        # 如果不返回字典形式结果，则返回元组
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 返回带池化的基础模型输出
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


class XCLIPCrossAttention(nn.Module):
    """来自'Attention Is All You Need'论文的多头注意力"""



# XCLIPCrossAttention类定义了一个多头注意力机制，来源于'Attention Is All You Need'论文
    def __init__(self, config):
        super().__init__()
        self.num_heads = config.prompt_num_attention_heads  # 从配置中获取注意力头的数量

        dim = config.projection_dim  # 从配置中获取投影维度
        head_dim = dim // self.num_heads  # 计算每个注意力头的维度
        self.scale = head_dim**-0.5  # 缩放因子，用于缩放注意力权重

        # 初始化查询、键、值的线性投影层
        self.q_proj = nn.Linear(dim, dim, bias=False)
        self.k_proj = nn.Linear(dim, dim, bias=False)
        self.v_proj = nn.Linear(dim, dim, bias=False)

        self.attn_drop = nn.Dropout(config.prompt_attention_dropout)  # 注意力分数的dropout层
        self.proj = nn.Linear(dim, dim)  # 最终输出的线性投影层
        self.proj_drop = nn.Dropout(config.prompt_projection_dropout)  # 最终输出的dropout层

    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
        """调整张量形状以便注意力计算"""
        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(self, queries, keys, values):
        """模型的前向传播方法"""
        batch_size, query_seq_len, hidden_size = queries.shape
        batch_size, key_seq_len, hidden_size = keys.shape

        # 对查询、键、值进行线性投影，并调整形状以适应注意力计算的需求
        queries = (
            self.q_proj(queries)
            .reshape(batch_size, query_seq_len, self.num_heads, hidden_size // self.num_heads)
            .permute(0, 2, 1, 3)
        )
        keys = (
            self.k_proj(keys)
            .reshape(batch_size, key_seq_len, self.num_heads, hidden_size // self.num_heads)
            .permute(0, 2, 1, 3)
        )
        values = (
            self.v_proj(values)
            .reshape(batch_size, key_seq_len, self.num_heads, hidden_size // self.num_heads)
            .permute(0, 2, 1, 3)
        )

        # 计算注意力权重并进行缩放
        attn = (queries @ keys.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)  # 对注意力权重进行softmax归一化
        attn = self.attn_drop(attn)  # 对注意力分数进行dropout

        # 通过注意力权重加权值向量，然后将结果重新整形为最终输出形状
        x = (attn @ values).transpose(1, 2).reshape(batch_size, query_seq_len, hidden_size)
        x = self.proj(x)  # 最终输出的线性投影
        x = self.proj_drop(x)  # 对最终输出进行dropout
        return x
class PromptGeneratorLayer(nn.Module):
    def __init__(self, config):
        super().__init__()

        embed_dim = config.projection_dim
        # 初始化跨媒体注意力层
        self.cross_attn = XCLIPCrossAttention(config)
        # 第一层归一化
        self.norm1 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
        # 第三层归一化
        self.norm3 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
        # 多层感知机模型
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 4),  # 线性层，扩展维度
            ACT2FN[config.prompt_hidden_act],  # 激活函数
            nn.Dropout(config.prompt_attention_dropout),  # 随机丢弃以减少过拟合
            nn.Linear(embed_dim * 4, embed_dim),  # 线性层，降低维度
        )

    def forward(self, x, visual):
        # 使用跨媒体注意力层处理文本和视觉输入
        x = x + self.cross_attn(self.norm1(x), visual, visual)
        # 使用多层感知机处理更新后的文本表示
        x = x + self.mlp(self.norm3(x))
        return x


class XCLIPPromptGenerator(nn.Module):
    """This corresponds to the `VideoSpecificPrompt` class in the original implementation."""

    def __init__(self, config):
        super().__init__()
        embed_dim = config.projection_dim
        # 规范化层，用于视觉输入
        self.layernorm = nn.LayerNorm(embed_dim, eps=config.vision_config.layer_norm_eps)
        # 多个生成层组成的解码器
        self.decoder = nn.ModuleList([PromptGeneratorLayer(config) for _ in range(config.prompt_layers)])
        # 系数 alpha，用于加权输出文本表示
        self.alpha = nn.Parameter(torch.ones(embed_dim) * config.prompt_alpha)

    def forward(self, text, visual):
        # 规范化视觉输入
        visual = self.layernorm(visual)
        # 逐层使用生成层处理文本和视觉输入
        for layer in self.decoder:
            text = layer(text, visual)

        # 使用 alpha 系数加权输出文本表示
        return self.alpha * text


@add_start_docstrings(X_CLIP_START_DOCSTRING)
class XCLIPModel(XCLIPPreTrainedModel):
    config_class = XCLIPConfig
    # 初始化方法，接受一个XCLIPConfig类型的参数config
    def __init__(self, config: XCLIPConfig):
        # 调用父类的初始化方法，传入config参数
        super().__init__(config)

        # 检查config.text_config是否为XCLIPTextConfig类型，如果不是则抛出数值错误异常
        if not isinstance(config.text_config, XCLIPTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type XCLIPTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查config.vision_config是否为XCLIPVisionConfig类型，如果不是则抛出数值错误异常
        if not isinstance(config.vision_config, XCLIPVisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type XCLIPVisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 从config中获取text_config和vision_config
        text_config = config.text_config
        vision_config = config.vision_config

        # 设置对象的投影维度为config.projection_dim
        self.projection_dim = config.projection_dim
        # 设置对象的文本嵌入维度为text_config.hidden_size
        self.text_embed_dim = text_config.hidden_size
        # 设置对象的视觉嵌入维度为vision_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

        # 创建文本模型，使用XCLIPTextTransformer类，并传入text_config作为参数
        self.text_model = XCLIPTextTransformer(text_config)
        # 创建视觉模型，使用XCLIPVisionTransformer类，并传入vision_config作为参数
        self.vision_model = XCLIPVisionTransformer(vision_config)

        # 创建视觉投影层，将视觉嵌入维度映射到投影维度，不使用偏置
        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
        # 创建文本投影层，将文本嵌入维度映射到投影维度，不使用偏置
        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
        # 创建一个可学习参数，作为logit的尺度初始化值，使用config中的logit_scale_init_value
        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))

        # 创建视觉提示的LayerNorm层，输入维度为视觉嵌入维度，使用config中的layer_norm_eps作为epsilon值
        self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim, eps=config.vision_config.layer_norm_eps)
        # 创建一个可学习参数，形状为[视觉嵌入维度, 投影维度]，用于视觉提示的投影
        self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim))

        # 复制vision_config创建一个新的配置mit_config，并修改其中的部分属性
        mit_config = copy(vision_config)
        mit_config.hidden_size = vision_config.mit_hidden_size
        mit_config.intermediate_size = vision_config.mit_intermediate_size
        mit_config.num_hidden_layers = vision_config.mit_num_hidden_layers
        mit_config.num_attention_heads = vision_config.mit_num_attention_heads
        # 创建XCLIPMultiframeIntegrationTransformer对象，使用修改后的mit_config作为参数
        self.mit = XCLIPMultiframeIntegrationTransformer(mit_config)

        # 创建XCLIPPromptGenerator对象，使用config作为参数
        self.prompts_generator = XCLIPPromptGenerator(config)

        # 调用post_init方法，完成权重初始化和最终处理
        self.post_init()

    # 为模型的前向传播函数get_text_features添加文档字符串，使用X_CLIP_TEXT_INPUTS_DOCSTRING作为注释模板
    @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`XCLIPTextModel`].

        Examples:

        ```
        >>> from transformers import AutoTokenizer, AutoModel

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")
        >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        text_embeds = text_outputs[1]
        text_embeds = self.text_projection(text_embeds)

        return text_embeds

    @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
    def get_video_features(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            video_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The video embeddings obtained by
            applying the projection layer to the pooled output of [`XCLIPVideoModel`].

        Examples:

        ```
        >>> from transformers import AutoTokenizer, AutoModel

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")
        >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")

        >>> video_inputs = torch.randn(2, 3, 224, 224)  # Example pixel values tensor
        >>> video_features = model.get_video_features(pixel_values=video_inputs)
        ```"""
        # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        video_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        video_embeds = video_outputs[1]
        video_embeds = self.video_projection(video_embeds)

        return video_embeds

    @add_start_docstrings_to_model_forward(X_CLIP_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=XCLIPOutput, config_class=XCLIPConfig)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        return_loss: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[XCLIPOutput, Tuple[torch.FloatTensor]]:
        r"""
        Returns:
            Union[XCLIPOutput, Tuple[torch.FloatTensor]]: Depending on the configuration, this method can return either
            a single XCLIPOutput object or a tuple containing a torch.FloatTensor.
        """

`.\models\x_clip\processing_x_clip.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Image/Text processor class for XCLIP
"""

import warnings

from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding


class XCLIPProcessor(ProcessorMixin):
    r"""
    Constructs an X-CLIP processor which wraps a VideoMAE image processor and a CLIP tokenizer into a single processor.

    [`XCLIPProcessor`] offers all the functionalities of [`VideoMAEImageProcessor`] and [`CLIPTokenizerFast`]. See the
    [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.

    Args:
        image_processor ([`VideoMAEImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`CLIPTokenizerFast`], *optional*):
            The tokenizer is a required input.
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "VideoMAEImageProcessor"
    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        # Check for deprecated argument and warn the user
        feature_extractor = None
        if "feature_extractor" in kwargs:
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")

        # Use `feature_extractor` if `image_processor` is not provided directly
        image_processor = image_processor if image_processor is not None else feature_extractor
        # Raise an error if `image_processor` or `tokenizer` is not provided
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        # Initialize the processor mixin with `image_processor` and `tokenizer`
        super().__init__(image_processor, tokenizer)
        # Set the current processor to `image_processor`
        self.current_processor = self.image_processor

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        # Forward batch decoding request to `tokenizer`
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        # Forward decoding request to `tokenizer`
        return self.tokenizer.decode(*args, **kwargs)
    # 返回模型输入的名称列表，包括输入的标识符、注意力掩码、位置标识和像素值
    @property
    def model_input_names(self):
        return ["input_ids", "attention_mask", "position_ids", "pixel_values"]
    
    # 返回特征提取器的类。警告已弃用，将在v5中移除。建议使用`image_processor_class`替代。
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class
    
    # 返回特征提取器。警告已弃用，将在v5中移除。建议使用`image_processor`替代。
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor

`.\models\x_clip\init.py`

# 导入类型检查模块，用于检查类型是否符合预期
from typing import TYPE_CHECKING

# 导入自定义的异常和模块惰性加载相关的工具函数和类
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构，包含配置、处理和建模相关的内容
_import_structure = {
    "configuration_x_clip": [
        "XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "XCLIPConfig",
        "XCLIPTextConfig",
        "XCLIPVisionConfig",
    ],
    "processing_x_clip": ["XCLIPProcessor"],
}

# 检查是否有torch可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若torch可用，增加模型相关的导入内容到_import_structure
    _import_structure["modeling_x_clip"] = [
        "XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "XCLIPModel",
        "XCLIPPreTrainedModel",
        "XCLIPTextModel",
        "XCLIPVisionModel",
    ]

# 如果是类型检查模式，导入具体的配置和建模内容
if TYPE_CHECKING:
    from .configuration_x_clip import (
        XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
        XCLIPConfig,
        XCLIPTextConfig,
        XCLIPVisionConfig,
    )
    from .processing_x_clip import XCLIPProcessor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_x_clip import (
            XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
            XCLIPModel,
            XCLIPPreTrainedModel,
            XCLIPTextModel,
            XCLIPVisionModel,
        )

# 如果不是类型检查模式，将当前模块置为懒加载模块
else:
    import sys

    # 将当前模块替换为懒加载模块_LazyModule，以延迟加载依赖模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\yolos\configuration_yolos.py`

# 设置编码为 UTF-8
# 版权声明 2022 年由 HuggingFace Inc. 团队所有
# 根据 Apache 许可证版本 2.0 许可，除非符合许可证条款，否则不得使用此文件
# 您可以在以下链接处获得许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
# 除非适用法律要求或书面同意，否则按“原样”分发软件，不附带任何明示或暗示的保证或条件
# 请参阅许可证以了解详细信息

""" YOLOS 模型配置"""

# 导入必要的库
from collections import OrderedDict  # 导入 OrderedDict 类
from typing import Mapping  # 导入 Mapping 类型提示

from packaging import version  # 导入 version 模块

# 导入配置工具类和 OnnxConfig
from ...configuration_utils import PretrainedConfig  
from ...onnx import OnnxConfig  
from ...utils import logging  # 导入 logging 模块

# 获取 logger 实例
logger = logging.get_logger(__name__)

# YOLOS 预训练模型配置存档映射
YOLOS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "hustvl/yolos-small": "https://huggingface.co/hustvl/yolos-small/resolve/main/config.json",
    # 可在 https://huggingface.co/models?filter=yolos 查看所有 YOLOS 模型
}

# YolosConfig 类，继承自 PretrainedConfig 类
class YolosConfig(PretrainedConfig):
    r"""
    这是一个配置类，用于存储 [`YolosModel`] 的配置。它用于根据指定的参数实例化 YOLOS 模型，定义模型架构。使用默认值实例化配置将产生类似 YOLOS [hustvl/yolos-base](https://huggingface.co/hustvl/yolos-base) 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。有关更多信息，请阅读 [`PretrainedConfig`] 的文档。

    示例:

    ```
    >>> from transformers import YolosConfig, YolosModel

    >>> # 初始化一个 YOLOS hustvl/yolos-base 风格的配置
    >>> configuration = YolosConfig()

    >>> # 使用配置初始化一个（具有随机权重）hustvl/yolos-base 风格的模型
    >>> model = YolosModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    model_type = "yolos"

    # 初始化函数，定义了 YolosConfig 的各种配置参数
    def __init__(
        self,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.0,
        attention_probs_dropout_prob=0.0,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        image_size=[512, 864],
        patch_size=16,
        num_channels=3,
        qkv_bias=True,
        num_detection_tokens=100,
        use_mid_position_embeddings=True,
        auxiliary_loss=False,
        class_cost=1,
        bbox_cost=5,
        giou_cost=2,
        bbox_loss_coefficient=5,
        giou_loss_coefficient=2,
        eos_coefficient=0.1,
        **kwargs,
    ):
        # 调用父类的初始化方法，传递所有关键字参数
        super().__init__(**kwargs)

        # 设置隐藏层大小
        self.hidden_size = hidden_size
        # 设置隐藏层数量
        self.num_hidden_layers = num_hidden_layers
        # 设置注意力头的数量
        self.num_attention_heads = num_attention_heads
        # 设置中间层大小
        self.intermediate_size = intermediate_size
        # 设置隐藏层激活函数
        self.hidden_act = hidden_act
        # 设置隐藏层的丢弃率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 设置注意力概率的丢弃率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 设置初始化范围
        self.initializer_range = initializer_range
        # 设置层归一化的 epsilon 值
        self.layer_norm_eps = layer_norm_eps
        # 设置图像大小
        self.image_size = image_size
        # 设置图像的补丁大小
        self.patch_size = patch_size
        # 设置图像的通道数
        self.num_channels = num_channels
        # 是否使用 QKV 偏置
        self.qkv_bias = qkv_bias
        # 检测令牌的数量
        self.num_detection_tokens = num_detection_tokens
        # 是否使用中间位置嵌入
        self.use_mid_position_embeddings = use_mid_position_embeddings
        # 是否使用辅助损失
        self.auxiliary_loss = auxiliary_loss
        # 设置类别损失的成本
        self.class_cost = class_cost
        # 设置边界框损失的成本
        self.bbox_cost = bbox_cost
        # 设置GIoU损失的成本
        self.giou_cost = giou_cost
        # 设置边界框损失系数
        self.bbox_loss_coefficient = bbox_loss_coefficient
        # 设置GIoU损失系数
        self.giou_loss_coefficient = giou_loss_coefficient
        # 设置EOS损失系数
        self.eos_coefficient = eos_coefficient
# 定义一个自定义的 YolosOnnxConfig 类，继承自 OnnxConfig 类
class YolosOnnxConfig(OnnxConfig):
    # 设置 torch_onnx_minimum_version 属性为版本号 1.11
    torch_onnx_minimum_version = version.parse("1.11")

    # 定义 inputs 属性作为属性方法，返回一个有序字典
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 返回一个有序字典，包含 pixel_values 键和相应的字典值
        return OrderedDict(
            [
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
            ]
        )

    # 定义 atol_for_validation 属性作为属性方法，返回浮点数 1e-4
    @property
    def atol_for_validation(self) -> float:
        # 返回用于验证的绝对误差容限值
        return 1e-4

    # 定义 default_onnx_opset 属性作为属性方法，返回整数 12
    @property
    def default_onnx_opset(self) -> int:
        # 返回默认的 ONNX 运算集版本号
        return 12

`.\models\yolos\convert_yolos_to_pytorch.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert YOLOS checkpoints from the original repository. URL: https://github.com/hustvl/YOLOS"""

# 导入必要的库和模块
import argparse  # 用于解析命令行参数
import json  # 用于处理 JSON 格式数据
from pathlib import Path  # 用于处理文件和目录路径操作

import requests  # 用于发送 HTTP 请求
import torch  # PyTorch 深度学习框架
from huggingface_hub import hf_hub_download  # 用于从 HF Hub 下载资源
from PIL import Image  # Python Imaging Library，用于图像处理

from transformers import YolosConfig, YolosForObjectDetection, YolosImageProcessor  # YOLOS 模型相关类
from transformers.utils import logging  # Transformers 日志工具模块


logging.set_verbosity_info()  # 设置日志输出级别为信息级别
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


def get_yolos_config(yolos_name: str) -> YolosConfig:
    config = YolosConfig()

    # 根据模型名称设置 YOLOS 配置参数
    if "yolos_ti" in yolos_name:
        config.hidden_size = 192
        config.intermediate_size = 768
        config.num_hidden_layers = 12
        config.num_attention_heads = 3
        config.image_size = [800, 1333]
        config.use_mid_position_embeddings = False
    elif yolos_name == "yolos_s_dWr":
        config.hidden_size = 330
        config.num_hidden_layers = 14
        config.num_attention_heads = 6
        config.intermediate_size = 1320
    elif "yolos_s" in yolos_name:
        config.hidden_size = 384
        config.intermediate_size = 1536
        config.num_hidden_layers = 12
        config.num_attention_heads = 6
    elif "yolos_b" in yolos_name:
        config.image_size = [800, 1344]

    config.num_labels = 91  # 设置标签数为 91
    repo_id = "huggingface/label-files"
    filename = "coco-detection-id2label.json"
    # 从 HF Hub 下载 COCO 检测标签映射文件，并加载为字典形式
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    id2label = {int(k): v for k, v in id2label.items()}  # 将键转换为整数类型
    config.id2label = id2label  # 设置 id 到标签的映射字典
    config.label2id = {v: k for k, v in id2label.items()}  # 设置标签到 id 的映射字典

    return config


# 将每个编码器层的矩阵拆分为查询、键和值
def read_in_q_k_v(state_dict: dict, config: YolosConfig, base_model: bool = False):
    # 循环遍历隐藏层的数量，通常用于处理神经网络的层数
    for i in range(config.num_hidden_layers):
        # 弹出输入投影层的权重和偏置，这些在Timm中表示为单矩阵和偏置
        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
        # 将查询(query)、键(keys)和值(values)依次添加到状态字典中
        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
            config.hidden_size : config.hidden_size * 2, :
        ]
        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
            config.hidden_size : config.hidden_size * 2
        ]
        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
# 定义一个函数，用于重命名模型状态字典的键
def rename_key(name: str) -> str:
    if "backbone" in name:
        name = name.replace("backbone", "vit")
    if "cls_token" in name:
        name = name.replace("cls_token", "embeddings.cls_token")
    if "det_token" in name:
        name = name.replace("det_token", "embeddings.detection_tokens")
    if "mid_pos_embed" in name:
        name = name.replace("mid_pos_embed", "encoder.mid_position_embeddings")
    if "pos_embed" in name:
        name = name.replace("pos_embed", "embeddings.position_embeddings")
    if "patch_embed.proj" in name:
        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
    if "blocks" in name:
        name = name.replace("blocks", "encoder.layer")
    if "attn.proj" in name:
        name = name.replace("attn.proj", "attention.output.dense")
    if "attn" in name:
        name = name.replace("attn", "attention.self")
    if "norm1" in name:
        name = name.replace("norm1", "layernorm_before")
    if "norm2" in name:
        name = name.replace("norm2", "layernorm_after")
    if "mlp.fc1" in name:
        name = name.replace("mlp.fc1", "intermediate.dense")
    if "mlp.fc2" in name:
        name = name.replace("mlp.fc2", "output.dense")
    if "class_embed" in name:
        name = name.replace("class_embed", "class_labels_classifier")
    if "bbox_embed" in name:
        name = name.replace("bbox_embed", "bbox_predictor")
    if "vit.norm" in name:
        name = name.replace("vit.norm", "vit.layernorm")

    return name


# 定义一个函数，用于将原始的模型状态字典转换为新的模型状态字典格式
def convert_state_dict(orig_state_dict: dict, model: YolosForObjectDetection) -> dict:
    # 遍历原始状态字典的键（复制一个副本进行遍历）
    for key in orig_state_dict.copy().keys():
        val = orig_state_dict.pop(key)  # 弹出当前键对应的值

        # 如果键包含 "qkv"
        if "qkv" in key:
            key_split = key.split(".")  # 使用点号分割键名
            layer_num = int(key_split[2])  # 解析层号
            dim = model.vit.encoder.layer[layer_num].attention.attention.all_head_size  # 获取维度信息

            # 根据键名中是否包含 "weight" 来决定如何处理值
            if "weight" in key:
                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.weight"] = val[:dim, :]
                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.weight"] = val[dim:dim * 2, :]
                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
            else:
                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.bias"] = val[:dim]
                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.bias"] = val[dim:dim * 2]
                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.bias"] = val[-dim:]

        else:
            # 对键名进行重命名处理并更新状态字典
            orig_state_dict[rename_key(key)] = val

    return orig_state_dict


# 定义一个函数，用于从 URL 加载并返回一张图像
def prepare_img() -> torch.Tensor:
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    im = Image.open(requests.get(url, stream=True).raw)  # 使用 requests 获取图像流并打开为 Image 对象
    return im


# 使用 torch.no_grad() 装饰器，确保在此函数调用期间不会计算梯度
@torch.no_grad()
# 定义一个函数，用于将指定模型的权重转换到 YOLOS 结构中
def convert_yolos_checkpoint(
    yolos_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False
):
    """
    Copy/paste/tweak model's weights to our YOLOS structure.
    复制/粘贴/调整模型的权重到我们的 YOLOS 结构中。
    """
    # 获取 YOLOS 配置信息
    config = get_yolos_config(yolos_name)

    # 加载原始模型的状态字典
    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]

    # 加载 YOLOS 目标检测模型
    model = YolosForObjectDetection(config)
    model.eval()

    # 将原始模型的状态字典转换为适应 YOLOS 结构的新状态字典
    new_state_dict = convert_state_dict(state_dict, model)
    model.load_state_dict(new_state_dict)

    # 使用 YolosImageProcessor 准备图像，然后将其编码
    size = 800 if yolos_name != "yolos_ti" else 512
    image_processor = YolosImageProcessor(format="coco_detection", size=size)
    encoding = image_processor(images=prepare_img(), return_tensors="pt")

    # 在处理后的图像上运行模型，获取预测的 logits 和边界框
    outputs = model(**encoding)
    logits, pred_boxes = outputs.logits, outputs.pred_boxes

    # 根据 yolos_name 设置预期的 logits 和边界框值
    expected_slice_logits, expected_slice_boxes = None, None
    if yolos_name == "yolos_ti":
        expected_slice_logits = torch.tensor(
            [[-39.5022, -11.9820, -17.6888], [-29.9574, -9.9769, -17.7691], [-42.3281, -20.7200, -30.6294]]
        )
        expected_slice_boxes = torch.tensor(
            [[0.4021, 0.0836, 0.7979], [0.0184, 0.2609, 0.0364], [0.1781, 0.2004, 0.2095]]
        )
    elif yolos_name == "yolos_s_200_pre":
        expected_slice_logits = torch.tensor(
            [[-24.0248, -10.3024, -14.8290], [-42.0392, -16.8200, -27.4334], [-27.2743, -11.8154, -18.7148]]
        )
        expected_slice_boxes = torch.tensor(
            [[0.2559, 0.5455, 0.4706], [0.2989, 0.7279, 0.1875], [0.7732, 0.4017, 0.4462]]
        )
    elif yolos_name == "yolos_s_300_pre":
        expected_slice_logits = torch.tensor(
            [[-36.2220, -14.4385, -23.5457], [-35.6970, -14.7583, -21.3935], [-31.5939, -13.6042, -16.8049]]
        )
        expected_slice_boxes = torch.tensor(
            [[0.7614, 0.2316, 0.4728], [0.7168, 0.4495, 0.3855], [0.4996, 0.1466, 0.9996]]
        )
    elif yolos_name == "yolos_s_dWr":
        expected_slice_logits = torch.tensor(
            [[-42.8668, -24.1049, -41.1690], [-34.7456, -14.1274, -24.9194], [-33.7898, -12.1946, -25.6495]]
        )
        expected_slice_boxes = torch.tensor(
            [[0.5587, 0.2773, 0.0605], [0.5004, 0.3014, 0.9994], [0.4999, 0.1548, 0.9994]]
        )
    elif yolos_name == "yolos_base":
        expected_slice_logits = torch.tensor(
            [[-40.6064, -24.3084, -32.6447], [-55.1990, -30.7719, -35.5877], [-51.4311, -33.3507, -35.6462]]
        )
        expected_slice_boxes = torch.tensor(
            [[0.5555, 0.2794, 0.0655], [0.9049, 0.2664, 0.1894], [0.9183, 0.1984, 0.1635]]
        )
    else:
        # 如果 yolos_name 不在预期的列表中，则抛出 ValueError
        raise ValueError(f"Unknown yolos_name: {yolos_name}")

    # 断言确保预测的 logits 和边界框与预期值接近
    assert torch.allclose(logits[0, :3, :3], expected_slice_logits, atol=1e-4)
    assert torch.allclose(pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)
    # 创建指定路径的文件夹，如果文件夹不存在则创建
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 打印保存模型的信息，包括模型名称和保存路径
    print(f"Saving model {yolos_name} to {pytorch_dump_folder_path}")
    # 将模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)
    # 打印保存图像处理器的信息，包括保存路径
    print(f"Saving image processor to {pytorch_dump_folder_path}")
    # 将图像处理器保存到指定路径
    image_processor.save_pretrained(pytorch_dump_folder_path)

    if push_to_hub:
        # 定义模型名称到Hub名称的映射字典
        model_mapping = {
            "yolos_ti": "yolos-tiny",
            "yolos_s_200_pre": "yolos-small",
            "yolos_s_300_pre": "yolos-small-300",
            "yolos_s_dWr": "yolos-small-dwr",
            "yolos_base": "yolos-base",
        }

        # 打印提示信息，说明正在将模型推送到Hub
        print("Pushing to the hub...")
        # 根据模型名称从映射字典中获取对应的Hub名称
        model_name = model_mapping[yolos_name]
        # 将图像处理器推送到Hub，指定组织名称为"hustvl"
        image_processor.push_to_hub(model_name, organization="hustvl")
        # 将模型推送到Hub，指定组织名称为"hustvl"
        model.push_to_hub(model_name, organization="hustvl")
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必选参数
    parser.add_argument(
        "--yolos_name",
        default="yolos_s_200_pre",
        type=str,
        help=(
            "Name of the YOLOS model you'd like to convert. Should be one of 'yolos_ti', 'yolos_s_200_pre',"
            " 'yolos_s_300_pre', 'yolos_s_dWr', 'yolos_base'."
        ),
    )
    # 添加一个参数选项，用于指定要转换的 YOLOS 模型的名称

    parser.add_argument(
        "--checkpoint_path", default=None, type=str, help="Path to the original state dict (.pth file)."
    )
    # 添加一个参数选项，用于指定原始状态字典文件（.pth 文件）的路径

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加一个参数选项，用于指定输出 PyTorch 模型目录的路径

    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    # 添加一个参数选项，用于指定是否将转换后的模型推送到 🤗 hub

    args = parser.parse_args()
    # 解析命令行参数，并将其存储在 args 变量中

    convert_yolos_checkpoint(args.yolos_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
    # 调用函数 convert_yolos_checkpoint，传递解析后的参数作为函数的参数

`.\models\yolos\feature_extraction_yolos.py`

# 设置文件编码为 UTF-8
# 版权声明，版权归 The HuggingFace Inc. 团队所有，保留所有权利
#
# 根据 Apache 许可证 2.0 版本进行许可
# 除非符合许可证的要求，否则不得使用此文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按“原样”提供，
# 没有任何明示或暗示的担保或条件
# 请参阅许可证了解具体的法律条款和条件

"""YOLOS 的特征提取器类。"""

# 导入警告模块
import warnings

# 导入 RGB 到 ID 转换函数，并重命名为 _rgb_to_id
from ...image_transforms import rgb_to_id as _rgb_to_id
# 导入日志模块
from ...utils import logging
# 导入 YolosImageProcessor 类，用于图像处理
from .image_processing_yolos import YolosImageProcessor

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


def rgb_to_id(x):
    # 发出警告，提醒用户 rgb_to_id 函数已经移动，
    # 从版本 5 开始将不再从当前模块中导入，
    # 请从 transformers.image_transforms 中导入
    warnings.warn(
        "rgb_to_id has moved and will not be importable from this module from v5. "
        "Please import from transformers.image_transforms instead.",
        FutureWarning,
    )
    return _rgb_to_id(x)


class YolosFeatureExtractor(YolosImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告，提醒用户 YolosFeatureExtractor 类已经废弃，
        # 将在版本 5 中移除，请使用 YolosImageProcessor 代替
        warnings.warn(
            "The class YolosFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
            " use YolosImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类 YolosImageProcessor 的初始化方法
        super().__init__(*args, **kwargs)

`.\models\yolos\image_processing_yolos.py`

    # 计算按照指定尺寸和最大尺寸缩放后的图像大小，保持宽高比不变
    """
    Compute the size of the image while maintaining the aspect ratio based on the given size and optional maximum size.
    """
    aspect_ratio = float(image_size[0]) / image_size[1]
    # 如果没有提供最大尺寸或者图像尺寸在最大尺寸内，则直接返回图像尺寸
    if max_size is None or (size[0] <= max_size[0] and size[1] <= max_size[1]):
        return size
    # 根据宽高比计算缩放后的高度
    new_height = int(round(size[0] / aspect_ratio))
    # 如果新高度小于等于最大高度，返回结果
    if new_height <= max_size[0]:
        return size[0], new_height
    # 否则，根据宽高比计算缩放后的宽度，并返回结果
    new_width = int(round(size[1] * aspect_ratio))
    return new_width, size[1]
    # 获取输入图像的高度和宽度
    height, width = image_size

    # 如果设置了最大输出尺寸
    if max_size is not None:
        # 计算输入图像的最小和最大边长
        min_original_size = float(min((height, width)))
        max_original_size = float(max((height, width)))
        
        # 如果按照指定输出尺寸计算后超过了最大允许尺寸，则重新调整输出尺寸
        if max_original_size / min_original_size * size > max_size:
            size = int(round(max_size * min_original_size / max_original_size))

    # 如果宽度小于高度且宽度不等于指定尺寸，则调整高度以保持比例
    if width < height and width != size:
        height = int(size * height / width)
        width = size
    # 如果高度小于宽度且高度不等于指定尺寸，则调整宽度以保持比例
    elif height < width and height != size:
        width = int(size * width / height)
        height = size

    # 计算宽度的模数，以确保宽度为16的倍数
    width_mod = np.mod(width, 16)
    # 计算高度的模数，以确保高度为16的倍数
    height_mod = np.mod(height, 16)

    # 调整宽度和高度，使其成为16的倍数
    width = width - width_mod
    height = height - height_mod

    # 返回调整后的高度和宽度作为元组
    return (height, width)
# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
def get_resize_output_image_size(
    input_image: np.ndarray,
    size: Union[int, Tuple[int, int], List[int]],
    max_size: Optional[int] = None,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
    """
    Computes the output image size given the input image size and the desired output size. If the desired output size
    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
    image size is computed by keeping the aspect ratio of the input image size.

    Args:
        input_image (`np.ndarray`):
            The image to resize.
        size (`int` or `Tuple[int, int]` or `List[int]`):
            The desired output size.
        max_size (`int`, *optional*):
            The maximum allowed output size.
        input_data_format (`ChannelDimension` or `str`, *optional*):
            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
    """
    # 获取输入图像的尺寸
    image_size = get_image_size(input_image, input_data_format)
    # 如果输出尺寸是一个元组或列表，则直接返回该尺寸
    if isinstance(size, (list, tuple)):
        return size
    # 否则，根据输入图像尺寸的长宽比计算输出尺寸
    return get_size_with_aspect_ratio(image_size, size, max_size)


# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable:
    """
    Returns a function that converts a numpy array to the framework of the input array.

    Args:
        arr (`np.ndarray`): The array to convert.
    """
    # 如果输入是一个 numpy 数组，则返回 numpy 的 array 函数
    if isinstance(arr, np.ndarray):
        return np.array
    # 如果 TensorFlow 可用且输入是 TensorFlow 张量，则返回 TensorFlow 的 convert_to_tensor 函数
    if is_tf_available() and is_tf_tensor(arr):
        import tensorflow as tf

        return tf.convert_to_tensor
    # 如果 PyTorch 可用且输入是 PyTorch 张量，则返回 PyTorch 的 tensor 函数
    if is_torch_available() and is_torch_tensor(arr):
        import torch

        return torch.tensor
    # 如果 Flax 可用且输入是 JAX 张量，则返回 JAX 的 array 函数
    if is_flax_available() and is_jax_tensor(arr):
        import jax.numpy as jnp

        return jnp.array
    # 如果无法识别输入类型，则引发 ValueError 异常
    raise ValueError(f"Cannot convert arrays of type {type(arr)}")


# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
    """
    Squeezes an array, but only if the axis specified has dim 1.
    """
    # 如果未指定轴，则默认压缩所有维度为 1 的轴
    if axis is None:
        return arr.squeeze()
    # 否则，尝试压缩指定轴，若指定轴的维度不为 1 则返回原数组
    try:
        return arr.squeeze(axis=axis)
    except ValueError:
        return arr


# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
    image_height, image_width = image_size
    norm_annotation = {}
    # 遍历注释字典中的键值对
    for key, value in annotation.items():
        # 如果当前键是 "boxes"
        if key == "boxes":
            # 将值赋给变量 boxes
            boxes = value
            # 将边角格式的框转换为中心-大小格式的框
            boxes = corners_to_center_format(boxes)
            # 将框的坐标值除以图像宽度和高度，进行归一化处理
            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
            # 将归一化后的框重新赋给规范化后的注释中的 "boxes" 键
            norm_annotation[key] = boxes
        else:
            # 对于非 "boxes" 键，直接复制其值到规范化后的注释中
            norm_annotation[key] = value
    # 返回处理后的规范化注释字典
    return norm_annotation
# 从transformers.models.detr.image_processing_detr.max_across_indices复制而来
def max_across_indices(values: Iterable[Any]) -> List[Any]:
    """
    返回可迭代值中所有索引上的最大值列表。
    """
    # 对于可迭代值的每个索引，找到最大值并构成列表返回
    return [max(values_i) for values_i in zip(*values)]


# 从transformers.models.detr.image_processing_detr.make_pixel_mask复制而来
def make_pixel_mask(
    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
    """
    为图像生成像素掩码，其中1表示有效像素，0表示填充。

    Args:
        image (`np.ndarray`):
            需要生成像素掩码的图像。
        output_size (`Tuple[int, int]`):
            掩码的输出尺寸。
    """
    # 获取图像的高度和宽度，根据输入的数据格式
    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
    # 创建一个指定大小的全零数组
    mask = np.zeros(output_size, dtype=np.int64)
    # 将有效像素的部分设置为1
    mask[:input_height, :input_width] = 1
    return mask


# 从transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask复制而来
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
    """
    将COCO格式的多边形注释转换为掩码。

    Args:
        segmentations (`List[List[float]]`):
            多边形的列表，每个多边形由一组x-y坐标表示。
        height (`int`):
            掩码的高度。
        width (`int`):
            掩码的宽度。
    """
    try:
        from pycocotools import mask as coco_mask
    except ImportError:
        raise ImportError("Pycocotools未安装在您的环境中。")

    masks = []
    # 遍历每个多边形，将其转换为掩码
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        if len(mask.shape) < 3:
            mask = mask[..., None]
        mask = np.asarray(mask, dtype=np.uint8)
        mask = np.any(mask, axis=2)
        masks.append(mask)
    # 如果存在掩码，则堆叠它们成为一个数组
    if masks:
        masks = np.stack(masks, axis=0)
    else:
        masks = np.zeros((0, height, width), dtype=np.uint8)

    return masks


# 从transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation复制而来
def prepare_coco_detection_annotation(
    image,
    target,
    return_segmentation_masks: bool = False,
    input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
    """
    将COCO格式中的目标转换为DETR所期望的格式。
    """
    # 获取图像的高度和宽度，根据输入的数据格式
    image_height, image_width = get_image_size(image, channel_dim=input_data_format)

    image_id = target["image_id"]
    image_id = np.asarray([image_id], dtype=np.int64)

    # 获取给定图像的所有COCO注释
    annotations = target["annotations"]
    # 过滤掉所有iscrowd为0或未定义的对象
    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]

    # 获取所有对象的类别ID
    classes = [obj["category_id"] for obj in annotations]
    classes = np.asarray(classes, dtype=np.int64)

    # 用于转换为coco api的准备工作
    # 提取所有注释中的目标区域面积，转换为 numpy 数组，使用 float32 类型
    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
    # 提取所有注释中的是否为群体标志（如果存在），转换为 numpy 数组，如果不存在则设为0，使用 int64 类型
    iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)

    # 提取所有注释中的包围框（bbox），存入列表
    boxes = [obj["bbox"] for obj in annotations]
    # 防止出现没有包围框的情况，通过调整大小使所有框都有四个坐标值
    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
    # 将宽度和高度加到每个框的坐标上，使其成为 (left, top, right, bottom) 格式
    boxes[:, 2:] += boxes[:, :2]
    # 将框的左上角和右下角坐标限制在图像尺寸内，即不超过图像宽度和高度
    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)

    # 通过检查每个框的高度和宽度来保留有效的框
    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])

    # 创建新的目标字典
    new_target = {}
    # 将图像 ID 添加到新的目标字典中
    new_target["image_id"] = image_id
    # 将保留框对应的类标签添加到新的目标字典中
    new_target["class_labels"] = classes[keep]
    # 将保留框添加到新的目标字典中
    new_target["boxes"] = boxes[keep]
    # 将保留框对应的区域面积添加到新的目标字典中
    new_target["area"] = area[keep]
    # 将保留框对应的是否为群体标志添加到新的目标字典中
    new_target["iscrowd"] = iscrowd[keep]
    # 将原始图像尺寸作为整数数组添加到新的目标字典中
    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)

    # 如果存在注释并且第一个注释包含关键点信息
    if annotations and "keypoints" in annotations[0]:
        # 提取所有注释中的关键点列表
        keypoints = [obj["keypoints"] for obj in annotations]
        # 将过滤后的关键点列表转换为 numpy 数组
        keypoints = np.asarray(keypoints, dtype=np.float32)
        # 使用 keep 掩码过滤出相关的注释
        keypoints = keypoints[keep]
        # 计算关键点的数量
        num_keypoints = keypoints.shape[0]
        # 如果有关键点，将其重新整形为 (-1, 3) 的形状，否则保持原状
        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
        # 将关键点添加到新的目标字典中
        new_target["keypoints"] = keypoints

    # 如果需要返回分割掩模
    if return_segmentation_masks:
        # 提取所有注释中的分割信息列表
        segmentation_masks = [obj["segmentation"] for obj in annotations]
        # 将 COCO 多边形分割转换为掩模（masks）
        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
        # 将保留的掩模添加到新的目标字典中
        new_target["masks"] = masks[keep]

    # 返回最终构建的新目标字典
    return new_target
# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes
# 计算提供的全景分割掩模周围的边界框
def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
    """
    Compute the bounding boxes around the provided panoptic segmentation masks.

    Args:
        masks: masks in format `[number_masks, height, width]` where N is the number of masks

    Returns:
        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
    """
    # 如果掩模为空，则返回一个形状为 (0, 4) 的全零数组
    if masks.size == 0:
        return np.zeros((0, 4))

    # 获取掩模的高度和宽度
    h, w = masks.shape[-2:]
    # 创建高度和宽度的一维数组
    y = np.arange(0, h, dtype=np.float32)
    x = np.arange(0, w, dtype=np.float32)
    # 创建高度和宽度的二维数组网格，使用 "ij" 索引顺序
    y, x = np.meshgrid(y, x, indexing="ij")

    # 计算掩模和坐标 x 的乘积，并获取每个掩模的最大 x 坐标
    x_mask = masks * np.expand_dims(x, axis=0)
    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
    # 创建掩模的 x 坐标的掩码数组，处理不包含掩模的部分
    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
    # 将掩码数组填充为 1e8，获取每个掩模的最小 x 坐标
    x_min = x.filled(fill_value=1e8)
    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)

    # 计算掩模和坐标 y 的乘积，并获取每个掩模的最大 y 坐标
    y_mask = masks * np.expand_dims(y, axis=0)
    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
    # 创建掩模的 y 坐标的掩码数组，处理不包含掩模的部分
    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
    # 将掩码数组填充为 1e8，获取每个掩模的最小 y 坐标
    y_min = y.filled(fill_value=1e8)
    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)

    # 返回形状为 `[number_masks, 4]` 的边界框数组，包含每个掩模的最小 x、y 和最大 x、y 坐标
    return np.stack([x_min, y_min, x_max, y_max], 1)


# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->YOLOS
# 为 YOLOS 准备 coco 全景注释
def prepare_coco_panoptic_annotation(
    image: np.ndarray,
    target: Dict,
    masks_path: Union[str, pathlib.Path],
    return_masks: bool = True,
    input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
    """
    Prepare a coco panoptic annotation for YOLOS.
    """
    # 获取图像的高度和宽度，使用输入数据格式作为通道维度
    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
    # 构建注释路径，结合掩模路径和目标文件名
    annotation_path = pathlib.Path(masks_path) / target["file_name"]

    # 创建新的目标字典，包含图像 ID、大小和原始大小的信息
    new_target = {}
    # 使用目标中的图像 ID 或 ID，作为 64 位整数数组的一部分
    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
    # 图像的尺寸，作为 64 位整数数组的一部分
    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
    # 原始图像的尺寸，作为 64 位整数数组的一部分
    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
    # 检查目标字典中是否存在键名为 "segments_info"
    if "segments_info" in target:
        # 从指定路径打开图像文件，将其转换为 NumPy 数组形式的多通道图像数据
        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
        # 将 RGB 图像数据转换为整数形式的类别 ID 图像数据
        masks = rgb_to_id(masks)

        # 从目标字典中获取所有段落信息的 ID 组成的数组
        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
        # 根据 ID 数组，创建布尔类型的掩码数组，用于表示每个像素是否属于对应的类别
        masks = masks == ids[:, None, None]
        # 将布尔类型的掩码数组转换为整数类型（0 或 1）
        masks = masks.astype(np.uint8)

        # 如果需要返回掩码数组，则将其添加到新的目标字典中
        if return_masks:
            new_target["masks"] = masks
        
        # 将掩码数组转换为包围框信息并添加到新的目标字典中
        new_target["boxes"] = masks_to_boxes(masks)
        
        # 从段落信息中获取类别 ID 并转换为 NumPy 数组形式，添加到新的目标字典中
        new_target["class_labels"] = np.array(
            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
        )
        
        # 从段落信息中获取 iscrowd 属性并转换为 NumPy 数组形式，添加到新的目标字典中
        new_target["iscrowd"] = np.asarray(
            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
        )
        
        # 从段落信息中获取 area 属性并转换为 NumPy 数组形式，添加到新的目标字典中
        new_target["area"] = np.asarray(
            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
        )

    # 返回经处理后的新目标字典
    return new_target
# Copied from transformers.models.detr.image_processing_detr.get_segmentation_image
def get_segmentation_image(
    masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False
):
    # 获取输入图像的高度和宽度
    h, w = input_size
    # 获取目标图像的最终高度和宽度
    final_h, final_w = target_size

    # 对 mask 进行 softmax 操作，使得每个像素的概率和为 1
    m_id = scipy.special.softmax(masks.transpose(0, 1), -1)

    # 如果没有检测到任何 mask，则将 m_id 初始化为全零数组
    if m_id.shape[-1] == 0:
        m_id = np.zeros((h, w), dtype=np.int64)
    else:
        # 取最大概率对应的类别作为每个像素的预测标签，并重新形状为 (h, w)
        m_id = m_id.argmax(-1).reshape(h, w)

    # 如果需要去重复处理
    if deduplicate:
        # 合并相同类别的 mask
        for equiv in stuff_equiv_classes.values():
            for eq_id in equiv:
                m_id[m_id == eq_id] = equiv[0]

    # 将预测标签转换为 RGB 彩色图像
    seg_img = id_to_rgb(m_id)
    # 将彩色分割图像 resize 到最终的目标大小
    seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
    return seg_img


# Copied from transformers.models.detr.image_processing_detr.get_mask_area
def get_mask_area(seg_img: np.ndarray, target_size: Tuple[int, int], n_classes: int) -> np.ndarray:
    # 获取目标图像的最终高度和宽度
    final_h, final_w = target_size
    # 将 seg_img 转换为 uint8 类型的 numpy 数组，并重塑形状为 (final_h, final_w, 3)
    np_seg_img = seg_img.astype(np.uint8)
    np_seg_img = np_seg_img.reshape(final_h, final_w, 3)
    # 将 RGB 彩色图像转换为类别标签图像
    m_id = rgb_to_id(np_seg_img)
    # 计算每个类别的面积
    area = [(m_id == i).sum() for i in range(n_classes)]
    return area


# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities
def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    # 对类别概率进行 softmax 操作
    probs = scipy.special.softmax(logits, axis=-1)
    # 取概率最大的类别作为预测标签
    labels = probs.argmax(-1, keepdims=True)
    # 提取最大概率值作为预测得分，并去除冗余的维度
    scores = np.take_along_axis(probs, labels, axis=-1)
    scores, labels = scores.squeeze(-1), labels.squeeze(-1)
    return scores, labels


# Copied from transformers.models.detr.image_processing_detr.resize_annotation
def resize_annotation(
    annotation: Dict[str, Any],
    orig_size: Tuple[int, int],
    target_size: Tuple[int, int],
    threshold: float = 0.5,
    resample: PILImageResampling = PILImageResampling.NEAREST,
):
    """
    Resizes an annotation to a target size.

    Args:
        annotation (`Dict[str, Any]`):
            The annotation dictionary.
        orig_size (`Tuple[int, int]`):
            The original size of the input image.
        target_size (`Tuple[int, int]`):
            The target size of the image, as returned by the preprocessing `resize` step.
        threshold (`float`, *optional*, defaults to 0.5):
            The threshold used to binarize the segmentation masks.
        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
            The resampling filter to use when resizing the masks.
    """
    # 计算原始大小与目标大小的尺寸比例
    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
    ratio_height, ratio_width = ratios

    # 创建新的注释字典，并设置目标尺寸大小
    new_annotation = {}
    new_annotation["size"] = target_size
    # 遍历注释字典中的每个键值对
    for key, value in annotation.items():
        # 如果键是"boxes"
        if key == "boxes":
            # 将值赋给变量boxes，并按比例缩放每个框的坐标
            boxes = value
            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
            # 将缩放后的框坐标存入新注释字典中
            new_annotation["boxes"] = scaled_boxes
        # 如果键是"area"
        elif key == "area":
            # 将值赋给变量area，并按比例缩放面积
            area = value
            scaled_area = area * (ratio_width * ratio_height)
            # 将缩放后的面积存入新注释字典中
            new_annotation["area"] = scaled_area
        # 如果键是"masks"
        elif key == "masks":
            # 将值赋给变量masks，并按目标尺寸重新调整每个掩码，然后进行二值化处理
            masks = value[:, None]
            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
            masks = masks.astype(np.float32)
            masks = masks[:, 0] > threshold
            # 将处理后的掩码存入新注释字典中
            new_annotation["masks"] = masks
        # 如果键是"size"
        elif key == "size":
            # 将目标尺寸存入新注释字典中
            new_annotation["size"] = target_size
        # 对于其它未指定处理的键，直接复制值到新注释字典中
        else:
            new_annotation[key] = value

    # 返回更新后的新注释字典
    return new_annotation
# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle
def binary_mask_to_rle(mask):
    """
    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.

    Args:
        mask (`torch.Tensor` or `numpy.array`):
            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
            segment_id or class_id.
    Returns:
        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
        format.
    """
    # 如果输入的 mask 是 PyTorch tensor，则转换为 numpy 数组
    if is_torch_tensor(mask):
        mask = mask.numpy()

    # 将二维数组扁平化为一维数组
    pixels = mask.flatten()
    # 在数组两端各加一个 0，确保算法正确处理首尾边界
    pixels = np.concatenate([[0], pixels, [0]])
    # 找到连续变化的位置，形成 RLE 编码
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    # 计算每段长度并调整格式
    runs[1::2] -= runs[::2]
    return list(runs)


# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
def convert_segmentation_to_rle(segmentation):
    """
    Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.

    Args:
        segmentation (`torch.Tensor` or `numpy.array`):
            A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
    Returns:
        `List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
    """
    # 获取所有不同的 segment_id
    segment_ids = torch.unique(segmentation)

    run_length_encodings = []
    # 遍历每个 segment_id，生成对应的 RLE 编码
    for idx in segment_ids:
        # 根据当前 segment_id 构建对应的二进制 mask
        mask = torch.where(segmentation == idx, 1, 0)
        # 将二进制 mask 转换为 RLE 编码
        rle = binary_mask_to_rle(mask)
        run_length_encodings.append(rle)

    return run_length_encodings


# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects
def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
    """
    Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
    `labels`.

    Args:
        masks (`torch.Tensor`):
            A tensor of shape `(num_queries, height, width)`.
        scores (`torch.Tensor`):
            A tensor of shape `(num_queries)`.
        labels (`torch.Tensor`):
            A tensor of shape `(num_queries)`.
        object_mask_threshold (`float`):
            A number between 0 and 1 used to binarize the masks.
    Raises:
        `ValueError`: Raised when the first dimension doesn't match in all input tensors.
    Returns:
        `Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
        < `object_mask_threshold`.
    """
    # 检查输入的张量形状是否一致
    if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
        raise ValueError("mask, scores and labels must have the same shape!")

    # 生成一个布尔掩码，标识需要保留的对象
    to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)

    # 根据掩码筛选出需要保留的 masks, scores 和 labels，并返回
    return masks[to_keep], scores[to_keep], labels[to_keep]


# Copied from transformers.models.detr.image_processing_detr.check_segment_validity
# 检查分割有效性的函数，判断给定类别 k 的分割是否有效
def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
    # 获取与类别 k 相关联的掩模
    mask_k = mask_labels == k
    # 计算类别 k 的掩模面积
    mask_k_area = mask_k.sum()

    # 计算查询类别 k 中所有内容的面积
    original_area = (mask_probs[k] >= mask_threshold).sum()
    # 判断是否存在类别 k 的掩模
    mask_exists = mask_k_area > 0 and original_area > 0

    # 如果存在掩模，进一步检查是否是有效的分割
    if mask_exists:
        # 计算掩模面积与查询面积的比率
        area_ratio = mask_k_area / original_area
        # 如果比率不大于重叠掩模面积阈值，将掩模标记为无效
        if not area_ratio.item() > overlap_mask_area_threshold:
            mask_exists = False

    return mask_exists, mask_k


# 从 transformers.models.detr.image_processing_detr.compute_segments 复制而来
def compute_segments(
    mask_probs,
    pred_scores,
    pred_labels,
    mask_threshold: float = 0.5,
    overlap_mask_area_threshold: float = 0.8,
    label_ids_to_fuse: Optional[Set[int]] = None,
    target_size: Tuple[int, int] = None,
):
    # 确定图像的高度和宽度
    height = mask_probs.shape[1] if target_size is None else target_size[0]
    width = mask_probs.shape[2] if target_size is None else target_size[1]

    # 创建用于存储分割结果的空白分割图和段落列表
    segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
    segments: List[Dict] = []

    # 如果指定了目标大小，则对掩模进行插值
    if target_size is not None:
        mask_probs = nn.functional.interpolate(
            mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
        )[0]

    current_segment_id = 0

    # 根据预测分数对每个掩模进行加权处理
    mask_probs *= pred_scores.view(-1, 1, 1)
    # 获取掩模的标签，即每个像素最可能的类别
    mask_labels = mask_probs.argmax(0)  # [height, width]

    # 跟踪每个类别实例的数量
    stuff_memory_list: Dict[str, int] = {}
    for k in range(pred_labels.shape[0]):
        pred_class = pred_labels[k].item()
        should_fuse = pred_class in label_ids_to_fuse

        # 检查是否存在有效的分割掩模
        mask_exists, mask_k = check_segment_validity(
            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
        )

        if mask_exists:
            # 如果类别已经存在于 stuff_memory_list 中，则使用已有的分割 ID
            if pred_class in stuff_memory_list:
                current_segment_id = stuff_memory_list[pred_class]
            else:
                # 否则递增分割 ID
                current_segment_id += 1

            # 将当前对象的分割添加到最终的分割图中
            segmentation[mask_k] = current_segment_id
            # 获取分割的分数，并将分割信息添加到 segments 列表中
            segment_score = round(pred_scores[k].item(), 6)
            segments.append(
                {
                    "id": current_segment_id,
                    "label_id": pred_class,
                    "was_fused": should_fuse,
                    "score": segment_score,
                }
            )
            # 如果应该进行融合，则将类别 ID 与当前分割 ID 关联起来
            if should_fuse:
                stuff_memory_list[pred_class] = current_segment_id

    return segmentation, segments


class YolosImageProcessor(BaseImageProcessor):
    r"""
    Constructs a Detr image processor.
    Args:
        format (`str`, *optional*, defaults to `"coco_detection"`):
            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
        do_resize (`bool`, *optional*, defaults to `True`):
            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
            overridden by the `do_resize` parameter in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
            Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
            the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
            Resampling filter to use if resizing the image.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
            `do_rescale` parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        do_normalize:
            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
            `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_pad (`bool`, *optional*, defaults to `True`):
            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
            method. If `True` will pad the images in the batch to the largest height and width in the batch.
            Padding will be applied to the bottom and right of the image with zeros.
    """

    # 定义模型输入的名称列表
    model_input_names = ["pixel_values", "pixel_mask"]
    # 初始化方法，用于创建一个新的对象实例
    def __init__(
        self,
        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Union[float, List[float]] = None,
        image_std: Union[float, List[float]] = None,
        do_convert_annotations: Optional[bool] = None,
        do_pad: bool = True,
        **kwargs,
    ) -> None:
        # 如果在 kwargs 中有 "pad_and_return_pixel_mask" 参数，则使用该参数覆盖 do_pad 变量
        if "pad_and_return_pixel_mask" in kwargs:
            do_pad = kwargs.pop("pad_and_return_pixel_mask")

        # 如果在 kwargs 中有 "max_size" 参数，则发出警告并将其从 kwargs 中弹出
        if "max_size" in kwargs:
            logger.warning_once(
                "The `max_size` parameter is deprecated and will be removed in v4.26. "
                "Please specify in `size['longest_edge'] instead`.",
            )
            max_size = kwargs.pop("max_size")
        else:
            # 否则将 max_size 设置为 None，或者从 size 中获取默认值 1333
            max_size = None if size is None else 1333

        # 如果 size 为 None，则设置一个默认的 size 字典，包含 shortest_edge 和 longest_edge
        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
        # 调用 get_size_dict 方法获取最终的 size 字典，考虑 max_size 和 default_to_square 参数
        size = get_size_dict(size, max_size=max_size, default_to_square=False)

        # 兼容性处理：如果 do_convert_annotations 为 None，则将其设为 do_normalize 的值
        if do_convert_annotations is None:
            do_convert_annotations = do_normalize

        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 设置实例的各种属性
        self.format = format
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.do_convert_annotations = do_convert_annotations
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        self.do_pad = do_pad
        # 定义一个私有属性，包含所有有效的处理器键名
        self._valid_processor_keys = [
            "images",
            "annotations",
            "return_segmentation_masks",
            "masks_path",
            "do_resize",
            "size",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "do_convert_annotations",
            "do_pad",
            "format",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    @classmethod
    # 从 transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict 方法复制而来，修改为支持 Yolos
    # 从字典中恢复图像处理器对象的参数，并根据传入的 kwargs 更新参数。
    # 如果 kwargs 中包含 "max_size"，则更新 image_processor_dict 中的 "max_size"。
    # 如果 kwargs 中包含 "pad_and_return_pixel_mask"，则更新 image_processor_dict 中的 "pad_and_return_pixel_mask"。
    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
        image_processor_dict = image_processor_dict.copy()
        if "max_size" in kwargs:
            image_processor_dict["max_size"] = kwargs.pop("max_size")
        if "pad_and_return_pixel_mask" in kwargs:
            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
        return super().from_dict(image_processor_dict, **kwargs)

    # 从 DETR 模型的图像处理器中准备注释信息，以便输入到 DETR 模型中。
    # 根据指定的注释格式进行处理。
    def prepare_annotation(
        self,
        image: np.ndarray,
        target: Dict,
        format: Optional[AnnotationFormat] = None,
        return_segmentation_masks: bool = None,
        masks_path: Optional[Union[str, pathlib.Path]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> Dict:
        """
        Prepare an annotation for feeding into DETR model.
        """
        format = format if format is not None else self.format

        # 根据注释格式选择相应的处理方法：COCO_DETECTION 或 COCO_PANOPTIC。
        if format == AnnotationFormat.COCO_DETECTION:
            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
            # 调用 prepare_coco_detection_annotation 函数处理 COCO_DETECTION 格式的注释。
            target = prepare_coco_detection_annotation(
                image, target, return_segmentation_masks, input_data_format=input_data_format
            )
        elif format == AnnotationFormat.COCO_PANOPTIC:
            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
            # 调用 prepare_coco_panoptic_annotation 函数处理 COCO_PANOPTIC 格式的注释。
            target = prepare_coco_panoptic_annotation(
                image,
                target,
                masks_path=masks_path,
                return_masks=return_segmentation_masks,
                input_data_format=input_data_format,
            )
        else:
            # 如果注释格式不是 COCO_DETECTION 或 COCO_PANOPTIC，则抛出 ValueError。
            raise ValueError(f"Format {format} is not supported.")
        return target

    # 警告：`prepare` 方法已弃用，将在 v4.33 版本中删除。请使用 `prepare_annotation` 方法替代。
    # 注意：`prepare_annotation` 方法不再返回图像。
    def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
        logger.warning_once(
            "The `prepare` method is deprecated and will be removed in a v4.33. "
            "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
            "does not return the image anymore.",
        )
        # 调用 prepare_annotation 方法处理注释。
        target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
        # 返回处理后的图像和目标。
        return image, target

    # 从 DETR 模型的图像处理器中将 COCO 格式的多边形转换为掩码的方法。
    # 发出警告日志，指出方法 `convert_coco_poly_to_mask` 将在 v4.33 版本中删除
    def convert_coco_poly_to_mask(self, *args, **kwargs):
        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
        # 调用同名函数 `convert_coco_poly_to_mask` 处理传入的参数和关键字参数并返回结果
        return convert_coco_poly_to_mask(*args, **kwargs)

    # 从 DETR 模型的处理图像部分复制而来，准备 COCO 检测的数据集注释
    def prepare_coco_detection(self, *args, **kwargs):
        # 发出警告日志，指出方法 `prepare_coco_detection` 将在 v4.33 版本中删除
        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
        # 调用 `prepare_coco_detection_annotation` 函数处理传入的参数和关键字参数并返回结果
        return prepare_coco_detection_annotation(*args, **kwargs)

    # 从 DETR 模型的处理图像部分复制而来，准备 COCO Panoptic 的数据集注释
    def prepare_coco_panoptic(self, *args, **kwargs):
        # 发出警告日志，指出方法 `prepare_coco_panoptic` 将在 v4.33 版本中删除
        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
        # 调用 `prepare_coco_panoptic_annotation` 函数处理传入的参数和关键字参数并返回结果
        return prepare_coco_panoptic_annotation(*args, **kwargs)

    # 从 DETR 模型的处理图像部分复制而来，调整图像大小的函数
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    # 调整图像大小到指定尺寸。尺寸可以是 `min_size`（标量）或 `(height, width)` 元组。如果尺寸是整数，则图像的较小边将匹配到该数字。
    def resize_image(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
        int, smaller edge of the image will be matched to this number.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
                `height` and `width`.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                Resampling filter to use if resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 如果 `kwargs` 中包含 `max_size` 参数，则发出警告并将其移除，建议在 `size['longest_edge']` 中指定
        if "max_size" in kwargs:
            logger.warning_once(
                "The `max_size` parameter is deprecated and will be removed in v4.26. "
                "Please specify in `size['longest_edge'] instead`.",
            )
            max_size = kwargs.pop("max_size")
        else:
            max_size = None
        
        # 使用 `get_size_dict` 函数获取调整后的大小字典，支持 `default_to_square` 参数
        size = get_size_dict(size, max_size=max_size, default_to_square=False)
        
        # 根据大小字典中的内容调整图像大小
        if "shortest_edge" in size and "longest_edge" in size:
            size = get_resize_output_image_size(
                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
            )
        elif "height" in size and "width" in size:
            size = (size["height"], size["width"])
        else:
            # 如果大小字典不包含所需的键，引发值错误异常
            raise ValueError(
                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
                f" {size.keys()}."
            )
        
        # 使用 `resize` 函数调整图像大小，并返回调整后的图像
        image = resize(
            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
        )
        
        # 返回调整后的图像
        return image

    # 从 transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation 复制而来
    def resize_annotation(
        self,
        annotation,
        orig_size,
        size,
        resample: PILImageResampling = PILImageResampling.NEAREST,
    ) -> Dict:
        """
        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
        to this number.
        """
        # 调用 `resize_annotation` 函数，将标注调整为与调整后图像匹配的大小
        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
    def rescale(
        self,
        image: np.ndarray,
        rescale_factor: float,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Rescale the image by the given factor. image = image * rescale_factor.

        Args:
            image (`np.ndarray`):
                Image to rescale.
            rescale_factor (`float`):
                The value to use for rescaling.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
                one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        """
        # 调用外部函数 `rescale` 对输入图像进行按比例缩放处理，并返回处理后的图像
        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
        """
        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
        """
        # 调用外部函数 `normalize_annotation` 对给定的注释信息进行标准化处理，并返回处理后的注释信息
        return normalize_annotation(annotation, image_size=image_size)

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
    def _update_annotation_for_padded_image(
        self,
        annotation: Dict,
        input_image_size: Tuple[int, int],
        output_image_size: Tuple[int, int],
        padding,
        update_bboxes,
    ) -> Dict:
        """
        Update the annotation to reflect padding changes in the input image.

        Args:
            annotation (`Dict`):
                The original annotation to update.
            input_image_size (`Tuple[int, int]`):
                Size of the original input image before padding.
            output_image_size (`Tuple[int, int]`):
                Size of the padded output image after padding.
            padding:
                Details of padding applied to the image.
            update_bboxes:
                Boolean flag indicating whether to update bounding boxes in the annotation.

        Returns:
            `Dict`: Updated annotation reflecting changes due to padding.
        """
        # 调用外部函数 `_update_annotation_for_padded_image` 对给定的注释信息进行填充图像的更新处理，并返回更新后的注释信息
        return _update_annotation_for_padded_image(
            annotation, input_image_size, output_image_size, padding, update_bboxes
        )
    ) -> Dict:
        """
        Update the annotation for a padded image.
        """
        # 创建一个新的注释字典
        new_annotation = {}
        # 将输出图像大小添加到新注释字典中
        new_annotation["size"] = output_image_size

        # 遍历传入的注释字典
        for key, value in annotation.items():
            if key == "masks":
                # 如果是 masks 键，获取 masks 数据并进行填充操作
                masks = value
                masks = pad(
                    masks,
                    padding,
                    mode=PaddingMode.CONSTANT,
                    constant_values=0,
                    input_data_format=ChannelDimension.FIRST,
                )
                masks = safe_squeeze(masks, 1)
                # 将填充后的 masks 数据添加到新注释字典中
                new_annotation["masks"] = masks
            elif key == "boxes" and update_bboxes:
                # 如果是 boxes 键且需要更新边界框
                boxes = value
                # 根据输入输出图像大小的比例，更新边界框数据
                boxes *= np.asarray(
                    [
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                    ]
                )
                # 将更新后的边界框数据添加到新注释字典中
                new_annotation["boxes"] = boxes
            elif key == "size":
                # 如果是 size 键，更新输出图像大小
                new_annotation["size"] = output_image_size
            else:
                # 对于其它键直接复制到新注释字典中
                new_annotation[key] = value
        # 返回更新后的注释字典
        return new_annotation

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        annotation: Optional[Dict[str, Any]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
        """
        # 获取输入图像的高度和宽度
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        # 获取输出图像的高度和宽度
        output_height, output_width = output_size

        # 计算需要填充的底部和右侧的像素数
        pad_bottom = output_height - input_height
        pad_right = output_width - input_width
        # 构建填充元组
        padding = ((0, pad_bottom), (0, pad_right))
        # 对图像进行填充操作，使用指定的常数值填充
        padded_image = pad(
            image,
            padding,
            mode=PaddingMode.CONSTANT,
            constant_values=constant_values,
            data_format=data_format,
            input_data_format=input_data_format,
        )
        # 如果有注释数据，更新注释以适应填充后的图像
        if annotation is not None:
            annotation = self._update_annotation_for_padded_image(
                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
            )
        # 返回填充后的图像和更新后的注释数据
        return padded_image, annotation
    # 定义一个实例方法 `pad`，用于在图像周围填充像素值，以使它们具有相同的尺寸
    def pad(
        self,
        images: List[np.ndarray],  # 输入参数：图像列表，每个元素是一个 NumPy 数组
        annotations: Optional[List[Dict[str, Any]]] = None,  # 可选参数：注释列表，每个注释是一个字典
        constant_values: Union[float, Iterable[float]] = 0,  # 填充的像素值，可以是单个浮点数或可迭代对象
        return_pixel_mask: bool = False,  # 是否返回像素掩码
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的数据类型，可以是字符串或张量类型
        data_format: Optional[ChannelDimension] = None,  # 数据格式，通道维度的顺序
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据的格式
        update_bboxes: bool = True,  # 是否更新边界框信息
    # 预处理图像和注释的实例方法 `preprocess`，用于执行图像的各种预处理操作
    def preprocess(
        self,
        images: ImageInput,  # 输入参数：图像，可以是单个图像或图像列表
        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,  # 可选参数：注释，可以是单个注释或注释列表
        return_segmentation_masks: bool = None,  # 是否返回分割掩码
        masks_path: Optional[Union[str, pathlib.Path]] = None,  # 掩码路径
        do_resize: Optional[bool] = None,  # 是否调整大小
        size: Optional[Dict[str, int]] = None,  # 调整大小的目标尺寸
        resample=None,  # PIL 图像重采样方法
        do_rescale: Optional[bool] = None,  # 是否重新缩放
        rescale_factor: Optional[Union[int, float]] = None,  # 重新缩放因子
        do_normalize: Optional[bool] = None,  # 是否进行归一化
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像均值
        image_std: Optional[Union[float, List[float]]] = None,  # 图像标准差
        do_convert_annotations: Optional[bool] = None,  # 是否转换注释
        do_pad: Optional[bool] = None,  # 是否填充图像
        format: Optional[Union[str, AnnotationFormat]] = None,  # 注释格式
        return_tensors: Optional[Union[TensorType, str]] = None,  # 返回的数据类型，可以是张量或字符串
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,  # 数据格式，通道维度的顺序
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据的格式
        **kwargs,  # 其他关键字参数
    # 后处理方法 - TODO: 添加对其他框架的支持
    # 从 transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process 复制，并将 Detr 替换为 Yolos
    # 将原始输出转换为最终的边界框坐标格式（top_left_x, top_left_y, bottom_right_x, bottom_right_y）。
    # 仅支持 PyTorch。

    def post_process(self, outputs, target_sizes):
        """
        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format. Only supports PyTorch.

        Args:
            outputs ([`YolosObjectDetectionOutput`]):
                Raw outputs of the model.
            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
                original image size (before any data augmentation). For visualization, this should be the image size
                after data augment, but before padding.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        """

        # 发出警告，表明函数即将移除，建议使用替代函数 `post_process_object_detection`，并设置 `threshold=0.` 以获得相同的结果
        logger.warning_once(
            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
        )

        # 从模型输出中获取分类置信度和预测边界框
        out_logits, out_bbox = outputs.logits, outputs.pred_boxes

        # 检查输出的 logits 数量与目标尺寸数量是否一致
        if len(out_logits) != len(target_sizes):
            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
        # 检查目标尺寸的形状是否为 (batch_size, 2)
        if target_sizes.shape[1] != 2:
            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")

        # 对分类置信度进行 softmax 处理，得到每个类别的概率分布，并提取最大概率对应的类别索引和置信度分数
        prob = nn.functional.softmax(out_logits, -1)
        scores, labels = prob[..., :-1].max(-1)

        # 将预测的边界框格式转换为 [x0, y0, x1, y1] 格式（左上角和右下角坐标）
        boxes = center_to_corners_format(out_bbox)

        # 将相对坐标 [0, 1] 转换为绝对坐标 [0, height]，根据目标尺寸缩放边界框
        img_h, img_w = target_sizes.unbind(1)
        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
        boxes = boxes * scale_fct[:, None, :]

        # 将结果组织成字典的列表，每个字典包含模型预测的分数、类别和边界框
        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
        return results

    # 从 `transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_object_detection` 复制而来，
    # 将函数名及相关说明中的 `Detr` 替换为 `Yolos`
    def post_process_object_detection(
        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
        """
        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format. Only supports PyTorch.

        Args:
            outputs ([`YolosObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        """
        # Extract logits and bounding boxes from model outputs
        out_logits, out_bbox = outputs.logits, outputs.pred_boxes

        # Check if target_sizes is provided and validate dimensions
        if target_sizes is not None:
            if len(out_logits) != len(target_sizes):
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )

        # Compute probabilities and extract scores and labels
        prob = nn.functional.softmax(out_logits, -1)
        scores, labels = prob[..., :-1].max(-1)

        # Convert bounding boxes to [x0, y0, x1, y1] format
        boxes = center_to_corners_format(out_bbox)

        # Convert relative [0, 1] coordinates to absolute [0, height] coordinates if target_sizes is provided
        if target_sizes is not None:
            if isinstance(target_sizes, list):
                img_h = torch.Tensor([i[0] for i in target_sizes])
                img_w = torch.Tensor([i[1] for i in target_sizes])
            else:
                img_h, img_w = target_sizes.unbind(1)

            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
            boxes = boxes * scale_fct[:, None, :]

        # Filter predictions based on score threshold and organize results into dictionaries
        results = []
        for s, l, b in zip(scores, labels, boxes):
            score = s[s > threshold]
            label = l[s > threshold]
            box = b[s > threshold]
            results.append({"scores": score, "labels": label, "boxes": box})

        return results

`.\models\yolos\modeling_yolos.py`

# 设置文件编码为UTF-8
# 版权声明，指出2022年版权归华中科技大学电信学院和HuggingFace团队所有
# 根据Apache许可证2.0版，除非符合许可证的规定，否则禁止使用此文件
# 可以在以下链接找到完整的许可证内容：http://www.apache.org/licenses/LICENSE-2.0
""" PyTorch YOLOS 模型."""

# 导入必要的模块
import collections.abc  # 引入集合抽象基类
import math  # 引入数学库
from dataclasses import dataclass  # 引入数据类
from typing import Dict, List, Optional, Set, Tuple, Union  # 引入类型提示

import torch  # 引入PyTorch
import torch.utils.checkpoint  # 引入PyTorch的checkpoint功能
from torch import Tensor, nn  # 从PyTorch中引入张量和神经网络模块

# 导入相关的模型输出和工具函数
from ...activations import ACT2FN  # 从activations模块导入激活函数映射
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling  # 导入基础模型输出类
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer  # 导入模型剪枝工具函数
from ...utils import (  # 导入各种实用函数和类
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_accelerate_available,
    is_scipy_available,
    is_vision_available,
    logging,
    replace_return_docstrings,
    requires_backends,
)
from .configuration_yolos import YolosConfig  # 导入YOLOS模型的配置类

# 如果scipy可用，则导入线性求和分配优化工具
if is_scipy_available():
    from scipy.optimize import linear_sum_assignment

# 如果vision模块可用，则导入图像转换相关函数
if is_vision_available():
    from transformers.image_transforms import center_to_corners_format

# 如果accelerate可用，则导入部分状态和相关的降维函数
if is_accelerate_available():
    from accelerate import PartialState
    from accelerate.utils import reduce

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# 用于文档的配置信息
_CONFIG_FOR_DOC = "YolosConfig"

# 用于文档的检查点信息
_CHECKPOINT_FOR_DOC = "hustvl/yolos-small"

# 预期的输出形状
_EXPECTED_OUTPUT_SHAPE = [1, 3401, 384]

# YOLOS预训练模型存档列表
YOLOS_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "hustvl/yolos-small",
    # 查看所有YOLOS模型，请访问：https://huggingface.co/models?filter=yolos
]

@dataclass
class YolosObjectDetectionOutput(ModelOutput):
    """
    [`YolosForObjectDetection`] 的输出类型。
    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
            scale-invariant IoU loss.
        loss_dict (`Dict`, *optional*):
            A dictionary containing the individual losses. Useful for logging.
        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
            Classification logits (including no-object) for all queries.
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
            possible padding). You can use [`~YolosImageProcessor.post_process`] to retrieve the unnormalized bounding
            boxes.
        auxiliary_outputs (`list[Dict]`, *optional*):
            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
            `pred_boxes`) for each decoder layer.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    """

    # Optional variables initialized to None, indicating they may or may not be present
    loss: Optional[torch.FloatTensor] = None
    loss_dict: Optional[Dict] = None
    logits: torch.FloatTensor = None
    pred_boxes: torch.FloatTensor = None
    auxiliary_outputs: Optional[List[Dict]] = None
    last_hidden_state: Optional[torch.FloatTensor] = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
# 定义一个名为 YolosEmbeddings 的类，继承自 nn.Module，用于构建 CLS token、检测 token、位置和补丁嵌入。
class YolosEmbeddings(nn.Module):
    """
    Construct the CLS token, detection tokens, position and patch embeddings.
    构建 CLS token、检测 token、位置和补丁嵌入。
    """

    def __init__(self, config: YolosConfig) -> None:
        super().__init__()

        # 定义一个可学习参数，用于表示 CLS token，形状为 [1, 1, hidden_size]
        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        
        # 定义一个可学习参数，用于表示检测 token，形状为 [1, num_detection_tokens, hidden_size]
        self.detection_tokens = nn.Parameter(torch.zeros(1, config.num_detection_tokens, config.hidden_size))
        
        # 使用 YolosPatchEmbeddings 类构建补丁嵌入对象
        self.patch_embeddings = YolosPatchEmbeddings(config)
        num_patches = self.patch_embeddings.num_patches
        
        # 定义一个可学习参数，用于表示位置嵌入，形状为 [1, num_patches + num_detection_tokens + 1, hidden_size]
        self.position_embeddings = nn.Parameter(
            torch.zeros(1, num_patches + config.num_detection_tokens + 1, config.hidden_size)
        )

        # 定义一个 dropout 层，用于在训练过程中随机断开一些神经元连接，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        # 创建一个 InterpolateInitialPositionEmbeddings 的实例，用于插值初始位置嵌入
        self.interpolation = InterpolateInitialPositionEmbeddings(config)
        
        # 保存配置参数，以便后续使用
        self.config = config

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        batch_size, num_channels, height, width = pixel_values.shape
        
        # 使用补丁嵌入对象处理输入的像素值，得到嵌入向量
        embeddings = self.patch_embeddings(pixel_values)

        batch_size, seq_len, _ = embeddings.size()

        # 将 [CLS] token 和检测 token 添加到嵌入的补丁 token 中
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        detection_tokens = self.detection_tokens.expand(batch_size, -1, -1)
        embeddings = torch.cat((cls_tokens, embeddings, detection_tokens), dim=1)

        # 添加位置编码到每个 token 中
        # 可能需要对现有的位置嵌入进行插值处理
        position_embeddings = self.interpolation(self.position_embeddings, (height, width))

        embeddings = embeddings + position_embeddings

        # 对嵌入向量应用 dropout 层
        embeddings = self.dropout(embeddings)

        return embeddings


# 定义一个名为 InterpolateInitialPositionEmbeddings 的类，继承自 nn.Module，用于插值初始位置嵌入。
class InterpolateInitialPositionEmbeddings(nn.Module):
    def __init__(self, config) -> None:
        super().__init__()
        self.config = config
    # 定义一个前向传播函数，用于处理位置嵌入和图像大小参数，返回一个张量
    def forward(self, pos_embed, img_size=(800, 1344)) -> torch.Tensor:
        # 从位置嵌入中提取类别位置嵌入，形状为(batch_size, hidden_size)
        cls_pos_embed = pos_embed[:, 0, :]
        # 添加一个维度使其形状变为(batch_size, 1, hidden_size)，用于后续拼接
        cls_pos_embed = cls_pos_embed[:, None]
        # 从位置嵌入中提取检测位置嵌入，形状为(batch_size, num_detection_tokens, hidden_size)
        det_pos_embed = pos_embed[:, -self.config.num_detection_tokens :, :]
        # 从位置嵌入中提取除类别和检测外的其余位置嵌入，形状为(batch_size, seq_len - num_detection_tokens - 1, hidden_size)
        patch_pos_embed = pos_embed[:, 1 : -self.config.num_detection_tokens, :]
        # 将形状为(batch_size, hidden_size, seq_len - num_detection_tokens - 1)的张量转置为(batch_size, seq_len - num_detection_tokens - 1, hidden_size)
        patch_pos_embed = patch_pos_embed.transpose(1, 2)
        # 获取批次大小、隐藏大小和序列长度
        batch_size, hidden_size, seq_len = patch_pos_embed.shape

        # 计算图像中的分块高度和宽度
        patch_height, patch_width = (
            self.config.image_size[0] // self.config.patch_size,
            self.config.image_size[1] // self.config.patch_size,
        )
        # 将位置嵌入重塑为(batch_size, hidden_size, patch_height, patch_width)
        patch_pos_embed = patch_pos_embed.view(batch_size, hidden_size, patch_height, patch_width)

        # 重新调整分块的位置嵌入大小至新的分块高度和宽度
        height, width = img_size
        new_patch_heigth, new_patch_width = height // self.config.patch_size, width // self.config.patch_size
        patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed, size=(new_patch_heigth, new_patch_width), mode="bicubic", align_corners=False
        )
        # 展平重新调整后的位置嵌入，形状变为(batch_size, seq_len - num_detection_tokens - 1, hidden_size)
        patch_pos_embed = patch_pos_embed.flatten(2).transpose(1, 2)
        # 拼接类别位置嵌入、分块位置嵌入和检测位置嵌入，形状为(batch_size, seq_len, hidden_size)
        scale_pos_embed = torch.cat((cls_pos_embed, patch_pos_embed, det_pos_embed), dim=1)
        # 返回拼接后的位置嵌入张量
        return scale_pos_embed
class InterpolateMidPositionEmbeddings(nn.Module):
    """
    模块用于在Transformer模型中插值中间位置的位置嵌入。

    Args:
        config: 模型配置参数对象

    Attributes:
        config: 存储模型配置参数的对象

    Methods:
        forward(pos_embed, img_size=(800, 1344)): 前向传播方法，用于计算位置嵌入的插值结果。

    """

    def __init__(self, config) -> None:
        super().__init__()
        self.config = config

    def forward(self, pos_embed, img_size=(800, 1344)) -> torch.Tensor:
        """
        执行前向传播计算插值后的位置嵌入。

        Args:
            pos_embed: 位置嵌入张量，形状为(batch_size, seq_length, hidden_size, seq_len)
            img_size: 图像大小元组，默认为(800, 1344)

        Returns:
            scale_pos_embed: 插值后的位置嵌入张量，形状为(batch_size, seq_length, hidden_size)

        """
        # 提取CLS位置嵌入，保持维度
        cls_pos_embed = pos_embed[:, :, 0, :]
        cls_pos_embed = cls_pos_embed[:, None]
        
        # 提取检测标记位置嵌入
        det_pos_embed = pos_embed[:, :, -self.config.num_detection_tokens :, :]
        
        # 提取补丁位置嵌入，并转置张量的最后两个维度
        patch_pos_embed = pos_embed[:, :, 1 : -self.config.num_detection_tokens, :]
        patch_pos_embed = patch_pos_embed.transpose(2, 3)
        
        # 获取补丁嵌入张量的形状信息
        depth, batch_size, hidden_size, seq_len = patch_pos_embed.shape
        
        # 将补丁嵌入张量重塑为(batch_size * depth, hidden_size, patch_height, patch_width)
        patch_height, patch_width = (
            self.config.image_size[0] // self.config.patch_size,
            self.config.image_size[1] // self.config.patch_size,
        )
        patch_pos_embed = patch_pos_embed.view(depth * batch_size, hidden_size, patch_height, patch_width)
        
        # 插值新的补丁位置嵌入至目标图像大小
        height, width = img_size
        new_patch_height, new_patch_width = height // self.config.patch_size, width // self.config.patch_size
        patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed, size=(new_patch_height, new_patch_width), mode="bicubic", align_corners=False
        )
        
        # 将插值后的补丁位置嵌入张量重塑为(batch_size, depth, new_patch_height * new_patch_width, hidden_size)
        patch_pos_embed = (
            patch_pos_embed.flatten(2)
            .transpose(1, 2)
            .contiguous()
            .view(depth, batch_size, new_patch_height * new_patch_width, hidden_size)
        )
        
        # 拼接不同部分的位置嵌入张量：CLS位置嵌入 + 插值后的补丁位置嵌入 + 检测标记位置嵌入
        scale_pos_embed = torch.cat((cls_pos_embed, patch_pos_embed, det_pos_embed), dim=2)
        
        return scale_pos_embed


class YolosPatchEmbeddings(nn.Module):
    """
    此类将输入的`pixel_values`（形状为(batch_size, num_channels, height, width)）转换为Transformer模型消费的初始隐藏状态（补丁嵌入），
    形状为(batch_size, seq_length, hidden_size)。

    Args:
        config: 模型配置参数对象

    Attributes:
        image_size: 图像大小元组
        patch_size: 补丁大小元组
        num_channels: 输入图像的通道数
        num_patches: 图像中的补丁数量

    Methods:
        __init__(config): 初始化方法，设置类属性和卷积投影
    """

    def __init__(self, config):
        super().__init__()
        image_size, patch_size = config.image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.hidden_size

        # 确保图像大小和补丁大小是可迭代对象
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        
        # 计算图像中的补丁数量
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        
        # 设置类属性
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches
        
        # 定义卷积投影层，将输入图像转换为补丁嵌入
        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
    # 定义一个方法 `forward`，接收一个名为 `pixel_values` 的张量作为输入，并返回一个张量作为输出
    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        # 获取输入张量的批大小、通道数、高度和宽度
        batch_size, num_channels, height, width = pixel_values.shape
        # 如果输入张量的通道数不等于预设的 `self.num_channels`，抛出数值错误异常
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )

        # 对输入张量应用投影层 `self.projection`，并将结果扁平化为二维，然后交换维度 1 和 2
        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
        # 返回处理后的嵌入张量
        return embeddings
# 从transformers.models.vit.modeling_vit.ViTSelfAttention复制代码到YolosSelfAttention并替换ViT为Yolos
class YolosSelfAttention(nn.Module):
    def __init__(self, config: YolosConfig) -> None:
        super().__init__()
        # 检查隐藏大小是否是注意力头数的倍数，若不是则抛出数值错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
                f"heads {config.num_attention_heads}."
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建query、key、value线性层，用于计算注意力分数
        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)

        # 定义Dropout层用于注意力概率的dropout操作
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        # 将输入张量x转换为注意力分数矩阵的形状
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 计算混合的查询层
        mixed_query_layer = self.query(hidden_states)

        # 计算键值对应的注意力分数矩阵
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 计算原始的注意力分数，通过query和key的点积得到
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        # 对注意力分数进行缩放，除以注意力头大小的平方根
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # 对注意力分数进行softmax归一化得到注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 使用Dropout对注意力概率进行随机置零处理
        attention_probs = self.dropout(attention_probs)

        # 如果有头掩码，则将头掩码应用到注意力概率上
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算上下文向量，通过注意力概率加权求和value层
        context_layer = torch.matmul(attention_probs, value_layer)

        # 重新排列上下文向量的形状以便后续处理
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        # 根据需要返回上下文向量和注意力概率，或仅返回上下文向量
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Yolos
class YolosSelfOutput(nn.Module):
    """
    The residual connection is defined in YolosLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    """

    def __init__(self, config: YolosConfig) -> None:
        super().__init__()
        # 定义一个全连接层，输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义一个 dropout 层，根据配置中的 dropout 概率进行丢弃操作
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入的 hidden_states 通过全连接层 self.dense 进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对经过全连接层的输出进行 dropout 操作
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Yolos
class YolosAttention(nn.Module):
    def __init__(self, config: YolosConfig) -> None:
        super().__init__()
        # 初始化 YolosSelfAttention 层
        self.attention = YolosSelfAttention(config)
        # 初始化 YolosSelfOutput 层
        self.output = YolosSelfOutput(config)
        # 初始化一个空集合，用于存储待剪枝的注意力头部索引
        self.pruned_heads = set()

    def prune_heads(self, heads: Set[int]) -> None:
        # 如果待剪枝的头部集合为空，则直接返回
        if len(heads) == 0:
            return
        # 调用辅助函数 find_pruneable_heads_and_indices 寻找可剪枝的头部及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.attention.query = prune_linear_layer(self.attention.query, index)
        self.attention.key = prune_linear_layer(self.attention.key, index)
        self.attention.value = prune_linear_layer(self.attention.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储已剪枝的头部索引
        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 调用 self.attention 进行注意力计算，返回自注意力的输出
        self_outputs = self.attention(hidden_states, head_mask, output_attentions)

        # 将自注意力的输出 self_outputs[0] 和输入 hidden_states 传入 self.output 层
        attention_output = self.output(self_outputs[0], hidden_states)

        # 如果需要输出注意力权重，则在 outputs 中包含它们
        outputs = (attention_output,) + self_outputs[1:]  # 如果输出注意力权重，则添加它们到输出中
        return outputs


# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->Yolos
class YolosIntermediate(nn.Module):
    def __init__(self, config: YolosConfig) -> None:
        super().__init__()
        # 定义一个全连接层，输入维度是 config.hidden_size，输出维度是 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果 config.hidden_act 是字符串类型，则选择对应的激活函数，否则直接使用给定的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
    # 定义一个类方法 `forward`，用于执行前向传播操作，接收隐藏状态作为输入，并返回处理后的隐藏状态
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入的隐藏状态通过全连接层 `self.dense` 进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的隐藏状态应用激活函数 `self.intermediate_act_fn` 进行非线性变换
        hidden_states = self.intermediate_act_fn(hidden_states)
    
        # 返回经过线性变换和激活函数处理后的隐藏状态作为结果
        return hidden_states
# 从transformers.models.vit.modeling_vit.ViTOutput复制并将ViT改为Yolos
class YolosOutput(nn.Module):
    def __init__(self, config: YolosConfig) -> None:
        super().__init__()
        # 创建一个全连接层，输入大小为config.intermediate_size，输出大小为config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个dropout层，以config.hidden_dropout_prob的概率丢弃输入
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入hidden_states传递到全连接层中进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对全连接层的输出应用dropout操作
        hidden_states = self.dropout(hidden_states)

        # 将dropout后的输出与输入tensor相加，实现残差连接
        hidden_states = hidden_states + input_tensor

        return hidden_states


# 从transformers.models.vit.modeling_vit.ViTLayer复制并将ViT改为Yolos
class YolosLayer(nn.Module):
    """这对应于timm实现中的Block类。"""

    def __init__(self, config: YolosConfig) -> None:
        super().__init__()
        # 设置用于分块前馈的chunk大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 设置序列长度的维度为1
        self.seq_len_dim = 1
        # 创建一个YolosAttention实例
        self.attention = YolosAttention(config)
        # 创建一个YolosIntermediate实例
        self.intermediate = YolosIntermediate(config)
        # 创建一个YolosOutput实例
        self.output = YolosOutput(config)
        # 创建一个LayerNorm层，在隐藏大小为config.hidden_size时使用config.layer_norm_eps作为epsilon
        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个LayerNorm层，在隐藏大小为config.hidden_size时使用config.layer_norm_eps作为epsilon
        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 在Yolos中，layernorm在self-attention之前应用
        self_attention_outputs = self.attention(
            self.layernorm_before(hidden_states),
            head_mask,
            output_attentions=output_attentions,
        )
        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # 如果输出注意力权重，将其添加到输出中

        # 第一个残差连接
        hidden_states = attention_output + hidden_states

        # 在Yolos中，layernorm也在self-attention之后应用
        layer_output = self.layernorm_after(hidden_states)
        layer_output = self.intermediate(layer_output)

        # 第二个残差连接在这里完成
        layer_output = self.output(layer_output, hidden_states)

        outputs = (layer_output,) + outputs

        return outputs
    # 初始化函数，接受一个 YolosConfig 对象作为配置参数
    def __init__(self, config: YolosConfig) -> None:
        # 调用父类的初始化方法
        super().__init__()
        # 将配置参数保存到对象属性中
        self.config = config
        # 创建一个由多个 YolosLayer 对象组成的列表，并保存到对象属性中
        self.layer = nn.ModuleList([YolosLayer(config) for _ in range(config.num_hidden_layers)])
        # 设置梯度检查点标志为 False
        self.gradient_checkpointing = False

        # 计算序列长度，用于中间位置嵌入的初始化
        seq_length = (
            1 + (config.image_size[0] * config.image_size[1] // config.patch_size**2) + config.num_detection_tokens
        )
        # 如果配置中指定使用中间位置嵌入，则创建相应的可训练参数
        self.mid_position_embeddings = (
            nn.Parameter(
                torch.zeros(
                    config.num_hidden_layers - 1,
                    1,
                    seq_length,
                    config.hidden_size,
                )
            )
            if config.use_mid_position_embeddings
            else None
        )

        # 如果配置中指定使用中间位置嵌入，则创建相应的插值器对象
        self.interpolation = InterpolateMidPositionEmbeddings(config) if config.use_mid_position_embeddings else None

    # 前向传播函数，接受多个输入参数，并返回一个联合类型的输出
    def forward(
        self,
        hidden_states: torch.Tensor,
        height,
        width,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[tuple, BaseModelOutput]:
        # 如果需要输出隐藏状态，则初始化空元组用于保存所有隐藏状态
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力权重，则初始化空元组用于保存所有注意力权重
        all_self_attentions = () if output_attentions else None

        # 如果配置中指定使用中间位置嵌入，则根据输入的高度和宽度进行插值计算
        if self.config.use_mid_position_embeddings:
            interpolated_mid_position_embeddings = self.interpolation(self.mid_position_embeddings, (height, width))

        # 遍历所有层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 如果启用梯度检查点并且处于训练阶段，则调用梯度检查点函数获取层的输出
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    layer_head_mask,
                    output_attentions,
                )
            else:
                # 否则，直接调用层模块进行前向传播计算
                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)

            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果配置中指定使用中间位置嵌入，并且不是最后一层，则将中间位置嵌入添加到当前隐藏状态中
            if self.config.use_mid_position_embeddings:
                if i < (self.config.num_hidden_layers - 1):
                    hidden_states = hidden_states + interpolated_mid_position_embeddings[i]

            # 如果需要输出注意力权重，则将当前层的注意力权重添加到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果需要输出隐藏状态，则将最终隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的结果，则将结果打包成元组并返回
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则，返回一个 BaseModelOutput 对象，包含最终的隐藏状态、所有隐藏状态和所有注意力权重
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
@add_start_docstrings(
    "The bare YOLOS Model transformer outputting raw hidden-states without any specific head on top.",
    YOLOS_START_DOCSTRING,
)
class YolosModel(YolosPreTrainedModel):
    """
    YolosModel extends YolosPreTrainedModel to implement a transformer model without specific heads on top.

    Inherits from YolosPreTrainedModel and utilizes the provided YOLOS_START_DOCSTRING for detailed documentation.
    """
    # 初始化函数，接受一个YolosConfig类型的配置参数和一个布尔值参数add_pooling_layer，默认为True
    def __init__(self, config: YolosConfig, add_pooling_layer: bool = True):
        # 调用父类的初始化方法
        super().__init__(config)
        # 将传入的配置参数保存到实例变量中
        self.config = config

        # 初始化YolosEmbeddings对象，用于处理嵌入
        self.embeddings = YolosEmbeddings(config)
        # 初始化YolosEncoder对象，用于编码器
        self.encoder = YolosEncoder(config)

        # 初始化LayerNorm层，用于层归一化，设置epsilon参数为config中的layer_norm_eps
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 如果add_pooling_layer为True，则初始化YolosPooler对象，用于池化
        # 否则将self.pooler设置为None
        self.pooler = YolosPooler(config) if add_pooling_layer else None

        # 调用post_init方法，用于初始化权重和应用最终处理
        self.post_init()

    # 返回输入嵌入层对象YolosPatchEmbeddings
    def get_input_embeddings(self) -> YolosPatchEmbeddings:
        return self.embeddings.patch_embeddings

    # 私有方法，用于修剪模型中的注意力头
    # heads_to_prune参数为一个字典，表示每个层需要修剪的注意力头列表
    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
        """
        Prunes heads of the model.

        Args:
            heads_to_prune (`dict` of {layer_num: list of heads to prune in this layer}):
                See base class `PreTrainedModel`.
        """
        # 遍历heads_to_prune字典中的每一层和对应需要修剪的头部列表
        for layer, heads in heads_to_prune.items():
            # 调用self.encoder.layer[layer].attention.prune_heads方法进行修剪
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 前向传播函数，接受多个可选的输入参数，并返回模型输出
    @add_start_docstrings_to_model_forward(YOLOS_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

        # 接受像素值张量作为输入，可选参数，默认为None
        head_mask: Optional[torch.Tensor] = None,
        # 输出注意力权重的标志，可选参数，默认为None
        output_attentions: Optional[bool] = None,
        # 输出隐藏状态的标志，可选参数，默认为None
        output_hidden_states: Optional[bool] = None,
        # 返回字典类型的输出结果标志，可选参数，默认为None
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        # 设置输出注意力矩阵的选项，如果未指定则使用配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置输出隐藏状态的选项，如果未指定则使用配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置返回字典的选项，如果未指定则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果未提供像素值，则抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 准备头部屏蔽（head mask）如果需要
        # head_mask 中的 1.0 表示我们保留该头部
        # attention_probs 的形状为 bsz x n_heads x N x N
        # 输入的 head_mask 的形状为 [num_heads] 或 [num_hidden_layers x num_heads]
        # 并且 head_mask 被转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        # 将像素值嵌入到嵌入层中
        embedding_output = self.embeddings(pixel_values)

        # 将嵌入输出传入编码器（encoder）
        encoder_outputs = self.encoder(
            embedding_output,
            height=pixel_values.shape[-2],
            width=pixel_values.shape[-1],
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取编码器的序列输出
        sequence_output = encoder_outputs[0]
        # 应用层归一化到序列输出
        sequence_output = self.layernorm(sequence_output)
        # 如果存在汇聚器（pooler），则将序列输出传入汇聚器
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        # 如果不使用返回字典，则返回头部输出和编码器其他输出
        if not return_dict:
            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
            return head_outputs + encoder_outputs[1:]

        # 如果使用返回字典，则返回包含序列输出、汇聚器输出以及编码器其他输出的 BaseModelOutputWithPooling 对象
        return BaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
class YolosPooler(nn.Module):
    # 定义 YolosPooler 类，继承自 nn.Module
    def __init__(self, config: YolosConfig):
        # 初始化函数，接收一个 YolosConfig 类型的参数 config
        super().__init__()
        # 创建一个线性层，输入和输出维度均为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 激活函数为双曲正切函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # 前向传播函数
        # 我们通过简单地选择与第一个标记对应的隐藏状态来“池化”模型。
        first_token_tensor = hidden_states[:, 0]
        # 将第一个标记的隐藏状态输入到线性层中
        pooled_output = self.dense(first_token_tensor)
        # 应用激活函数到线性层的输出
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出
        return pooled_output


@add_start_docstrings(
    """
    YOLOS Model (consisting of a ViT encoder) with object detection heads on top, for tasks such as COCO detection.
    """,
    YOLOS_START_DOCSTRING,
)
class YolosForObjectDetection(YolosPreTrainedModel):
    # 定义 YolosForObjectDetection 类，继承自 YolosPreTrainedModel 类
    def __init__(self, config: YolosConfig):
        # 初始化函数，接收一个 YolosConfig 类型的参数 config
        super().__init__(config)

        # YOLOS (ViT) 编码器模型
        self.vit = YolosModel(config, add_pooling_layer=False)

        # 目标检测头部
        # 我们为“无对象”类别添加一个头部
        self.class_labels_classifier = YolosMLPPredictionHead(
            input_dim=config.hidden_size, hidden_dim=config.hidden_size, output_dim=config.num_labels + 1, num_layers=3
        )
        self.bbox_predictor = YolosMLPPredictionHead(
            input_dim=config.hidden_size, hidden_dim=config.hidden_size, output_dim=4, num_layers=3
        )

        # 初始化权重并应用最终处理
        self.post_init()

    # 参考自 https://github.com/facebookresearch/detr/blob/master/models/detr.py
    @torch.jit.unused
    def _set_aux_loss(self, outputs_class, outputs_coord):
        # 这是一个解决方案，使 torchscript 可以正常工作，因为 torchscript
        # 不支持具有非同构值的字典，例如同时包含张量和列表的字典。
        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]

    @add_start_docstrings_to_model_forward(YOLOS_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=YolosObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        labels: Optional[List[Dict]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # YolosForObjectDetection 的前向传播函数
        pass  # 在此函数中执行具体的前向传播操作，但没有提供具体实现

# 从 transformers.models.detr.modeling_detr.dice_loss 复制而来
def dice_loss(inputs, targets, num_boxes):
    """
    Compute the DICE loss, similar to generalized IOU for masks

    Args:
        inputs: A float tensor of arbitrary shape.
                The predictions for each example.
        targets: A float tensor with the same shape as inputs. Stores the binary
                 classification label for each element in inputs (0 for the negative class and 1 for the positive
                 class).
    """
    # 对输入进行 sigmoid 激活
    inputs = inputs.sigmoid()
    # 将输入扁平化处理
    inputs = inputs.flatten(1)
    # 计算每个样本的分子部分：2 * (inputs * targets) 的按行求和
    numerator = 2 * (inputs * targets).sum(1)
    # 计算每个样本的分母部分：inputs 和 targets 的按最后一个维度求和
    denominator = inputs.sum(-1) + targets.sum(-1)
    # 计算损失值：1 减去 (numerator + 1) 除以 (denominator + 1)
    loss = 1 - (numerator + 1) / (denominator + 1)
    # 返回所有样本损失值的平均值
    return loss.sum() / num_boxes
# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
    """
    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.

    Args:
        inputs (`torch.FloatTensor` of arbitrary shape):
            The predictions for each example.
        targets (`torch.FloatTensor` with the same shape as `inputs`)
            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
            and 1 for the positive class).
        alpha (`float`, *optional*, defaults to `0.25`):
            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
        gamma (`int`, *optional*, defaults to `2`):
            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.

    Returns:
        Loss tensor
    """
    # 将预测值通过sigmoid函数转换为概率
    prob = inputs.sigmoid()
    # 计算二元交叉熵损失，不进行缩减
    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
    # 计算调节因子 p_t
    p_t = prob * targets + (1 - prob) * (1 - targets)
    # 计算焦点损失
    loss = ce_loss * ((1 - p_t) ** gamma)

    # 如果 alpha 大于等于 0，应用 alpha 调节损失
    if alpha >= 0:
        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        loss = alpha_t * loss

    # 返回损失的均值，并对所有盒子的损失进行求和后再除以盒子数量
    return loss.mean(1).sum() / num_boxes


# Copied from transformers.models.detr.modeling_detr.DetrLoss with Detr->Yolos
class YolosLoss(nn.Module):
    """
    This class computes the losses for YolosForObjectDetection/YolosForSegmentation. The process happens in two steps: 1)
    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
    of matched ground-truth / prediction (supervise class and box).

    A note on the `num_classes` argument (copied from original repo in detr.py): "the naming of the `num_classes`
    parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where `max_obj_id` is
    the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass `num_classes` to
    be 91. As another example, for a dataset that has a single class with `id` 1, you should pass `num_classes` to be 2
    (`max_obj_id` + 1). For more details on this, check the following discussion
    https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223"


    Args:
        matcher (`YolosHungarianMatcher`):
            Module able to compute a matching between targets and proposals.
        num_classes (`int`):
            Number of object categories, omitting the special no-object category.
        eos_coef (`float`):
            Relative classification weight applied to the no-object category.
        losses (`List[str]`):
            List of all the losses to be applied. See `get_loss` for a list of all available losses.
    """
    pass
    # 初始化函数，接收匹配器、类别数、EOS（End of Sequence）系数和损失函数列表作为参数
    def __init__(self, matcher, num_classes, eos_coef, losses):
        # 调用父类的初始化方法
        super().__init__()
        # 将参数赋值给对象的属性
        self.matcher = matcher
        self.num_classes = num_classes
        self.eos_coef = eos_coef
        self.losses = losses
        # 创建一个全为1的张量，长度为类别数+1，最后一个元素赋值为EOS系数
        empty_weight = torch.ones(self.num_classes + 1)
        empty_weight[-1] = self.eos_coef
        # 将这个权重张量注册为缓冲区，使其能够被保存到模型的状态中
        self.register_buffer("empty_weight", empty_weight)

    # 使用的是负对数似然损失（NLL），计算分类损失
    def loss_labels(self, outputs, targets, indices, num_boxes):
        """
        Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim
        [nb_target_boxes]
        """
        # 检查输出中是否存在"logits"键
        if "logits" not in outputs:
            raise KeyError("No logits were found in the outputs")
        # 获取模型输出的分类预测值
        source_logits = outputs["logits"]

        # 根据匹配索引重新排序目标类别标签，以匹配源 logits 的顺序
        idx = self._get_source_permutation_idx(indices)
        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
        target_classes = torch.full(
            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
        )
        target_classes[idx] = target_classes_o

        # 计算交叉熵损失，使用 self.empty_weight 作为类别权重
        loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight)
        losses = {"loss_ce": loss_ce}

        return losses

    # 用于计算基数误差（cardinality error），即预测的非空框的数量与目标之间的绝对误差
    @torch.no_grad()
    def loss_cardinality(self, outputs, targets, indices, num_boxes):
        """
        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.

        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
        """
        # 获取模型输出的 logits
        logits = outputs["logits"]
        device = logits.device
        # 获取每个目标的类别标签张量长度
        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
        # 计算预测的非空框数量，即 logits.argmax(-1) 不是最后一个类别的数量
        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
        # 使用 L1 损失函数计算基数误差
        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
        losses = {"cardinality_error": card_err}
        return losses
    def loss_boxes(self, outputs, targets, indices, num_boxes):
        """
        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.

        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
        are expected in format (center_x, center_y, w, h), normalized by the image size.
        """
        # 检查输出中是否存在预测的边界框
        if "pred_boxes" not in outputs:
            raise KeyError("No predicted boxes found in outputs")
        
        # 根据索引获取重新排列后的源边界框
        idx = self._get_source_permutation_idx(indices)
        source_boxes = outputs["pred_boxes"][idx]
        
        # 获取目标边界框，并且将它们拼接成一个张量
        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)

        # 计算 L1 回归损失
        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")

        losses = {}
        # 将 L1 损失求和并进行归一化
        losses["loss_bbox"] = loss_bbox.sum() / num_boxes

        # 计算 GIoU 损失
        loss_giou = 1 - torch.diag(
            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
        )
        # 将 GIoU 损失求和并进行归一化
        losses["loss_giou"] = loss_giou.sum() / num_boxes
        return losses

    def loss_masks(self, outputs, targets, indices, num_boxes):
        """
        Compute the losses related to the masks: the focal loss and the dice loss.

        Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
        """
        # 检查输出中是否存在预测的掩码
        if "pred_masks" not in outputs:
            raise KeyError("No predicted masks found in outputs")

        # 获取源索引和目标索引，根据它们重新排列预测的掩码
        source_idx = self._get_source_permutation_idx(indices)
        target_idx = self._get_target_permutation_idx(indices)
        source_masks = outputs["pred_masks"]
        source_masks = source_masks[source_idx]
        
        # 获取目标掩码，并将它们解压缩成嵌套张量
        masks = [t["masks"] for t in targets]
        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
        target_masks = target_masks.to(source_masks)
        target_masks = target_masks[target_idx]

        # 将预测掩码插值到目标大小
        source_masks = nn.functional.interpolate(
            source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
        )
        source_masks = source_masks[:, 0].flatten(1)

        target_masks = target_masks.flatten(1)
        target_masks = target_masks.view(source_masks.shape)
        
        losses = {
            # 计算 sigmoid focal 损失
            "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
            # 计算 dice 损失
            "loss_dice": dice_loss(source_masks, target_masks, num_boxes),
        }
        return losses

    def _get_source_permutation_idx(self, indices):
        # 根据索引创建批次索引和源索引
        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
        source_idx = torch.cat([source for (source, _) in indices])
        return batch_idx, source_idx
    def _get_target_permutation_idx(self, indices):
        # 根据给定的索引重新排列目标
        # 创建一个张量，其中每个目标都被填充为其索引 i
        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
        # 创建一个张量，包含所有目标的索引
        target_idx = torch.cat([target for (_, target) in indices])
        return batch_idx, target_idx

    def get_loss(self, loss, outputs, targets, indices, num_boxes):
        # 定义损失函数映射
        loss_map = {
            "labels": self.loss_labels,
            "cardinality": self.loss_cardinality,
            "boxes": self.loss_boxes,
            "masks": self.loss_masks,
        }
        if loss not in loss_map:
            # 如果损失函数不在映射中，则抛出错误
            raise ValueError(f"Loss {loss} not supported")
        # 调用相应的损失函数并返回结果
        return loss_map[loss](outputs, targets, indices, num_boxes)

    def forward(self, outputs, targets):
        """
        This performs the loss computation.

        Args:
             outputs (`dict`, *optional*):
                Dictionary of tensors, see the output specification of the model for the format.
             targets (`List[dict]`, *optional*):
                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
                losses applied, see each loss' doc.
        """
        # 剔除辅助输出，保留主要输出
        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}

        # 获取输出和目标之间的匹配关系
        indices = self.matcher(outputs_without_aux, targets)

        # 计算所有节点上的平均目标框数，用于归一化
        num_boxes = sum(len(t["class_labels"]) for t in targets)
        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
        world_size = 1
        if is_accelerate_available():
            # 如果加速可用，并且部分状态不为空，则进行一些处理
            if PartialState._shared_state != {}:
                num_boxes = reduce(num_boxes)
                world_size = PartialState().num_processes
        # 限制目标框数除以世界大小后的值，并转为标量
        num_boxes = torch.clamp(num_boxes / world_size, min=1).item()

        # 计算所有请求的损失
        losses = {}
        for loss in self.losses:
            # 获取损失并更新到损失字典中
            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))

        # 如果存在辅助损失，则对每个中间层的输出进行相同的处理
        if "auxiliary_outputs" in outputs:
            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
                # 获取辅助输出和目标之间的匹配关系
                indices = self.matcher(auxiliary_outputs, targets)
                for loss in self.losses:
                    if loss == "masks":
                        # 忽略中间层的掩码损失计算，因为成本太高
                        continue
                    # 获取损失并更新到损失字典中，使用带有索引后缀的键
                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
                    losses.update(l_dict)

        return losses
# 从 transformers.models.detr.modeling_detr.DetrMLPPredictionHead 复制并改名为 YolosMLPPredictionHead
class YolosMLPPredictionHead(nn.Module):
    """
    简单的多层感知机（MLP，也称为 FFN），用于预测边界框相对于图像的归一化中心坐标、高度和宽度。

    从 https://github.com/facebookresearch/detr/blob/master/models/detr.py 复制而来
    """

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.num_layers = num_layers
        # 创建一个由多个线性层组成的模块列表
        h = [hidden_dim] * (num_layers - 1)
        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))

    def forward(self, x):
        # 通过多个线性层和 ReLU 激活函数进行前向传播
        for i, layer in enumerate(self.layers):
            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
        return x


# 从 transformers.models.detr.modeling_detr.DetrHungarianMatcher 复制并改名为 YolosHungarianMatcher
class YolosHungarianMatcher(nn.Module):
    """
    这个类计算网络预测与目标之间的匹配。

    为了效率考虑，目标不包括无对象。因此，一般情况下预测数量比目标多。
    在这种情况下，我们对最佳预测进行 1 对 1 的匹配，而其余预测则不匹配（因此视为非对象）。

    Args:
        class_cost:
            匹配成本中分类误差的相对权重。
        bbox_cost:
            匹配成本中边界框坐标 L1 误差的相对权重。
        giou_cost:
            匹配成本中边界框 giou 损失的相对权重。
    """

    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
        super().__init__()
        # 确保依赖库 scipy 被加载
        requires_backends(self, ["scipy"])

        self.class_cost = class_cost
        self.bbox_cost = bbox_cost
        self.giou_cost = giou_cost
        # 如果所有匹配成本都为 0，则抛出错误
        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
            raise ValueError("All costs of the Matcher can't be 0")

    @torch.no_grad()
    def forward(self, outputs, targets):
        """
        Args:
            outputs (`dict`):
                A dictionary that contains at least these entries:
                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
            targets (`List[dict]`):
                A list of targets (len(targets) = batch_size), where each target is a dict containing:
                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
                  ground-truth objects in the target) containing the class labels
                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.

        Returns:
            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
            - index_i is the indices of the selected predictions (in order)
            - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        # Extract batch size and number of queries from the logits tensor shape
        batch_size, num_queries = outputs["logits"].shape[:2]

        # Flatten logits and apply softmax to get the output probabilities
        out_prob = outputs["logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]

        # Flatten predicted boxes tensor
        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]

        # Concatenate target class labels from all targets
        target_ids = torch.cat([v["class_labels"] for v in targets])

        # Concatenate target box coordinates from all targets
        target_bbox = torch.cat([v["boxes"] for v in targets])

        # Compute classification cost using negative log likelihood approximation
        class_cost = -out_prob[:, target_ids]

        # Compute L1 cost between predicted boxes and target boxes
        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)

        # Compute generalized IoU (giou) cost between predicted and target boxes
        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))

        # Combine costs into a final cost matrix using pre-defined coefficients (self.bbox_cost, self.class_cost, self.giou_cost)
        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost

        # Reshape cost matrix to match batch size and number of queries
        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()

        # Split cost matrix based on target sizes and apply linear sum assignment to find optimal assignment
        sizes = [len(v["boxes"]) for v in targets]
        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]

        # Return indices as a list of tuples, each tuple representing (index_i, index_j)
        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
# Copied from transformers.models.detr.modeling_detr._upcast
def _upcast(t: Tensor) -> Tensor:
    # 如果输入张量是浮点型，则根据需要将其类型提升到更高的浮点类型，以防止数值溢出
    if t.is_floating_point():
        return t if t.dtype in (torch.float32, torch.float64) else t.float()
    else:
        # 如果输入张量是整型，则根据需要将其类型提升到更高的整型类型
        return t if t.dtype in (torch.int32, torch.int64) else t.int()


# Copied from transformers.models.detr.modeling_detr.box_area
def box_area(boxes: Tensor) -> Tensor:
    """
    计算一组边界框的面积，这些边界框由其 (x1, y1, x2, y2) 坐标指定。

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            待计算面积的边界框。这些边界框应该以 (x1, y1, x2, y2) 格式提供，其中 `0 <= x1 < x2` 且 `0 <= y1 < y2`。

    Returns:
        `torch.FloatTensor`: 包含每个边界框面积的张量。
    """
    boxes = _upcast(boxes)
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


# Copied from transformers.models.detr.modeling_detr.box_iou
def box_iou(boxes1, boxes2):
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]

    union = area1[:, None] + area2 - inter

    iou = inter / union
    return iou, union


# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
def generalized_box_iou(boxes1, boxes2):
    """
    使用 https://giou.stanford.edu/ 中的广义 IoU 计算。边界框应该以 [x0, y0, x1, y1] (左上角和右下角) 格式提供。

    Returns:
        `torch.FloatTensor`: 一个形状为 [N, M] 的成对矩阵，其中 N = len(boxes1)，M = len(boxes2)
    """
    # 检查是否存在退化边界框，这会导致无穷大 / 无效结果，因此进行早期检查
    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
        raise ValueError(f"boxes1 必须以 [x0, y0, x1, y1] (左上角和右下角) 格式提供，但是提供了 {boxes1}")
    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
        raise ValueError(f"boxes2 必须以 [x0, y0, x1, y1] (左上角和右下角) 格式提供，但是提供了 {boxes2}")
    iou, union = box_iou(boxes1, boxes2)

    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
    area = width_height[:, :, 0] * width_height[:, :, 1]

    return iou - (area - union) / area


# Copied from transformers.models.detr.modeling_detr._max_by_axis
def _max_by_axis(the_list):
    # type: (List[List[int]]) -> List[int]
    maxes = the_list[0]
    for sublist in the_list[1:]:
        for index, item in enumerate(sublist):
            maxes[index] = max(maxes[index], item)
    return maxes
# 定义一个名为 NestedTensor 的类，用于处理包含张量和可选遮罩的嵌套张量数据结构
class NestedTensor(object):
    # 初始化方法，接收张量列表和可选的遮罩张量作为参数
    def __init__(self, tensors, mask: Optional[Tensor]):
        # 将输入的张量列表赋值给实例变量 tensors
        self.tensors = tensors
        # 将输入的遮罩张量赋值给实例变量 mask
        self.mask = mask

    # 转换方法，将嵌套张量对象的张量数据移动到指定设备上
    def to(self, device):
        # 将嵌套张量中的张量数据转移到指定的设备上，并保存为新的张量对象
        cast_tensor = self.tensors.to(device)
        # 获取当前对象的遮罩张量
        mask = self.mask
        # 如果存在遮罩张量，则将其也转移到指定的设备上
        if mask is not None:
            cast_mask = mask.to(device)
        else:
            cast_mask = None
        # 返回一个新的 NestedTensor 对象，其张量和遮罩均已转移到指定设备上
        return NestedTensor(cast_tensor, cast_mask)

    # 解构方法，返回嵌套张量对象中的张量和遮罩
    def decompose(self):
        return self.tensors, self.mask

    # 字符串表示方法，返回嵌套张量对象的张量的字符串表示
    def __repr__(self):
        return str(self.tensors)


# 从输入的张量列表中创建嵌套张量对象的函数，要求输入的张量必须是三维的
# 引自 transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
    # 检查输入张量列表中第一个张量的维度是否为三维
    if tensor_list[0].ndim == 3:
        # 计算张量列表中所有张量的最大尺寸
        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
        # 构建批次的形状，包括批次大小和每个张量的最大尺寸
        batch_shape = [len(tensor_list)] + max_size
        # 解包批次形状
        batch_size, num_channels, height, width = batch_shape
        # 获取第一个张量的数据类型和设备信息
        dtype = tensor_list[0].dtype
        device = tensor_list[0].device
        # 创建全零张量，其形状与批次形状相同，指定数据类型和设备
        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
        # 创建全一遮罩张量，形状为批次大小、高度和宽度，数据类型为布尔型，指定设备
        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
        # 将每个输入张量复制到对应位置的全零张量中，并更新遮罩张量
        for img, pad_img, m in zip(tensor_list, tensor, mask):
            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
            m[: img.shape[1], : img.shape[2]] = False
    else:
        # 如果输入张量不是三维的，则抛出 ValueError 异常
        raise ValueError("Only 3-dimensional tensors are supported")
    # 返回一个新的 NestedTensor 对象，其中包含填充后的张量和遮罩张量
    return NestedTensor(tensor, mask)

Transformers-源码解析-一百二十九-

Transformers 源码解析（一百二十九）

.\models\xmod\__init__.py

.\models\x_clip\configuration_x_clip.py

.\models\x_clip\convert_x_clip_original_pytorch_to_hf.py

.\models\x_clip\modeling_x_clip.py

.\models\x_clip\processing_x_clip.py

.\models\x_clip\__init__.py

.\models\yolos\configuration_yolos.py

.\models\yolos\convert_yolos_to_pytorch.py

.\models\yolos\feature_extraction_yolos.py

.\models\yolos\image_processing_yolos.py

.\models\yolos\modeling_yolos.py

`.\models\xmod\init.py`

`.\models\x_clip\configuration_x_clip.py`

`.\models\x_clip\convert_x_clip_original_pytorch_to_hf.py`

`.\models\x_clip\modeling_x_clip.py`

`.\models\x_clip\processing_x_clip.py`

`.\models\x_clip\init.py`

`.\models\yolos\configuration_yolos.py`

`.\models\yolos\convert_yolos_to_pytorch.py`

`.\models\yolos\feature_extraction_yolos.py`

`.\models\yolos\image_processing_yolos.py`

`.\models\yolos\modeling_yolos.py`