Transformers 源码解析（三十八）

`.\models\depth_anything\convert_depth_anything_to_hf.py`

    # coding=utf-8
    # 版权 2024 年 HuggingFace Inc. 团队所有。
    #
    # 根据 Apache 许可证 2.0 版本进行许可；
    # 除非符合许可证的要求，否则不得使用此文件。
    # 您可以在以下网址获取许可证的副本：
    #
    #     http://www.apache.org/licenses/LICENSE-2.0
    #
    # 除非适用法律要求或书面同意，本软件按"原样"分发，
    # 不作任何明示或暗示的担保或条件。
    # 请参阅许可证了解具体的管理权限和限制。
    """从原始仓库转换 Depth Anything 检查点。URL:
    https://github.com/LiheYoung/Depth-Anything"""

    
    import argparse  # 导入命令行参数解析模块
    from pathlib import Path  # 导入处理文件路径的模块
    
    import requests  # 导入处理 HTTP 请求的模块
    import torch  # 导入 PyTorch 深度学习库
    from huggingface_hub import hf_hub_download  # 导入 Hugging Face Hub 下载模块
    from PIL import Image  # 导入处理图像的模块
    
    from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor
    from transformers.utils import logging  # 导入日志记录模块
    
    logging.set_verbosity_info()  # 设置日志记录级别为信息
    logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器
    
    
    def get_dpt_config(model_name):
        if "small" in model_name:
            # 如果模型名包含 "small"，则使用 Dinov2Config 从预训练模型 "facebook/dinov2-small" 初始化配置
            backbone_config = Dinov2Config.from_pretrained(
                "facebook/dinov2-small", out_indices=[9, 10, 11, 12], apply_layernorm=True, reshape_hidden_states=False
            )
            fusion_hidden_size = 64  # 设置融合隐藏层的大小为 64
            neck_hidden_sizes = [48, 96, 192, 384]  # 设置颈部隐藏层的大小列表
        elif "base" in model_name:
            # 如果模型名包含 "base"，则使用 Dinov2Config 从预训练模型 "facebook/dinov2-base" 初始化配置
            backbone_config = Dinov2Config.from_pretrained(
                "facebook/dinov2-base", out_indices=[9, 10, 11, 12], apply_layernorm=True, reshape_hidden_states=False
            )
            fusion_hidden_size = 128  # 设置融合隐藏层的大小为 128
            neck_hidden_sizes = [96, 192, 384, 768]  # 设置颈部隐藏层的大小列表
        elif "large" in model_name:
            # 如果模型名包含 "large"，则使用 Dinov2Config 从预训练模型 "facebook/dinov2-large" 初始化配置
            backbone_config = Dinov2Config.from_pretrained(
                "facebook/dinov2-large", out_indices=[21, 22, 23, 24], apply_layernorm=True, reshape_hidden_states=False
            )
            fusion_hidden_size = 256  # 设置融合隐藏层的大小为 256
            neck_hidden_sizes = [256, 512, 1024, 1024]  # 设置颈部隐藏层的大小列表
        else:
            raise NotImplementedError("To do")  # 抛出未实现的错误
        
        # 根据给定的配置参数创建 DepthAnythingConfig 对象
        config = DepthAnythingConfig(
            reassemble_hidden_size=backbone_config.hidden_size,
            patch_size=backbone_config.patch_size,
            backbone_config=backbone_config,
            fusion_hidden_size=fusion_hidden_size,
            neck_hidden_sizes=neck_hidden_sizes,
        )
    
        return config  # 返回配置对象
    
    
    def create_rename_keys(config):
        rename_keys = []  # 创建重命名键列表
        
        # fmt: off
        # stem
        # 添加预定义的重命名键对到列表中，格式化关闭
        rename_keys.append(("pretrained.cls_token", "backbone.embeddings.cls_token"))
        rename_keys.append(("pretrained.mask_token", "backbone.embeddings.mask_token"))
        rename_keys.append(("pretrained.pos_embed", "backbone.embeddings.position_embeddings"))
        rename_keys.append(("pretrained.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
        rename_keys.append(("pretrained.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
        
        # Transfomer encoder
    # 遍历预训练模型的隐藏层，生成重命名键值对，将预训练模型参数映射到新的后骨干网络中
    for i in range(config.backbone_config.num_hidden_layers):
        # 重命名预训练模型中的 gamma 参数到对应后骨干网络的 lambda1 参数
        rename_keys.append((f"pretrained.blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1"))
        rename_keys.append((f"pretrained.blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1"))
        # 重命名预训练模型中的 norm1 和 norm2 参数到对应后骨干网络的 norm1 和 norm2 参数
        rename_keys.append((f"pretrained.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight"))
        rename_keys.append((f"pretrained.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias"))
        rename_keys.append((f"pretrained.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight"))
        rename_keys.append((f"pretrained.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias"))
        # 重命名预训练模型中的 mlp.fc1 和 mlp.fc2 参数到对应后骨干网络的 mlp.fc1 和 mlp.fc2 参数
        rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight"))
        rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias"))
        rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight"))
        rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias"))
        # 重命名预训练模型中的 attention 参数到对应后骨干网络的 attention 参数
        rename_keys.append((f"pretrained.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
        rename_keys.append((f"pretrained.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))

    # 头部部分的重命名
    rename_keys.append(("pretrained.norm.weight", "backbone.layernorm.weight"))
    rename_keys.append(("pretrained.norm.bias", "backbone.layernorm.bias"))

    # 激活后处理（读取投影 + 调整大小块）
    # Depth Anything 不使用 CLS token，因此不需要 readout_projects

    # 遍历深度头部的投影和调整大小层，将其重命名到颈部的重组阶段中
    for i in range(4):
        rename_keys.append((f"depth_head.projects.{i}.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
        rename_keys.append((f"depth_head.projects.{i}.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))

        if i != 2:
            # 对于不是第二个元素的情况，将深度头部的调整大小层重命名到颈部的重组阶段中
            rename_keys.append((f"depth_head.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
            rename_keys.append((f"depth_head.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))

    # RefineNet 部分的映射关系
    mapping = {1:3, 2:2, 3:1, 4:0}
    # 遍历范围为 1 到 4 的整数，依次映射到对应的索引值，生成重命名键列表
    for i in range(1, 5):
        j = mapping[i]
        # 添加重命名键对，将深度头部的卷积层权重映射到融合阶段的投影层权重
        rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
        rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))

    # 处理 scratch convolutions
    for i in range(4):
        # 将深度头部的 scratch 层权重映射到融合阶段的卷积层权重
        rename_keys.append((f"depth_head.scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))

    # 处理头部权重
    rename_keys.append(("depth_head.scratch.output_conv1.weight", "head.conv1.weight"))
    rename_keys.append(("depth_head.scratch.output_conv1.bias", "head.conv1.bias"))
    rename_keys.append(("depth_head.scratch.output_conv2.0.weight", "head.conv2.weight"))
    rename_keys.append(("depth_head.scratch.output_conv2.0.bias", "head.conv2.bias"))
    rename_keys.append(("depth_head.scratch.output_conv2.2.weight", "head.conv3.weight"))
    rename_keys.append(("depth_head.scratch.output_conv2.2.bias", "head.conv3.bias"))

    # 返回最终的重命名键列表
    return rename_keys
# 将每个编码器层的权重矩阵分解为查询(query)、键(keys)和值(values)
def read_in_q_k_v(state_dict, config):
    # 从配置中获取隐藏层的大小
    hidden_size = config.backbone_config.hidden_size
    # 遍历每个编码器层
    for i in range(config.backbone_config.num_hidden_layers):
        # 读取输入投影层的权重和偏置（在原始实现中，这是一个单独的矩阵加偏置）
        in_proj_weight = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.weight")
        in_proj_bias = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.bias")
        
        # 将查询(query)、键(keys)和值(values)依次添加到状态字典中
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[hidden_size: hidden_size * 2, :]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[hidden_size: hidden_size * 2]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:]


# 重命名字典中的键
def rename_key(dct, old, new):
    val = dct.pop(old)
    dct[new] = val


# 准备用于验证的可爱猫咪图片
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    im = Image.open(requests.get(url, stream=True).raw)
    return im


# 不使用梯度进行操作的装饰器，用于转换DPT检查点
@torch.no_grad()
def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits):
    """
    Copy/paste/tweak model's weights to our DPT structure.
    """

    # 获取DPT模型的配置
    config = get_dpt_config(model_name)

    # 定义模型名称到文件名的映射
    model_name_to_filename = {
        "depth-anything-small": "depth_anything_vits14.pth",
        "depth-anything-base": "depth_anything_vitb14.pth",
        "depth-anything-large": "depth_anything_vitl14.pth",
    }

    # 加载原始的state_dict
    filename = model_name_to_filename[model_name]
    # 从HuggingFace Hub下载文件
    filepath = hf_hub_download(
        repo_id="LiheYoung/Depth-Anything", filename=f"checkpoints/{filename}", repo_type="space"
    )
    state_dict = torch.load(filepath, map_location="cpu")

    # 根据配置创建重命名映射
    rename_keys = create_rename_keys(config)
    # 使用重命名映射重命名state_dict中的键
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)

    # 读取qkv矩阵
    read_in_q_k_v(state_dict, config)

    # 加载HuggingFace模型
    model = DepthAnythingForDepthEstimation(config)
    model.load_state_dict(state_dict)
    model.eval()
    # 创建一个图像处理器对象，配置参数包括调整大小、尺寸限制、保持长宽比、重新缩放和归一化处理
    processor = DPTImageProcessor(
        do_resize=True,
        size={"height": 518, "width": 518},
        ensure_multiple_of=14,
        keep_aspect_ratio=True,
        do_rescale=True,
        do_normalize=True,
        image_mean=[0.485, 0.456, 0.406],
        image_std=[0.229, 0.224, 0.225],
    )

    # 定义一个图像的 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 通过 HTTP 请求获取图像数据并以流的方式打开
    image = Image.open(requests.get(url, stream=True).raw)

    # 使用图像处理器处理图像并返回像素张量
    pixel_values = processor(image, return_tensors="pt").pixel_values

    # 使用无梯度环境，对模型进行前向传播预测深度
    with torch.no_grad():
        outputs = model(pixel_values)
        # 获取预测的深度图
        predicted_depth = outputs.predicted_depth

    # 打印预测深度图的形状
    print("Shape of predicted depth:", predicted_depth.shape)
    # 打印预测深度图的前几个像素值
    print("First values:", predicted_depth[0, :3, :3])

    # 如果需要验证 logits（对数几率），则进行断言验证
    if verify_logits:
        # 定义预期的深度图形状
        expected_shape = torch.Size([1, 518, 686])
        # 根据模型名称选择预期的深度图片段
        if model_name == "depth-anything-small":
            expected_slice = torch.tensor(
                [[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]],
            )
        elif model_name == "depth-anything-base":
            expected_slice = torch.tensor(
                [[26.3997, 26.3004, 26.3928], [26.2260, 26.2092, 26.3427], [26.0719, 26.0483, 26.1254]],
            )
        elif model_name == "depth-anything-large":
            expected_slice = torch.tensor(
                [[87.9968, 87.7493, 88.2704], [87.1927, 87.6611, 87.3640], [86.7789, 86.9469, 86.7991]]
            )
        else:
            # 如果模型名称不受支持，则引发错误
            raise ValueError("Not supported")

        # 断言预测深度图的形状是否符合预期
        assert predicted_depth.shape == torch.Size(expected_shape)
        # 断言预测深度图的前几个像素值是否接近预期值，允许误差为 1e-6
        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-6)
        # 打印验证通过信息
        print("Looks ok!")

    # 如果指定了 PyTorch 模型保存文件夹路径，则保存模型和处理器
    if pytorch_dump_folder_path is not None:
        # 确保保存路径存在，不存在则创建
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        # 打印保存模型和处理器的信息
        print(f"Saving model and processor to {pytorch_dump_folder_path}")
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 将处理器保存到指定路径
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要将模型和处理器推送到 Hub
    if push_to_hub:
        # 打印推送模型和处理器到 Hub 的信息
        print("Pushing model and processor to hub...")
        # 将模型推送到指定 Hub 仓库
        model.push_to_hub(repo_id=f"LiheYoung/{model_name}-hf")
        # 将处理器推送到指定 Hub 仓库
        processor.push_to_hub(repo_id=f"LiheYoung/{model_name}-hf")
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    parser.add_argument(
        "--model_name",
        default="depth-anything-small",
        type=str,
        choices=name_to_checkpoint.keys(),
        help="Name of the model you'd like to convert.",
    )
    # 添加必需的参数 --model_name，指定要转换的模型名称，必须是预定义的选择之一

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        help="Path to the output PyTorch model directory.",
    )
    # 添加参数 --pytorch_dump_folder_path，指定输出的 PyTorch 模型目录的路径

    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether to push the model to the hub after conversion.",
    )
    # 添加参数 --push_to_hub，指定是否在转换后将模型推送到 hub

    parser.add_argument(
        "--verify_logits",
        action="store_false",
        required=False,
        help="Whether to verify the logits after conversion.",
    )
    # 添加参数 --verify_logits，指定是否在转换后验证 logits

    args = parser.parse_args()
    # 解析命令行参数并存储到 args 对象中

    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits)
    # 调用 convert_dpt_checkpoint 函数，传递解析后的参数用于模型转换

`.\models\depth_anything\modeling_depth_anything.py`

# 指定文件编码为 UTF-8

# 版权声明和信息，指出版权归 TikTok 和 HuggingFace Inc. 团队所有，保留所有权利
# 根据 Apache 许可证 2.0 版本，除非符合许可证的要求，否则不得使用此文件
# 可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
# 如果适用法律要求或书面同意，本软件是基于“按原样”提供的，没有任何形式的明示或暗示的保证或条件
""" PyTorch Depth Anything model."""

# 引入必要的模块和类型提示
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

# 引入 HuggingFace 提供的实用函数和模块
from ...file_utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from ...modeling_outputs import DepthEstimatorOutput
from ...modeling_utils import PreTrainedModel
from ...utils import logging
from ..auto import AutoBackbone

# 引入 DepthAnythingConfig 配置类
from .configuration_depth_anything import DepthAnythingConfig

# 获取 logger 实例
logger = logging.get_logger(__name__)

# 模型配置文档字符串
_CONFIG_FOR_DOC = "DepthAnythingConfig"

# 预训练模型存档列表
DEPTH_ANYTHING_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "LiheYoung/depth-anything-small-hf",
    # 查看所有 Depth Anything 模型的列表：https://huggingface.co/models?filter=depth_anything
]

# Depth Anything 模型的起始文档字符串，描述模型是 PyTorch 的 nn.Module 子类，使用方法和行为可以参考 PyTorch 文档
DEPTH_ANYTHING_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`DepthAnythingConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# Depth Anything 模型的输入参数文档字符串，描述了模型接受的输入参数的详细信息
DEPTH_ANYTHING_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
            for details.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""

class DepthAnythingReassembleLayer(nn.Module):
    """
    PyTorch 模块用于 Depth Anything 重组层的定义
    """
    def __init__(self, config, channels, factor):
        super().__init__()
        # 创建一个 1x1 的卷积层，用于投影隐藏状态的通道数
        self.projection = nn.Conv2d(in_channels=config.reassemble_hidden_size, out_channels=channels, kernel_size=1)

        # 根据因子选择上/下采样操作
        if factor > 1:
            # 如果因子大于1，则使用转置卷积进行上采样
            self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0)
        elif factor == 1:
            # 如果因子等于1，则保持形状不变
            self.resize = nn.Identity()
        elif factor < 1:
            # 如果因子小于1，则进行下采样，使用卷积核大小为3，步长为1/factor，填充为1
            self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1)

    # 从 transformers.models.dpt.modeling_dpt.DPTReassembleLayer.forward 复制过来的方法
    def forward(self, hidden_state):
        # 对隐藏状态进行投影
        hidden_state = self.projection(hidden_state)
        # 对投影后的状态进行尺寸调整（上/下采样）
        hidden_state = self.resize(hidden_state)

        return hidden_state
class DepthAnythingReassembleStage(nn.Module):
    """
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.
    
    This happens in 3 stages:
    1. Take the patch embeddings and reshape them to image-like feature representations.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[DepthAnythingConfig]`):
            Model configuration class defining the model architecture.
    """

    def __init__(self, config):
        super().__init__()

        # Store the configuration object for the model
        self.config = config

        # Initialize a list to store reassemble layers based on config
        self.layers = nn.ModuleList()
        for channels, factor in zip(config.neck_hidden_sizes, config.reassemble_factors):
            # Append a reassemble layer to the list
            self.layers.append(DepthAnythingReassembleLayer(config, channels=channels, factor=factor))

    def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
        """
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        """
        # Initialize an empty list to store output tensors
        out = []

        # Iterate over each hidden state tensor in the input list
        for i, hidden_state in enumerate(hidden_states):
            # Remove the first token from the sequence dimension
            hidden_state = hidden_state[:, 1:]

            # Extract dimensions from the modified hidden state tensor
            batch_size, _, num_channels = hidden_state.shape

            # Reshape the tensor to image-like representation
            hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels)

            # Permute dimensions to (batch_size, num_channels, height, width)
            hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()

            # Apply the reassemble layer corresponding to the current index
            hidden_state = self.layers[i](hidden_state)

            # Append the processed tensor to the output list
            out.append(hidden_state)

        # Return the list of reassembled hidden states
        return out


class DepthAnythingPreActResidualLayer(nn.Module):
    """
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[DepthAnythingConfig]`):
            Model configuration class defining the model architecture.
    """

    def __init__(self, config):
        super().__init__()

        # Initialize ReLU activation function
        self.activation1 = nn.ReLU()

        # Initialize the first convolutional layer
        self.convolution1 = nn.Conv2d(
            config.fusion_hidden_size,
            config.fusion_hidden_size,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=True,
        )

        # Initialize ReLU activation function
        self.activation2 = nn.ReLU()

        # Initialize the second convolutional layer
        self.convolution2 = nn.Conv2d(
            config.fusion_hidden_size,
            config.fusion_hidden_size,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=True,
        )

    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        # Preserve the input tensor as residual connection
        residual = hidden_state

        # Apply ReLU activation to the input tensor
        hidden_state = self.activation1(hidden_state)

        # Perform convolution operation using the first convolutional layer
        hidden_state = self.convolution1(hidden_state)

        # Apply ReLU activation to the resulting tensor
        hidden_state = self.activation2(hidden_state)

        # Perform convolution operation using the second convolutional layer
        hidden_state = self.convolution2(hidden_state)

        # Add the residual connection to the final output tensor
        return hidden_state + residual
class DepthAnythingFeatureFusionLayer(nn.Module):
    """Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[DepthAnythingConfig]`):
            Model configuration class defining the model architecture.
    """

    def __init__(self, config):
        super().__init__()

        # Projection layer to adjust feature map dimensions
        self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)

        # Residual layers for feature fusion
        self.residual_layer1 = DepthAnythingPreActResidualLayer(config)
        self.residual_layer2 = DepthAnythingPreActResidualLayer(config)

    def forward(self, hidden_state, residual=None, size=None):
        # Apply residual connection if residual is provided
        if residual is not None:
            # Resize residual tensor if shapes are different
            if hidden_state.shape != residual.shape:
                residual = nn.functional.interpolate(
                    residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
                )
            hidden_state = hidden_state + self.residual_layer1(residual)

        # Apply the second residual layer
        hidden_state = self.residual_layer2(hidden_state)

        # Determine modifier for interpolation
        modifier = {"scale_factor": 2} if size is None else {"size": size}

        # Interpolate the hidden_state tensor
        hidden_state = nn.functional.interpolate(
            hidden_state,
            **modifier,
            mode="bilinear",
            align_corners=True,
        )

        # Project the interpolated feature map using the projection layer
        hidden_state = self.projection(hidden_state)

        return hidden_state


class DepthAnythingFeatureFusionStage(nn.Module):
    # Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage.__init__ with DPT->DepthAnything
    def __init__(self, config):
        super().__init__()

        # Initialize layers list with DepthAnythingFeatureFusionLayer instances
        self.layers = nn.ModuleList()
        for _ in range(len(config.neck_hidden_sizes)):
            self.layers.append(DepthAnythingFeatureFusionLayer(config))

    def forward(self, hidden_states, size=None):
        # Reverse the order of hidden_states for processing from last to first
        hidden_states = hidden_states[::-1]

        fused_hidden_states = []

        # Process the first layer separately using the last hidden_state
        size = hidden_states[1].shape[2:]  # size for interpolation
        fused_hidden_state = self.layers[0](hidden_states[0], size=size)
        fused_hidden_states.append(fused_hidden_state)

        # Iterate through remaining layers and hidden_states in reverse order
        for idx, (hidden_state, layer) in enumerate(zip(hidden_states[1:], self.layers[1:])):
            size = hidden_states[1:][idx + 1].shape[2:] if idx != (len(hidden_states[1:]) - 1) else None

            # Apply the current layer to fused_hidden_state and current hidden_state
            fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size)

            fused_hidden_states.append(fused_hidden_state)

        return fused_hidden_states


# Copied from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->DepthAnything,dpt->depth_anything
class DepthAnythingPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # Configuration class to be used with this model
    config_class = DepthAnythingConfig
    # 设置基础模型的前缀名称
    base_model_prefix = "depth_anything"
    # 设置主输入的名称为"pixel_values"
    main_input_name = "pixel_values"
    # 启用梯度检查点支持
    supports_gradient_checkpointing = True

    # 初始化模型权重的函数
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果是线性层、2D卷积层或转置卷积层，则初始化权重
        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
            # 使用正态分布初始化权重，均值为0，标准差为self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置，则将偏置初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是LayerNorm层，则初始化偏置为零，权重为1
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
class DepthAnythingNeck(nn.Module):
    """
    DepthAnythingNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For DepthAnything, it includes 2 stages:

    * DepthAnythingReassembleStage
    * DepthAnythingFeatureFusionStage.

    Args:
        config (dict): config dict.
    """

    def __init__(self, config):
        super().__init__()
        self.config = config

        # Initialize the reassemble stage using provided configuration
        self.reassemble_stage = DepthAnythingReassembleStage(config)

        # Initialize convolutional layers based on neck_hidden_sizes in the config
        self.convs = nn.ModuleList()
        for channel in config.neck_hidden_sizes:
            self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False))

        # Initialize the fusion stage using provided configuration
        self.fusion_stage = DepthAnythingFeatureFusionStage(config)

    def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
        """
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        """
        # Check if hidden_states is a tuple or list of tensors
        if not isinstance(hidden_states, (tuple, list)):
            raise ValueError("hidden_states should be a tuple or list of tensors")

        # Ensure the number of hidden states matches the number of neck hidden sizes in the config
        if len(hidden_states) != len(self.config.neck_hidden_sizes):
            raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")

        # Reassemble hidden states using the reassemble stage
        hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)

        # Apply convolutional layers to each hidden state feature
        features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]

        # Apply fusion stage to the processed features
        output = self.fusion_stage(features)

        return output


class DepthAnythingDepthEstimationHead(nn.Module):
    """
    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the DPT paper's
    supplementary material).
    """

    def __init__(self, config):
        super().__init__()

        # Initialize head_in_index and patch_size from config
        self.head_in_index = config.head_in_index
        self.patch_size = config.patch_size

        features = config.fusion_hidden_size

        # Define convolutional layers with decreasing feature dimensions
        self.conv1 = nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(features // 2, config.head_hidden_size, kernel_size=3, stride=1, padding=1)
        self.activation1 = nn.ReLU()

        # Final convolutional layer with output dimension 1 (for depth estimation)
        self.conv3 = nn.Conv2d(config.head_hidden_size, 1, kernel_size=1, stride=1, padding=0)
        self.activation2 = nn.ReLU()
    # 在输入的隐藏状态中选择与当前头部相关的部分
    hidden_states = hidden_states[self.head_in_index]

    # 第一层卷积，用于生成深度预测
    predicted_depth = self.conv1(hidden_states)

    # 对预测的深度图进行插值操作，调整大小以适应给定的补丁尺寸
    predicted_depth = nn.functional.interpolate(
        predicted_depth,
        (int(patch_height * self.patch_size), int(patch_width * self.patch_size)),
        mode="bilinear",
        align_corners=True,
    )

    # 第二层卷积，处理调整大小后的深度预测
    predicted_depth = self.conv2(predicted_depth)

    # 应用第一个激活函数到调整后的深度预测
    predicted_depth = self.activation1(predicted_depth)

    # 第三层卷积，进一步处理激活后的深度预测
    predicted_depth = self.conv3(predicted_depth)

    # 应用第二个激活函数到最终的深度预测结果
    predicted_depth = self.activation2(predicted_depth)

    # 压缩维度，将深度预测结果从 (batch_size, 1, height, width) 变为 (batch_size, height, width)
    predicted_depth = predicted_depth.squeeze(dim=1)  # shape (batch_size, height, width)

    # 返回最终的深度预测张量
    return predicted_depth
@add_start_docstrings(
    """
    Depth Anything Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    """,
    DEPTH_ANYTHING_START_DOCSTRING,
)
class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化深度估计模型的主干网络
        self.backbone = AutoBackbone.from_config(config.backbone_config)
        
        # 初始化深度估计模型的特征提取网络
        self.neck = DepthAnythingNeck(config)
        
        # 初始化深度估计模型的深度估计头部网络
        self.head = DepthAnythingDepthEstimationHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DEPTH_ANYTHING_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

`.\models\depth_anything\init.py`

# 导入所需模块和函数
from typing import TYPE_CHECKING
# 从文件工具中导入懒加载模块和是否可用torch的函数
from ...file_utils import _LazyModule, is_torch_available
# 导入可选依赖未安装的异常
from ...utils import OptionalDependencyNotAvailable

# 定义模块导入结构字典
_import_structure = {
    "configuration_depth_anything": ["DEPTH_ANYTHING_PRETRAINED_CONFIG_ARCHIVE_MAP", "DepthAnythingConfig"]
}

# 检查torch是否可用，若不可用则抛出可选依赖未安装的异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若torch可用，则添加模型相关的导入结构到_import_structure中
    _import_structure["modeling_depth_anything"] = [
        "DEPTH_ANYTHING_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DepthAnythingForDepthEstimation",
        "DepthAnythingPreTrainedModel",
    ]

# 如果是类型检查阶段
if TYPE_CHECKING:
    # 从配置模块中导入所需的符号
    from .configuration_depth_anything import DEPTH_ANYTHING_PRETRAINED_CONFIG_ARCHIVE_MAP, DepthAnythingConfig

    # 再次检查torch是否可用，若不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若torch可用，则从模型模块中导入所需的符号
        from .modeling_depth_anything import (
            DEPTH_ANYTHING_PRETRAINED_MODEL_ARCHIVE_LIST,
            DepthAnythingForDepthEstimation,
            DepthAnythingPreTrainedModel,
        )

# 如果不是类型检查阶段
else:
    # 导入sys模块
    import sys

    # 将当前模块设为懒加载模块的实例
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\deta\configuration_deta.py`

# 设置文件编码为 UTF-8
# 版权声明和许可证声明，告知代码使用者版权和许可条件
# 仅在遵守 Apache 许可证 Version 2.0 的情况下可使用本文件
# 可以从指定的网址获取完整的许可证文本
# 根据适用法律或书面同意，本软件以"原样"提供，不带任何明示或暗示的担保或条件
# 详细许可证信息请参见指定的网址

""" DETA model configuration"""

# 导入必要的模块和类
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# DETA 预训练配置文件映射，指定了模型名称和其对应的配置文件 URL
DETA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "ut/deta": "https://huggingface.co/ut/deta/resolve/main/config.json",
}

# DetaConfig 类继承自 PretrainedConfig 类，用于存储 DETA 模型的配置信息
class DetaConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DetaModel`]. It is used to instantiate a DETA
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DETA
    [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Examples:

    ```
    >>> from transformers import DetaConfig, DetaModel

    >>> # Initializing a DETA SenseTime/deformable-detr style configuration
    >>> configuration = DetaConfig()

    >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
    >>> model = DetaModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 指定模型类型为 "deta"
    model_type = "deta"
    # 定义属性映射，将通用名称映射到具体模型配置参数名称
    attribute_map = {
        "hidden_size": "d_model",
        "num_attention_heads": "encoder_attention_heads",
    }
    # 初始化方法，用于初始化模型对象
    def __init__(
        self,
        backbone_config=None,  # 设置用于骨干网络的配置
        backbone=None,  # 设置用于骨干网络的具体实现
        use_pretrained_backbone=False,  # 是否使用预训练的骨干网络参数
        use_timm_backbone=False,  # 是否使用timm库提供的骨干网络
        backbone_kwargs=None,  # 骨干网络的额外参数
        num_queries=900,  # 查询的数量
        max_position_embeddings=2048,  # 最大位置嵌入数
        encoder_layers=6,  # 编码器层数
        encoder_ffn_dim=2048,  # 编码器中FFN层的维度
        encoder_attention_heads=8,  # 编码器中注意力头的数量
        decoder_layers=6,  # 解码器层数
        decoder_ffn_dim=1024,  # 解码器中FFN层的维度
        decoder_attention_heads=8,  # 解码器中注意力头的数量
        encoder_layerdrop=0.0,  # 编码器层的dropout率
        is_encoder_decoder=True,  # 模型是否为编码-解码结构
        activation_function="relu",  # 激活函数类型
        d_model=256,  # 模型的维度
        dropout=0.1,  # 全局的dropout率
        attention_dropout=0.0,  # 注意力机制的dropout率
        activation_dropout=0.0,  # 激活函数的dropout率
        init_std=0.02,  # 初始化的标准差
        init_xavier_std=1.0,  # Xavier初始化的标准差
        return_intermediate=True,  # 是否返回中间结果
        auxiliary_loss=False,  # 是否使用辅助损失
        position_embedding_type="sine",  # 位置嵌入的类型
        num_feature_levels=5,  # 特征金字塔的层数
        encoder_n_points=4,  # 编码器中的采样点数
        decoder_n_points=4,  # 解码器中的采样点数
        two_stage=True,  # 是否使用两阶段检测器
        two_stage_num_proposals=300,  # 第一阶段提议的数量
        with_box_refine=True,  # 是否进行框细化
        assign_first_stage=True,  # 是否进行第一阶段的指派
        assign_second_stage=True,  # 是否进行第二阶段的指派
        class_cost=1,  # 类别损失的权重
        bbox_cost=5,  # 边界框损失的权重
        giou_cost=2,  # GIoU损失的权重
        mask_loss_coefficient=1,  # 掩膜损失的系数
        dice_loss_coefficient=1,  # Dice损失的系数
        bbox_loss_coefficient=5,  # 边界框损失的系数
        giou_loss_coefficient=2,  # GIoU损失的系数
        eos_coefficient=0.1,  # EOS损失的系数
        focal_alpha=0.25,  # Focal损失的alpha参数
        disable_custom_kernels=True,  # 是否禁用自定义内核
        **kwargs,  # 其他未列出的关键字参数
    ):
        # 属性访问器，返回编码器注意力头的数量
        @property
        def num_attention_heads(self) -> int:
            return self.encoder_attention_heads

        # 属性访问器，返回模型隐藏层的维度
        @property
        def hidden_size(self) -> int:
            return self.d_model

`.\models\deta\convert_deta_resnet_to_pytorch.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert DETA checkpoints from the original repository.

URL: https://github.com/jozhang97/DETA/tree/master"""

import argparse
import json
from pathlib import Path

import requests
import torch
from huggingface_hub import cached_download, hf_hub_download, hf_hub_url
from PIL import Image

from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor
from transformers.utils import logging

# 设置日志记录级别为信息
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义获取DETA配置信息的函数
def get_deta_config():
    # 创建DETA配置对象，设置各种参数
    config = DetaConfig(
        num_queries=900,
        encoder_ffn_dim=2048,
        decoder_ffn_dim=2048,
        num_feature_levels=5,
        assign_first_stage=True,
        with_box_refine=True,
        two_stage=True,
    )

    # 设置标签信息
    config.num_labels = 91
    repo_id = "huggingface/label-files"
    filename = "coco-detection-id2label.json"
    # 从Hugging Face Hub下载并加载COCO检测标签映射文件
    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
    # 转换为整数类型的键值对字典
    id2label = {int(k): v for k, v in id2label.items()}
    # 设置DETA配置对象的id到标签的映射和标签到id的映射
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}

    return config

# 定义创建重命名键列表的函数
# 这里列出所有需要重命名的键（左侧是原始名称，右侧是我们的名称）
def create_rename_keys(config):
    rename_keys = []

    # stem
    # fmt: off
    rename_keys.append(("backbone.0.body.conv1.weight", "model.backbone.model.embedder.embedder.convolution.weight"))
    rename_keys.append(("backbone.0.body.bn1.weight", "model.backbone.model.embedder.embedder.normalization.weight"))
    rename_keys.append(("backbone.0.body.bn1.bias", "model.backbone.model.embedder.embedder.normalization.bias"))
    rename_keys.append(("backbone.0.body.bn1.running_mean", "model.backbone.model.embedder.embedder.normalization.running_mean"))
    rename_keys.append(("backbone.0.body.bn1.running_var", "model.backbone.model.embedder.embedder.normalization.running_var"))
    # stages
    # transformer encoder

    # fmt: on
    # 遍历编码器层数量次数，进行以下操作
    for i in range(config.encoder_layers):
        # 添加重命名键值对，将transformer.encoder.layers中的权重和偏置重命名为model.encoder.layers中对应的权重和偏置
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))

    # transformer decoder
    # 遍历从配置中获取的解码器层数量次数
    for i in range(config.decoder_layers):
        # 重命名键，将transformer.decoder.layers中的权重和偏置名映射到model.decoder.layers中的对应位置
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))

    # 格式化代码，结束长行格式化
    # fmt: on

    # 返回重命名后的键列表
    return rename_keys
# 重命名字典中的键名，将旧键名对应的值移除，并将其值存储到新键名下
def rename_key(dct, old, new):
    val = dct.pop(old)
    dct[new] = val

# 读取解码器的查询、键和值的权重和偏置，将它们添加到状态字典中
def read_in_decoder_q_k_v(state_dict, config):
    # 获取隐藏层大小
    hidden_size = config.d_model
    # 遍历解码器层
    for i in range(config.decoder_layers):
        # 获取自注意力层输入投影层的权重和偏置
        in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
        in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
        # 将查询、键和值的投影权重和偏置添加到状态字典中
        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[hidden_size:2*hidden_size, :]
        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size:2*hidden_size]
        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]

# 准备图片数据，从指定的 URL 中获取图片并返回
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    im = Image.open(requests.get(url, stream=True).raw)
    return im

# 使用 torch.no_grad() 装饰器，将模型权重转换并复制到 DETA 结构中
@torch.no_grad()
def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
    """
    复制/粘贴/调整模型权重到我们的 DETA 结构中。
    """

    # 加载配置信息
    config = get_deta_config()

    # 加载原始状态字典
    if model_name == "deta-resnet-50":
        filename = "adet_checkpoint0011.pth"
    elif model_name == "deta-resnet-50-24-epochs":
        filename = "adet_2x_checkpoint0023.pth"
    else:
        raise ValueError(f"Model name {model_name} not supported")
    # 从指定的 HF Hub 下载检查点文件
    checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename=filename)
    # 使用 torch 加载模型状态字典，设定在 CPU 上处理
    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]

    # 重命名键
    rename_keys = create_rename_keys(config)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    # 读取解码器的查询、键和值的权重和偏置，将它们添加到状态字典中
    read_in_decoder_q_k_v(state_dict, config)

    # 修正部分前缀
    for key in state_dict.copy().keys():
        # 如果键中包含 "transformer.decoder.class_embed" 或 "transformer.decoder.bbox_embed"
        if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
            val = state_dict.pop(key)
            # 替换键名前缀为 "model.decoder"
            state_dict[key.replace("transformer.decoder", "model.decoder")] = val
        # 如果键中包含 "input_proj"
        if "input_proj" in key:
            val = state_dict.pop(key)
            # 替换键名前缀为 "model."
            state_dict["model." + key] = val
        # 如果键中包含 "level_embed"、"pos_trans"、"pix_trans" 或 "enc_output"
        if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
            val = state_dict.pop(key)
            # 替换键名前缀为 "model"
            state_dict[key.replace("transformer", "model")] = val
    # 创建一个用于物体检测的 Deta 模型，并加载状态字典
    model = DetaForObjectDetection(config)
    model.load_state_dict(state_dict)
    model.eval()

    # 检测是否可以使用 CUDA，如果可以则将模型移动到 CUDA 设备上，否则使用 CPU
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # 加载图像处理器，使用 COCO 检测格式
    processor = DetaImageProcessor(format="coco_detection")

    # 准备图像并进行编码
    img = prepare_img()
    encoding = processor(images=img, return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values.to(device))

    # 验证模型输出的 logits 是否符合预期
    if model_name == "deta-resnet-50":
        expected_logits = torch.tensor(
            [[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
        )
        expected_boxes = torch.tensor([[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]])
    elif model_name == "deta-resnet-50-24-epochs":
        expected_logits = torch.tensor(
            [[-7.1688, -2.4857, -4.8669], [-7.8630, -3.8154, -4.2674], [-7.2730, -4.1865, -5.5323]]
        )
        expected_boxes = torch.tensor([[0.5021, 0.4971, 0.9994], [0.2546, 0.5486, 0.4731], [0.1686, 0.1986, 0.2142]])

    # 使用 assert 语句检查模型输出的 logits 和预期值的接近程度，设置容差为 1e-4
    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
    print("Everything ok!")

    # 如果指定了 PyTorch 模型保存路径，则保存模型和处理器
    if pytorch_dump_folder_path:
        logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        model.save_pretrained(pytorch_dump_folder_path)
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要将模型和处理器推送到 Hub
    if push_to_hub:
        print("Pushing model and processor to hub...")
        model.push_to_hub(f"jozhang97/{model_name}")
        processor.push_to_hub(f"jozhang97/{model_name}")
# 如果这个脚本被直接运行，执行以下操作
if __name__ == "__main__":
    # 创建一个参数解析器对象
    parser = argparse.ArgumentParser()

    # 添加一个参数选项，用于指定模型名称
    parser.add_argument(
        "--model_name",
        type=str,
        default="deta-resnet-50",
        choices=["deta-resnet-50", "deta-resnet-50-24-epochs"],
        help="Name of the model you'd like to convert.",
    )

    # 添加一个参数选项，用于指定输出 PyTorch 模型的文件夹路径
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        help="Path to the folder to output PyTorch model.",
    )

    # 添加一个布尔类型的参数选项，用于指定是否将转换后的模型推送到 🤗 hub
    parser.add_argument(
        "--push_to_hub", 
        action="store_true", 
        help="Whether or not to push the converted model to the 🤗 hub."
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数 convert_deta_checkpoint，传入解析后的参数
    convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

`.\models\deta\convert_deta_swin_to_pytorch.py`

# 定义一个函数，用于生成 DETA 模型的配置信息
def get_deta_config(model_name):
    # 定义 Swin Transformer 的配置信息作为背骨网络配置
    backbone_config = SwinConfig(
        embed_dim=192,
        depths=(2, 2, 18, 2),
        num_heads=(6, 12, 24, 48),
        window_size=12,
        out_features=["stage2", "stage3", "stage4"],
    )

    # 定义 DETA 模型的总体配置
    config = DetaConfig(
        backbone_config=backbone_config,
        num_queries=900,
        encoder_ffn_dim=2048,
        decoder_ffn_dim=2048,
        num_feature_levels=5,
        assign_first_stage=True,
        with_box_refine=True,
        two_stage=True,
    )

    # 设置模型的标签信息
    repo_id = "huggingface/label-files"
    if "o365" in model_name:
        num_labels = 366
        filename = "object365-id2label.json"
    else:
        num_labels = 91
        filename = "coco-detection-id2label.json"

    # 加载并解析标签文件，设置模型的标签映射
    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    config.num_labels = num_labels
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}

    # 返回生成的配置信息
    return config


# 定义一个函数，用于创建需要重命名的键值对列表
def create_rename_keys(config):
    rename_keys = []

    # stem（茎部分）的重命名操作
    # fmt: off
    rename_keys.append(("backbone.0.body.patch_embed.proj.weight", "model.backbone.model.embeddings.patch_embeddings.projection.weight"))
    rename_keys.append(("backbone.0.body.patch_embed.proj.bias", "model.backbone.model.embeddings.patch_embeddings.projection.bias"))
    rename_keys.append(("backbone.0.body.patch_embed.norm.weight", "model.backbone.model.embeddings.norm.weight"))
    rename_keys.append(("backbone.0.body.patch_embed.norm.bias", "model.backbone.model.embeddings.norm.bias"))
    # stages
    # 遍历配置中指定的每个深度值
    for i in range(len(config.backbone_config.depths)):
        # 根据每个深度值，再次遍历对应数量的层
        for j in range(config.backbone_config.depths[i]):
            # 将旧的键值对和新的键值对添加到重命名键列表中，用于重命名模型参数
            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.bias"))

        # 对于前三层，额外重命名下采样模块的参数
        if i < 3:
            rename_keys.append((f"backbone.0.body.layers.{i}.downsample.reduction.weight", f"model.backbone.model.encoder.layers.{i}.downsample.reduction.weight"))
            rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.weight", f"model.backbone.model.encoder.layers.{i}.downsample.norm.weight"))
            rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.bias", f"model.backbone.model.encoder.layers.{i}.downsample.norm.bias"))

    # 添加额外的重命名键，用于处理第一层的归一化权重和偏置
    rename_keys.append(("backbone.0.body.norm1.weight", "model.backbone.model.hidden_states_norms.stage2.weight"))
    rename_keys.append(("backbone.0.body.norm1.bias", "model.backbone.model.hidden_states_norms.stage2.bias"))
    # 添加需要重命名的键值对，将模型中的旧键名映射到新键名
    rename_keys.append(("backbone.0.body.norm2.weight", "model.backbone.model.hidden_states_norms.stage3.weight"))
    rename_keys.append(("backbone.0.body.norm2.bias", "model.backbone.model.hidden_states_norms.stage3.bias"))
    rename_keys.append(("backbone.0.body.norm3.weight", "model.backbone.model.hidden_states_norms.stage4.weight"))
    rename_keys.append(("backbone.0.body.norm3.bias", "model.backbone.model.hidden_states_norms.stage4.bias"))

    # 遍历所有的 transformer 编码层，将各层的权重和偏置映射到模型中对应层的新键名
    for i in range(config.encoder_layers):
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))

    # transformer decoder
    # 循环遍历配置中的解码器层数，生成重命名键列表
    for i in range(config.decoder_layers):
        # 添加每一层解码器交叉注意力模块的权重重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
        # 添加每一层解码器交叉注意力模块的偏置重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
        # 添加每一层解码器交叉注意力模块的注意力权重重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
        # 添加每一层解码器交叉注意力模块的注意力偏置重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
        # 添加每一层解码器交叉注意力模块的值投影权重重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
        # 添加每一层解码器交叉注意力模块的值投影偏置重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
        # 添加每一层解码器交叉注意力模块的输出投影权重重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
        # 添加每一层解码器交叉注意力模块的输出投影偏置重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
        # 添加每一层解码器第一个层归一化层的权重重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
        # 添加每一层解码器第一个层归一化层的偏置重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
        # 添加每一层解码器自注意力模块的输出投影权重重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
        # 添加每一层解码器自注意力模块的输出投影偏置重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
        # 添加每一层解码器第二个层归一化层的权重重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
        # 添加每一层解码器第二个层归一化层的偏置重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
        # 添加每一层解码器第一个全连接层的权重重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
        # 添加每一层解码器第一个全连接层的偏置重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
        # 添加每一层解码器第二个全连接层的权重重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
        # 添加每一层解码器第二个全连接层的偏置重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
        # 添加每一层解码器第三个归一化层的权重重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
        # 添加每一层解码器第三个归一化层的偏置重命名键
        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))

    # 格式化选项重新开启
    # 返回生成的重命名键列表
    return rename_keys
# 重命名字典中的键，将旧键移除并用新键替换
def rename_key(dct, old, new):
    # 弹出旧键对应的值
    val = dct.pop(old)
    # 将该值与新键关联起来
    dct[new] = val


# 将每个编码器层的矩阵拆分为查询（query）、键（key）和值（value）
def read_in_swin_q_k_v(state_dict, backbone_config):
    # 计算每个特征的维度列表
    num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
    # 遍历深度列表
    for i in range(len(backbone_config.depths)):
        dim = num_features[i]
        # 遍历每个深度内的层数
        for j in range(backbone_config.depths[i]):
            # fmt: off
            # 读取输入投影层权重和偏置（原始实现中，这是一个单独的矩阵加偏置）
            in_proj_weight = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.weight")
            in_proj_bias = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.bias")
            # 将查询（query）、键（key）和值（value）依次添加到状态字典中
            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
                dim : dim * 2, :
            ]
            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
                dim : dim * 2
            ]
            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
                -dim :, :
            ]
            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
            # fmt: on


# 读取解码器的查询（query）、键（key）和值（value）信息
def read_in_decoder_q_k_v(state_dict, config):
    # 解码器自注意力层
    hidden_size = config.d_model
    # 遍历解码器层数
    for i in range(config.decoder_layers):
        # 读取自注意力层输入投影层的权重和偏置
        in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
        in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
        # 将查询（query）、键（key）和值（value）依次添加到状态字典中
        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
            hidden_size : hidden_size * 2, :
        ]
        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
# 我们将在一张可爱猫咪的图片上验证我们的结果
def prepare_img():
    # 图片的 URL 地址
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 使用 requests 获取图片的原始数据流，并用 PIL 库打开图片
    im = Image.open(requests.get(url, stream=True).raw)

    return im


@torch.no_grad()
def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
    """
    复制/粘贴/调整模型的权重到我们的 DETA 结构中。
    """

    # 加载配置
    config = get_deta_config(model_name)

    # 加载原始状态字典
    if model_name == "deta-swin-large":
        # 从 HuggingFace Hub 下载 adet_swin_ft.pth 文件
        checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename="adet_swin_ft.pth")
    elif model_name == "deta-swin-large-o365":
        # 从 HuggingFace Hub 下载 deta_swin_pt_o365.pth 文件
        checkpoint_path = hf_hub_download(repo_id="jozhang97/deta-swin-l-o365", filename="deta_swin_pt_o365.pth")
    else:
        raise ValueError(f"Model name {model_name} not supported")

    # 使用 torch.load 加载模型的状态字典，并将其放在 CPU 上
    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]

    # 打印原始状态字典中每个参数的名称和形状
    for name, param in state_dict.items():
        print(name, param.shape)

    # 重命名键
    rename_keys = create_rename_keys(config)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_swin_q_k_v(state_dict, config.backbone_config)
    read_in_decoder_q_k_v(state_dict, config)

    # 修正一些前缀
    for key in state_dict.copy().keys():
        if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
            val = state_dict.pop(key)
            state_dict[key.replace("transformer.decoder", "model.decoder")] = val
        if "input_proj" in key:
            val = state_dict.pop(key)
            state_dict["model." + key] = val
        if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
            val = state_dict.pop(key)
            state_dict[key.replace("transformer", "model")] = val

    # 最后，创建 HuggingFace 模型并加载状态字典
    model = DetaForObjectDetection(config)
    model.load_state_dict(state_dict)
    model.eval()

    # 根据 GPU 是否可用选择设备
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # 加载图像处理器
    processor = DetaImageProcessor(format="coco_detection")

    # 验证在图像上的转换结果
    img = prepare_img()
    encoding = processor(images=img, return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values.to(device))

    # 验证输出的 logits
    print("Logits:", outputs.logits[0, :3, :3])
    print("Boxes:", outputs.pred_boxes[0, :3, :3])
    if model_name == "deta-swin-large":
        # 预期的 logits 和 boxes
        expected_logits = torch.tensor(
            [[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
        )
        expected_boxes = torch.tensor([[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]])
    # 如果模型名称为 "deta-swin-large-o365"，设置预期的逻辑回归输出张量
    expected_logits = torch.tensor(
        [[-8.0122, -3.5720, -4.9717], [-8.1547, -3.6886, -4.6389], [-7.6610, -3.6194, -5.0134]]
    )
    # 设置预期的边界框张量
    expected_boxes = torch.tensor([[0.2523, 0.5549, 0.4881], [0.7715, 0.4149, 0.4601], [0.5503, 0.2753, 0.0575]])

# 断言：验证模型输出的逻辑回归部分是否与预期值接近
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
# 断言：验证模型输出的预测边界框部分是否与预期值接近
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)

# 输出确认信息
print("Everything ok!")

# 如果指定了 PyTorch 模型保存路径
if pytorch_dump_folder_path:
    # 日志记录：保存 PyTorch 模型和处理器到指定路径
    logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
    # 创建保存路径（如果不存在）
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 保存模型到指定路径
    model.save_pretrained(pytorch_dump_folder_path)
    # 保存处理器到指定路径
    processor.save_pretrained(pytorch_dump_folder_path)

# 如果需要推送到 Hub
if push_to_hub:
    # 输出信息：推送模型和处理器到 Hub
    print("Pushing model and processor to hub...")
    # 推送模型到 Hub，使用指定的命名空间和模型名称
    model.push_to_hub(f"jozhang97/{model_name}")
    # 推送处理器到 Hub，使用指定的命名空间和模型名称
    processor.push_to_hub(f"jozhang97/{model_name}")
# 如果当前脚本被直接执行而非被导入为模块，则执行以下代码
if __name__ == "__main__":
    # 创建一个参数解析器对象
    parser = argparse.ArgumentParser()

    # 添加命令行参数：--model_name，类型为字符串，默认为"deta-swin-large"，
    # 可选值为["deta-swin-large", "deta-swin-large-o365"]，用于指定要转换的模型名称
    parser.add_argument(
        "--model_name",
        type=str,
        default="deta-swin-large",
        choices=["deta-swin-large", "deta-swin-large-o365"],
        help="Name of the model you'd like to convert.",
    )

    # 添加命令行参数：--pytorch_dump_folder_path，类型为字符串，默认为None，
    # 用于指定输出 PyTorch 模型的文件夹路径
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        help="Path to the folder to output PyTorch model.",
    )

    # 添加命令行参数：--push_to_hub，如果指定该参数，则设置为 True，否则为 False，
    # 用于指定是否将转换后的模型推送到 🤗 hub
    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )

    # 解析命令行参数，并将它们保存到 args 对象中
    args = parser.parse_args()

    # 调用 convert_deta_checkpoint 函数，传入命令行参数中的模型名称、PyTorch 模型输出路径和推送到 hub 的标志
    convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

`.\models\deta\image_processing_deta.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Deformable DETR."""

import pathlib
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

import numpy as np

from ...feature_extraction_utils import BatchFeature
from ...image_processing_utils import BaseImageProcessor, get_size_dict
from ...image_transforms import (
    PaddingMode,
    center_to_corners_format,
    corners_to_center_format,
    pad,
    rescale,
    resize,
    rgb_to_id,
    to_channel_dimension_format,
)
from ...image_utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
    AnnotationFormat,
    AnnotationType,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_batched,
    is_scaled_image,
    to_numpy_array,
    valid_images,
    validate_annotations,
    validate_preprocess_arguments,
)
from ...utils import (
    is_flax_available,
    is_jax_tensor,
    is_tf_available,
    is_tf_tensor,
    is_torch_available,
    is_torch_tensor,
    is_torchvision_available,
    is_vision_available,
    logging,
)
from ...utils.generic import TensorType


if is_torch_available():
    import torch


if is_torchvision_available():
    from torchvision.ops.boxes import batched_nms

if is_vision_available():
    import PIL


logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
    """
    Computes the output image size given the input image size and the desired output size.

    Args:
        image_size (`Tuple[int, int]`):
            The input image size.
        size (`int`):
            The desired output size.
        max_size (`int`, *optional*):
            The maximum allowed output size.
    """
    height, width = image_size
    if max_size is not None:
        min_original_size = float(min((height, width)))
        max_original_size = float(max((height, width)))
        if max_original_size / min_original_size * size > max_size:
            size = int(round(max_size * min_original_size / max_original_size))

    # 返回计算后的输出图像大小
    return int(round(height * size / min((height, width)))), int(round(width * size / min((height, width))))
    # 如果高度小于等于宽度且高度等于指定大小，或者宽度小于等于高度且宽度等于指定大小，则返回当前高度和宽度
    if (height <= width and height == size) or (width <= height and width == size):
        return height, width

    # 如果宽度小于高度，则计算调整后的宽度和高度
    if width < height:
        # 新的调整后宽度为指定大小
        ow = size
        # 新的调整后高度按比例计算
        oh = int(size * height / width)
    else:
        # 否则，新的调整后高度为指定大小
        oh = size
        # 新的调整后宽度按比例计算
        ow = int(size * width / height)
    
    # 返回调整后的高度和宽度
    return (oh, ow)
# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
def get_resize_output_image_size(
    input_image: np.ndarray,
    size: Union[int, Tuple[int, int], List[int]],
    max_size: Optional[int] = None,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
    """
    Computes the output image size given the input image size and the desired output size. If the desired output size
    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
    image size is computed by keeping the aspect ratio of the input image size.

    Args:
        input_image (`np.ndarray`):
            The image to resize.
        size (`int` or `Tuple[int, int]` or `List[int]`):
            The desired output size.
        max_size (`int`, *optional*):
            The maximum allowed output size.
        input_data_format (`ChannelDimension` or `str`, *optional*):
            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
    """
    # 获取输入图像的尺寸
    image_size = get_image_size(input_image, input_data_format)
    # 如果 size 是元组或列表，则直接返回 size
    if isinstance(size, (list, tuple)):
        return size
    # 否则，按照输入图像的长宽比计算输出图像的尺寸
    return get_size_with_aspect_ratio(image_size, size, max_size)


# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable:
    """
    Returns a function that converts a numpy array to the framework of the input array.

    Args:
        arr (`np.ndarray`): The array to convert.
    """
    # 如果 arr 是 numpy 数组，则返回 np.array 函数
    if isinstance(arr, np.ndarray):
        return np.array
    # 如果 TensorFlow 可用且 arr 是 TensorFlow 张量，则返回 tf.convert_to_tensor 函数
    if is_tf_available() and is_tf_tensor(arr):
        import tensorflow as tf

        return tf.convert_to_tensor
    # 如果 PyTorch 可用且 arr 是 PyTorch 张量，则返回 torch.tensor 函数
    if is_torch_available() and is_torch_tensor(arr):
        import torch

        return torch.tensor
    # 如果 Flax 可用且 arr 是 JAX 张量，则返回 jnp.array 函数
    if is_flax_available() and is_jax_tensor(arr):
        import jax.numpy as jnp

        return jnp.array
    # 若无法识别 arr 的类型，则引发 ValueError 异常
    raise ValueError(f"Cannot convert arrays of type {type(arr)}")


# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
    """
    Squeezes an array, but only if the axis specified has dim 1.
    """
    # 如果未指定 axis，则对 arr 进行 squeeze 操作
    if axis is None:
        return arr.squeeze()
    
    # 尝试对指定 axis 进行 squeeze 操作，若失败则返回 arr 原样
    try:
        return arr.squeeze(axis=axis)
    except ValueError:
        return arr


# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
    image_height, image_width = image_size
    # 初始化规范化后的注释字典
    norm_annotation = {}
    # 遍历 annotation 字典中的每对键值对
    for key, value in annotation.items():
        # 如果当前键是 "boxes"
        if key == "boxes":
            # 将值赋给 boxes 变量
            boxes = value
            # 将边角坐标格式的 boxes 转换为中心-宽高格式
            boxes = corners_to_center_format(boxes)
            # 将 boxes 中的坐标值除以图像的宽度和高度，以实现归一化
            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
            # 将归一化后的 boxes 存入 norm_annotation 字典
            norm_annotation[key] = boxes
        else:
            # 对于不是 "boxes" 的键，直接将其值存入 norm_annotation 字典
            norm_annotation[key] = value
    # 返回归一化后的 annotation 字典
    return norm_annotation
# Copied from transformers.models.detr.image_processing_detr.max_across_indices
def max_across_indices(values: Iterable[Any]) -> List[Any]:
    """
    Return the maximum value across all indices of an iterable of values.
    """
    # 使用zip(*values)将输入的可迭代对象转置，对每个位置的元素计算最大值，返回结果列表
    return [max(values_i) for values_i in zip(*values)]


# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
def get_max_height_width(
    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
    """
    Get the maximum height and width across all images in a batch.
    """
    # 如果未指定数据格式，使用infer_channel_dimension_format推断第一个图像的通道维度格式
    if input_data_format is None:
        input_data_format = infer_channel_dimension_format(images[0])

    # 根据数据格式分别计算最大高度和宽度
    if input_data_format == ChannelDimension.FIRST:
        _, max_height, max_width = max_across_indices([img.shape for img in images])
    elif input_data_format == ChannelDimension.LAST:
        max_height, max_width, _ = max_across_indices([img.shape for img in images])
    else:
        # 抛出异常，如果通道维度格式无效
        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
    return (max_height, max_width)


# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
def make_pixel_mask(
    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
    """
    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.

    Args:
        image (`np.ndarray`):
            Image to make the pixel mask for.
        output_size (`Tuple[int, int]`):
            Output size of the mask.
    """
    # 获取图像的高度和宽度，根据输入数据格式
    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
    
    # 创建一个与输出尺寸相同的零数组，将图像有效区域设为1
    mask = np.zeros(output_size, dtype=np.int64)
    mask[:input_height, :input_width] = 1
    return mask


# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
    """
    Convert a COCO polygon annotation to a mask.

    Args:
        segmentations (`List[List[float]]`):
            List of polygons, each polygon represented by a list of x-y coordinates.
        height (`int`):
            Height of the mask.
        width (`int`):
            Width of the mask.
    """
    try:
        from pycocotools import mask as coco_mask
    except ImportError:
        raise ImportError("Pycocotools is not installed in your environment.")

    # 将每个多边形分割转换为掩码，合并成一个数组返回
    masks = []
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        if len(mask.shape) < 3:
            mask = mask[..., None]
        mask = np.asarray(mask, dtype=np.uint8)
        mask = np.any(mask, axis=2)
        masks.append(mask)
    
    # 如果存在掩码则堆叠成数组，否则返回一个空的数组
    if masks:
        masks = np.stack(masks, axis=0)
    else:
        masks = np.zeros((0, height, width), dtype=np.uint8)

    return masks
# 从transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation中复制，用于准备COCO格式的检测注释，转换为DETA所需的格式
def prepare_coco_detection_annotation(
    image,
    target,
    return_segmentation_masks: bool = False,
    input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
    """
    将COCO格式的目标转换为DETA期望的格式。
    """
    # 获取图像的高度和宽度
    image_height, image_width = get_image_size(image, channel_dim=input_data_format)

    # 获取图像ID
    image_id = target["image_id"]
    image_id = np.asarray([image_id], dtype=np.int64)

    # 获取给定图像的所有COCO注释
    annotations = target["annotations"]
    # 过滤掉所有"isocrowd"不在对象中或者"isocrowd"等于0的对象
    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]

    # 获取所有类别ID并转换为numpy数组
    classes = [obj["category_id"] for obj in annotations]
    classes = np.asarray(classes, dtype=np.int64)

    # 转换为COCO API格式
    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
    iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)

    # 获取所有边界框，并确保每个边界框均在图像内
    boxes = [obj["bbox"] for obj in annotations]
    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
    boxes[:, 2:] += boxes[:, :2]
    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)

    # 保留有效的边界框
    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])

    # 创建新的目标字典
    new_target = {}
    new_target["image_id"] = image_id
    new_target["class_labels"] = classes[keep]
    new_target["boxes"] = boxes[keep]
    new_target["area"] = area[keep]
    new_target["iscrowd"] = iscrowd[keep]
    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)

    # 如果注释存在且包含关键点信息，则处理关键点
    if annotations and "keypoints" in annotations[0]:
        keypoints = [obj["keypoints"] for obj in annotations]
        keypoints = np.asarray(keypoints, dtype=np.float32)
        keypoints = keypoints[keep]
        num_keypoints = keypoints.shape[0]
        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
        new_target["keypoints"] = keypoints

    # 如果需要返回分割掩码，则处理分割掩码
    if return_segmentation_masks:
        segmentation_masks = [obj["segmentation"] for obj in annotations]
        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
        new_target["masks"] = masks[keep]

    return new_target


# 从transformers.models.detr.image_processing_detr.masks_to_boxes中复制，用于计算提供的全景分割掩码周围的边界框
def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
    """
    计算提供的全景分割掩码周围的边界框。

    Args:
        masks: 格式为`[number_masks, height, width]`的掩码，其中N是掩码的数量

    Returns:
        boxes: 格式为`[number_masks, 4]`的边界框，使用xyxy格式
    """
    # 如果 masks 的大小为 0，表示没有任何掩码，则返回一个形状为 (0, 4) 的全零数组
    if masks.size == 0:
        return np.zeros((0, 4))

    # 获取掩码的高度 h 和宽度 w
    h, w = masks.shape[-2:]
    
    # 创建一个包含从 0 到 h-1 的浮点数数组 y，以及从 0 到 w-1 的浮点数数组 x
    y = np.arange(0, h, dtype=np.float32)
    x = np.arange(0, w, dtype=np.float32)
    
    # 使用 meshgrid 函数创建一个网格，返回的 y 和 x 是二维数组，形状为 (h, w)，使用 "ij" 索引顺序
    # 详情参见 https://github.com/pytorch/pytorch/issues/50276
    y, x = np.meshgrid(y, x, indexing="ij")

    # 对 masks 应用 x 坐标，每个掩码都乘以对应的 x 坐标，并扩展维度以匹配 masks 的形状
    x_mask = masks * np.expand_dims(x, axis=0)
    
    # 对 x_mask 进行重塑成二维数组，并计算每行的最大值，得到 x 的最大值数组 x_max
    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
    
    # 创建一个掩码数组 x，掩盖掉所有非掩码值的元素
    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
    
    # 使用填充值 1e8 填充掩码外的所有元素，并对 x_min 进行重塑成二维数组，计算每行的最小值，得到 x 的最小值数组 x_min
    x_min = x.filled(fill_value=1e8)
    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)

    # 对 masks 应用 y 坐标，每个掩码都乘以对应的 y 坐标，并扩展维度以匹配 masks 的形状
    y_mask = masks * np.expand_dims(y, axis=0)
    
    # 对 y_mask 进行重塑成二维数组，并计算每行的最大值，得到 y 的最大值数组 y_max
    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
    
    # 创建一个掩码数组 y，掩盖掉所有非掩码值的元素
    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
    
    # 使用填充值 1e8 填充掩码外的所有元素，并对 y_min 进行重塑成二维数组，计算每行的最小值，得到 y 的最小值数组 y_min
    y_min = y.filled(fill_value=1e8)
    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)

    # 将 x_min, y_min, x_max, y_max 四个数组按列堆叠起来，形成一个形状为 (N, 4) 的数组，其中 N 是 masks 的数量
    return np.stack([x_min, y_min, x_max, y_max], 1)
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DETA
def prepare_coco_panoptic_annotation(
    image: np.ndarray,
    target: Dict,
    masks_path: Union[str, pathlib.Path],
    return_masks: bool = True,
    input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
    """
    Prepare a coco panoptic annotation for DETA.
    """
    # 获取图像的高度和宽度
    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
    # 构建注释文件路径
    annotation_path = pathlib.Path(masks_path) / target["file_name"]

    new_target = {}
    # 设置新目标的图像ID，如果原始目标中存在'image_id'键，则使用它，否则使用'id'键
    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
    # 设置新目标的尺寸为图像的高度和宽度
    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
    # 设置新目标的原始尺寸为图像的高度和宽度
    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)

    if "segments_info" in target:
        # 从注释路径中读取分割掩码，并将其转换为ID格式
        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
        masks = rgb_to_id(masks)

        # 从目标的'segments_info'中提取分割标识符ID，并根据ID创建相应的分割掩码
        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
        masks = masks == ids[:, None, None]
        masks = masks.astype(np.uint8)
        if return_masks:
            new_target["masks"] = masks
        # 根据分割掩码生成边界框
        new_target["boxes"] = masks_to_boxes(masks)
        # 设置新目标的类别标签为'segments_info'中的'category_id'
        new_target["class_labels"] = np.array(
            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
        )
        # 设置新目标的'iscrowd'字段为'segments_info'中的'iscrowd'
        new_target["iscrowd"] = np.asarray(
            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
        )
        # 设置新目标的区域面积为'segments_info'中的'area'
        new_target["area"] = np.asarray(
            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
        )

    return new_target


# Copied from transformers.models.detr.image_processing_detr.resize_annotation
def resize_annotation(
    annotation: Dict[str, Any],
    orig_size: Tuple[int, int],
    target_size: Tuple[int, int],
    threshold: float = 0.5,
    resample: PILImageResampling = PILImageResampling.NEAREST,
):
    """
    Resizes an annotation to a target size.

    Args:
        annotation (`Dict[str, Any]`):
            The annotation dictionary.
        orig_size (`Tuple[int, int]`):
            The original size of the input image.
        target_size (`Tuple[int, int]`):
            The target size of the image, as returned by the preprocessing `resize` step.
        threshold (`float`, *optional*, defaults to 0.5):
            The threshold used to binarize the segmentation masks.
        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
            The resampling filter to use when resizing the masks.
    """
    # 计算原始尺寸与目标尺寸之间的比例
    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
    ratio_height, ratio_width = ratios

    new_annotation = {}
    # 设置新注释的尺寸为目标尺寸
    new_annotation["size"] = target_size
    # 遍历注释字典中的每对键值对
    for key, value in annotation.items():
        # 如果键是"boxes"
        if key == "boxes":
            # 将值赋给变量boxes，并按比例缩放框的坐标
            boxes = value
            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
            # 将缩放后的框坐标存入新的注释字典中
            new_annotation["boxes"] = scaled_boxes
        # 如果键是"area"
        elif key == "area":
            # 将值赋给变量area，并按比例缩放面积
            area = value
            scaled_area = area * (ratio_width * ratio_height)
            # 将缩放后的面积存入新的注释字典中
            new_annotation["area"] = scaled_area
        # 如果键是"masks"
        elif key == "masks":
            # 将值赋给变量masks，并按目标大小resize每个掩码，然后进行阈值处理
            masks = value[:, None]
            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
            masks = masks.astype(np.float32)
            masks = masks[:, 0] > threshold
            # 将处理后的掩码存入新的注释字典中
            new_annotation["masks"] = masks
        # 如果键是"size"
        elif key == "size":
            # 直接将目标大小存入新的注释字典中
            new_annotation["size"] = target_size
        # 对于其他未处理的键，直接复制到新的注释字典中
        else:
            new_annotation[key] = value

    # 返回处理后的新注释字典
    return new_annotation
class DetaImageProcessor(BaseImageProcessor):
    r"""
    Constructs a Deformable DETR image processor.

    Args:
        format (`str`, *optional*, defaults to `"coco_detection"`):
            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
        do_resize (`bool`, *optional*, defaults to `True`):
            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
            overridden by the `do_resize` parameter in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
            Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
            the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
            Resampling filter to use if resizing the image.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
            `do_rescale` parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        do_normalize:
            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
            `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_annotations (`bool`, *optional*, defaults to `True`):
            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
        do_pad (`bool`, *optional*, defaults to `True`):
            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
            method. If `True` will pad the images in the batch to the largest height and width in the batch.
            Padding will be applied to the bottom and right of the image with zeros.
    """
    # 定义模型输入的名称列表
    model_input_names = ["pixel_values", "pixel_mask"]

    # 初始化函数，设置数据处理的参数和选项
    def __init__(
        self,
        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Union[float, List[float]] = None,
        image_std: Union[float, List[float]] = None,
        do_convert_annotations: bool = True,
        do_pad: bool = True,
        **kwargs,
    ) -> None:
        # 如果 kwargs 中包含 "pad_and_return_pixel_mask"，则将 do_pad 设为对应值并从 kwargs 中删除该项
        if "pad_and_return_pixel_mask" in kwargs:
            do_pad = kwargs.pop("pad_and_return_pixel_mask")

        # 设置图像尺寸，默认为 {"shortest_edge": 800, "longest_edge": 1333}
        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
        # 根据 size 获取尺寸字典，确保不是默认正方形
        size = get_size_dict(size, default_to_square=False)

        # 如果 do_convert_annotations 为 None，则设为 do_normalize 的值
        if do_convert_annotations is None:
            do_convert_annotations = do_normalize

        # 调用父类的初始化方法，传入可能的其他参数
        super().__init__(**kwargs)

        # 设置类的各种属性
        self.format = format
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.do_convert_annotations = do_convert_annotations
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        self.do_pad = do_pad

    # 从 DETR 模型中复制的函数，用于准备注释信息，根据参数设置返回特定格式的注释
    def prepare_annotation(
        self,
        image: np.ndarray,
        target: Dict,
        format: Optional[AnnotationFormat] = None,
        return_segmentation_masks: bool = None,
        masks_path: Optional[Union[str, pathlib.Path]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> Dict:
        """
        Prepare an annotation for feeding into DETA model.
        """
        # 如果未指定格式，则使用默认格式
        format = format if format is not None else self.format

        # 根据不同的注释格式进行处理
        if format == AnnotationFormat.COCO_DETECTION:
            # 如果未指定返回分割掩码，则默认为 False
            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
            # 准备 COCO 检测注释
            target = prepare_coco_detection_annotation(
                image, target, return_segmentation_masks, input_data_format=input_data_format
            )
        elif format == AnnotationFormat.COCO_PANOPTIC:
            # 如果未指定返回分割掩码，则默认为 True
            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
            # 准备 COCO 全景注释
            target = prepare_coco_panoptic_annotation(
                image,
                target,
                masks_path=masks_path,
                return_masks=return_segmentation_masks,
                input_data_format=input_data_format,
            )
        else:
            # 抛出错误，指定的格式不支持
            raise ValueError(f"Format {format} is not supported.")

        # 返回处理后的目标注释
        return target

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
    def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
        logger.warning_once(
            "The `prepare` method is deprecated and will be removed in a v4.33. "
            "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
            "does not return the image anymore.",
        )
        # 使用新的方法 `prepare_annotation` 处理注释，并返回处理后的目标注释
        target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
        return image, target

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
    def convert_coco_poly_to_mask(self, *args, **kwargs):
        # 发出警告，方法 `convert_coco_poly_to_mask` 即将被移除
        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
        # 调用 `convert_coco_poly_to_mask` 函数，并返回结果
        return convert_coco_poly_to_mask(*args, **kwargs)

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection
    def prepare_coco_detection(self, *args, **kwargs):
        # 发出警告，方法 `prepare_coco_detection` 即将被移除
        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
        # 调用 `prepare_coco_detection_annotation` 函数，并返回结果
        return prepare_coco_detection_annotation(*args, **kwargs)

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
    def prepare_coco_panoptic(self, *args, **kwargs):
        # 发出警告，方法 `prepare_coco_panoptic` 即将被移除
        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
        # 调用 `prepare_coco_panoptic_annotation` 函数，并返回结果
        return prepare_coco_panoptic_annotation(*args, **kwargs)

    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
        int, smaller edge of the image will be matched to this number.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                The desired output size. Can contain keys `shortest_edge` and `longest_edge` or `height` and `width`.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                Resampling filter to use if resizing the image.
            data_format (`ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred from the input
                image.
        """
        # 获取调整后的尺寸字典，确保不默认为正方形
        size = get_size_dict(size, default_to_square=False)
        
        # 如果 size 包含 'shortest_edge' 和 'longest_edge' 键
        if "shortest_edge" in size and "longest_edge" in size:
            # 根据 'shortest_edge' 和 'longest_edge' 来计算调整后的图像尺寸
            size = get_resize_output_image_size(
                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
            )
        # 如果 size 包含 'height' 和 'width' 键
        elif "height" in size and "width" in size:
            # 直接使用 'height' 和 'width' 指定的尺寸
            size = (size["height"], size["width"])
        else:
            # 如果 size 不符合预期的键集合，抛出值错误异常
            raise ValueError(
                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
                f" {size.keys()}."
            )
        
        # 调用 resize 函数，根据指定的尺寸调整图像大小
        image = resize(
            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format
        )
        # 返回调整大小后的图像
        return image

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
    def resize_annotation(
        self,
        annotation,
        orig_size,
        size,
        resample: PILImageResampling = PILImageResampling.NEAREST,
    ) -> Dict:
        """
        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
        to this number.
        """
        # 调用外部函数 resize_annotation，将注释调整到与调整后的图像匹配的大小
        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
    def rescale(
        self,
        image: np.ndarray,
        rescale_factor: float,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Rescale the image by the given factor. image = image * rescale_factor.

        Args:
            image (`np.ndarray`):
                Image to rescale.
            rescale_factor (`float`):
                The value to use for rescaling.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
                one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        """
        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
        """
        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
        """
        return normalize_annotation(annotation, image_size=image_size)

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
    def _update_annotation_for_padded_image(
        self,
        annotation: Dict,
        input_image_size: Tuple[int, int],
        output_image_size: Tuple[int, int],
        padding,  # `padding` parameter for handling padding information
        update_bboxes,  # `update_bboxes` parameter to control updating bounding boxes
    ) -> Dict:
        """
        Update the annotation for a padded image.
        """
        new_annotation = {}
        new_annotation["size"] = output_image_size  # 初始化新的注释字典，设置图像大小为输出图像大小

        for key, value in annotation.items():  # 遍历注释字典中的每个键值对
            if key == "masks":
                masks = value
                masks = pad(
                    masks,
                    padding,
                    mode=PaddingMode.CONSTANT,  # 使用常量填充模式
                    constant_values=0,  # 填充值为0
                    input_data_format=ChannelDimension.FIRST,  # 输入数据格式为通道维度在前
                )
                masks = safe_squeeze(masks, 1)  # 压缩维度为1的维度
                new_annotation["masks"] = masks  # 更新注释字典中的masks项为填充后的masks
            elif key == "boxes" and update_bboxes:
                boxes = value
                boxes *= np.asarray(
                    [
                        input_image_size[1] / output_image_size[1],  # 调整框的水平位置
                        input_image_size[0] / output_image_size[0],  # 调整框的垂直位置
                        input_image_size[1] / output_image_size[1],  # 调整框的水平大小
                        input_image_size[0] / output_image_size[0],  # 调整框的垂直大小
                    ]
                )
                new_annotation["boxes"] = boxes  # 更新注释字典中的boxes项为调整后的boxes
            elif key == "size":
                new_annotation["size"] = output_image_size  # 更新注释字典中的size项为输出图像大小
            else:
                new_annotation[key] = value  # 其他键直接复制到新的注释字典中
        return new_annotation  # 返回更新后的注释字典

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        annotation: Optional[Dict[str, Any]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
        """
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)  # 获取输入图像的高度和宽度

        output_height, output_width = output_size  # 获取输出图像的高度和宽度

        pad_bottom = output_height - input_height  # 计算垂直方向的填充量
        pad_right = output_width - input_width  # 计算水平方向的填充量
        padding = ((0, pad_bottom), (0, pad_right))  # 设置填充的大小

        padded_image = pad(
            image,
            padding,
            mode=PaddingMode.CONSTANT,  # 使用常量填充模式
            constant_values=constant_values,  # 设置填充的常量值
            data_format=data_format,  # 数据格式
            input_data_format=input_data_format,  # 输入数据格式
        )

        if annotation is not None:
            annotation = self._update_annotation_for_padded_image(
                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
            )  # 更新图像填充后的注释信息

        return padded_image, annotation  # 返回填充后的图像和更新后的注释信息

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
    # 定义一个方法用于填充图像数据，支持多种参数选项
    def pad(
        self,
        images: List[np.ndarray],  # 输入参数：图像列表，每个元素是一个 NumPy 数组
        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,  # 可选参数：注解数据，可以是单个注解或注解列表
        constant_values: Union[float, Iterable[float]] = 0,  # 填充常数值，可以是单个浮点数或可迭代对象
        return_pixel_mask: bool = True,  # 是否返回像素掩码，默认为 True
        return_tensors: Optional[Union[str, TensorType]] = None,  # 可选参数：返回的数据类型，可以是字符串或张量类型
        data_format: Optional[ChannelDimension] = None,  # 数据格式，通道维度的定义
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据的格式，可以是字符串或通道维度对象
        update_bboxes: bool = True,  # 是否更新边界框信息，默认为 True
    ):
    
    # 定义一个方法用于预处理图像和注解数据
    def preprocess(
        self,
        images: ImageInput,  # 输入参数：图像数据，可以是单张图像或图像列表
        annotations: Optional[Union[List[Dict], List[List[Dict]]]] = None,  # 可选参数：注解数据，可以是字典列表或嵌套的字典列表
        return_segmentation_masks: bool = None,  # 是否返回分割掩码，根据情况自动设定
        masks_path: Optional[Union[str, pathlib.Path]] = None,  # 可选参数：分割掩码的路径，可以是字符串或路径对象
        do_resize: Optional[bool] = None,  # 是否调整图像大小，根据情况自动设定
        size: Optional[Dict[str, int]] = None,  # 图像大小的目标尺寸，字典形式
        resample=None,  # PIL 图像重采样方法的选项
        do_rescale: Optional[bool] = None,  # 是否进行图像重新缩放，根据情况自动设定
        rescale_factor: Optional[Union[int, float]] = None,  # 图像缩放因子，可以是整数或浮点数
        do_normalize: Optional[bool] = None,  # 是否对图像进行标准化，根据情况自动设定
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像的均值，可以是单个数值或列表
        image_std: Optional[Union[float, List[float]]] = None,  # 图像的标准差，可以是单个数值或列表
        do_convert_annotations: Optional[bool] = None,  # 是否转换注解数据的格式，根据情况自动设定
        do_pad: Optional[bool] = None,  # 是否进行图像填充，根据情况自动设定
        format: Optional[Union[str, AnnotationFormat]] = None,  # 注解数据的格式，可以是字符串或注解格式对象
        return_tensors: Optional[Union[TensorType, str]] = None,  # 返回的数据类型，可以是张量类型或字符串
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,  # 数据格式，通道维度的定义
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据的格式，可以是字符串或通道维度对象
        **kwargs,  # 其他未明确列出的参数
    ):
    
    # 定义一个方法用于目标检测后处理
    def post_process_object_detection(
        self,
        outputs,  # 输入参数：模型输出的数据
        threshold: float = 0.5,  # 置信度阈值，默认为 0.5
        target_sizes: Union[TensorType, List[Tuple]] = None,  # 目标大小，可以是张量类型或元组列表
        nms_threshold: float = 0.7,  # 非最大抑制的阈值，默认为 0.7

`.\models\deta\modeling_deta.py`

# 设置编码格式为 UTF-8
# 版权声明，指明 SenseTime 和 The HuggingFace Inc. 团队的所有权，保留所有权利
#
# 根据 Apache 许可证 2.0 版本使用本文件，除非符合许可证中的条款，否则不得使用本文件
# 您可以在以下网址获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件
# 软件没有任何形式的担保或条件，明示或暗示
# 有关特定语言的权限，请参阅许可证

""" PyTorch DETA model. """

# 导入所需的库和模块
import copy
import math
import os
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
from torch import Tensor, nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable

# 导入自定义模块和函数
from ...activations import ACT2FN
from ...file_utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_scipy_available,
    is_torch_cuda_available,
    is_vision_available,
    replace_return_docstrings,
)
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import meshgrid
from ...utils import is_accelerate_available, is_ninja_available, is_torchvision_available, logging, requires_backends
from ...utils.backbone_utils import load_backbone
from .configuration_deta import DetaConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 多尺度可变形注意力模块的全局变量
MultiScaleDeformableAttention = None


# 从 models.deformable_detr.load_cuda_kernels 复制过来的函数
def load_cuda_kernels():
    # 导入 torch.utils.cpp_extension 中的 load 函数
    from torch.utils.cpp_extension import load

    global MultiScaleDeformableAttention

    # 获取 CUDA 内核源文件的路径
    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deta"
    src_files = [
        root / filename
        for filename in [
            "vision.cpp",
            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
        ]
    ]

    # 加载 CUDA 内核
    load(
        "MultiScaleDeformableAttention",
        src_files,
        with_cuda=True,
        extra_include_paths=[str(root)],
        extra_cflags=["-DWITH_CUDA=1"],
        extra_cuda_cflags=[
            "-DCUDA_HAS_FP16=1",
            "-D__CUDA_NO_HALF_OPERATORS__",
            "-D__CUDA_NO_HALF_CONVERSIONS__",
            "-D__CUDA_NO_HALF2_OPERATORS__",
        ],
    )


# 从 transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction 复制过来的类
class MultiScaleDeformableAttentionFunction(Function):
    @staticmethod
    def forward(
        context,
        value,
        value_spatial_shapes,
        value_level_start_index,
        sampling_locations,
        attention_weights,
        im2col_step,
        # forward 方法的静态方法，执行前向传播
    ):
        # 设置上下文对象的 im2col_step 属性为传入的 im2col_step 值
        context.im2col_step = im2col_step
        # 调用 MultiScaleDeformableAttention 类的静态方法 ms_deform_attn_forward 进行前向传播
        output = MultiScaleDeformableAttention.ms_deform_attn_forward(
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
            context.im2col_step,  # 使用上下文中的 im2col_step 参数
        )
        # 将需要反向传播的张量保存到上下文中
        context.save_for_backward(
            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
        )
        # 返回前向传播的输出
        return output

    @staticmethod
    @once_differentiable
    def backward(context, grad_output):
        (
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
        ) = context.saved_tensors
        # 调用 MultiScaleDeformableAttention 类的静态方法 ms_deform_attn_backward 进行反向传播计算梯度
        grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
            grad_output,
            context.im2col_step,  # 使用上下文中的 im2col_step 参数
        )

        # 返回梯度，其中第二和第三个返回值为 None
        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
# 如果加速库可用，则导入 PartialState 和 reduce 函数
if is_accelerate_available():
    from accelerate import PartialState
    from accelerate.utils import reduce

# 如果视觉库可用，则导入 center_to_corners_format 函数
if is_vision_available():
    from transformers.image_transforms import center_to_corners_format

# 如果 TorchVision 库可用，则导入 batched_nms 函数
if is_torchvision_available():
    from torchvision.ops.boxes import batched_nms

# 如果 SciPy 库可用，则导入 linear_sum_assignment 函数
if is_scipy_available():
    from scipy.optimize import linear_sum_assignment

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 用于文档的配置字符串，指定为 "DetaConfig"
_CONFIG_FOR_DOC = "DetaConfig"

# 用于文档的检查点字符串，指定为 "jozhang97/deta-swin-large-o365"
_CHECKPOINT_FOR_DOC = "jozhang97/deta-swin-large-o365"

# DETA 预训练模型存档列表，列出了一个预训练模型
DETA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "jozhang97/deta-swin-large-o365",
    # 查看所有 DETA 模型的列表：https://huggingface.co/models?filter=deta
]

@dataclass
# 从 DeformableDetrDecoderOutput 复制而来，用于 DETA 模型的解码器输出
# 继承自 ModelOutput 类，添加了两个属性：
# - 中间解码器隐藏状态的堆叠张量（即每个解码器层的输出）
# - 中间参考点的堆叠张量
class DetaDecoderOutput(ModelOutput):
    """
    DetaDecoder 的输出基类。这个类在 BaseModelOutputWithCrossAttentions 基础上增加了两个属性：
    - 中间解码器隐藏状态的堆叠张量（即每个解码器层的输出）
    - 中间参考点的堆叠张量。
    """
    # 定义函数参数和它们的类型注解，用于描述模型输出的不同隐藏状态和注意力权重
    
    last_hidden_state: torch.FloatTensor = None
    # 最后一个编码器层的隐藏状态，形状为(batch_size, sequence_length, hidden_size)
    
    intermediate_hidden_states: torch.FloatTensor = None
    # 解码器各层的中间隐藏状态堆叠，形状为(batch_size, config.decoder_layers, num_queries, hidden_size)
    
    intermediate_reference_points: torch.FloatTensor = None
    # 解码器各层的中间参考点（每层解码器的参考点），形状为(batch_size, config.decoder_layers, sequence_length, hidden_size)
    
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 模型隐藏状态的元组，包括初始嵌入层输出和每个层的输出，形状为(batch_size, sequence_length, hidden_size)
    
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 注意力权重的元组，每个元素对应每个解码器层的注意力权重，形状为(batch_size, num_heads, sequence_length, sequence_length)。在自注意力头中用于计算加权平均值。
    
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 解码器的交叉注意力层的注意力权重的元组，每个元素对应每个解码器层的交叉注意力权重，形状为(batch_size, num_heads, sequence_length, sequence_length)。在交叉注意力头中用于计算加权平均值。
# 用于存储Deformable DETR编码器-解码器模型输出的基类。
@dataclass
class DetaModelOutput(ModelOutput):
    """
    Deformable DETR编码器-解码器模型输出的基类。

    """

    # 初始化参考点张量
    init_reference_points: torch.FloatTensor = None
    # 最后一个隐藏状态张量
    last_hidden_state: torch.FloatTensor = None
    # 中间隐藏状态张量
    intermediate_hidden_states: torch.FloatTensor = None
    # 中间参考点张量
    intermediate_reference_points: torch.FloatTensor = None
    # 解码器的隐藏状态（可选的元组列表）
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 解码器的注意力权重（可选的元组列表）
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 交叉注意力权重（可选的元组列表）
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器最后一个隐藏状态张量（可选）
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    # 编码器的隐藏状态（可选的元组列表）
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器的注意力权重（可选的元组列表）
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器输出类别（可选的张量）
    enc_outputs_class: Optional[torch.FloatTensor] = None
    # 编码器输出坐标逻辑（可选的张量）
    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
    # 输出提议（可选的张量）
    output_proposals: Optional[torch.FloatTensor] = None


# 用于存储DetaForObjectDetection模型输出类型的基类。
@dataclass
class DetaObjectDetectionOutput(ModelOutput):
    """
    DetaForObjectDetection模型的输出类型。

    """

    # 损失（可选的浮点张量）
    loss: Optional[torch.FloatTensor] = None
    # 损失字典（可选的字典）
    loss_dict: Optional[Dict] = None
    # logits张量
    logits: torch.FloatTensor = None
    # 预测框张量
    pred_boxes: torch.FloatTensor = None
    # 辅助输出（可选的字典列表）
    auxiliary_outputs: Optional[List[Dict]] = None
    # 初始化参考点张量（可选的浮点张量）
    init_reference_points: Optional[torch.FloatTensor] = None
    # 最后一个隐藏状态张量（可选的浮点张量）
    last_hidden_state: Optional[torch.FloatTensor] = None
    # 中间隐藏状态张量（可选的浮点张量）
    intermediate_hidden_states: Optional[torch.FloatTensor] = None
    # 中间参考点张量（可选的浮点张量）
    intermediate_reference_points: Optional[torch.FloatTensor] = None
    # 解码器的隐藏状态（可选的元组列表）
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 解码器的注意力权重（可选的元组列表）
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 交叉注意力权重（可选的元组列表）
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器最后一个隐藏状态张量（可选的浮点张量）
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    # 编码器的隐藏状态（可选的元组列表）
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器的注意力权重（可选的元组列表）
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器输出类别（可选）
    enc_outputs_class: Optional = None
    # 编码器输出坐标逻辑（可选）
    enc_outputs_coord_logits: Optional = None
    # 输出提议（可选的浮点张量）
    output_proposals: Optional[torch.FloatTensor] = None


# 创建一个复制指定模块的列表函数。
def _get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])


# 计算逆sigmoid函数。
def inverse_sigmoid(x, eps=1e-5):
    # 将x限制在0到1之间
    x = x.clamp(min=0, max=1)
    # 对x应用逆sigmoid变换并添加小的eps以避免数值稳定性问题
    x1 = x.clamp(min=eps)
    x2 = (1 - x).clamp(min=eps)
    return torch.log(x1 / x2)


# DetaFrozenBatchNorm2d类，从DetrFrozenBatchNorm2d类复制并修改一些部分。
class DetaFrozenBatchNorm2d(nn.Module):
    """
    批量归一化层，其中批次统计信息和仿射参数被固定。

    从torchvision.misc.ops中复制粘贴，添加eps以在没有此项的情况下保证任何模型（而不仅仅是torchvision.models.resnet[18,34,50,101]）不产生nan。
    """

    def __init__(self, n):
        super().__init__()
        # 注册权重张量，并初始化为全1
        self.register_buffer("weight", torch.ones(n))
        # 注册偏置张量，并初始化为全0
        self.register_buffer("bias", torch.zeros(n))
        # 注册运行时均值张量，并初始化为全0
        self.register_buffer("running_mean", torch.zeros(n))
        # 注册运行时方差张量，并初始化为全1
        self.register_buffer("running_var", torch.ones(n))
    # 从模型状态字典中加载权重和偏差，忽略 num_batches_tracked 键
    def _load_from_state_dict(
        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
    ):
        # 构建 num_batches_tracked 的键名
        num_batches_tracked_key = prefix + "num_batches_tracked"
        # 如果 num_batches_tracked 存在于状态字典中，则删除它
        if num_batches_tracked_key in state_dict:
            del state_dict[num_batches_tracked_key]

        # 调用父类的加载状态字典方法，传递所有参数
        super()._load_from_state_dict(
            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
        )

    # 前向传播函数
    def forward(self, x):
        # 将权重重塑为 (1, -1, 1, 1) 的形状，以便与输入张量相乘
        weight = self.weight.reshape(1, -1, 1, 1)
        # 将偏置重塑为 (1, -1, 1, 1) 的形状，以便与输入张量相加
        bias = self.bias.reshape(1, -1, 1, 1)
        # 将运行时方差重塑为 (1, -1, 1, 1) 的形状
        running_var = self.running_var.reshape(1, -1, 1, 1)
        # 将运行时均值重塑为 (1, -1, 1, 1) 的形状
        running_mean = self.running_mean.reshape(1, -1, 1, 1)
        # 定义一个极小值 epsilon，用于数值稳定性
        epsilon = 1e-5
        # 计算缩放因子 scale，用于标准化输入数据
        scale = weight * (running_var + epsilon).rsqrt()
        # 调整偏置，使其适应标准化后的输入数据
        bias = bias - running_mean * scale
        # 返回经过标准化和偏置处理的输入数据
        return x * scale + bias
# 从 transformers.models.detr.modeling_detr.replace_batch_norm 复制，并将 Detr->Deta
def replace_batch_norm(model):
    r"""
    递归替换所有的 `torch.nn.BatchNorm2d` 层为 `DetaFrozenBatchNorm2d`。

    Args:
        model (torch.nn.Module):
            输入的模型
    """
    for name, module in model.named_children():
        # 如果当前模块是 nn.BatchNorm2d 类型
        if isinstance(module, nn.BatchNorm2d):
            # 创建一个新的 DetaFrozenBatchNorm2d 实例，参数为原始模块的特征数量
            new_module = DetaFrozenBatchNorm2d(module.num_features)

            # 如果原始模块的权重不在 "meta" 设备上
            if not module.weight.device == torch.device("meta"):
                # 将新模块的权重、偏置、运行均值和方差复制为原始模块的相应值
                new_module.weight.data.copy_(module.weight)
                new_module.bias.data.copy_(module.bias)
                new_module.running_mean.data.copy_(module.running_mean)
                new_module.running_var.data.copy_(module.running_var)

            # 将模型中的原始模块替换为新的 DetaFrozenBatchNorm2d 实例
            model._modules[name] = new_module

        # 如果当前模块有子模块，则递归调用 replace_batch_norm 函数
        if len(list(module.children())) > 0:
            replace_batch_norm(module)


class DetaBackboneWithPositionalEncodings(nn.Module):
    """
    带有位置编码的主干模型。

    nn.BatchNorm2d 层被上述定义的 DetaFrozenBatchNorm2d 替换。
    """

    def __init__(self, config):
        super().__init__()

        # 加载指定配置的主干模型
        backbone = load_backbone(config)
        
        # 使用 torch.no_grad() 包装，递归替换主干模型中的 BatchNorm 层为 DetaFrozenBatchNorm2d
        with torch.no_grad():
            replace_batch_norm(backbone)
        
        # 将替换后的主干模型设置为当前对象的模型属性
        self.model = backbone
        
        # 获取主干模型中的通道尺寸信息
        self.intermediate_channel_sizes = self.model.channels

        # TODO 修复这个部分
        # 如果主干模型的类型是 "resnet"
        if config.backbone_config.model_type == "resnet":
            # 遍历主干模型的所有参数
            for name, parameter in self.model.named_parameters():
                # 如果参数名中不包含 "stages.1"、"stages.2" 或 "stages.3"
                if "stages.1" not in name and "stages.2" not in name and "stages.3" not in name:
                    # 将参数的 requires_grad 属性设为 False，即冻结参数
                    parameter.requires_grad_(False)

        # 构建位置编码器
        self.position_embedding = build_position_encoding(config)

    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
        """
        如果 `config.num_feature_levels > 1`，则输出 ResNet 中 C_3 到 C_5 的后续阶段的特征图，否则输出 C_5 的特征图。
        """
        # 首先，通过主干模型传递像素值以获取特征图列表
        features = self.model(pixel_values).feature_maps

        # 接下来，创建位置编码
        out = []
        pos = []
        for feature_map in features:
            # 将像素掩码下采样以匹配相应特征图的形状
            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
            # 使用位置编码器对特征图和掩码生成位置编码
            position_embeddings = self.position_embedding(feature_map, mask).to(feature_map.dtype)
            out.append((feature_map, mask))
            pos.append(position_embeddings)

        return out, pos


# 从 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding 复制，并将 DeformableDetr->Deta
class DetaSinePositionEmbedding(nn.Module):
    """
    这是一种更标准的位置编码版本，与 Attention is all you
    """
    需要纸张，通用于处理图像。

    初始化函数，设置模型参数并进行必要的检查。
    """
    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
        super().__init__()
        # 设置嵌入维度
        self.embedding_dim = embedding_dim
        # 温度参数，影响位置编码的范围
        self.temperature = temperature
        # 是否进行归一化处理
        self.normalize = normalize
        # 如果传入了scale但未开启normalize，则引发错误
        if scale is not None and normalize is False:
            raise ValueError("normalize should be True if scale is passed")
        # 如果未传入scale，则默认设置为2π
        if scale is None:
            scale = 2 * math.pi
        self.scale = scale

    def forward(self, pixel_values, pixel_mask):
        # 如果未提供像素掩码，则引发错误
        if pixel_mask is None:
            raise ValueError("No pixel mask provided")
        # 在y方向上对掩码进行累积求和，作为位置编码的一部分
        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
        # 在x方向上对掩码进行累积求和，作为位置编码的一部分
        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
        # 如果设置了归一化，则对位置编码进行归一化处理
        if self.normalize:
            eps = 1e-6
            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale

        # 生成维度向量，用于计算位置编码
        dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)

        # 计算x和y方向上的位置编码
        pos_x = x_embed[:, :, :, None] / dim_t
        pos_y = y_embed[:, :, :, None] / dim_t
        # 对位置编码进行正弦和余弦变换，然后展平处理
        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
        # 拼接x和y方向的位置编码，并将通道维度放到正确的位置
        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
        # 返回位置编码结果
        return pos
# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding
class DetaLearnedPositionEmbedding(nn.Module):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, embedding_dim=256):
        super().__init__()
        # 创建一个嵌入层用于行位置编码，嵌入维度为 embedding_dim，总共有 50 个位置
        self.row_embeddings = nn.Embedding(50, embedding_dim)
        # 创建一个嵌入层用于列位置编码，嵌入维度为 embedding_dim，总共有 50 个位置
        self.column_embeddings = nn.Embedding(50, embedding_dim)

    def forward(self, pixel_values, pixel_mask=None):
        # 获取输入像素值的高度和宽度
        height, width = pixel_values.shape[-2:]
        # 创建一个张量，包含从 0 到 width-1 的整数，设备类型与 pixel_values 相同
        width_values = torch.arange(width, device=pixel_values.device)
        # 创建一个张量，包含从 0 到 height-1 的整数，设备类型与 pixel_values 相同
        height_values = torch.arange(height, device=pixel_values.device)
        # 获取列位置的嵌入向量
        x_emb = self.column_embeddings(width_values)
        # 获取行位置的嵌入向量
        y_emb = self.row_embeddings(height_values)
        # 拼接列和行位置的嵌入向量，形成位置编码矩阵
        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
        # 将位置编码矩阵进行维度置换，变为 (embedding_dim, height, width) 的形式
        pos = pos.permute(2, 0, 1)
        # 在最前面添加一个维度，变为 (1, embedding_dim, height, width) 的形式
        pos = pos.unsqueeze(0)
        # 将位置编码矩阵扩展为与输入像素值相同的张量形状
        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
        # 返回位置编码张量
        return pos


# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->Deta
def build_position_encoding(config):
    # 计算位置编码的步数，为模型维度除以 2
    n_steps = config.d_model // 2
    if config.position_embedding_type == "sine":
        # 若使用正弦位置编码类型，则创建 DetaSinePositionEmbedding 对象
        # 此处暂未暴露其他参数的更好方法
        position_embedding = DetaSinePositionEmbedding(n_steps, normalize=True)
    elif config.position_embedding_type == "learned":
        # 若使用学习得到的位置编码类型，则创建 DetaLearnedPositionEmbedding 对象
        position_embedding = DetaLearnedPositionEmbedding(n_steps)
    else:
        # 抛出异常，指出不支持的位置编码类型
        raise ValueError(f"Not supported {config.position_embedding_type}")

    # 返回创建的位置编码对象
    return position_embedding


# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
def multi_scale_deformable_attention(
    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
) -> Tensor:
    # 获取输入 value 张量的维度信息
    batch_size, _, num_heads, hidden_dim = value.shape
    # 获取 sampling_locations 张量的维度信息
    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
    # 将 value 张量按照空间形状进行分割，并存储到 value_list 中
    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
    # 计算采样网格，将采样位置放大为 2 倍并减去 1
    sampling_grids = 2 * sampling_locations - 1
    # 初始化采样值列表为空列表
    sampling_value_list = []
    for level_id, (height, width) in enumerate(value_spatial_shapes):
        # 遍历每个级别的空间形状，level_id 是级别索引，(height, width) 是高度和宽度元组
        # 扁平化 value_list[level_id]，将其转置，然后重塑为指定形状
        # 得到形状为 batch_size*num_heads, hidden_dim, height, width 的 value_l_
        value_l_ = (
            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
        )
        
        # 提取 sampling_grids 中特定 level_id 的数据，进行转置和扁平化操作
        # 得到形状为 batch_size*num_heads, num_queries, num_points, 2 的 sampling_grid_l_
        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
        
        # 使用双线性插值的方式，根据 sampling_grid_l_ 对 value_l_ 进行采样
        # 得到形状为 batch_size*num_heads, hidden_dim, num_queries, num_points 的 sampling_value_l_
        sampling_value_l_ = nn.functional.grid_sample(
            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
        )
        
        # 将当前级别的采样值列表添加到 sampling_value_list 中
        sampling_value_list.append(sampling_value_l_)
    
    # 调整注意力权重的形状，转置以匹配后续计算的需求
    # 最终形状为 batch_size, num_heads, num_queries, num_levels * num_points
    attention_weights = attention_weights.transpose(1, 2).reshape(
        batch_size * num_heads, 1, num_queries, num_levels * num_points
    )
    
    # 计算最终的输出，对采样值列表进行堆叠和扁平化操作，并乘以注意力权重，最后重新调整形状
    output = (
        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
        .sum(-1)
        .view(batch_size, num_heads * hidden_dim, num_queries)
    )
    
    # 返回输出，并重新调整其形状以匹配预期的输出格式
    return output.transpose(1, 2).contiguous()
# 从transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention复制而来，修改为DetaMultiscaleDeformableAttention
class DetaMultiscaleDeformableAttention(nn.Module):
    """
    在Deformable DETR中提出的多尺度可变形注意力机制。
    """

    def __init__(self, config: DetaConfig, num_heads: int, n_points: int):
        super().__init__()

        # 检查是否加载了CUDA内核函数并且Ninja库可用，如果没有加载则尝试加载
        kernel_loaded = MultiScaleDeformableAttention is not None
        if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
            try:
                load_cuda_kernels()
            except Exception as e:
                logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")

        # 检查d_model是否能被num_heads整除，如果不能则抛出错误
        if config.d_model % num_heads != 0:
            raise ValueError(
                f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
            )
        
        # 计算每个注意力头的维度
        dim_per_head = config.d_model // num_heads
        # 检查dim_per_head是否是2的幂
        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
            warnings.warn(
                "You'd better set embed_dim (d_model) in DetaMultiscaleDeformableAttention to make the"
                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
                " implementation."
            )

        # 初始化im2col步长为64
        self.im2col_step = 64

        # 保存配置参数
        self.d_model = config.d_model
        self.n_levels = config.num_feature_levels
        self.n_heads = num_heads
        self.n_points = n_points

        # 初始化偏移量线性层、注意力权重线性层、值投影线性层和输出投影线性层
        self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
        self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
        self.value_proj = nn.Linear(config.d_model, config.d_model)
        self.output_proj = nn.Linear(config.d_model, config.d_model)

        # 根据配置参数决定是否禁用自定义内核函数
        self.disable_custom_kernels = config.disable_custom_kernels

        # 重置模型参数
        self._reset_parameters()
    # 重置模型参数的方法
    def _reset_parameters(self):
        # 初始化采样偏移权重为常数 0.0
        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
        
        # 获取默认的数据类型
        default_dtype = torch.get_default_dtype()
        
        # 创建角度列表 thetas，作为每个注意力头的角度偏移量
        thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads)
        
        # 初始化网格 grid_init，其中包含每个头的位置编码
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
        
        # 标准化网格使其范围在 [-1, 1] 内，并重复以匹配所有级别和点的数量
        grid_init = (
            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
            .view(self.n_heads, 1, 1, 2)
            .repeat(1, self.n_levels, self.n_points, 1)
        )
        
        # 根据点的索引调整网格初始化值
        for i in range(self.n_points):
            grid_init[:, :, i, :] *= i + 1
        
        # 使用 torch.no_grad() 上下文管理器，将初始化的网格作为偏移量的偏置参数
        with torch.no_grad():
            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
        
        # 初始化注意力权重为常数 0.0
        nn.init.constant_(self.attention_weights.weight.data, 0.0)
        nn.init.constant_(self.attention_weights.bias.data, 0.0)
        
        # 使用 xavier_uniform 方法初始化值投影权重和偏置
        nn.init.xavier_uniform_(self.value_proj.weight.data)
        nn.init.constant_(self.value_proj.bias.data, 0.0)
        nn.init.xavier_uniform_(self.output_proj.weight.data)
        nn.init.constant_(self.output_proj.bias.data, 0.0)
# 从transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention复制代码，将DeformableDetr->Deta,Deformable DETR->DETA
class DetaMultiheadAttention(nn.Module):
    """
    'Attention Is All You Need'论文中的多头注意力机制。

    这里，我们根据Deformable DETR论文的说明，为查询和键添加位置嵌入。
    """

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        bias: bool = True,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        if self.head_dim * num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim必须能被num_heads整除 (得到 `embed_dim`: {self.embed_dim} 和 `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5

        # 线性层，用于投影键（key）、值（value）和查询（query）
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
        # 重新形状张量，以便进行多头注意力计算
        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
        # 如果提供了位置嵌入，则将其添加到张量中
        return tensor if position_embeddings is None else tensor + position_embeddings

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_embeddings: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 前向传播函数，计算多头注意力机制



class DetaEncoderLayer(nn.Module):
    def __init__(self, config: DetaConfig):
        super().__init__()
        self.embed_dim = config.d_model
        self.self_attn = DetaMultiscaleDeformableAttention(
            config,
            num_heads=config.encoder_attention_heads,
            n_points=config.encoder_n_points,
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        position_embeddings: torch.Tensor = None,
        reference_points=None,
        spatial_shapes=None,
        level_start_index=None,
        output_attentions: bool = False,
    ):
        # 前向传播函数，包括多尺度可变形注意力和前馈神经网络层的计算
    # 定义一个方法，用于处理多尺度变形注意力模块的计算
    def forward(
        self,
        hidden_states,  # 输入到层的隐藏状态张量，形状为(batch_size, sequence_length, hidden_size)
        attention_mask,  # 注意力掩码张量，形状为(batch_size, sequence_length)
        position_embeddings=None,  # 位置嵌入张量，可选参数，将添加到hidden_states中
        reference_points=None,  # 参考点张量，可选参数
        spatial_shapes=None,  # 主干特征图的空间形状张量，可选参数
        level_start_index=None,  # 级别开始索引张量，可选参数
        output_attentions=False,  # 是否返回所有注意力层的注意力张量的标志，详见返回的张量中的'attentions'
    ):
        residual = hidden_states  # 保存初始输入的残差连接
    
        # 在多尺度特征图上应用多尺度变形注意力模块
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            encoder_hidden_states=hidden_states,
            encoder_attention_mask=attention_mask,
            position_embeddings=position_embeddings,
            reference_points=reference_points,
            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            output_attentions=output_attentions,
        )
    
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 使用dropout进行正则化
        hidden_states = residual + hidden_states  # 添加残差连接
        hidden_states = self.self_attn_layer_norm(hidden_states)  # 对结果进行层归一化处理
    
        residual = hidden_states  # 保存第一次处理后的残差连接
        hidden_states = self.activation_fn(self.fc1(hidden_states))  # 使用激活函数处理全连接层fc1的结果
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)  # 使用dropout进行正则化
    
        hidden_states = self.fc2(hidden_states)  # 进行第二个全连接层fc2的线性变换
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 使用dropout进行正则化
    
        hidden_states = residual + hidden_states  # 添加第二次残差连接
        hidden_states = self.final_layer_norm(hidden_states)  # 对结果进行最终的层归一化处理
    
        if self.training:  # 如果处于训练模式
            # 检查是否存在无穷大或NaN值，如果有，则进行数值截断
            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
    
        outputs = (hidden_states,)  # 输出结果为隐藏状态张量的元组
    
        if output_attentions:  # 如果需要返回注意力张量
            outputs += (attn_weights,)  # 将注意力张量添加到输出元组中
    
        return outputs  # 返回最终输出
class DetaDecoderLayer(nn.Module):
    def __init__(self, config: DetaConfig):
        super().__init__()
        self.embed_dim = config.d_model

        # self-attention
        self.self_attn = DetaMultiheadAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
        )
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout

        # Layer normalization for self-attention output
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # cross-attention
        self.encoder_attn = DetaMultiscaleDeformableAttention(
            config,
            num_heads=config.decoder_attention_heads,
            n_points=config.decoder_n_points,
        )
        # Layer normalization for cross-attention output
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # feedforward neural networks
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
        # Layer normalization for final output after feedforward networks
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: Optional[torch.Tensor] = None,
        reference_points=None,
        spatial_shapes=None,
        level_start_index=None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ):
        # Forward pass through the decoder layer
        # Self-attention mechanism
        self_attn_output = self.self_attn(
            hidden_states,
            hidden_states,
            hidden_states,
            attn_mask=None,  # Optional masking
            output_attentions=output_attentions,
        )
        self_attn_output = self.dropout(self_attn_output[0])  # Apply dropout
        hidden_states = hidden_states + self_attn_output  # Residual connection
        hidden_states = self.self_attn_layer_norm(hidden_states)  # Layer normalization

        # Cross-attention mechanism
        encoder_attn_output = self.encoder_attn(
            hidden_states,
            encoder_hidden_states,
            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            attn_mask=encoder_attention_mask,
            output_attentions=output_attentions,
        )
        encoder_attn_output = self.dropout(encoder_attn_output[0])  # Apply dropout
        hidden_states = hidden_states + encoder_attn_output  # Residual connection
        hidden_states = self.encoder_attn_layer_norm(hidden_states)  # Layer normalization

        # Feedforward neural network
        intermediate_output = self.activation_fn(self.fc1(hidden_states))  # First linear layer + activation
        intermediate_output = self.dropout(intermediate_output)  # Apply dropout
        ffn_output = self.fc2(intermediate_output)  # Second linear layer
        ffn_output = self.dropout(ffn_output)  # Apply dropout
        hidden_states = hidden_states + ffn_output  # Residual connection
        hidden_states = self.final_layer_norm(hidden_states)  # Layer normalization

        return hidden_states



# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead
class DetaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
        super().__init__()
        # Fully connected layer for dimension reduction
        self.dense = nn.Linear(input_dim, inner_dim)
        # Dropout for regularization
        self.dropout = nn.Dropout(p=pooler_dropout)
        # Output projection layer for classification
        self.out_proj = nn.Linear(inner_dim, num_classes)

    def forward(self, hidden_states: torch.Tensor):
        hidden_states = self.dropout(hidden_states)  # Apply dropout
        hidden_states = self.dense(hidden_states)  # Linear transformation
        hidden_states = torch.tanh(hidden_states)  # Apply activation function (tanh)
        hidden_states = self.dropout(hidden_states)  # Apply dropout
        hidden_states = self.out_proj(hidden_states)  # Final linear transformation for classification
        return hidden_states


class DetaPreTrainedModel(PreTrainedModel):
    config_class = DetaConfig
    base_model_prefix = "model"
    main_input_name = "pixel_values"
    _no_split_modules = [r"DetaBackboneWithPositionalEncodings", r"DetaEncoderLayer", r"DetaDecoderLayer"]
    supports_gradient_checkpointing = True
    # 初始化神经网络模块的权重
    def _init_weights(self, module):
        std = self.config.init_std  # 获取初始化标准差参数

        if isinstance(module, DetaLearnedPositionEmbedding):
            nn.init.uniform_(module.row_embeddings.weight)  # 均匀分布初始化行嵌入权重
            nn.init.uniform_(module.column_embeddings.weight)  # 均匀分布初始化列嵌入权重
        elif isinstance(module, DetaMultiscaleDeformableAttention):
            module._reset_parameters()  # 重置多尺度可变形注意力模块的参数
        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
            # 对于线性层、二维卷积层、批归一化层，使用正态分布初始化权重，偏置初始化为零
            # 与 TF 版本稍有不同，TF 版本使用截断正态分布进行初始化
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 对于嵌入层，使用正态分布初始化权重，如果定义了填充索引，则对应权重置零
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        if hasattr(module, "reference_points") and not self.config.two_stage:
            # 如果模块具有 reference_points 属性且非两阶段配置，则使用 Xavier 均匀分布初始化权重，偏置初始化为零
            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
            nn.init.constant_(module.reference_points.bias.data, 0.0)
        if hasattr(module, "level_embed"):
            # 如果模块具有 level_embed 属性，则使用正态分布初始化权重
            nn.init.normal_(module.level_embed)
# DETA_START_DOCSTRING 是一个原始文档字符串，描述了这个模型的继承和一般行为。
# 它继承自 PreTrainedModel 类，可以查阅该超类的文档以了解其实现的通用方法，
# 如下载或保存模型、调整输入嵌入大小、修剪头部等。

DETA_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`DetaConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# DETA_INPUTS_DOCSTRING 是一个空的文档字符串，将用于描述输入参数的信息。
DETA_INPUTS_DOCSTRING = r"""
    # 定义函数参数，接受一个四维的浮点型张量作为输入，表示像素值，维度为(batch_size, num_channels, height, width)
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it.
            
            像素数值。默认情况下将忽略填充部分的像素值。

            Pixel values can be obtained using [`AutoImageProcessor`]. See [`AutoImageProcessor.__call__`] for details.
            
            可以使用 [`AutoImageProcessor`] 获得像素值。详细信息请参见 [`AutoImageProcessor.__call__`]。

        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
            
            遮罩，用于避免在填充像素值上执行注意力操作。遮罩的取值范围为 `[0, 1]`：

            - 1 for pixels that are real (i.e. **not masked**),
            - 0 for pixels that are padding (i.e. **masked**).
            
            - 值为 1 表示真实像素（即**未遮罩**），
            - 值为 0 表示填充像素（即**已遮罩**）。

            [What are attention masks?](../glossary#attention-mask)
            
            [注意力遮罩是什么？](../glossary#attention-mask)

        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
            Not used by default. Can be used to mask object queries.
            
            默认情况下不使用。可以用于屏蔽对象查询。

        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
            
            元组包含 (`last_hidden_state`, *可选*: `hidden_states`, *可选*: `attentions`)

            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
            
            `last_hidden_state` 的形状为 `(batch_size, sequence_length, hidden_size)`，*可选*，是编码器最后一层的输出隐藏状态序列。用于解码器的交叉注意力。

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
            can choose to directly pass a flattened representation of an image.
            
            可选项，可以直接传递一个展平的图像表示，而不是传递扁平化的特征图（骨干网络输出 + 投影层的输出）。

        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
            embedded representation.
            
            可选项，可以直接传递嵌入表示，而不是用零张量初始化查询。

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
            
            是否返回所有注意力层的注意力张量。有关更多详细信息，请参阅返回张量中的 `attentions`。

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
            
            是否返回所有层的隐藏状态。有关更多详细信息，请参阅返回张量中的 `hidden_states`。

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
            
            是否返回 [`~file_utils.ModelOutput`] 而不是普通元组。
"""


class DetaEncoder(DetaPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
    [`DetaEncoderLayer`].

    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.

    Args:
        config: DetaConfig
    """

    def __init__(self, config: DetaConfig):
        super().__init__(config)

        self.dropout = config.dropout
        # 创建多个 DetaEncoderLayer 组成的层列表
        self.layers = nn.ModuleList([DetaEncoderLayer(config) for _ in range(config.encoder_layers)])
        self.gradient_checkpointing = False

        # 初始化权重并应用最终处理
        self.post_init()

    @staticmethod
    def get_reference_points(spatial_shapes, valid_ratios, device):
        """
        Get reference points for each feature map. Used in decoder.

        Args:
            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
                Spatial shapes of each feature map.
            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
                Valid ratios of each feature map.
            device (`torch.device`):
                Device on which to create the tensors.
        Returns:
            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
        """
        reference_points_list = []
        # 遍历每个特征图的空间形状
        for level, (height, width) in enumerate(spatial_shapes):
            # 创建网格矩阵，生成参考点
            ref_y, ref_x = meshgrid(
                torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
                torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
                indexing="ij",
            )
            # 重新形状和缩放参考点，考虑有效比率
            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
            ref = torch.stack((ref_x, ref_y), -1)
            reference_points_list.append(ref)
        # 拼接所有参考点并应用有效比率
        reference_points = torch.cat(reference_points_list, 1)
        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
        return reference_points

    def forward(
        self,
        inputs_embeds=None,
        attention_mask=None,
        position_embeddings=None,
        spatial_shapes=None,
        level_start_index=None,
        valid_ratios=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    # 将 `position_embeddings`, `reference_points`, `spatial_shapes` 和 `valid_ratios` 添加到前向传播中。
    # 同时返回所有解码层的中间输出和参考点的堆栈。

    Args:
        config: DetaConfig
    """

    def __init__(self, config: DetaConfig):
        super().__init__(config)

        # 初始化配置中的参数
        self.dropout = config.dropout
        # 创建指定数量的解码层，并存储在模块列表中
        self.layers = nn.ModuleList([DetaDecoderLayer(config) for _ in range(config.decoder_layers)])
        self.gradient_checkpointing = False

        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
        # 用于迭代边界框细化和两阶段可变形DETR的特殊实现
        self.bbox_embed = None
        self.class_embed = None

        # 初始化权重并应用最终处理
        self.post_init()

    def forward(
        self,
        inputs_embeds=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        position_embeddings=None,
        reference_points=None,
        spatial_shapes=None,
        level_start_index=None,
        valid_ratios=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
"""
The bare DETA Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
any specific head on top.
"""
# 继承自预训练模型基类 DetaPreTrainedModel 的 DetaModel 类
@add_start_docstrings(
    """
    The bare DETA Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
    any specific head on top.
    """,
    DETA_START_DOCSTRING,
)
class DetaModel(DetaPreTrainedModel):
    # 初始化函数，接收一个 DetaConfig 对象作为配置参数
    def __init__(self, config: DetaConfig):
        # 调用父类的初始化函数
        super().__init__(config)

        # 如果配置指定了 two_stage 为 True，则要求导入 torch 和 torchvision 库
        if config.two_stage:
            requires_backends(self, ["torchvision"])

        # 创建带有位置编码的背景骨干网络
        self.backbone = DetaBackboneWithPositionalEncodings(config)
        # 获取背景骨干网络中间层的通道大小信息
        intermediate_channel_sizes = self.backbone.intermediate_channel_sizes

        # 创建输入投影层
        if config.num_feature_levels > 1:
            num_backbone_outs = len(intermediate_channel_sizes)
            input_proj_list = []
            for _ in range(num_backbone_outs):
                in_channels = intermediate_channel_sizes[_]
                # 对每个输入层进行卷积和分组归一化
                input_proj_list.append(
                    nn.Sequential(
                        nn.Conv2d(in_channels, config.d_model, kernel_size=1),
                        nn.GroupNorm(32, config.d_model),
                    )
                )
            for _ in range(config.num_feature_levels - num_backbone_outs):
                # 对于额外的输入层，使用卷积核大小为 3，步幅为 2，填充为 1 的卷积层和分组归一化
                input_proj_list.append(
                    nn.Sequential(
                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
                        nn.GroupNorm(32, config.d_model),
                    )
                )
                in_channels = config.d_model
            # 将输入投影层作为模块列表保存
            self.input_proj = nn.ModuleList(input_proj_list)
        else:
            # 如果只有一个特征级别，则直接创建一个输入投影层
            self.input_proj = nn.ModuleList(
                [
                    nn.Sequential(
                        nn.Conv2d(intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
                        nn.GroupNorm(32, config.d_model),
                    )
                ]
            )

        # 如果不是两阶段模型，则创建查询位置嵌入层
        if not config.two_stage:
            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)

        # 创建编码器和解码器
        self.encoder = DetaEncoder(config)
        self.decoder = DetaDecoder(config)

        # 创建特征级别嵌入参数
        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))

        # 如果是两阶段模型，则创建额外的输出层和归一化层
        if config.two_stage:
            self.enc_output = nn.Linear(config.d_model, config.d_model)
            self.enc_output_norm = nn.LayerNorm(config.d_model)
            self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
            self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
            self.pix_trans = nn.Linear(config.d_model, config.d_model)
            self.pix_trans_norm = nn.LayerNorm(config.d_model)
        else:
            # 否则创建参考点线性层
            self.reference_points = nn.Linear(config.d_model, 2)

        # 设置两阶段模型相关的配置参数
        self.assign_first_stage = config.assign_first_stage
        self.two_stage_num_proposals = config.two_stage_num_proposals

        # 执行初始化后的其他操作
        self.post_init()
    # 从DeformableDetrModel类中复制的方法，返回编码器(encoder)模型
    def get_encoder(self):
        return self.encoder

    # 从DeformableDetrModel类中复制的方法，返回解码器(decoder)模型
    def get_decoder(self):
        return self.decoder

    # 冻结骨干(backbone)模型的参数，使其不需要梯度更新
    def freeze_backbone(self):
        for name, param in self.backbone.model.named_parameters():
            param.requires_grad_(False)

    # 解冻骨干(backbone)模型的参数，使其需要梯度更新
    def unfreeze_backbone(self):
        for name, param in self.backbone.model.named_parameters():
            param.requires_grad_(True)

    # 从DeformableDetrModel类中复制的方法，计算特征图的有效比例
    def get_valid_ratio(self, mask, dtype=torch.float32):
        """获取所有特征图的有效比例。"""

        _, height, width = mask.shape
        valid_height = torch.sum(mask[:, :, 0], 1)
        valid_width = torch.sum(mask[:, 0, :], 1)
        valid_ratio_height = valid_height.to(dtype) / height
        valid_ratio_width = valid_width.to(dtype) / width
        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
        return valid_ratio

    # 从DeformableDetrModel类中复制的方法，获取提议(proposals)的位置嵌入
    def get_proposal_pos_embed(self, proposals):
        """获取提议的位置嵌入。"""

        num_pos_feats = self.config.d_model // 2
        temperature = 10000
        scale = 2 * math.pi

        dim_t = torch.arange(num_pos_feats, dtype=torch.int64, device=proposals.device).float()
        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
        # batch_size, num_queries, 4
        proposals = proposals.sigmoid() * scale
        # batch_size, num_queries, 4, 128
        pos = proposals[:, :, :, None] / dim_t
        # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
        return pos

    # 将模型的输入和输出文档字符串添加到forward方法
    @add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=DetaModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        pixel_mask: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.FloatTensor] = None,
        encoder_outputs: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    """
    DETA Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
    such as COCO detection.
    """,
    DETA_START_DOCSTRING,
)
class DetaForObjectDetection(DetaPreTrainedModel):
    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
    _tied_weights_keys = [r"bbox_embed\.\d+"]
    # We can't initialize the model on meta device as some weights are modified during the initialization
    _no_split_modules = None

    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection.__init__ with DeformableDetr->Deta
    def __init__(self, config: DetaConfig):
        super().__init__(config)

        # Deformable DETR encoder-decoder model
        self.model = DetaModel(config)

        # Detection heads on top
        self.class_embed = nn.Linear(config.d_model, config.num_labels)
        self.bbox_embed = DetaMLPPredictionHead(
            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
        )

        # Initialize bias for classification head to adjust for prior probability
        prior_prob = 0.01
        bias_value = -math.log((1 - prior_prob) / prior_prob)
        self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value

        # Initialize weights for bounding box regression head
        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)

        # Adjust bias for bounding box regression to focus on object presence
        nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)

        # Configure model components based on whether two-stage training is enabled
        num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
        if config.with_box_refine:
            # Clone heads for each decoder layer in two-stage training
            self.class_embed = _get_clones(self.class_embed, num_pred)
            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
            # Connect bounding box embed to decoder for iterative refinement
            self.model.decoder.bbox_embed = self.bbox_embed
        else:
            # Clone heads for each decoder layer in single-stage training
            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
            self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
            self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
            self.model.decoder.bbox_embed = None
        
        if config.two_stage:
            # Connect class embed to decoder for two-stage training
            self.model.decoder.class_embed = self.class_embed
            for box_embed in self.bbox_embed:
                nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)

        # Initialize weights and perform final processing
        self.post_init()

    @torch.jit.unused
    def _set_aux_loss(self, outputs_class, outputs_coord):
        # 为了使 torchscript 正常运行，因为 torchscript 不支持值非同质的字典，
        # 比如一个字典包含张量和列表。
        # 创建辅助损失列表，每个元素是一个字典包含 "logits" 和 "pred_boxes"
        aux_loss = [
            {"logits": logits, "pred_boxes": pred_boxes}
            for logits, pred_boxes in zip(outputs_class.transpose(0, 1)[:-1], outputs_coord.transpose(0, 1)[:-1])
        ]
        return aux_loss

    @add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=DetaObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        pixel_mask: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.FloatTensor] = None,
        encoder_outputs: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[List[dict]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# Copied from transformers.models.detr.modeling_detr.dice_loss
def dice_loss(inputs, targets, num_boxes):
    """
    Compute the DICE loss, similar to generalized IOU for masks

    Args:
        inputs: A float tensor of arbitrary shape.
                The predictions for each example.
        targets: A float tensor with the same shape as inputs. Stores the binary
                 classification label for each element in inputs (0 for the negative class and 1 for the positive
                 class).
        num_boxes: Number of boxes (or examples) to compute the average loss over.
    """
    # Apply sigmoid activation to convert inputs to probabilities
    inputs = inputs.sigmoid()
    # Flatten the inputs to shape (batch_size, -1)
    inputs = inputs.flatten(1)
    # Compute the numerator for DICE coefficient
    numerator = 2 * (inputs * targets).sum(1)
    # Compute the denominator for DICE coefficient
    denominator = inputs.sum(-1) + targets.sum(-1)
    # Compute the DICE loss as 1 - DICE coefficient
    loss = 1 - (numerator + 1) / (denominator + 1)
    # Compute the average loss over all examples (boxes)
    return loss.sum() / num_boxes


# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
    """
    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.

    Args:
        inputs (`torch.FloatTensor` of arbitrary shape):
            The predictions for each example.
        targets (`torch.FloatTensor` with the same shape as `inputs`)
            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
            and 1 for the positive class).
        num_boxes: Number of boxes (or examples) to compute the average loss over.
        alpha (`float`, *optional*, defaults to `0.25`):
            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
        gamma (`int`, *optional*, defaults to `2`):
            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.

    Returns:
        Loss tensor
    """
    # Apply sigmoid activation to convert inputs to probabilities
    prob = inputs.sigmoid()
    # Compute binary cross entropy loss
    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
    # Compute the modulating factor (1 - p_t)
    p_t = prob * targets + (1 - prob) * (1 - targets)
    # Compute the focal loss as modified by the modulating factor
    loss = ce_loss * ((1 - p_t) ** gamma)

    # Apply optional alpha balancing factor if alpha is non-negative
    if alpha >= 0:
        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        loss = alpha_t * loss

    # Compute the average loss over all examples (boxes)
    return loss.mean(1).sum() / num_boxes


class DetaLoss(nn.Module):
    """
    This class computes the losses for `DetaForObjectDetection`. The process happens in two steps: 1) we compute
    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
    ground-truth / prediction (supervised class and box).

    Args:
        matcher (`DetaHungarianMatcher`):
            Module able to compute a matching between targets and proposals.
        num_classes (`int`):
            Number of object categories, omitting the special no-object category.
        focal_alpha (`float`):
            Alpha parameter in focal loss.
        losses (`List[str]`):
            List of all the losses to be applied. See `get_loss` for a list of all available losses.
    """
    pass
    def __init__(
        self,
        matcher,
        num_classes,
        focal_alpha,
        losses,
        num_queries,
        assign_first_stage=False,
        assign_second_stage=False,
    ):
        super().__init__()
        self.matcher = matcher  # 初始化匹配器对象
        self.num_classes = num_classes  # 设置类别数
        self.focal_alpha = focal_alpha  # 设置焦点损失的 alpha 参数
        self.losses = losses  # 损失函数对象
        self.assign_first_stage = assign_first_stage  # 是否在第一阶段分配
        self.assign_second_stage = assign_second_stage  # 是否在第二阶段分配

        if self.assign_first_stage:
            self.stg1_assigner = DetaStage1Assigner()  # 如果在第一阶段分配，创建第一阶段分配器对象
        if self.assign_second_stage:
            self.stg2_assigner = DetaStage2Assigner(num_queries)  # 如果在第二阶段分配，创建第二阶段分配器对象

    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_labels
    def loss_labels(self, outputs, targets, indices, num_boxes):
        """
        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
        of dim [nb_target_boxes]
        """
        if "logits" not in outputs:
            raise KeyError("No logits were found in the outputs")
        source_logits = outputs["logits"]  # 获取输出中的逻辑回归 logits

        idx = self._get_source_permutation_idx(indices)  # 根据索引获取源排列索引
        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])  # 从目标中获取类别标签
        target_classes = torch.full(
            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
        )  # 创建与 logits 相同大小的目标类别张量
        target_classes[idx] = target_classes_o  # 将目标类别分配给对应的索引位置

        target_classes_onehot = torch.zeros(
            [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
            dtype=source_logits.dtype,
            layout=source_logits.layout,
            device=source_logits.device,
        )  # 创建一个独热编码的目标类别张量，多出的一维用于处理背景类别
        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)  # 使用目标类别填充独热编码张量

        target_classes_onehot = target_classes_onehot[:, :, :-1]  # 移除多余的背景类别维度
        loss_ce = (
            sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
            * source_logits.shape[1]  # 计算二元焦点损失
        )
        losses = {"loss_ce": loss_ce}  # 损失函数为交叉熵损失

        return losses

    @torch.no_grad()
    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_cardinality
    # 计算基数误差，即预测的非空框数量的绝对误差
    def loss_cardinality(self, outputs, targets, indices, num_boxes):
        logits = outputs["logits"]  # 获取输出中的逻辑张量
        device = logits.device  # 获取逻辑张量所在的设备
        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
        # 统计预测中不是“无物体”类别（最后一个类别）的数量
        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
        # 使用 L1 损失计算基数误差
        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
        losses = {"cardinality_error": card_err}  # 将基数误差保存在损失字典中
        return losses

    # 从 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_boxes 复制而来
    def loss_boxes(self, outputs, targets, indices, num_boxes):
        """
        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.

        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
        are expected in format (center_x, center_y, w, h), normalized by the image size.
        """
        if "pred_boxes" not in outputs:
            raise KeyError("No predicted boxes found in outputs")
        idx = self._get_source_permutation_idx(indices)  # 获取源排列的索引
        source_boxes = outputs["pred_boxes"][idx]  # 获取预测框的源框
        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
        # 使用 L1 损失计算框的损失
        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")

        losses = {}
        losses["loss_bbox"] = loss_bbox.sum() / num_boxes  # 将边界框损失保存在损失字典中

        # 计算广义 IoU 损失
        loss_giou = 1 - torch.diag(
            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
        )
        losses["loss_giou"] = loss_giou.sum() / num_boxes  # 将 GIoU 损失保存在损失字典中
        return losses

    # 从 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_source_permutation_idx 复制而来
    def _get_source_permutation_idx(self, indices):
        # 根据 indices 对预测进行排列
        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
        source_idx = torch.cat([source for (source, _) in indices])
        return batch_idx, source_idx

    # 从 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_target_permutation_idx 复制而来
    def _get_target_permutation_idx(self, indices):
        # 根据 indices 对目标进行排列
        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
        target_idx = torch.cat([target for (_, target) in indices])
        return batch_idx, target_idx

    # 从 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.get_loss 复制而来
    # 定义一个方法，用于根据指定的损失类型计算损失值
    def get_loss(self, loss, outputs, targets, indices, num_boxes):
        # 定义损失类型和对应的损失函数的映射关系
        loss_map = {
            "labels": self.loss_labels,        # 损失类型为标签时使用 self.loss_labels 方法计算损失
            "cardinality": self.loss_cardinality,  # 损失类型为基数时使用 self.loss_cardinality 方法计算损失
            "boxes": self.loss_boxes,          # 损失类型为框坐标时使用 self.loss_boxes 方法计算损失
        }
        # 如果指定的损失类型不在预定义的映射关系中，则抛出数值错误
        if loss not in loss_map:
            raise ValueError(f"Loss {loss} not supported")
        # 返回根据损失类型映射得到的损失值
        return loss_map[loss](outputs, targets, indices, num_boxes)
    def forward(self, outputs, targets):
        """
        This performs the loss computation.

        Args:
             outputs (`dict`, *optional*):
                Dictionary of tensors, see the output specification of the model for the format.
             targets (`List[dict]`, *optional*):
                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
                losses applied, see each loss' doc.
        """
        # Filter out auxiliary and encoder outputs from the main outputs dictionary
        outputs_without_aux = {k: v for k, v in outputs.items() if k not in ("auxiliary_outputs", "enc_outputs")}

        # Determine which function to use for matching outputs to targets based on `assign_second_stage` flag
        if self.assign_second_stage:
            indices = self.stg2_assigner(outputs_without_aux, targets)
        else:
            indices = self.matcher(outputs_without_aux, targets)

        # Compute the total number of target boxes across all samples for normalization
        num_boxes = sum(len(t["class_labels"]) for t in targets)
        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)

        # Check and adjust `num_boxes` based on distributed training setup
        world_size = 1
        if is_accelerate_available():
            if PartialState._shared_state != {}:
                num_boxes = reduce(num_boxes)
                world_size = PartialState().num_processes
        num_boxes = torch.clamp(num_boxes / world_size, min=1).item()

        # Compute losses for all specified loss functions
        losses = {}
        for loss in self.losses:
            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))

        # Handle auxiliary losses if present in outputs
        if "auxiliary_outputs" in outputs:
            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
                if not self.assign_second_stage:
                    indices = self.matcher(auxiliary_outputs, targets)
                for loss in self.losses:
                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
                    losses.update(l_dict)

        # Handle encoder outputs if present in outputs
        if "enc_outputs" in outputs:
            enc_outputs = outputs["enc_outputs"]
            bin_targets = copy.deepcopy(targets)
            for bt in bin_targets:
                bt["class_labels"] = torch.zeros_like(bt["class_labels"])
            if self.assign_first_stage:
                indices = self.stg1_assigner(enc_outputs, bin_targets)
            else:
                indices = self.matcher(enc_outputs, bin_targets)
            for loss in self.losses:
                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
                l_dict = {k + "_enc": v for k, v in l_dict.items()}
                losses.update(l_dict)

        # Return computed losses dictionary
        return losses
# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
class DetaMLPPredictionHead(nn.Module):
    """
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py

    """

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.num_layers = num_layers
        # 定义隐藏层的维度列表，从输入维度到输出维度，构建多层线性变换
        h = [hidden_dim] * (num_layers - 1)
        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))

    def forward(self, x):
        # 逐层进行前向传播，使用ReLU作为激活函数，最后一层不使用激活函数
        for i, layer in enumerate(self.layers):
            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
        return x


# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->Deta
class DetaHungarianMatcher(nn.Module):
    """
    This class computes an assignment between the targets and the predictions of the network.

    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
    un-matched (and thus treated as non-objects).

    Args:
        class_cost:
            The relative weight of the classification error in the matching cost.
        bbox_cost:
            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
        giou_cost:
            The relative weight of the giou loss of the bounding box in the matching cost.
    """

    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
        super().__init__()
        # 检查是否安装了SciPy库，这是后端计算所需
        requires_backends(self, ["scipy"])

        # 设置分类错误、边界框坐标L1误差和边界框giou损失的权重
        self.class_cost = class_cost
        self.bbox_cost = bbox_cost
        self.giou_cost = giou_cost
        # 如果三种损失都为0，则抛出值错误异常
        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
            raise ValueError("All costs of the Matcher can't be 0")

    @torch.no_grad()
    def forward(self, outputs, targets):
        """
        Args:
            outputs (`dict`):
                A dictionary that contains at least these entries:
                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
            targets (`List[dict]`):
                A list of targets (len(targets) = batch_size), where each target is a dict containing:
                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
                  ground-truth objects in the target) containing the class labels
                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.

        Returns:
            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
            - index_i is the indices of the selected predictions (in order)
            - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        batch_size, num_queries = outputs["logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # 将分类 logits 展平并应用 sigmoid 函数，得到概率值 [batch_size * num_queries, num_classes]
        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # 将预测的框坐标展平 [batch_size * num_queries, 4]

        # Also concat the target labels and boxes
        target_ids = torch.cat([v["class_labels"] for v in targets])  # 连接所有目标的类标签 [sum(num_target_boxes)]
        target_bbox = torch.cat([v["boxes"] for v in targets])  # 连接所有目标的框坐标 [sum(num_target_boxes), 4]

        # Compute the classification cost.
        alpha = 0.25
        gamma = 2.0
        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())  # 计算负分类损失
        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())  # 计算正分类损失
        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]  # 根据目标类标签选择对应的分类损失

        # Compute the L1 cost between boxes
        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)  # 计算框坐标之间的 L1 损失

        # Compute the giou cost between boxes
        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))  # 计算框之间的 giou 损失

        # Final cost matrix
        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost  # 组合成最终的损失矩阵
        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()  # 调整形状并转移到 CPU

        sizes = [len(v["boxes"]) for v in targets]  # 获取每个目标的框数量
        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]  # 使用线性求和分配算法求解最优匹配

        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
# Copied from transformers.models.detr.modeling_detr._upcast
def _upcast(t: Tensor) -> Tensor:
    """
    Protects from numerical overflows in multiplications by upcasting to the equivalent higher type.
    
    Args:
        t (`torch.Tensor`): The input tensor to be upcasted.
    
    Returns:
        `torch.Tensor`: The upcasted tensor.
    """
    if t.is_floating_point():
        return t if t.dtype in (torch.float32, torch.float64) else t.float()
    else:
        return t if t.dtype in (torch.int32, torch.int64) else t.int()


# Copied from transformers.models.detr.modeling_detr.box_area
def box_area(boxes: Tensor) -> Tensor:
    """
    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
            < x2` and `0 <= y1 < y2`.

    Returns:
        `torch.FloatTensor`: a tensor containing the area for each box.
    """
    boxes = _upcast(boxes)
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


# Copied from transformers.models.detr.modeling_detr.box_iou
def box_iou(boxes1, boxes2):
    """
    Computes the IoU (Intersection over Union) between two sets of bounding boxes.

    Args:
        boxes1 (`torch.Tensor`): Bounding boxes in format (x1, y1, x2, y2).
        boxes2 (`torch.Tensor`): Bounding boxes in format (x1, y1, x2, y2).

    Returns:
        `torch.Tensor`: A tensor containing IoU values for each pair of boxes.
    """
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]

    union = area1[:, None] + area2 - inter

    iou = inter / union
    return iou, union


# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
def generalized_box_iou(boxes1, boxes2):
    """
    Computes the Generalized IoU (GIoU) between two sets of bounding boxes.

    Args:
        boxes1 (`torch.Tensor`): Bounding boxes in format (x1, y1, x2, y2).
        boxes2 (`torch.Tensor`): Bounding boxes in format (x1, y1, x2, y2).

    Returns:
        `torch.Tensor`: A [N, M] pairwise matrix containing GIoU values.
    """
    # Check for degenerate boxes to prevent NaN results
    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
    
    iou, union = box_iou(boxes1, boxes2)

    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
    area = width_height[:, :, 0] * width_height[:, :, 1]

    return iou - (area - union) / area


# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/layers/wrappers.py#L100
def nonzero_tuple(x):
    """
    A 'as_tuple=True' version of torch.nonzero to support torchscript. because of
    https://github.com/pytorch/pytorch/issues/38718
    """
    # 如果当前处于 Torch 脚本模式
    if torch.jit.is_scripting():
        # 检查张量 x 的维度是否为 0
        if x.dim() == 0:
            # 如果是，将其扩展为一维张量，然后获取非零元素的索引
            return x.unsqueeze(0).nonzero().unbind(1)
        # 如果 x 的维度不为 0，直接获取非零元素的索引
        return x.nonzero().unbind(1)
    # 如果不处于 Torch 脚本模式
    else:
        # 直接返回张量 x 的非零元素的索引，返回结果为元组形式
        return x.nonzero(as_tuple=True)
# from https://github.com/facebookresearch/detectron2/blob/9921a2caa585d4fa66c4b534b6fab6e74d89b582/detectron2/modeling/matcher.py#L9
class DetaMatcher(object):
    """
    This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will
    have exactly zero or one matches; each ground-truth element may be matched to zero or more predicted elements.

    The matching is determined by the MxN match_quality_matrix, that characterizes how well each (ground-truth,
    prediction)-pair match each other. For example, if the elements are boxes, this matrix may contain box
    intersection-over-union overlap values.

    The matcher returns (a) a vector of length N containing the index of the ground-truth element m in [0, M) that
    matches to prediction n in [0, N). (b) a vector of length N containing the labels for each prediction.
    """

    def __init__(self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False):
        """
        Args:
            thresholds (`list[float]`):
                A list of thresholds used to stratify predictions into levels.
            labels (`list[int`):
                A list of values to label predictions belonging at each level. A label can be one of {-1, 0, 1}
                signifying {ignore, negative class, positive class}, respectively.
            allow_low_quality_matches (`bool`, *optional*, defaults to `False`):
                If `True`, produce additional matches for predictions with maximum match quality lower than
                high_threshold. See `set_low_quality_matches_` for more details.

            For example,
                thresholds = [0.3, 0.5] labels = [0, -1, 1] All predictions with iou < 0.3 will be marked with 0 and
                thus will be considered as false positives while training. All predictions with 0.3 <= iou < 0.5 will
                be marked with -1 and thus will be ignored. All predictions with 0.5 <= iou will be marked with 1 and
                thus will be considered as true positives.
        """
        # 将阈值列表复制到新变量 thresholds 中
        thresholds = thresholds[:]
        # 如果第一个阈值小于 0，则抛出 ValueError 异常
        if thresholds[0] < 0:
            raise ValueError("Thresholds should be positive")
        # 在 thresholds 的开头和结尾分别插入负无穷和正无穷
        thresholds.insert(0, -float("inf"))
        thresholds.append(float("inf"))
        # 检查 thresholds 列表是否按照升序排列
        if not all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])):
            raise ValueError("Thresholds should be sorted.")
        # 检查 labels 列表中的所有元素是否都属于 {-1, 0, 1} 这三个值
        if not all(l in [-1, 0, 1] for l in labels):
            raise ValueError("All labels should be either -1, 0 or 1")
        # 检查 labels 列表的长度是否与 thresholds 列表长度减 1 相等
        if len(labels) != len(thresholds) - 1:
            raise ValueError("Number of labels should be equal to number of thresholds - 1")
        # 初始化对象的属性
        self.thresholds = thresholds
        self.labels = labels
        self.allow_low_quality_matches = allow_low_quality_matches
    def __call__(self, match_quality_matrix):
        """
        Args:
            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
                pairwise quality between M ground-truth elements and N predicted elements. All elements must be >= 0
                (due to the use of `torch.nonzero` for selecting indices in `set_low_quality_matches_`).

        Returns:
            matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
                ground-truth index in [0, M)
            match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
                whether a prediction is a true or false positive or ignored
        """
        assert match_quality_matrix.dim() == 2  # 确保输入的 match_quality_matrix 是二维张量

        if match_quality_matrix.numel() == 0:
            # 创建一个全零的张量作为默认匹配结果
            default_matches = match_quality_matrix.new_full((match_quality_matrix.size(1),), 0, dtype=torch.int64)
            # 当没有 ground-truth 盒子存在时，我们定义 IOU = 0，因此将标签设置为 self.labels[0]，
            # 通常默认为背景类别 0；也可以选择忽略，设置 labels=[-1,0,-1,1] 并设置适当的阈值
            default_match_labels = match_quality_matrix.new_full(
                (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
            )
            return default_matches, default_match_labels

        assert torch.all(match_quality_matrix >= 0)  # 确保所有元素都大于等于 0

        # 对每个预测元素，选择与其IOU最大的 ground-truth 元素作为匹配
        matched_vals, matches = match_quality_matrix.max(dim=0)

        # 创建一个全一的标签张量，初始化为真正例
        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)

        # 根据阈值和标签设置规则，将匹配标签调整为正确的预测标签
        for l, low, high in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
            low_high = (matched_vals >= low) & (matched_vals < high)
            match_labels[low_high] = l

        # 如果允许低质量匹配，调用函数设置低质量匹配的标签
        if self.allow_low_quality_matches:
            self.set_low_quality_matches_(match_labels, match_quality_matrix)

        return matches, match_labels
        # 对于每一个ground-truth (gt)，找到与其具有最高质量匹配的预测
        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
        # 找到可用的最高质量匹配，即使质量较低，包括平局情况。
        # 注意，由于使用了 `torch.nonzero`，匹配质量必须是正数。
        _, pred_inds_with_highest_quality = nonzero_tuple(match_quality_matrix == highest_quality_foreach_gt[:, None])
        # 如果一个anchor仅因与gt_A的低质量匹配而被标记为正样本，
        # 但它与gt_B有更大的重叠，其匹配索引仍将是gt_B。
        # 这遵循Detectron中的实现，并且已经证明没有显著影响。
        match_labels[pred_inds_with_highest_quality] = 1
# 从torch中导入张量类型
import torch

# 定义函数subsmaple_labels，用于对标签进行子采样
def subsample_labels(labels: torch.Tensor, num_samples: int, positive_fraction: float, bg_label: int):
    """
    Return `num_samples` (or fewer, if not enough found) random samples from `labels` which is a mixture of positives &
    negatives. It will try to return as many positives as possible without exceeding `positive_fraction * num_samples`,
    and then try to fill the remaining slots with negatives.

    Args:
        labels (Tensor): (N, ) label vector with values:
            * -1: ignore
            * bg_label: background ("negative") class
            * otherwise: one or more foreground ("positive") classes
        num_samples (int): The total number of labels with value >= 0 to return.
            Values that are not sampled will be filled with -1 (ignore).
        positive_fraction (float): The number of subsampled labels with values > 0
            is `min(num_positives, int(positive_fraction * num_samples))`. The number of negatives sampled is
            `min(num_negatives, num_samples - num_positives_sampled)`. In order words, if there are not enough
            positives, the sample is filled with negatives. If there are also not enough negatives, then as many
            elements are sampled as is possible.
        bg_label (int): label index of background ("negative") class.

    Returns:
        pos_idx, neg_idx (Tensor):
            1D vector of indices. The total length of both is `num_samples` or fewer.
    """
    # 找出正样本的索引，即标签值不为-1且不为背景标签的位置
    positive = torch.nonzero((labels != -1) & (labels != bg_label)).squeeze(1)
    # 找出负样本的索引，即标签为背景标签的位置
    negative = torch.nonzero(labels == bg_label).squeeze(1)

    # 计算需要采样的正样本数量，并保护不超过正样本总数
    num_pos = int(num_samples * positive_fraction)
    num_pos = min(positive.numel(), num_pos)

    # 计算需要采样的负样本数量，并保护不超过负样本总数
    num_neg = num_samples - num_pos
    num_neg = min(negative.numel(), num_neg)

    # 随机选择正样本和负样本的索引
    perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
    perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]

    # 根据随机选择的索引获取正样本和负样本的最终索引
    pos_idx = positive[perm1]
    neg_idx = negative[perm2]

    return pos_idx, neg_idx


# 定义函数sample_topk_per_gt，用于对每个真实框(gt)进行top-k匹配采样
def sample_topk_per_gt(pr_inds, gt_inds, iou, k):
    if len(gt_inds) == 0:
        return pr_inds, gt_inds

    # 找出每个真实框(gt)的top-k匹配
    gt_inds2, counts = gt_inds.unique(return_counts=True)
    scores, pr_inds2 = iou[gt_inds2].topk(k, dim=1)
    gt_inds2 = gt_inds2[:, None].repeat(1, k)

    # 根据每个真实框的匹配数量过滤top-k匹配结果
    pr_inds3 = torch.cat([pr[:c] for c, pr in zip(counts, pr_inds2)])
    gt_inds3 = torch.cat([gt[:c] for c, gt in zip(counts, gt_inds2)])

    return pr_inds3, gt_inds3
    # 定义一个名为 DetaStage2Assigner 的神经网络模块，用于第二阶段的分配任务
    class DetaStage2Assigner(nn.Module):
        def __init__(self, num_queries, max_k=4):
            super().__init__()
            # 设置正样本比例为 0.25
            self.positive_fraction = 0.25
            # 设置背景标签为 400，大于91用于稍后过滤
            self.bg_label = 400  
            # 每张图像的每个批次的大小为 num_queries
            self.batch_size_per_image = num_queries
            # 创建一个 DetaMatcher 对象，用于提议与真实数据匹配
            self.proposal_matcher = DetaMatcher(thresholds=[0.6], labels=[0, 1], allow_low_quality_matches=True)
            # 最大 K 值设定为 max_k
            self.k = max_k

        def _sample_proposals(self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor):
            """
            根据 N 个提议与 M 个真实数据的匹配情况，采样提议并设置它们的分类标签。

            Args:
                matched_idxs (Tensor): 长度为 N 的向量，每个元素是每个提议最佳匹配的真实数据索引，取值范围为 [0, M)。
                matched_labels (Tensor): 长度为 N 的向量，每个元素是提议的分类标签（来自 cfg.MODEL.ROI_HEADS.IOU_LABELS）。
                gt_classes (Tensor): 长度为 M 的向量，每个元素是真实数据的类别。

            Returns:
                Tensor: 采样提议的索引向量，每个元素在 [0, N) 范围内。
                Tensor: 与采样提议对应的分类标签向量，每个元素与采样提议的索引向量一一对应。每个样本被标记为一个类别在 [0, num_classes) 或背景 (num_classes)。
            """
            # 判断是否存在真实数据
            has_gt = gt_classes.numel() > 0
            # 如果存在真实数据
            if has_gt:
                # 根据匹配结果为每个提议获取相应的真实类别
                gt_classes = gt_classes[matched_idxs]
                # 将未匹配的提议（matcher 标记为 0 的标签）标记为背景 (label=num_classes)
                gt_classes[matched_labels == 0] = self.bg_label
                # 将忽略的提议（标签为 -1）标记为 -1
                gt_classes[matched_labels == -1] = -1
            else:
                # 如果不存在真实数据，则将所有提议标记为背景
                gt_classes = torch.zeros_like(matched_idxs) + self.bg_label

            # 从 gt_classes 中采样前景和背景索引
            sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
                gt_classes, self.batch_size_per_image, self.positive_fraction, self.bg_label
            )

            # 将前景和背景索引连接起来作为最终采样的索引
            sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
            # 返回最终的采样索引和对应的分类标签
            return sampled_idxs, gt_classes[sampled_idxs]
    # 定义前向传播方法，计算目标检测的损失
    def forward(self, outputs, targets, return_cost_matrix=False):
        # COCO 数据集的类别编号范围为 1 到 90，模型设置 num_classes=91 并应用 sigmoid 函数。

        # 获取批量大小
        bs = len(targets)
        # 初始化空列表，用于存储匹配的索引和 IoU 值
        indices = []
        ious = []
        # 遍历每个批次中的目标
        for b in range(bs):
            # 计算预测框和目标框之间的 IoU，并转换为中心点到角点的格式
            iou, _ = box_iou(
                center_to_corners_format(targets[b]["boxes"]),
                center_to_corners_format(outputs["init_reference"][b].detach()),
            )
            # 使用 IoU 值进行匹配，得到匹配的索引和标签
            matched_idxs, matched_labels = self.proposal_matcher(
                iou
            )  # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.6, 0 ow]
            # 根据匹配结果，对提议框进行采样，得到采样后的索引和对应的目标类别
            (
                sampled_idxs,
                sampled_gt_classes,
            ) = self._sample_proposals(  # list of sampled proposal_ids, sampled_id -> [0, num_classes)+[bg_label]
                matched_idxs, matched_labels, targets[b]["class_labels"]
            )
            # 筛选出正样本提议框的索引和对应的正样本目标框的索引
            pos_pr_inds = sampled_idxs[sampled_gt_classes != self.bg_label]
            pos_gt_inds = matched_idxs[pos_pr_inds]
            # 后处理正样本索引，可能包括降采样等操作
            pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou)
            # 将正样本索引和 IoU 存入列表
            indices.append((pos_pr_inds, pos_gt_inds))
            ious.append(iou)
        # 如果需要返回损失矩阵，则返回索引和 IoU 值
        if return_cost_matrix:
            return indices, ious
        # 否则仅返回索引
        return indices

    # 后处理索引方法，对给定的提议框索引、目标框索引和 IoU 进行处理
    def postprocess_indices(self, pr_inds, gt_inds, iou):
        return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)
# 从 https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/proposal_generator/rpn.py#L181 修改而来

class DetaStage1Assigner(nn.Module):
    def __init__(self, t_low=0.3, t_high=0.7, max_k=4):
        super().__init__()
        # 初始化正样本的比例和每张图像的每批样本数
        self.positive_fraction = 0.5
        self.batch_size_per_image = 256
        # 设置最大匹配数和IoU阈值的下限和上限
        self.k = max_k
        self.t_low = t_low
        self.t_high = t_high
        # 创建锚框匹配器对象
        self.anchor_matcher = DetaMatcher(
            thresholds=[t_low, t_high], labels=[0, -1, 1], allow_low_quality_matches=True
        )

    def _subsample_labels(self, label):
        """
        随机抽样一部分正负样本，并将标签向量中未包含在抽样中的元素设置为忽略标签(-1)。

        Args:
            labels (Tensor): 包含标签值-1, 0, 1的向量。将在原地被修改并返回。
        """
        # 对正负样本进行抽样
        pos_idx, neg_idx = subsample_labels(label, self.batch_size_per_image, self.positive_fraction, 0)
        # 将标签向量填充为忽略标签(-1)，然后设置正负样本的标签值
        label.fill_(-1)
        label.scatter_(0, pos_idx, 1)
        label.scatter_(0, neg_idx, 0)
        return label

    def forward(self, outputs, targets):
        bs = len(targets)
        indices = []
        for b in range(bs):
            # 获取当前图像的锚框和目标框
            anchors = outputs["anchors"][b]
            if len(targets[b]["boxes"]) == 0:
                # 如果当前图像没有目标框，则返回空张量对
                indices.append(
                    (
                        torch.tensor([], dtype=torch.long, device=anchors.device),
                        torch.tensor([], dtype=torch.long, device=anchors.device),
                    )
                )
                continue
            # 计算锚框与目标框之间的IoU
            iou, _ = box_iou(
                center_to_corners_format(targets[b]["boxes"]),
                center_to_corners_format(anchors),
            )
            # 使用锚框匹配器确定锚框与目标框的匹配情况
            matched_idxs, matched_labels = self.anchor_matcher(
                iou
            )  # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.7, 0 if iou < 0.3, -1 ow]
            # 对匹配后的标签进行正负样本抽样
            matched_labels = self._subsample_labels(matched_labels)

            # 获取所有正样本的索引
            all_pr_inds = torch.arange(len(anchors), device=matched_labels.device)
            # 根据正样本的标签获取正样本对应的锚框和目标框的索引
            pos_pr_inds = all_pr_inds[matched_labels == 1]
            pos_gt_inds = matched_idxs[pos_pr_inds]
            # 后处理正样本的索引
            pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou)
            pos_pr_inds, pos_gt_inds = pos_pr_inds.to(anchors.device), pos_gt_inds.to(anchors.device)
            indices.append((pos_pr_inds, pos_gt_inds))
        return indices

    def postprocess_indices(self, pr_inds, gt_inds, iou):
        # 对每个目标框保留前k个置信度最高的锚框
        return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)

`.\models\deta\init.py`

# 引入类型检查标记，如果支持类型检查，表示当前环境可能用于类型检查
from typing import TYPE_CHECKING

# 引入自定义的异常和模块加载延迟工具
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块的导入结构字典，用于延迟加载模块
_import_structure = {
    "configuration_deta": ["DETA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetaConfig"],
}

# 检查是否支持视觉处理模块，若不支持则抛出可选依赖不可用异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持视觉处理，添加视觉处理模块到导入结构字典中
    _import_structure["image_processing_deta"] = ["DetaImageProcessor"]

# 检查是否支持torch模块，若不支持则抛出可选依赖不可用异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持torch模块，添加模型相关的deta模块到导入结构字典中
    _import_structure["modeling_deta"] = [
        "DETA_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DetaForObjectDetection",
        "DetaModel",
        "DetaPreTrainedModel",
    ]

# 如果是类型检查环境
if TYPE_CHECKING:
    # 从配置模块中导入预训练配置映射和DetaConfig类
    from .configuration_deta import DETA_PRETRAINED_CONFIG_ARCHIVE_MAP, DetaConfig

    # 检查是否支持视觉处理模块，若不支持则抛出可选依赖不可用异常
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 在类型检查环境下，从视觉处理模块导入DetaImageProcessor类
        from .image_processing_deta import DetaImageProcessor

    # 检查是否支持torch模块，若不支持则抛出可选依赖不可用异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 在类型检查环境下，从模型处理模块导入相关类和列表
        from .modeling_deta import (
            DETA_PRETRAINED_MODEL_ARCHIVE_LIST,
            DetaForObjectDetection,
            DetaModel,
            DetaPreTrainedModel,
        )

# 如果不是类型检查环境，即运行时环境
else:
    import sys

    # 将当前模块替换为延迟加载模块，使得导入时真正加载模块内容
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\detr\configuration_detr.py`

# coding=utf-8
# Copyright 2021 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" DETR model configuration"""

# 从 collections 模块中导入 OrderedDict 类
from collections import OrderedDict
# 导入 Mapping 类型
from typing import Mapping

# 从 packaging 模块中导入 version 函数
from packaging import version

# 从 configuration_utils.py 文件中导入 PretrainedConfig 类
from ...configuration_utils import PretrainedConfig
# 从 onnx.py 文件中导入 OnnxConfig 类
from ...onnx import OnnxConfig
# 从 utils.py 文件中导入 logging 函数
from ...utils import logging
# 从 auto.py 文件中导入 CONFIG_MAPPING 变量
from ..auto import CONFIG_MAPPING

# 获取 logger 对象
logger = logging.get_logger(__name__)

# DETR 预训练配置文件映射表，指定了每个预训练模型对应的配置文件 URL
DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/detr-resnet-50": "https://huggingface.co/facebook/detr-resnet-50/resolve/main/config.json",
    # 查看所有 DETR 模型的列表链接：https://huggingface.co/models?filter=detr
}

# DetrConfig 类，继承自 PretrainedConfig 类，用于存储 DETR 模型的配置信息
class DetrConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DetrModel`]. It is used to instantiate a DETR
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DETR
    [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Examples:

    ```
    >>> from transformers import DetrConfig, DetrModel

    >>> # Initializing a DETR facebook/detr-resnet-50 style configuration
    >>> configuration = DetrConfig()

    >>> # Initializing a model (with random weights) from the facebook/detr-resnet-50 style configuration
    >>> model = DetrModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型为 "detr"
    model_type = "detr"
    # 推断过程中需要忽略的键列表
    keys_to_ignore_at_inference = ["past_key_values"]
    # 属性映射，将配置中的属性名映射到 DETR 架构中对应的名称
    attribute_map = {
        "hidden_size": "d_model",
        "num_attention_heads": "encoder_attention_heads",
    }
    # 初始化函数，用于实例化一个配置对象
    def __init__(
        self,
        use_timm_backbone=True,  # 是否使用timm的backbone模型，默认为True
        backbone_config=None,  # backbone模型的配置参数，默认为None
        num_channels=3,  # 输入图片的通道数，默认为3（RGB）
        num_queries=100,  # 查询向量的数量，默认为100
        encoder_layers=6,  # 编码器的层数，默认为6层
        encoder_ffn_dim=2048,  # 编码器中FFN层的维度，默认为2048
        encoder_attention_heads=8,  # 编码器的注意力头数，默认为8个头
        decoder_layers=6,  # 解码器的层数，默认为6层
        decoder_ffn_dim=2048,  # 解码器中FFN层的维度，默认为2048
        decoder_attention_heads=8,  # 解码器的注意力头数，默认为8个头
        encoder_layerdrop=0.0,  # 编码器的层丢弃率，默认为0.0
        decoder_layerdrop=0.0,  # 解码器的层丢弃率，默认为0.0
        is_encoder_decoder=True,  # 是否为编码解码模型，默认为True
        activation_function="relu",  # 激活函数，默认为ReLU
        d_model=256,  # 模型的维度，默认为256
        dropout=0.1,  # 全局dropout率，默认为0.1
        attention_dropout=0.0,  # 注意力机制的dropout率，默认为0.0
        activation_dropout=0.0,  # 激活函数的dropout率，默认为0.0
        init_std=0.02,  # 参数初始化的标准差，默认为0.02
        init_xavier_std=1.0,  # Xavier初始化的标准差，默认为1.0
        auxiliary_loss=False,  # 是否使用辅助损失，默认为False
        position_embedding_type="sine",  # 位置编码类型，默认为正弦位置编码
        backbone="resnet50",  # 使用的backbone模型，默认为ResNet-50
        use_pretrained_backbone=True,  # 是否使用预训练的backbone，默认为True
        backbone_kwargs=None,  # backbone模型的额外参数，默认为None
        dilation=False,  # 是否使用扩张卷积，默认为False
        class_cost=1,  # 分类损失的权重，默认为1
        bbox_cost=5,  # 边界框损失的权重，默认为5
        giou_cost=2,  # GIoU损失的权重，默认为2
        mask_loss_coefficient=1,  # 掩码损失的系数，默认为1
        dice_loss_coefficient=1,  # Dice损失的系数，默认为1
        bbox_loss_coefficient=5,  # 边界框损失的系数，默认为5
        giou_loss_coefficient=2,  # GIoU损失的系数，默认为2
        eos_coefficient=0.1,  # 结束标记的损失权重，默认为0.1
        **kwargs,  # 其他关键字参数，用于接收未指定的参数
    ):
        pass

    @property
    def num_attention_heads(self) -> int:
        # 返回编码器中的注意力头数
        return self.encoder_attention_heads

    @property
    def hidden_size(self) -> int:
        # 返回模型的隐藏层大小
        return self.d_model

    @classmethod
    def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
        """从预训练的backbone模型配置中实例化一个DetrConfig（或其派生类）对象。

        Args:
            backbone_config ([PretrainedConfig]): 
                预训练的backbone模型的配置对象。

        Returns:
            [DetrConfig]: DetrConfig对象的一个实例
        """
        return cls(backbone_config=backbone_config, **kwargs)
# 定义一个名为 DetrOnnxConfig 的类，它继承自 OnnxConfig 类
class DetrOnnxConfig(OnnxConfig):
    # 设定 torch_onnx_minimum_version 属性为版本号 1.11
    torch_onnx_minimum_version = version.parse("1.11")

    # 定义一个 inputs 属性，返回一个有序字典，描述了模型的输入
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
                ("pixel_mask", {0: "batch"}),
            ]
        )

    # 定义一个 atol_for_validation 属性，返回一个用于验证的绝对容差值
    @property
    def atol_for_validation(self) -> float:
        return 1e-5

    # 定义一个 default_onnx_opset 属性，返回默认的 ONNX 操作集版本号
    @property
    def default_onnx_opset(self) -> int:
        return 12

Transformers-源码解析-三十八-

Transformers 源码解析（三十八）

.\models\depth_anything\convert_depth_anything_to_hf.py

.\models\depth_anything\modeling_depth_anything.py

.\models\depth_anything\__init__.py

.\models\deta\configuration_deta.py

.\models\deta\convert_deta_resnet_to_pytorch.py

.\models\deta\convert_deta_swin_to_pytorch.py

.\models\deta\image_processing_deta.py

.\models\deta\modeling_deta.py

.\models\deta\__init__.py

.\models\detr\configuration_detr.py

`.\models\depth_anything\convert_depth_anything_to_hf.py`

`.\models\depth_anything\modeling_depth_anything.py`

`.\models\depth_anything\init.py`

`.\models\deta\configuration_deta.py`

`.\models\deta\convert_deta_resnet_to_pytorch.py`

`.\models\deta\convert_deta_swin_to_pytorch.py`

`.\models\deta\image_processing_deta.py`

`.\models\deta\modeling_deta.py`

`.\models\deta\init.py`

`.\models\detr\configuration_detr.py`