Transformers 源码解析（三十九）

`.\models\detr\convert_detr_original_pytorch_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert DETR checkpoints with timm backbone."""


import argparse  # 导入命令行参数解析模块
import json  # 导入处理 JSON 数据的模块
from collections import OrderedDict  # 导入有序字典模块
from pathlib import Path  # 导入处理文件路径的模块

import requests  # 导入发送 HTTP 请求的模块
import torch  # 导入 PyTorch 模块
from huggingface_hub import hf_hub_download  # 导入从 Hugging Face Hub 下载模型的方法
from PIL import Image  # 导入处理图像的模块

from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor  # 导入 DETR 模型相关类
from transformers.utils import logging  # 导入日志记录模块


logging.set_verbosity_info()  # 设置日志输出为 info 级别
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# here we list all keys to be renamed (original name on the left, our name on the right)
rename_keys = []  # 创建空列表，用于存储需要重命名的键值对
for i in range(6):
    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
    rename_keys.append(
        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
    )  # 添加需要重命名的权重参数路径对，编码器层的注意力输出投影权重
    rename_keys.append(
        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
    )  # 添加需要重命名的偏置参数路径对，编码器层的注意力输出投影偏置
    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))  # 编码器层的第一个全连接层权重
    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))  # 编码器层的第一个全连接层偏置
    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))  # 编码器层的第二个全连接层权重
    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))  # 编码器层的第二个全连接层偏置
    rename_keys.append(
        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
    )  # 编码器层的第一个 Layernorm 权重
    rename_keys.append(
        (f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")
    )  # 编码器层的第一个 Layernorm 偏置
    rename_keys.append(
        (f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")
    )  # 编码器层的第二个 Layernorm 权重
    rename_keys.append(
        (f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")
    )  # 编码器层的第二个 Layernorm 偏置
    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
    )  # 解码器层的注意力输出投影权重
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
    )  # 解码器层的注意力输出投影偏置
    # 将下列元组添加到 rename_keys 列表中，重命名 Transformer 模型中特定层的权重和偏置参数到 Decoder 模型对应层
    rename_keys.append(
        (
            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",  # Transformer 解码器的多头注意力层输出投影权重
            f"decoder.layers.{i}.encoder_attn.out_proj.weight",  # 对应的解码器层的编码器注意力层输出投影权重
        )
    )
    rename_keys.append(
        (
            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",  # Transformer 解码器的多头注意力层输出投影偏置
            f"decoder.layers.{i}.encoder_attn.out_proj.bias",  # 对应的解码器层的编码器注意力层输出投影偏置
        )
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")  # Transformer 解码器的第一个线性层权重 -> 解码器层的第一个全连接层权重
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")  # Transformer 解码器的第一个线性层偏置 -> 解码器层的第一个全连接层偏置
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")  # Transformer 解码器的第二个线性层权重 -> 解码器层的第二个全连接层权重
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")  # Transformer 解码器的第二个线性层偏置 -> 解码器层的第二个全连接层偏置
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")  # Transformer 解码器的第一个归一化层权重 -> 解码器层的自注意力层归一化权重
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")  # Transformer 解码器的第一个归一化层偏置 -> 解码器层的自注意力层归一化偏置
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")  # Transformer 解码器的第二个归一化层权重 -> 解码器层的编码器注意力层归一化权重
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")  # Transformer 解码器的第二个归一化层偏置 -> 解码器层的编码器注意力层归一化偏置
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")  # Transformer 解码器的第三个归一化层权重 -> 解码器层的最终归一化层权重
    )
    rename_keys.append(
        (f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")  # Transformer 解码器的第三个归一化层偏置 -> 解码器层的最终归一化层偏置
    )
# 将键名转换列表扩展到state_dict中，用于重命名模型参数
rename_keys.extend(
    [
        ("input_proj.weight", "input_projection.weight"),
        ("input_proj.bias", "input_projection.bias"),
        ("query_embed.weight", "query_position_embeddings.weight"),
        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
        ("class_embed.weight", "class_labels_classifier.weight"),
        ("class_embed.bias", "class_labels_classifier.bias"),
        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
    ]
)

# 函数：根据给定的旧键名和新键名重命名state_dict中的键
def rename_key(state_dict, old, new):
    val = state_dict.pop(old)
    state_dict[new] = val

# 函数：重命名state_dict中backbone的键名，将"backbone.0.body"替换为"backbone.conv_encoder.model"
def rename_backbone_keys(state_dict):
    new_state_dict = OrderedDict()
    for key, value in state_dict.items():
        if "backbone.0.body" in key:
            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
            new_state_dict[new_key] = value
        else:
            new_state_dict[key] = value
    return new_state_dict

# 函数：从state_dict中读取query、key、value的权重和偏置，并重新组织存储结构
def read_in_q_k_v(state_dict, is_panoptic=False):
    prefix = ""
    if is_panoptic:
        prefix = "detr."

    # 首先处理transformer encoder部分
    for i in range(6):
        # 读取self attention的输入投影层权重和偏置
        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
        
        # 将权重和偏置按顺序分配到query、key、value的投影层权重和偏置中
        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]

    # 处理transformer decoder部分，包含cross-attention，稍显复杂，需要在后续代码中继续处理
    for i in range(6):
        # 从状态字典中弹出自注意力机制的输入投影层的权重和偏置
        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
        
        # 将查询、键和值（按顺序）添加到状态字典中
        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
        
        # 从状态字典中弹出交叉注意力机制的输入投影层的权重和偏置
        in_proj_weight_cross_attn = state_dict.pop(
            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
        )
        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
        
        # 将查询、键和值（按顺序）添加到状态字典中，用于交叉注意力机制
        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
# We will verify our results on an image of cute cats
def prepare_img():
    # 定义图像的 URL 地址
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 使用 requests 库发送 GET 请求获取图像的原始数据流，并由 PIL 库打开为图像对象
    im = Image.open(requests.get(url, stream=True).raw)

    # 返回打开的图像对象
    return im


@torch.no_grad()
def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our DETR structure.
    """

    # 加载默认的 DETR 配置
    config = DetrConfig()

    # 根据模型名称设置配置的背骨骨干和扩张属性
    if "resnet101" in model_name:
        config.backbone = "resnet101"
    if "dc5" in model_name:
        config.dilation = True

    # 检查模型是否为全景模型，根据需要设置标签数目
    is_panoptic = "panoptic" in model_name
    if is_panoptic:
        config.num_labels = 250
    else:
        config.num_labels = 91
        # 加载 COCO 数据集的标签文件，将其转换为 id 到标签名的映射字典
        repo_id = "huggingface/label-files"
        filename = "coco-detection-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}

    # 根据格式加载相应的图像处理器
    format = "coco_panoptic" if is_panoptic else "coco_detection"
    image_processor = DetrImageProcessor(format=format)

    # 准备图像，调用 prepare_img 函数获取图像对象，并编码为 PyTorch 张量
    img = prepare_img()
    encoding = image_processor(images=img, return_tensors="pt")
    pixel_values = encoding["pixel_values"]

    # 记录信息：正在转换模型的名称
    logger.info(f"Converting model {model_name}...")

    # 从 Torch Hub 加载原始的 DETR 模型
    detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval()
    state_dict = detr.state_dict()

    # 重命名模型权重中的键名
    for src, dest in rename_keys:
        if is_panoptic:
            src = "detr." + src
        rename_key(state_dict, src, dest)

    # 重命名骨干网络的键名
    state_dict = rename_backbone_keys(state_dict)

    # 处理查询、键和值矩阵的特殊处理
    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)

    # 重要提示：需要为每个基础模型的键名添加前缀，因为头部模型使用不同的属性来表示它们
    prefix = "detr.model." if is_panoptic else "model."
    # 遍历状态字典的键的副本
    for key in state_dict.copy().keys():
        # 如果是全景视觉任务
        if is_panoptic:
            # 检查是否以"detr"开头且不以"class_labels_classifier"或"bbox_predictor"开头的键
            if (
                key.startswith("detr")
                and not key.startswith("class_labels_classifier")
                and not key.startswith("bbox_predictor")
            ):
                # 弹出该键对应的值，并将其以"detr.model" + key[4:]的格式重新命名并存入字典
                val = state_dict.pop(key)
                state_dict["detr.model" + key[4:]] = val
            # 如果键包含"class_labels_classifier"或"bbox_predictor"
            elif "class_labels_classifier" in key or "bbox_predictor" in key:
                # 弹出该键对应的值，并将其以"detr." + key的格式重新命名并存入字典
                val = state_dict.pop(key)
                state_dict["detr." + key] = val
            # 如果键以"bbox_attention"或"mask_head"开头，则跳过处理
            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
                continue
            # 否则，弹出该键对应的值，并以prefix + key的格式重新命名并存入字典
            else:
                val = state_dict.pop(key)
                state_dict[prefix + key] = val
        # 如果不是全景视觉任务
        else:
            # 如果不以"class_labels_classifier"或"bbox_predictor"开头的键
            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
                # 弹出该键对应的值，并以prefix + key的格式重新命名并存入字典
                val = state_dict.pop(key)
                state_dict[prefix + key] = val
    
    # 最终，根据is_panoptic创建HuggingFace模型并加载状态字典
    model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
    model.load_state_dict(state_dict)
    model.eval()
    
    # 验证转换的正确性
    original_outputs = detr(pixel_values)
    outputs = model(pixel_values)
    # 断言输出的logits和原始输出的"pred_logits"在指定的误差范围内相似
    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
    # 断言输出的预测框和原始输出的"pred_boxes"在指定的误差范围内相似
    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
    # 如果是全景视觉任务，断言输出的预测掩码和原始输出的"pred_masks"在指定的误差范围内相似
    if is_panoptic:
        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
    
    # 保存模型和图像处理器
    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
    # 确保路径存在，如果不存在则创建
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 将模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)
    # 将图像处理器保存到指定路径
    image_processor.save_pretrained(pytorch_dump_folder_path)
# 如果脚本直接运行（而非被导入其他模块），则执行以下代码块
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()

    # 添加名为 "--model_name" 的命令行参数，用于指定要转换的 DETR 模型的名称，默认为 "detr_resnet50"
    parser.add_argument(
        "--model_name", default="detr_resnet50", type=str, help="Name of the DETR model you'd like to convert."
    )

    # 添加名为 "--pytorch_dump_folder_path" 的命令行参数，用于指定输出 PyTorch 模型的文件夹路径，默认为 None
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数 convert_detr_checkpoint，传入解析得到的模型名称和输出文件夹路径作为参数
    convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)

`.\models\detr\convert_detr_to_pytorch.py`

# coding=utf-8
# 代码文件使用UTF-8编码

# Copyright 2023 The HuggingFace Inc. team.
# 版权声明：2023年由HuggingFace Inc.团队拥有

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据Apache许可证2.0版本授权使用本代码

# you may not use this file except in compliance with the License.
# 除非符合许可证，否则不得使用此文件

# You may obtain a copy of the License at
# 可以在以下网址获取许可证副本

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# 除非适用法律要求或书面同意，否则按"原样"分发软件

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 无论明示或默示，不提供任何担保或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 请参阅许可证以了解具体的权限和限制

"""Convert DETR checkpoints with native (Transformers) backbone."""
# 脚本用于将具有本地（Transformers）骨干结构的DETR检查点进行转换

import argparse
# 导入命令行参数解析库
import json
# 导入JSON处理库
from pathlib import Path
# 导入路径处理模块Path

import requests
# 导入请求库
import torch
# 导入PyTorch库
from huggingface_hub import hf_hub_download
# 从huggingface_hub库导入模型下载函数hf_hub_download
from PIL import Image
# 导入PIL图像处理库

from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor, ResNetConfig
# 从transformers库中导入DETR模型相关组件和配置类
from transformers.utils import logging
# 从transformers的utils模块导入日志模块

logging.set_verbosity_info()
# 设置日志输出级别为info
logger = logging.get_logger(__name__)
# 获取logger对象


def get_detr_config(model_name):
    # 根据模型名称获取DETR配置

    # initialize config
    # 初始化配置对象
    if "resnet-50" in model_name:
        backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50")
    elif "resnet-101" in model_name:
        backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-101")
    else:
        raise ValueError("Model name should include either resnet50 or resnet101")
    # 根据模型名称选择对应的ResNet配置，若模型名称不正确则抛出异常

    config = DetrConfig(use_timm_backbone=False, backbone_config=backbone_config)
    # 创建DETR配置对象，设置使用Timm背景为False，并传入选择的ResNet配置对象

    # set label attributes
    # 设置标签属性
    is_panoptic = "panoptic" in model_name
    # 检查模型名称中是否包含'panoptic'

    if is_panoptic:
        config.num_labels = 250
    else:
        config.num_labels = 91
        repo_id = "huggingface/label-files"
        filename = "coco-detection-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
    # 根据模型类型设置标签数量和标签映射关系，如果是panoptic则设置标签数量为250，否则设置为91

    return config, is_panoptic
    # 返回配置对象和是否为panoptic类型的标志


def create_rename_keys(config):
    # 创建重命名键列表函数，用于将模型检查点中的键进行重命名

    # here we list all keys to be renamed (original name on the left, our name on the right)
    # 此处列出所有需要重命名的键（原始名称在左侧，我们的名称在右侧）
    rename_keys = []

    # stem
    # stem部分的重命名列表
    # fmt: off
    rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight"))
    rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight"))
    rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias"))
    rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean"))
    rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var"))
    # stages
    # fmt: on
    # 格式控制注释开启，用于标记需要进行重命名的键和相应的新名称
    # 定义一个空列表，用于存储需要重命名的键值对
    rename_keys.extend(
        [
            # 将原始键名 "input_proj.weight" 重命名为 "input_projection.weight"
            ("input_proj.weight", "input_projection.weight"),
            # 将原始键名 "input_proj.bias" 重命名为 "input_projection.bias"
            ("input_proj.bias", "input_projection.bias"),
            # 将原始键名 "query_embed.weight" 重命名为 "query_position_embeddings.weight"
            ("query_embed.weight", "query_position_embeddings.weight"),
            # 将原始键名 "transformer.decoder.norm.weight" 重命名为 "decoder.layernorm.weight"
            ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
            # 将原始键名 "transformer.decoder.norm.bias" 重命名为 "decoder.layernorm.bias"
            ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
            # 将原始键名 "class_embed.weight" 重命名为 "class_labels_classifier.weight"
            ("class_embed.weight", "class_labels_classifier.weight"),
            # 将原始键名 "class_embed.bias" 重命名为 "class_labels_classifier.bias"
            ("class_embed.bias", "class_labels_classifier.bias"),
            # 将原始键名 "bbox_embed.layers.0.weight" 重命名为 "bbox_predictor.layers.0.weight"
            ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
            # 将原始键名 "bbox_embed.layers.0.bias" 重命名为 "bbox_predictor.layers.0.bias"
            ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
            # 将原始键名 "bbox_embed.layers.1.weight" 重命名为 "bbox_predictor.layers.1.weight"
            ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
            # 将原始键名 "bbox_embed.layers.1.bias" 重命名为 "bbox_predictor.layers.1.bias"
            ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
            # 将原始键名 "bbox_embed.layers.2.weight" 重命名为 "bbox_predictor.layers.2.weight"
            ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
            # 将原始键名 "bbox_embed.layers.2.bias" 重命名为 "bbox_predictor.layers.2.bias"
            ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
        ]
    )

    # 返回存储重命名键值对的列表
    return rename_keys
# 重命名 state_dict 中的键：从 old 更改为 new，并返回 old 对应的值
def rename_key(state_dict, old, new):
    val = state_dict.pop(old)
    state_dict[new] = val

# 从 state_dict 中读取查询、键和值的权重和偏置，并添加到 state_dict 中
def read_in_q_k_v(state_dict, is_panoptic=False):
    prefix = ""
    if is_panoptic:
        prefix = "detr."

    # 遍历 transformer 编码器的6个层
    for i in range(6):
        # 读取输入投影层的权重和偏置（在 PyTorch 的 MultiHeadAttention 中，这是一个单独的矩阵加偏置）
        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
        
        # 将查询 q_proj 的权重和偏置添加到 state_dict 中
        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
        
        # 将键 k_proj 的权重和偏置添加到 state_dict 中
        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
        
        # 将值 v_proj 的权重和偏置添加到 state_dict 中
        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]

    # 接下来处理 transformer 解码器（稍复杂，因为它还包括交叉注意力）
    # 循环遍历范围为0到5（共6次迭代）
    for i in range(6):
        # 读取self-attention层中输入投影层的权重和偏置
        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
        
        # 将权重的前256行作为query投影的权重
        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
        # 将偏置的前256个元素作为query投影的偏置
        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
        # 将权重的第256到511行作为key投影的权重
        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
        # 将偏置的第256到511个元素作为key投影的偏置
        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
        # 将权重的最后256行作为value投影的权重
        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
        # 将偏置的最后256个元素作为value投影的偏置
        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
        
        # 读取cross-attention层中输入投影层的权重和偏置
        in_proj_weight_cross_attn = state_dict.pop(
            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
        )
        in_proj_bias_cross_attn = state_dict.pop(
            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias"
        )
        
        # 将权重的前256行作为cross-attention的query投影的权重
        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
        # 将偏置的前256个元素作为cross-attention的query投影的偏置
        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
        # 将权重的第256到511行作为cross-attention的key投影的权重
        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
        # 将偏置的第256到511个元素作为cross-attention的key投影的偏置
        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
        # 将权重的最后256行作为cross-attention的value投影的权重
        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
        # 将偏置的最后256个元素作为cross-attention的value投影的偏置
        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
# 使用此函数准备一张可爱猫咪的图像，从指定 URL 下载图像并打开
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 通过 requests 获取图像的原始流数据，并由 PIL 库打开为图像对象
    im = Image.open(requests.get(url, stream=True).raw)

    return im


@torch.no_grad()
# 将指定的 DETR 模型的检查点转换为我们的 DETR 结构
def convert_detr_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
    """
    Copy/paste/tweak model's weights to our DETR structure.
    """

    # 加载默认配置
    config, is_panoptic = get_detr_config(model_name)

    # 从 torch hub 加载原始模型
    model_name_to_original_name = {
        "detr-resnet-50": "detr_resnet50",
        "detr-resnet-101": "detr_resnet101",
    }
    logger.info(f"Converting model {model_name}...")
    # 加载预训练模型并设置为评估模式
    detr = torch.hub.load("facebookresearch/detr", model_name_to_original_name[model_name], pretrained=True).eval()
    state_dict = detr.state_dict()

    # 重命名模型权重的键
    for src, dest in create_rename_keys(config):
        if is_panoptic:
            src = "detr." + src
        rename_key(state_dict, src, dest)

    # 处理查询、键和值矩阵需要特殊处理
    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)

    # 对于基础模型键，需要在每个键前面添加特定前缀，因为头部模型使用不同的属性
    prefix = "detr.model." if is_panoptic else "model."
    for key in state_dict.copy().keys():
        if is_panoptic:
            # 对于全景分割模型，重新命名特定的键
            if (
                key.startswith("detr")
                and not key.startswith("class_labels_classifier")
                and not key.startswith("bbox_predictor")
            ):
                val = state_dict.pop(key)
                state_dict["detr.model" + key[4:]] = val
            elif "class_labels_classifier" in key or "bbox_predictor" in key:
                val = state_dict.pop(key)
                state_dict["detr." + key] = val
            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
                continue
            else:
                val = state_dict.pop(key)
                state_dict[prefix + key] = val
        else:
            # 对于检测模型，重新命名特定的键
            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
                val = state_dict.pop(key)
                state_dict[prefix + key] = val

    # 创建 HuggingFace 模型并加载状态字典
    model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
    model.load_state_dict(state_dict)
    model.eval()

    # 验证转换结果在图像上的输出
    format = "coco_panoptic" if is_panoptic else "coco_detection"
    processor = DetrImageProcessor(format=format)

    encoding = processor(images=prepare_img(), return_tensors="pt")
    pixel_values = encoding["pixel_values"]

    # 运行原始 DETR 模型和转换后的模型，并验证其输出的 logits 和预测框是否接近
    original_outputs = detr(pixel_values)
    outputs = model(pixel_values)

    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-3)
    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-3)
    if is_panoptic:
        # 如果是全视角（panoptic）预测，则进行以下断言：
        # 检查模型输出的预测掩码是否与原始输出中的预测掩码在给定的绝对误差范围内相似
        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
    # 打印提示信息，表示看起来一切正常
    print("Looks ok!")

    if pytorch_dump_folder_path is not None:
        # 如果指定了 PyTorch 模型保存路径：
        # 记录日志，指示正在将模型和图像处理器保存到指定路径
        logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
        # 确保保存路径存在，如果不存在则创建
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 将图像处理器保存到指定路径
        processor.save_pretrained(pytorch_dump_folder_path)

    if push_to_hub:
        # 如果需要将模型和图像处理器推送到 Hub 上：
        # 记录日志，指示正在上传 PyTorch 模型和图像处理器到 Hub
        logger.info("Uploading PyTorch model and image processor to the hub...")
        # 将模型推送到指定的 Hub 仓库（repository）
        model.push_to_hub(f"nielsr/{model_name}")
        # 将图像处理器推送到指定的 Hub 仓库（repository）
        processor.push_to_hub(f"nielsr/{model_name}")
# 如果当前脚本被直接运行而非被导入为模块，则执行以下代码
if __name__ == "__main__":
    # 创建一个参数解析器对象
    parser = argparse.ArgumentParser()

    # 添加命令行参数选项，用于指定要转换的DETR模型名称
    parser.add_argument(
        "--model_name",
        default="detr-resnet-50",
        type=str,
        choices=["detr-resnet-50", "detr-resnet-101"],
        help="Name of the DETR model you'd like to convert."
    )

    # 添加命令行参数选项，用于指定输出PyTorch模型的文件夹路径
    parser.add_argument(
        "--pytorch_dump_folder_path", 
        default=None, 
        type=str, 
        help="Path to the folder to output PyTorch model."
    )

    # 添加命令行参数选项，用于指定是否将模型推送到平台
    parser.add_argument(
        "--push_to_hub", 
        action="store_true", 
        help="Whether to push the model to the hub or not."
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数convert_detr_checkpoint，传入解析后的命令行参数，执行模型转换操作
    convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

`.\models\detr\feature_extraction_detr.py`

# 设置文件编码为 UTF-8
# 版权声明，指明版权归 HuggingFace Inc. 团队所有，保留所有权利
#
# 根据 Apache 许可证 2.0 版本，除非符合许可证要求，否则不得使用此文件
# 可在以下链接获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则根据此许可证分发的软件是基于 "原样" 提供的，
# 不附带任何明示或暗示的担保或条件。详见许可证文本获取更多信息。
"""DETR 的特征提取器类。"""

# 导入警告模块
import warnings

# 导入特定的图像转换函数 rgb_to_id，作为 _rgb_to_id 别名
from ...image_transforms import rgb_to_id as _rgb_to_id
# 导入日志工具
from ...utils import logging
# 导入 DETR 图像处理模块中的类 DetrImageProcessor
from .image_processing_detr import DetrImageProcessor

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


def rgb_to_id(x):
    # 发出警告，指出 rgb_to_id 函数已经移动，从版本 5 开始将无法从当前模块导入
    warnings.warn(
        "rgb_to_id has moved and will not be importable from this module from v5. "
        "Please import from transformers.image_transforms instead.",
        FutureWarning,
    )
    # 调用 _rgb_to_id 函数并返回结果
    return _rgb_to_id(x)


class DetrFeatureExtractor(DetrImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告，指出 DetrFeatureExtractor 类在 Transformers 版本 5 中将被移除，建议使用 DetrImageProcessor 替代
        warnings.warn(
            "The class DetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
            " Please use DetrImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类 DetrImageProcessor 的构造函数，传入所有位置参数和关键字参数
        super().__init__(*args, **kwargs)

`.\models\detr\image_processing_detr.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for DETR."""

import io
import pathlib
from collections import defaultdict
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union

import numpy as np

# 导入 DETR 的图像处理工具和变换函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
    PaddingMode,
    center_to_corners_format,
    corners_to_center_format,
    id_to_rgb,
    pad,
    rescale,
    resize,
    rgb_to_id,
    to_channel_dimension_format,
)
# 导入 DETR 的图像工具函数
from ...image_utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
    AnnotationFormat,
    AnnotationType,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_annotations,
    validate_kwargs,
    validate_preprocess_arguments,
)
# 导入通用工具函数和检测相关库
from ...utils import (
    TensorType,
    is_flax_available,
    is_jax_tensor,
    is_scipy_available,
    is_tf_available,
    is_tf_tensor,
    is_torch_available,
    is_torch_tensor,
    is_vision_available,
    logging,
)

# 如果 PyTorch 可用，则导入 torch 和 nn 模块
if is_torch_available():
    import torch
    from torch import nn

# 如果有图像处理库可用，则导入 PIL 模块
if is_vision_available():
    import PIL

# 如果有 scipy 库可用，则导入特定的子模块
if is_scipy_available():
    import scipy.special
    import scipy.stats

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

# 定义支持的注释格式常量
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


# 从原始代码库获取：https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/datasets/transforms.py#L76
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
    """
    Computes the output image size given the input image size and the desired output size.

    Args:
        image_size (`Tuple[int, int]`):
            The input image size.
        size (`int`):
            The desired output size.
        max_size (`int`, *optional*):
            The maximum allowed output size.
    """
    height, width = image_size
    # 如果指定了最大尺寸限制
    if max_size is not None:
        # 计算原始尺寸中较小的值
        min_original_size = float(min((height, width)))
        # 计算原始尺寸中较大的值
        max_original_size = float(max((height, width)))
        # 如果缩放后的尺寸超过了最大尺寸限制，则重新计算缩放尺寸
        if max_original_size / min_original_size * size > max_size:
            size = int(round(max_size * min_original_size / max_original_size))
    
    # 如果高度小于等于宽度且高度等于指定尺寸，或者宽度小于等于高度且宽度等于指定尺寸，则直接返回原始高度和宽度
    if (height <= width and height == size) or (width <= height and width == size):
        return height, width
    
    # 如果宽度小于高度，则根据指定尺寸计算新的宽度和高度
    if width < height:
        ow = size
        oh = int(size * height / width)
    else:  # 否则，根据指定尺寸计算新的高度和宽度
        oh = size
        ow = int(size * width / height)
    
    # 返回计算后的新的高度和宽度
    return (oh, ow)
# 计算输出图像的大小，根据输入图像大小和期望的输出大小。如果期望的输出大小是元组或列表，则直接返回。如果期望的输出大小是整数，则保持输入图像大小的长宽比。
def get_resize_output_image_size(
    input_image: np.ndarray,
    size: Union[int, Tuple[int, int], List[int]],
    max_size: Optional[int] = None,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
    """
    Args:
        input_image (`np.ndarray`):
            要调整大小的图像。
        size (`int` or `Tuple[int, int]` or `List[int]`):
            期望的输出大小。
        max_size (`int`, *可选*):
            允许的最大输出大小。
        input_data_format (`ChannelDimension` or `str`, *可选*):
            输入图像的通道维度格式。如果未提供，则将从输入图像推断。
    """
    # 获取输入图像的大小
    image_size = get_image_size(input_image, input_data_format)
    # 如果期望的大小是元组或列表，则直接返回
    if isinstance(size, (list, tuple)):
        return size
    # 否则，根据输入图像大小和期望的大小计算保持长宽比的输出图像大小
    return get_size_with_aspect_ratio(image_size, size, max_size)


# 返回一个函数，该函数将numpy数组转换为输入数组的框架
def get_numpy_to_framework_fn(arr) -> Callable:
    """
    Args:
        arr (`np.ndarray`): 要转换的数组。
    """
    # 如果arr是numpy数组，则返回np.array函数
    if isinstance(arr, np.ndarray):
        return np.array
    # 如果可以使用TensorFlow并且arr是TensorFlow张量，则返回tf.convert_to_tensor函数
    if is_tf_available() and is_tf_tensor(arr):
        import tensorflow as tf
        return tf.convert_to_tensor
    # 如果可以使用PyTorch并且arr是PyTorch张量，则返回torch.tensor函数
    if is_torch_available() and is_torch_tensor(arr):
        import torch
        return torch.tensor
    # 如果可以使用Flax并且arr是JAX张量，则返回jnp.array函数
    if is_flax_available() and is_jax_tensor(arr):
        import jax.numpy as jnp
        return jnp.array
    # 如果无法转换，抛出错误
    raise ValueError(f"Cannot convert arrays of type {type(arr)}")


# 如果指定的轴有维度为1，则压缩数组
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
    """
    Args:
        arr (`np.ndarray`): 要压缩的数组。
        axis (`int`, *可选*): 要压缩的轴。
    """
    # 如果未指定轴，则压缩所有尺寸为1的轴
    if axis is None:
        return arr.squeeze()
    
    try:
        # 尝试压缩指定轴
        return arr.squeeze(axis=axis)
    except ValueError:
        # 如果指定轴无法压缩（尺寸不为1），则返回原数组
        return arr


# 根据图像大小对注释进行归一化处理
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
    """
    Args:
        annotation (`Dict`): 要归一化的注释。
        image_size (`Tuple[int, int]`): 图像的高度和宽度。
    """
    # 获取图像的高度和宽度
    image_height, image_width = image_size
    # 初始化归一化后的注释字典
    norm_annotation = {}
    # 遍历注释的键值对
    for key, value in annotation.items():
        if key == "boxes":
            # 如果键是"boxes"，则对框的坐标进行中心坐标格式到归一化坐标的转换
            boxes = value
            boxes = corners_to_center_format(boxes)
            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
            norm_annotation[key] = boxes
        else:
            # 其他情况直接复制值
            norm_annotation[key] = value
    # 返回归一化后的注释字典
    return norm_annotation


# 返回可迭代值中每个索引位置的最大值
def max_across_indices(values: Iterable[Any]) -> List[Any]:
    """
    Args:
        values (`Iterable[Any]`): 要比较的可迭代值。
    """
    return [max(values_i) for values_i in zip(*values)]
# Copied from transformers.models.vilt.image_processing_vilt.get_max_height_width
def get_max_height_width(
    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
    """
    Get the maximum height and width across all images in a batch.
    """
    # 如果未提供输入数据格式，则推断第一个图像的通道维度格式
    if input_data_format is None:
        input_data_format = infer_channel_dimension_format(images[0])

    # 根据通道维度格式不同，选择不同的图像尺寸获取方式
    if input_data_format == ChannelDimension.FIRST:
        _, max_height, max_width = max_across_indices([img.shape for img in images])
    elif input_data_format == ChannelDimension.LAST:
        max_height, max_width, _ = max_across_indices([img.shape for img in images])
    else:
        # 如果输入的通道维度格式无效，则抛出异常
        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
    return (max_height, max_width)


# Copied from transformers.models.vilt.image_processing_vilt.make_pixel_mask
def make_pixel_mask(
    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
    """
    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.

    Args:
        image (`np.ndarray`):
            Image to make the pixel mask for.
        output_size (`Tuple[int, int]`):
            Output size of the mask.
    """
    # 获取图像的高度和宽度，根据输入的数据格式
    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
    # 创建一个全零数组作为掩码
    mask = np.zeros(output_size, dtype=np.int64)
    # 将有效像素位置（左上角至input_height、input_width范围内）置为1
    mask[:input_height, :input_width] = 1
    return mask


# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
    """
    Convert a COCO polygon annotation to a mask.

    Args:
        segmentations (`List[List[float]]`):
            List of polygons, each polygon represented by a list of x-y coordinates.
        height (`int`):
            Height of the mask.
        width (`int`):
            Width of the mask.
    """
    try:
        # 导入Pycocotools的mask模块
        from pycocotools import mask as coco_mask
    except ImportError:
        # 如果导入失败，抛出ImportError异常
        raise ImportError("Pycocotools is not installed in your environment.")

    masks = []
    # 遍历每个多边形（由一系列坐标表示），转换为掩码
    for polygons in segmentations:
        # 使用Pycocotools的函数将多边形转换为RLE编码格式
        rles = coco_mask.frPyObjects(polygons, height, width)
        # 解码RLE编码格式，得到掩码
        mask = coco_mask.decode(rles)
        # 如果掩码维度少于3（即没有通道维度），则添加一个维度
        if len(mask.shape) < 3:
            mask = mask[..., None]
        # 转换为uint8类型的数组
        mask = np.asarray(mask, dtype=np.uint8)
        # 在第三维度上执行逻辑或操作，将多通道掩码转换为单通道
        mask = np.any(mask, axis=2)
        masks.append(mask)
    # 如果有掩码存在，则堆叠成一个三维数组；否则创建一个全零数组
    if masks:
        masks = np.stack(masks, axis=0)
    else:
        masks = np.zeros((0, height, width), dtype=np.uint8)

    return masks


# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
def prepare_coco_detection_annotation(
    image,
    target,
    return_segmentation_masks: bool = False,
    input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
    """
    Convert the target in COCO format into the format expected by DETR.
    """
    # 获取图像的高度和宽度，根据输入数据格式确定通道维度
    image_height, image_width = get_image_size(image, channel_dim=input_data_format)

    # 获取目标字典中的图像ID
    image_id = target["image_id"]
    image_id = np.asarray([image_id], dtype=np.int64)

    # 获取给定图像的所有COCO注解
    annotations = target["annotations"]
    # 过滤掉“iscrowd”为1的对象，保留未标记为crowd的对象
    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]

    # 获取所有对象的类别ID，并转换为numpy数组
    classes = [obj["category_id"] for obj in annotations]
    classes = np.asarray(classes, dtype=np.int64)

    # 获取所有对象的面积，转换为numpy数组
    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
    # 获取所有对象的iscrowd标志，如果不存在则设为0，转换为numpy数组
    iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)

    # 获取所有对象的边界框（bbox），并将其转换为numpy数组
    boxes = [obj["bbox"] for obj in annotations]
    # 防止没有边界框导致的尺寸变换问题
    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
    # 调整边界框的格式，从[x_min, y_min, width, height]变为[x_min, y_min, x_max, y_max]
    boxes[:, 2:] += boxes[:, :2]
    # 确保边界框不超出图像边界
    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)

    # 根据边界框的有效性创建一个掩码
    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])

    # 创建新的目标字典
    new_target = {}
    new_target["image_id"] = image_id
    new_target["class_labels"] = classes[keep]
    new_target["boxes"] = boxes[keep]
    new_target["area"] = area[keep]
    new_target["iscrowd"] = iscrowd[keep]
    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)

    # 如果注解存在且第一个对象包含关键点信息
    if annotations and "keypoints" in annotations[0]:
        # 获取所有对象的关键点，并转换为numpy数组
        keypoints = [obj["keypoints"] for obj in annotations]
        keypoints = np.asarray(keypoints, dtype=np.float32)
        # 根据keep掩码过滤相关的注解
        keypoints = keypoints[keep]
        num_keypoints = keypoints.shape[0]
        # 重新整形关键点数组，如果没有关键点则保持原状
        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
        new_target["keypoints"] = keypoints

    # 如果需要返回分割掩码
    if return_segmentation_masks:
        # 获取所有对象的分割数据，并调用函数将其转换为掩码
        segmentation_masks = [obj["segmentation"] for obj in annotations]
        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
        new_target["masks"] = masks[keep]

    # 返回更新后的目标字典
    return new_target
# 将给定的全景分割掩码转换为包围框

def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
    """
    计算提供的全景分割掩码周围的边界框。

    Args:
        masks: 格式为 `[number_masks, height, width]` 的掩码数组，N 是掩码数量

    Returns:
        boxes: 格式为 `[number_masks, 4]` 的边界框数组，使用 xyxy 格式
    """
    if masks.size == 0:
        return np.zeros((0, 4))

    h, w = masks.shape[-2:]
    y = np.arange(0, h, dtype=np.float32)
    x = np.arange(0, w, dtype=np.float32)
    # 创建网格坐标，解决特定平台上的问题，详见链接
    y, x = np.meshgrid(y, x, indexing="ij")

    x_mask = masks * np.expand_dims(x, axis=0)
    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
    x_min = x.filled(fill_value=1e8)
    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)

    y_mask = masks * np.expand_dims(y, axis=0)
    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
    y_min = y.filled(fill_value=1e8)
    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)

    return np.stack([x_min, y_min, x_max, y_max], 1)


def prepare_coco_panoptic_annotation(
    image: np.ndarray,
    target: Dict,
    masks_path: Union[str, pathlib.Path],
    return_masks: bool = True,
    input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
    """
    为 DETR 准备 COCO 全景注释。

    Args:
        image: 输入图像
        target: 包含目标信息的字典
        masks_path: 分割掩码的路径
        return_masks: 是否返回掩码
        input_data_format: 输入数据的通道维度格式

    Returns:
        new_target: 处理后的 COCO 全景注释字典
    """
    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
    annotation_path = pathlib.Path(masks_path) / target["file_name"]

    new_target = {}
    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)

    if "segments_info" in target:
        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
        masks = rgb_to_id(masks)

        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
        masks = masks == ids[:, None, None]
        masks = masks.astype(np.uint8)
        if return_masks:
            new_target["masks"] = masks
        new_target["boxes"] = masks_to_boxes(masks)
        new_target["class_labels"] = np.array(
            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
        )
        new_target["iscrowd"] = np.asarray(
            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
        )
        new_target["area"] = np.asarray(
            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
        )

    return new_target


def get_segmentation_image(
    masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False
):
    """
    获取分割图像。

    Args:
        masks: 分割掩码数组
        input_size: 输入大小
        target_size: 目标大小
        stuff_equiv_classes: 材质等价类
        deduplicate: 是否去重

    Returns:
        segmentation_image: 分割图像
    """
    # 从输入大小元组中获取高度和宽度
    h, w = input_size
    # 从目标大小元组中获取最终高度和宽度
    final_h, final_w = target_size

    # 对 masks 应用 softmax 函数，转置以便按需计算
    m_id = scipy.special.softmax(masks.transpose(0, 1), -1)

    # 如果没有检测到任何掩模 :(
    if m_id.shape[-1] == 0:
        # 将 m_id 初始化为全零数组，数据类型为 np.int64
        m_id = np.zeros((h, w), dtype=np.int64)
    else:
        # 取出最大概率的类别索引，并将形状转换为 (h, w)
        m_id = m_id.argmax(-1).reshape(h, w)

    # 如果需要去重
    if deduplicate:
        # 合并属于相同类别的掩模
        for equiv in stuff_equiv_classes.values():
            for eq_id in equiv:
                # 将 m_id 中等于 eq_id 的值替换为 equiv[0]
                m_id[m_id == eq_id] = equiv[0]

    # 将 m_id 转换为 RGB 彩色分割图像
    seg_img = id_to_rgb(m_id)
    # 调整 seg_img 的大小至 (final_w, final_h)，使用最近邻插值
    seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
    # 返回调整大小后的分割图像
    return seg_img
def post_process_panoptic_sample(
    out_logits: np.ndarray,
    masks: np.ndarray,
    boxes: np.ndarray,
    processed_size: Tuple[int, int],
    target_size: Tuple[int, int],
    is_thing_map: Dict,
    threshold=0.85,
) -> Dict:
    """
    Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample.

    Args:
        out_logits (`torch.Tensor`):
            The logits for this sample.
        masks (`torch.Tensor`):
            The predicted segmentation masks for this sample.
        boxes (`torch.Tensor`):
            The predicted bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y,
            width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding).
        processed_size (`Tuple[int, int]`):
            The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size
            after data augmentation but before batching.
        target_size (`Tuple[int, int]`):
            The target size of the image, `(height, width)` corresponding to the requested final size of the
            prediction.
        is_thing_map (`Dict`):
            A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not.
        threshold (`float`, *optional*, defaults to 0.85):
            The threshold used to binarize the segmentation masks.
    """
    # 计算类别概率的得分和标签
    scores, labels = score_labels_from_class_probabilities(out_logits)
    
    # 保留得分高于阈值且不是背景类别的预测结果
    keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold)
    
    # 提取保留预测结果的得分、类别和框
    cur_scores = scores[keep]
    cur_classes = labels[keep]
    cur_boxes = center_to_corners_format(boxes[keep])
    
    # 检查每个类别是否有对应的边界框
    if len(cur_boxes) != len(cur_classes):
        raise ValueError("Not as many boxes as there are classes")
    
    # 提取保留预测结果的掩码，并调整大小到处理后的图像大小
    cur_masks = masks[keep]
    cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR)
    cur_masks = safe_squeeze(cur_masks, 1)
    b, h, w = cur_masks.shape
    
    # 将每个掩码展平，以便后续合并同一类别的多个掩码
    cur_masks = cur_masks.reshape(b, -1)
    # 创建一个默认值为列表的 defaultdict，用于存储相似物体类别的索引列表
    stuff_equiv_classes = defaultdict(list)
    
    # 遍历当前类别列表 cur_classes 中的每个索引 k 和其对应的标签 label
    for k, label in enumerate(cur_classes):
        # 如果该标签对应的不是物体类别，则将索引 k 添加到 stuff_equiv_classes[label] 列表中
        if not is_thing_map[label]:
            stuff_equiv_classes[label].append(k)
    
    # 调用函数生成分割图像 seg_img，传入当前掩膜 cur_masks、处理后的大小 processed_size、目标大小 target_size、相似物体类别的索引列表 stuff_equiv_classes，并进行去重处理
    seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True)
    
    # 调用函数计算当前掩膜 cur_masks 的面积 area，传入处理后的大小 processed_size 和当前分数列表 cur_scores 的长度作为参数
    area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores))
    
    # 如果当前类别列表 cur_classes 的大小大于 0，则执行以下操作；否则将 cur_classes 设为包含一个元素的数组
    if cur_classes.size() > 0:
        # 创建布尔类型的数组 filtered_small，其元素为 True 表示对应的面积 area 小于等于 4
        filtered_small = np.array([a <= 4 for a in area], dtype=bool)
        # 只要 filtered_small 中还有 True，则循环执行以下操作
        while filtered_small.any():
            # 从 cur_masks、cur_scores 和 cur_classes 中过滤掉 filtered_small 中对应为 True 的项
            cur_masks = cur_masks[~filtered_small]
            cur_scores = cur_scores[~filtered_small]
            cur_classes = cur_classes[~filtered_small]
            # 重新生成分割图像 seg_img，传入当前掩膜 cur_masks、处理后的大小 (h, w)、目标大小 target_size、相似物体类别的索引列表 stuff_equiv_classes，并进行去重处理
            seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True)
            # 重新计算当前分割图像 seg_img 的面积 area，传入目标大小 target_size 和当前分数列表 cur_scores 的长度作为参数
            area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores))
            # 更新 filtered_small，重新标记 area 中小于等于 4 的项
            filtered_small = np.array([a <= 4 for a in area], dtype=bool)
    else:
        # 如果 cur_classes 的大小为 0，则将 cur_classes 设为包含一个元素的数组，元素类型为 np.int64，其值为 1
        cur_classes = np.ones((1, 1), dtype=np.int64)
    
    # 创建 segments_info 列表，其中每个元素是一个字典，包含 id、isthing、category_id 和 area 四个键值对
    segments_info = [
        {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a}
        for i, (cat, a) in enumerate(zip(cur_classes, area))
    ]
    # 删除 cur_classes 变量，释放内存
    del cur_classes
    
    # 使用 io.BytesIO() 创建一个字节流对象 out
    with io.BytesIO() as out:
        # 将分割图像 seg_img 转换为 PIL.Image 对象，保存为 PNG 格式，并将结果写入到 out 字节流中
        PIL.Image.fromarray(seg_img).save(out, format="PNG")
        # 构建预测结果字典 predictions，包含键值对 "png_string" 和 "segments_info"
        predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
    
    # 返回预测结果字典 predictions
    return predictions
# 调整注释大小以适应目标大小

def resize_annotation(
    annotation: Dict[str, Any],  # 接受一个字典作为注释的输入
    orig_size: Tuple[int, int],  # 原始图像的尺寸元组，格式为 (width, height)
    target_size: Tuple[int, int],  # 目标图像的尺寸元组，格式为 (width, height)
    threshold: float = 0.5,  # 用于二值化分割掩模的阈值，默认为 0.5
    resample: PILImageResampling = PILImageResampling.NEAREST,  # 用于调整掩模大小的重采样滤波器，默认为最近邻插值
):
    """
    Resizes an annotation to a target size.

    Args:
        annotation (`Dict[str, Any]`):
            The annotation dictionary.
        orig_size (`Tuple[int, int]`):
            The original size of the input image.
        target_size (`Tuple[int, int]`):
            The target size of the image, as returned by the preprocessing `resize` step.
        threshold (`float`, *optional*, defaults to 0.5):
            The threshold used to binarize the segmentation masks.
        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
            The resampling filter to use when resizing the masks.
    """
    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))  # 计算尺寸缩放比例
    ratio_height, ratio_width = ratios

    new_annotation = {}
    new_annotation["size"] = target_size  # 将目标尺寸存入新的注释字典中

    for key, value in annotation.items():
        if key == "boxes":
            boxes = value
            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
            new_annotation["boxes"] = scaled_boxes  # 缩放边界框坐标并存入新的注释字典中
        elif key == "area":
            area = value
            scaled_area = area * (ratio_width * ratio_height)
            new_annotation["area"] = scaled_area  # 缩放区域面积并存入新的注释字典中
        elif key == "masks":
            masks = value[:, None]
            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])  # 调整掩模大小
            masks = masks.astype(np.float32)
            masks = masks[:, 0] > threshold  # 根据阈值二值化掩模
            new_annotation["masks"] = masks  # 存入新的注释字典中
        elif key == "size":
            new_annotation["size"] = target_size  # 如果键为"size"，更新尺寸信息
        else:
            new_annotation[key] = value  # 其他键直接复制到新的注释字典中

    return new_annotation  # 返回调整后的注释字典


# TODO - (Amy) make compatible with other frameworks
def binary_mask_to_rle(mask):
    """
    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.

    Args:
        mask (`torch.Tensor` or `numpy.array`):
            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
            segment_id or class_id.
    Returns:
        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
        format.
    """
    if is_torch_tensor(mask):
        mask = mask.numpy()

    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return list(runs)


# TODO - (Amy) make compatible with other frameworks
def convert_segmentation_to_rle(segmentation):
    """
    Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.
    ```
    # 获取唯一的分割标识符列表，表示图像分割中的不同类别或段落
    segment_ids = torch.unique(segmentation)

    # 初始化空列表，用于存储每个类别或段落的运行长度编码
    run_length_encodings = []
    
    # 遍历每个唯一的分割标识符
    for idx in segment_ids:
        # 创建一个与分割图像匹配的二进制掩码，其中与当前标识符匹配的像素值为1，否则为0
        mask = torch.where(segmentation == idx, 1, 0)
        
        # 将二进制掩码转换为运行长度编码（Run-Length Encoding，RLE）
        rle = binary_mask_to_rle(mask)
        
        # 将当前类别或段落的运行长度编码添加到结果列表中
        run_length_encodings.append(rle)

    # 返回所有类别或段落的运行长度编码列表
    return run_length_encodings
# 创建一个函数来移除低分和无对象的数据，保留符合条件的 `masks`, `scores` 和 `labels`
def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
    """
    Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
    `labels`.

    Args:
        masks (`torch.Tensor`):
            A tensor of shape `(num_queries, height, width)`.
        scores (`torch.Tensor`):
            A tensor of shape `(num_queries)`.
        labels (`torch.Tensor`):
            A tensor of shape `(num_queries)`.
        object_mask_threshold (`float`):
            A number between 0 and 1 used to binarize the masks.
    Raises:
        `ValueError`: Raised when the first dimension doesn't match in all input tensors.
    Returns:
        `Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
        < `object_mask_threshold`.
    """
    # 检查所有输入张量的第一个维度是否相等，如果不相等则抛出 ValueError 异常
    if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
        raise ValueError("mask, scores and labels must have the same shape!")

    # 创建一个布尔索引，用于选择符合条件的对象
    to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)

    # 返回符合条件的 `masks`, `scores` 和 `labels`
    return masks[to_keep], scores[to_keep], labels[to_keep]


# 检查分割 mask 的有效性，返回是否存在符合条件的 mask 以及该类别 k 的 mask
def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
    # 获取与类别 k 相关联的 mask
    mask_k = mask_labels == k
    # 计算类别 k 的 mask 区域面积
    mask_k_area = mask_k.sum()

    # 计算类别 k 在预测中的原始区域面积
    original_area = (mask_probs[k] >= mask_threshold).sum()
    # 检查是否存在类别 k 的 mask 以及原始区域面积是否大于 0
    mask_exists = mask_k_area > 0 and original_area > 0

    # 如果 mask 存在，则进一步检查区域面积比例是否大于给定阈值
    if mask_exists:
        area_ratio = mask_k_area / original_area
        if not area_ratio.item() > overlap_mask_area_threshold:
            mask_exists = False

    # 返回 mask 是否存在以及类别 k 的 mask
    return mask_exists, mask_k


# 计算分割 mask 的各个段落，并返回分割结果
def compute_segments(
    mask_probs,
    pred_scores,
    pred_labels,
    mask_threshold: float = 0.5,
    overlap_mask_area_threshold: float = 0.8,
    label_ids_to_fuse: Optional[Set[int]] = None,
    target_size: Tuple[int, int] = None,
):
    # 根据 target_size 设置高度和宽度
    height = mask_probs.shape[1] if target_size is None else target_size[0]
    width = mask_probs.shape[2] if target_size is None else target_size[1]

    # 创建一个空的整数类型张量用于存储分割结果
    segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
    # 创建一个空的字典列表，用于存储每个分割段落的信息
    segments: List[Dict] = []

    # 如果指定了 target_size，则对 mask_probs 进行双线性插值
    if target_size is not None:
        mask_probs = nn.functional.interpolate(
            mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
        )[0]

    # 初始化当前分割段落的 ID
    current_segment_id = 0

    # 根据预测得分加权每个 mask
    mask_probs *= pred_scores.view(-1, 1, 1)
    # 找到每个像素位置的预测类别标签
    mask_labels = mask_probs.argmax(0)  # [height, width]

    # 记录每个类别的实例数量
    stuff_memory_list: Dict[str, int] = {}
    # 对预测标签的每个样本进行循环处理
    for k in range(pred_labels.shape[0]):
        # 获取当前样本的预测类别
        pred_class = pred_labels[k].item()
        # 判断当前类别是否需要融合
        should_fuse = pred_class in label_ids_to_fuse

        # 检查当前样本的分割掩码是否存在并且足够大以成为一个段落
        mask_exists, mask_k = check_segment_validity(
            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
        )

        # 如果存在有效的分割掩码
        if mask_exists:
            # 如果当前类别在stuff_memory_list中已存在，获取其对应的段落ID
            if pred_class in stuff_memory_list:
                current_segment_id = stuff_memory_list[pred_class]
            else:
                # 否则，增加当前段落ID并分配给当前类别
                current_segment_id += 1

            # 将当前对象段落添加到最终的分割图中
            segmentation[mask_k] = current_segment_id
            # 获取当前样本的预测分数，并四舍五入保留6位小数
            segment_score = round(pred_scores[k].item(), 6)
            # 将当前段落信息添加到segments列表中
            segments.append(
                {
                    "id": current_segment_id,
                    "label_id": pred_class,
                    "was_fused": should_fuse,
                    "score": segment_score,
                }
            )
            # 如果需要融合，则更新stuff_memory_list中当前类别对应的段落ID
            if should_fuse:
                stuff_memory_list[pred_class] = current_segment_id

    # 返回最终的分割图和segments列表作为结果
    return segmentation, segments
class DetrImageProcessor(BaseImageProcessor):
    r"""
    Constructs a Detr image processor.

    Args:
        format (`str`, *optional*, defaults to `"coco_detection"`):
            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
        do_resize (`bool`, *optional*, defaults to `True`):
            Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
            overridden by the `do_resize` parameter in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
            Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
            in the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
            Resampling filter to use if resizing the image.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
            `do_rescale` parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to True):
            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
            `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_annotations (`bool`, *optional*, defaults to `True`):
            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
        do_pad (`bool`, *optional*, defaults to `True`):
            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
            method. If `True` will pad the images in the batch to the largest height and width in the batch.
            Padding will be applied to the bottom and right of the image with zeros.
    """
    model_input_names = ["pixel_values", "pixel_mask"]

    def __init__(
        self,
        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Union[float, List[float]] = None,
        image_std: Union[float, List[float]] = None,
        do_convert_annotations: Optional[bool] = None,
        do_pad: bool = True,
        **kwargs,
    ) -> None:
        # 如果在 `kwargs` 中有 "pad_and_return_pixel_mask"，则将 `do_pad` 设置为对应的值，并移除该参数
        if "pad_and_return_pixel_mask" in kwargs:
            do_pad = kwargs.pop("pad_and_return_pixel_mask")

        # 如果 `kwargs` 中有 "max_size"，发出一次警告，并将其移除；建议使用 `size['longest_edge']` 进行设置
        if "max_size" in kwargs:
            logger.warning_once(
                "The `max_size` parameter is deprecated and will be removed in v4.26. "
                "Please specify in `size['longest_edge'] instead`.",
            )
            max_size = kwargs.pop("max_size")
        else:
            max_size = None if size is None else 1333

        # 如果 `size` 为 `None`，则设置默认的尺寸字典，其中包括 "shortest_edge" 和 "longest_edge" 的默认值
        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
        # 根据给定的 `size` 和 `max_size`，获取调整后的尺寸字典，确保图像尺寸的合理性
        size = get_size_dict(size, max_size=max_size, default_to_square=False)

        # 兼容性处理：如果 `do_convert_annotations` 为 `None`，则设置为 `do_normalize` 的值
        if do_convert_annotations is None:
            do_convert_annotations = do_normalize

        # 调用父类的初始化方法，传递 `kwargs` 中的参数
        super().__init__(**kwargs)
        # 设置对象的各种属性，用于图像处理流程中的参数控制和数据处理
        self.format = format
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.do_convert_annotations = do_convert_annotations
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        self.do_pad = do_pad
        # 定义有效的处理器键列表，用于验证和访问处理器对象的属性
        self._valid_processor_keys = [
            "images",
            "annotations",
            "return_segmentation_masks",
            "masks_path",
            "do_resize",
            "size",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "do_convert_annotations",
            "image_mean",
            "image_std",
            "do_pad",
            "format",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    @classmethod
    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
        """
        从字典创建图像处理器的实例，可以通过此方法更新参数，例如通过 `DetrImageProcessor.from_pretrained(checkpoint, size=600, max_size=800)` 创建图像处理器。
        """
        # 复制输入的字典，以确保不修改原始数据
        image_processor_dict = image_processor_dict.copy()
        # 如果 `kwargs` 中包含 "max_size"，则更新 `image_processor_dict` 中的 "max_size" 参数，并从 `kwargs` 中移除
        if "max_size" in kwargs:
            image_processor_dict["max_size"] = kwargs.pop("max_size")
        # 如果 `kwargs` 中包含 "pad_and_return_pixel_mask"，则更新 `image_processor_dict` 中的 "pad_and_return_pixel_mask" 参数，并从 `kwargs` 中移除
        if "pad_and_return_pixel_mask" in kwargs:
            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
        # 调用父类的 `from_dict` 方法，传递更新后的 `image_processor_dict` 和未处理的 `kwargs`
        return super().from_dict(image_processor_dict, **kwargs)

    def prepare_annotation(
        self,
        image: np.ndarray,
        target: Dict,
        format: Optional[AnnotationFormat] = None,
        return_segmentation_masks: bool = None,
        masks_path: Optional[Union[str, pathlib.Path]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> Dict:
        """
        准备一个用于输入 DETR 模型的注释。
        """
        # 如果未指定格式，则使用默认格式 `self.format`
        format = format if format is not None else self.format

        # 根据注释格式调用相应的准备函数来准备注释数据
        if format == AnnotationFormat.COCO_DETECTION:
            # 如果 `return_segmentation_masks` 未指定，则根据情况设置为 False
            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
            # 调用 `prepare_coco_detection_annotation` 函数来准备 COCO 检测格式的注释数据
            target = prepare_coco_detection_annotation(
                image, target, return_segmentation_masks, input_data_format=input_data_format
            )
        elif format == AnnotationFormat.COCO_PANOPTIC:
            # 如果 `return_segmentation_masks` 未指定，则根据情况设置为 True
            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
            # 调用 `prepare_coco_panoptic_annotation` 函数来准备 COCO 全景格式的注释数据
            target = prepare_coco_panoptic_annotation(
                image,
                target,
                masks_path=masks_path,
                return_masks=return_segmentation_masks,
                input_data_format=input_data_format,
            )
        else:
            # 如果指定的格式不支持，则抛出 ValueError 异常
            raise ValueError(f"Format {format} is not supported.")
        
        # 返回处理后的注释数据
        return target

    def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
        """
        准备输入数据，调用 `prepare_annotation` 方法来处理目标注释。
        """
        # 发出警告，提示 `prepare` 方法将在 v4.33 版本中移除，建议使用 `prepare_annotation` 方法代替
        logger.warning_once(
            "The `prepare` method is deprecated and will be removed in a v4.33. "
            "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
            "does not return the image anymore.",
        )
        # 调用 `prepare_annotation` 方法来处理目标注释
        target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
        # 返回处理后的图像和注释数据
        return image, target

    def convert_coco_poly_to_mask(self, *args, **kwargs):
        """
        将 COCO 多边形格式的注释转换为掩码格式的方法，发出警告表示此方法将在 v4.33 版本中移除。
        """
        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
        # 调用 `convert_coco_poly_to_mask` 函数并返回其结果
        return convert_coco_poly_to_mask(*args, **kwargs)
    # 警告日志：方法已弃用，将在 v4.33 版本移除
    def prepare_coco_detection(self, *args, **kwargs):
        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
        # 调用替代方法处理 COCO 检测注释数据集
        return prepare_coco_detection_annotation(*args, **kwargs)

    # 警告日志：方法已弃用，将在 v4.33 版本移除
    def prepare_coco_panoptic(self, *args, **kwargs):
        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
        # 调用替代方法处理 COCO 全景分割注释数据集
        return prepare_coco_panoptic_annotation(*args, **kwargs)

    # 图像调整大小方法，根据指定尺寸和参数调整输入图像大小
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
        int, smaller edge of the image will be matched to this number.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
                `height` and `width`.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                Resampling filter to use if resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 如果 `max_size` 在参数中，则发出警告日志并弹出该参数
        if "max_size" in kwargs:
            logger.warning_once(
                "The `max_size` parameter is deprecated and will be removed in v4.26. "
                "Please specify in `size['longest_edge'] instead`.",
            )
            max_size = kwargs.pop("max_size")
        else:
            max_size = None
        
        # 获取调整后的大小字典，包括最大尺寸和默认不是正方形
        size = get_size_dict(size, max_size=max_size, default_to_square=False)
        
        # 根据给定的尺寸信息调整输出图像的大小
        if "shortest_edge" in size and "longest_edge" in size:
            size = get_resize_output_image_size(
                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
            )
        elif "height" in size and "width" in size:
            size = (size["height"], size["width"])
        else:
            # 如果尺寸信息不完整，则引发值错误异常
            raise ValueError(
                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
                f" {size.keys()}."
            )
        
        # 使用指定参数调整图像大小并返回调整后的图像
        image = resize(
            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
        )
        return image
    # 调用全局函数 `resize_annotation` 来调整给定注释的大小，以匹配调整后的图像大小
    def resize_annotation(
        self,
        annotation,
        orig_size,
        size,
        resample: PILImageResampling = PILImageResampling.NEAREST,
    ) -> Dict:
        """
        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
        to this number.
        """
        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)

    # TODO (Amy) - update to use `rescale_factor` instead of `scale`
    # 根据给定的 `rescale_factor` 缩放图像，更新后的尺寸为 `image = image * rescale_factor`
    def rescale(
        self,
        image: np.ndarray,
        rescale_factor: float,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Rescale the image by the given factor. image = image * rescale_factor.

        Args:
            image (`np.ndarray`):
                Image to rescale.
            rescale_factor (`float`):
                The value to use for rescaling.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
                one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        """
        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)

    # 根据给定的图像大小归一化注释框的坐标，从 `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]`
    # 转换为 `[center_x, center_y, width, height]` 格式，并将绝对像素值转换为相对值
    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
        """
        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
        """
        return normalize_annotation(annotation, image_size=image_size)

    # 更新填充图像后的注释信息，根据输入和输出图像大小、填充值和是否更新边界框信息进行更新
    def _update_annotation_for_padded_image(
        self,
        annotation: Dict,
        input_image_size: Tuple[int, int],
        output_image_size: Tuple[int, int],
        padding,
        update_bboxes,
    ) -> Dict:
    ) -> Dict:
        """
        Update the annotation for a padded image.
        """
        # 创建一个新的空字典用于存储更新后的注释信息
        new_annotation = {}
        # 将输出图像的尺寸信息添加到新注释字典中
        new_annotation["size"] = output_image_size

        # 遍历原始注释中的每个键值对
        for key, value in annotation.items():
            # 如果键是"masks"
            if key == "masks":
                # 获取masks值
                masks = value
                # 对masks进行填充操作，使用指定的填充模式和常数值
                masks = pad(
                    masks,
                    padding,
                    mode=PaddingMode.CONSTANT,
                    constant_values=0,
                    input_data_format=ChannelDimension.FIRST,
                )
                # 对填充后的masks进行安全压缩，移除维度为1的维度
                masks = safe_squeeze(masks, 1)
                # 将处理后的masks存入新注释字典中
                new_annotation["masks"] = masks
            # 如果键是"boxes"且需要更新边界框
            elif key == "boxes" and update_bboxes:
                # 获取boxes值
                boxes = value
                # 根据输入和输出图像的尺寸比例调整边界框的坐标值
                boxes *= np.asarray(
                    [
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                    ]
                )
                # 将调整后的boxes存入新注释字典中
                new_annotation["boxes"] = boxes
            # 如果键是"size"
            elif key == "size":
                # 将输出图像的尺寸信息存入新注释字典中（这一步似乎是多余的，因为在初始化时已经添加过）
                new_annotation["size"] = output_image_size
            else:
                # 将其他键值对直接复制到新注释字典中
                new_annotation[key] = value

        # 返回更新后的注释字典
        return new_annotation

    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        annotation: Optional[Dict[str, Any]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
        """
        # 获取输入图像的高度和宽度
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        # 获取输出图像的高度和宽度
        output_height, output_width = output_size

        # 计算在图像底部和右侧需要填充的像素数
        pad_bottom = output_height - input_height
        pad_right = output_width - input_width
        # 构建填充的配置元组
        padding = ((0, pad_bottom), (0, pad_right))
        # 对输入图像进行填充操作，使用指定的填充模式和常数值
        padded_image = pad(
            image,
            padding,
            mode=PaddingMode.CONSTANT,
            constant_values=constant_values,
            data_format=data_format,
            input_data_format=input_data_format,
        )
        
        # 如果提供了注释信息，则更新注释以匹配填充后的图像
        if annotation is not None:
            annotation = self._update_annotation_for_padded_image(
                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
            )
        
        # 返回填充后的图像及其对应的注释信息（如果有）
        return padded_image, annotation
    # 定义一个类方法 `pad`，用于填充图像数据。
    def pad(
        self,
        images: List[np.ndarray],  # 输入参数 `images` 是一个 NumPy 数组的列表，表示图像数据。
        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,  # 可选参数 `annotations`，可以是单个注解类型或注解类型的列表。
        constant_values: Union[float, Iterable[float]] = 0,  # 填充时的常数值，可以是单个浮点数或者浮点数的可迭代对象，默认为 0。
        return_pixel_mask: bool = True,  # 是否返回像素掩码，默认为 True。
        return_tensors: Optional[Union[str, TensorType]] = None,  # 可选参数，指定返回的张量类型或者字符串。
        data_format: Optional[ChannelDimension] = None,  # 可选参数，指定数据格式。
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 可选参数，指定输入数据的格式。
        update_bboxes: bool = True,  # 是否更新边界框，默认为 True。
    
    # 定义一个类方法 `preprocess`，用于预处理图像数据。
    def preprocess(
        self,
        images: ImageInput,  # 输入参数 `images`，表示输入的图像数据。
        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,  # 可选参数 `annotations`，可以是单个注解类型或注解类型的列表。
        return_segmentation_masks: bool = None,  # 是否返回分割掩码，默认为 None。
        masks_path: Optional[Union[str, pathlib.Path]] = None,  # 可选参数 `masks_path`，指定掩码路径，可以是字符串或路径对象。
        do_resize: Optional[bool] = None,  # 是否调整大小，默认为 None。
        size: Optional[Dict[str, int]] = None,  # 可选参数 `size`，指定大小的字典。
        resample=None,  # PIL 图像重采样方法。
        do_rescale: Optional[bool] = None,  # 是否重新缩放，默认为 None。
        rescale_factor: Optional[Union[int, float]] = None,  # 可选参数 `rescale_factor`，重新缩放的因子。
        do_normalize: Optional[bool] = None,  # 是否归一化，默认为 None。
        do_convert_annotations: Optional[bool] = None,  # 是否转换注解，默认为 None。
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像均值，可以是单个浮点数或浮点数列表。
        image_std: Optional[Union[float, List[float]]] = None,  # 图像标准差，可以是单个浮点数或浮点数列表。
        do_pad: Optional[bool] = None,  # 是否填充，默认为 None。
        format: Optional[Union[str, AnnotationFormat]] = None,  # 注解格式，可以是字符串或注解格式对象。
        return_tensors: Optional[Union[TensorType, str]] = None,  # 返回的张量类型或字符串。
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,  # 数据格式，默认为首通道优先。
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据的格式。
        **kwargs,  # 其余关键字参数。
    
    # 后处理方法 - TODO: 添加对其他框架的支持
    # 受 https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258 启发的
    def post_process(self, outputs, target_sizes):
        """
        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format. Only supports PyTorch.

        Args:
            outputs ([`DetrObjectDetectionOutput`]):
                Raw outputs of the model.
            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
                original image size (before any data augmentation). For visualization, this should be the image size
                after data augment, but before padding.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        """
        # 发出警告，提醒用户 `post_process` 方法即将被移除
        logger.warning_once(
            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
        )

        # 获取输出中的分类分数和预测框
        out_logits, out_bbox = outputs.logits, outputs.pred_boxes

        # 检查输出的长度与目标尺寸长度是否一致
        if len(out_logits) != len(target_sizes):
            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
        # 检查目标尺寸的形状是否为 (batch_size, 2)
        if target_sizes.shape[1] != 2:
            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")

        # 对分类分数进行 softmax 处理，得到概率分布，并获取最大概率的类别标签和分数
        prob = nn.functional.softmax(out_logits, -1)
        scores, labels = prob[..., :-1].max(-1)

        # 将预测框转换为 [x0, y0, x1, y1] 的格式（左上角和右下角坐标）
        boxes = center_to_corners_format(out_bbox)
        # 将相对坐标 [0, 1] 转换为绝对坐标 [0, height]，乘以图片尺寸因子
        img_h, img_w = target_sizes.unbind(1)
        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
        boxes = boxes * scale_fct[:, None, :]

        # 组装结果，每个字典包含预测的分数、标签和框
        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
        return results
    # 定义一个方法用于后处理分割模型的输出，将输出转换为图像分割预测。仅支持 PyTorch。
    def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
        """
        Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch.

        Args:
            outputs ([`DetrSegmentationOutput`]):
                Raw outputs of the model.
            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
            threshold (`float`, *optional*, defaults to 0.9):
                Threshold to use to filter out queries.
            mask_threshold (`float`, *optional*, defaults to 0.5):
                Threshold to use when turning the predicted masks into binary values.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
            in the batch as predicted by the model.
        """
        # 发出警告信息，提醒用户此函数即将在 Transformers v5 中删除，建议使用 `post_process_semantic_segmentation`。
        logger.warning_once(
            "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
            " `post_process_semantic_segmentation`.",
        )
        
        # 从模型输出中提取逻辑回归结果和预测的掩码
        out_logits, raw_masks = outputs.logits, outputs.pred_masks
        
        # 空标签的索引为输出 logits 的最后一个维度索引减一
        empty_label = out_logits.shape[-1] - 1
        
        # 存储预测结果的列表
        preds = []

        # 将输入转换为元组形式
        def to_tuple(tup):
            if isinstance(tup, tuple):
                return tup
            return tuple(tup.cpu().tolist())

        # 遍历每个样本的 logits、掩码和目标尺寸
        for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
            # 对 logits 进行 softmax 操作，获取每个预测的最大分数和对应的标签
            cur_scores, cur_labels = cur_logits.softmax(-1).max(-1)
            
            # 过滤掉空查询和分数低于阈值的检测
            keep = cur_labels.ne(empty_label) & (cur_scores > threshold)
            cur_scores = cur_scores[keep]
            cur_labels = cur_labels[keep]
            cur_masks = cur_masks[keep]
            
            # 使用双线性插值将掩码调整至目标尺寸
            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
            
            # 将掩码转换为二进制值，根据 mask_threshold 进行阈值化
            cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1

            # 将当前样本的分数、标签和掩码存储到预测字典中
            predictions = {"scores": cur_scores, "labels": cur_labels, "masks": cur_masks}
            preds.append(predictions)
        
        # 返回所有样本的预测结果列表
        return preds

    # 参考自 https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
    # 将模型输出转换为实例分割预测结果。仅支持 PyTorch。
    def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
        """
        Converts the output of [`DetrForSegmentation`] into actual instance segmentation predictions. Only supports
        PyTorch.

        Args:
            results (`List[Dict]`):
                Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added.
            outputs ([`DetrSegmentationOutput`]):
                Raw outputs of the model.
            orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
                image size (before any data augmentation).
            max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
                original image size (before any data augmentation).
            threshold (`float`, *optional*, defaults to 0.5):
                Threshold to use when turning the predicted masks into binary values.

        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
            image in the batch as predicted by the model.
        """
        # 发出警告信息，提醒用户函数将在 Transformers 的 v5 版本中移除，请使用 `post_process_instance_segmentation`。
        logger.warning_once(
            "`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use"
            " `post_process_instance_segmentation`.",
        )

        # 检查 orig_target_sizes 和 max_target_sizes 的长度是否相等，如果不相等则引发 ValueError。
        if len(orig_target_sizes) != len(max_target_sizes):
            raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes")

        # 获取最大的高度和宽度值
        max_h, max_w = max_target_sizes.max(0)[0].tolist()

        # 压缩模型输出中的预测 masks，并进行插值操作，使其与 max_h 和 max_w 的尺寸一致
        outputs_masks = outputs.pred_masks.squeeze(2)
        outputs_masks = nn.functional.interpolate(
            outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
        )

        # 将 masks 转换为二进制值，根据给定的阈值进行阈值化，并移动到 CPU
        outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()

        # 遍历每个输出，调整 masks 的尺寸并保存到 results 中
        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
            img_h, img_w = t[0], t[1]
            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
            results[i]["masks"] = nn.functional.interpolate(
                results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
            ).byte()

        # 返回处理后的结果列表
        return results

    # 受启发于 https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
    # 受启发于 https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258
    def post_process_object_detection(
        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
    ):
        """
        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format. Only supports PyTorch.

        Args:
            outputs ([`DetrObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        """
        # Extract logits and bounding boxes from model outputs
        out_logits, out_bbox = outputs.logits, outputs.pred_boxes

        # Check if target_sizes is provided and validate its dimension
        if target_sizes is not None:
            if len(out_logits) != len(target_sizes):
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )

        # Compute softmax probabilities and extract scores and labels
        prob = nn.functional.softmax(out_logits, -1)
        scores, labels = prob[..., :-1].max(-1)

        # Convert bounding boxes from center format to [x0, y0, x1, y1]
        boxes = center_to_corners_format(out_bbox)

        # If target_sizes is provided, convert relative coordinates to absolute coordinates
        if target_sizes is not None:
            if isinstance(target_sizes, list):
                img_h = torch.Tensor([i[0] for i in target_sizes])
                img_w = torch.Tensor([i[1] for i in target_sizes])
            else:
                img_h, img_w = target_sizes.unbind(1)

            # Compute scaling factors and apply to bounding boxes
            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
            boxes = boxes * scale_fct[:, None, :]

        # Filter predictions based on score threshold and construct results dictionary
        results = []
        for s, l, b in zip(scores, labels, boxes):
            score = s[s > threshold]
            label = l[s > threshold]
            box = b[s > threshold]
            results.append({"scores": score, "labels": label, "boxes": box})

        return results
    # 将模型输出转换为语义分割地图。仅支持 PyTorch。
    def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple[int, int]] = None):
        """
        Converts the output of [`DetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.

        Args:
            outputs ([`DetrForSegmentation`]):
                Raw outputs of the model.
            target_sizes (`List[Tuple[int, int]]`, *optional*):
                A list of tuples (`Tuple[int, int]`) containing the target size (height, width) of each image in the
                batch. If unset, predictions will not be resized.
        Returns:
            `List[torch.Tensor]`:
                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
                `torch.Tensor` correspond to a semantic class id.
        """
        # 获取类别查询的 logits，形状为 [batch_size, num_queries, num_classes+1]
        class_queries_logits = outputs.logits  
        # 获取掩码查询的 logits，形状为 [batch_size, num_queries, height, width]
        masks_queries_logits = outputs.pred_masks  

        # 移除最后一个类别（null 类别）的 logits，使用 softmax 进行归一化
        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
        # 使用 sigmoid 函数将掩码 logits 转换为概率，形状为 [batch_size, num_queries, height, width]
        masks_probs = masks_queries_logits.sigmoid()  

        # 计算语义分割 logits，形状为 (batch_size, num_classes, height, width)
        segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
        batch_size = class_queries_logits.shape[0]

        # 调整 logits 的大小并计算语义分割地图
        if target_sizes is not None:
            # 检查目标大小的数量与 logits 的批次维度是否匹配
            if batch_size != len(target_sizes):
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )

            semantic_segmentation = []
            for idx in range(batch_size):
                # 使用双线性插值将 logits 调整到指定大小
                resized_logits = nn.functional.interpolate(
                    segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
                )
                # 获取每个像素点的语义类别
                semantic_map = resized_logits[0].argmax(dim=0)
                semantic_segmentation.append(semantic_map)
        else:
            # 获取每个像素点的语义类别，并按批次组织成列表
            semantic_segmentation = segmentation.argmax(dim=1)
            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]

        return semantic_segmentation

    # 受 https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218 启发
    def post_process_instance_segmentation(
        self,
        outputs,
        threshold: float = 0.5,
        mask_threshold: float = 0.5,
        overlap_mask_area_threshold: float = 0.8,
        target_sizes: Optional[List[Tuple[int, int]]] = None,
        return_coco_annotation: Optional[bool] = False,
    # 参考自 https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
    # 定义一个方法用于处理全景分割的后处理
    def post_process_panoptic_segmentation(
        self,
        outputs,
        threshold: float = 0.5,
        mask_threshold: float = 0.5,
        overlap_mask_area_threshold: float = 0.8,
        label_ids_to_fuse: Optional[Set[int]] = None,
        target_sizes: Optional[List[Tuple[int, int]]] = None,

`.\models\detr\modeling_detr.py`

# coding=utf-8
# Copyright 2021 Facebook AI Research The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch DETR model."""

import math  # 导入数学函数库
from dataclasses import dataclass  # 导入数据类装饰器
from typing import Dict, List, Optional, Tuple, Union  # 导入类型提示模块

import torch  # 导入PyTorch库
from torch import Tensor, nn  # 导入张量和神经网络模块

from ...activations import ACT2FN  # 导入激活函数映射
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask  # 导入注意力掩码工具函数
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput  # 导入模型输出类
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...utils import (
    ModelOutput,  # 导入模型输出工具类
    add_start_docstrings,  # 导入添加文档字符串的工具函数
    add_start_docstrings_to_model_forward,  # 导入添加模型前向文档字符串的工具函数
    is_accelerate_available,  # 导入加速库可用性检查函数
    is_scipy_available,  # 导入SciPy库可用性检查函数
    is_timm_available,  # 导入Timm库可用性检查函数
    is_vision_available,  # 导入视觉库可用性检查函数
    logging,  # 导入日志模块
    replace_return_docstrings,  # 导入替换返回文档字符串的工具函数
    requires_backends,  # 导入后端需求检查函数
)
from ...utils.backbone_utils import load_backbone  # 导入加载骨干网络函数
from .configuration_detr import DetrConfig  # 导入DETR模型配置类


if is_accelerate_available():  # 如果加速库可用
    from accelerate import PartialState  # 导入部分状态
    from accelerate.utils import reduce  # 导入数据缩减函数

if is_scipy_available():  # 如果SciPy库可用
    from scipy.optimize import linear_sum_assignment  # 导入线性求解分配函数

if is_timm_available():  # 如果Timm库可用
    from timm import create_model  # 导入创建模型函数

if is_vision_available():  # 如果视觉库可用
    from transformers.image_transforms import center_to_corners_format  # 导入中心转角格式转换函数

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CONFIG_FOR_DOC = "DetrConfig"  # 用于文档的DETR配置类名
_CHECKPOINT_FOR_DOC = "facebook/detr-resnet-50"  # 用于文档的预训练模型名

DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [  # DETR预训练模型存档列表
    "facebook/detr-resnet-50",
    # See all DETR models at https://huggingface.co/models?filter=detr
]


@dataclass
class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
    """
    Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
    """
    # 定义一个可选参数 `intermediate_hidden_states`，类型为 `torch.FloatTensor`，形状为 `(config.decoder_layers, batch_size, num_queries, hidden_size)`
    # 如果 `config.auxiliary_loss=True`，则返回中间解码器激活值，即每个解码器层的输出，每个都经过了层归一化处理。
    intermediate_hidden_states: Optional[torch.FloatTensor] = None
# 使用 `dataclass` 装饰器定义一个数据类 `DetrModelOutput`，它是 `Seq2SeqModelOutput` 的子类
@dataclass
class DetrModelOutput(Seq2SeqModelOutput):
    """
    DETR 编码-解码模型的输出基类。这个类在 `Seq2SeqModelOutput` 的基础上增加了一个属性，
    即可选的中间解码器激活堆栈，即每个解码器层的输出，每个输出通过了一个 layernorm。
    在使用辅助解码损失训练模型时非常有用。
    """

    # 可选的中间隐藏状态，类型为 FloatTensor
    intermediate_hidden_states: Optional[torch.FloatTensor] = None


# 使用 `dataclass` 装饰器定义一个数据类 `DetrObjectDetectionOutput`，它是 `ModelOutput` 的子类
@dataclass
class DetrObjectDetectionOutput(ModelOutput):
    """
    [`DetrForObjectDetection`] 的输出类型。
    """

    # 可选的损失，类型为 FloatTensor
    loss: Optional[torch.FloatTensor] = None
    # 可选的损失字典，类型为字典
    loss_dict: Optional[Dict] = None
    # 预测的 logits，类型为 FloatTensor
    logits: torch.FloatTensor = None
    # 预测的框，类型为 FloatTensor
    pred_boxes: torch.FloatTensor = None
    # 可选的辅助输出，类型为列表中包含字典
    auxiliary_outputs: Optional[List[Dict]] = None
    # 最后的隐藏状态，类型为 FloatTensor
    last_hidden_state: Optional[torch.FloatTensor] = None
    # 解码器的隐藏状态，类型为元组中的 FloatTensor
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 解码器的注意力，类型为元组中的 FloatTensor
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 交叉注意力，类型为元组中的 FloatTensor
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器的最后隐藏状态，类型为 FloatTensor
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    # 编码器的隐藏状态，类型为元组中的 FloatTensor
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器的注意力，类型为元组中的 FloatTensor
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None


# 使用 `dataclass` 装饰器定义一个数据类 `DetrSegmentationOutput`，它是 `ModelOutput` 的子类
@dataclass
class DetrSegmentationOutput(ModelOutput):
    """
    [`DetrForSegmentation`] 的输出类型。
    """

    # 可选的损失，类型为 FloatTensor
    loss: Optional[torch.FloatTensor] = None
    # 可选的损失字典，类型为字典
    loss_dict: Optional[Dict] = None
    # 预测的 logits，类型为 FloatTensor
    logits: torch.FloatTensor = None
    # 预测的框，类型为 FloatTensor
    pred_boxes: torch.FloatTensor = None
    # 预测的掩码，类型为 FloatTensor
    pred_masks: torch.FloatTensor = None
    # 可选的辅助输出，类型为列表中包含字典
    auxiliary_outputs: Optional[List[Dict]] = None
    # 最后的隐藏状态，类型为 FloatTensor
    last_hidden_state: Optional[torch.FloatTensor] = None
    # 解码器的隐藏状态，类型为元组中的 FloatTensor
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 解码器的注意力，类型为元组中的 FloatTensor
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 交叉注意力，类型为元组中的 FloatTensor
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器的最后隐藏状态，类型为 FloatTensor
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    # 编码器的隐藏状态，类型为元组中的 FloatTensor
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 编码器的注意力，类型为元组中的 FloatTensor
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None


# 以下是从 https://github.com/facebookresearch/detr/blob/master/backbone.py 复制的实用程序
class DetrFrozenBatchNorm2d(nn.Module):
    """
    固定统计量和仿射参数的 BatchNorm2d。

    从 torchvision.misc.ops 中复制粘贴，添加了 eps 在 rqsrt 前，否则除了 torchvision.models.resnet[18,34,50,101] 之外的任何模型都会产生 NaN。
    """

    def __init__(self, n):
        super().__init__()
        # 注册缓冲区：权重、偏置、运行时均值、运行时方差，都是长度为 n 的张量
        self.register_buffer("weight", torch.ones(n))
        self.register_buffer("bias", torch.zeros(n))
        self.register_buffer("running_mean", torch.zeros(n))
        self.register_buffer("running_var", torch.ones(n))

    def _load_from_state_dict(
        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
        ):
        # 加载模型状态字典时的特殊方法
        pass
    ):
        # 构建存储批次追踪数的键名
        num_batches_tracked_key = prefix + "num_batches_tracked"
        # 如果状态字典中存在批次追踪数的键名，则删除该键
        if num_batches_tracked_key in state_dict:
            del state_dict[num_batches_tracked_key]

        # 调用父类的方法，从状态字典加载模型参数
        super()._load_from_state_dict(
            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
        )

    def forward(self, x):
        # 将权重重塑为适合用户操作的形状
        weight = self.weight.reshape(1, -1, 1, 1)
        # 将偏置重塑为适合用户操作的形状
        bias = self.bias.reshape(1, -1, 1, 1)
        # 将运行时方差重塑为适合用户操作的形状
        running_var = self.running_var.reshape(1, -1, 1, 1)
        # 将运行时均值重塑为适合用户操作的形状
        running_mean = self.running_mean.reshape(1, -1, 1, 1)
        # 定义一个很小的常数 epsilon，用于数值稳定性
        epsilon = 1e-5
        # 计算 scale，用于标准化
        scale = weight * (running_var + epsilon).rsqrt()
        # 调整偏置，确保数据的中心化
        bias = bias - running_mean * scale
        # 返回经过标准化和偏置处理后的输入 x
        return x * scale + bias
def replace_batch_norm(model):
    r"""
    Recursively replace all `torch.nn.BatchNorm2d` with `DetrFrozenBatchNorm2d`.

    Args:
        model (torch.nn.Module):
            input model
    """
    # 遍历模型的所有子模块
    for name, module in model.named_children():
        # 如果当前模块是 `nn.BatchNorm2d` 类型
        if isinstance(module, nn.BatchNorm2d):
            # 创建一个新的 `DetrFrozenBatchNorm2d` 模块
            new_module = DetrFrozenBatchNorm2d(module.num_features)

            # 如果当前 `nn.BatchNorm2d` 模块的权重不在 "meta" 设备上
            if not module.weight.device == torch.device("meta"):
                # 复制权重、偏置、均值和方差到新的 `DetrFrozenBatchNorm2d` 模块
                new_module.weight.data.copy_(module.weight)
                new_module.bias.data.copy_(module.bias)
                new_module.running_mean.data.copy_(module.running_mean)
                new_module.running_var.data.copy_(module.running_var)

            # 将模型中原来的 `nn.BatchNorm2d` 模块替换为新的 `DetrFrozenBatchNorm2d` 模块
            model._modules[name] = new_module

        # 如果当前模块还有子模块，则递归替换其中的 `nn.BatchNorm2d` 模块
        if len(list(module.children())) > 0:
            replace_batch_norm(module)


class DetrConvEncoder(nn.Module):
    """
    Convolutional backbone, using either the AutoBackbone API or one from the timm library.

    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.

    """

    def __init__(self, config):
        super().__init__()

        self.config = config

        # 根据配置选择使用 timm 库中的模型还是自定义加载的模型
        if config.use_timm_backbone:
            # 如果使用 timm 模型，则确保需要 timm 后端支持
            requires_backends(self, ["timm"])
            kwargs = {}
            if config.dilation:
                kwargs["output_stride"] = 16
            # 创建 timm 模型，并只输出特征
            backbone = create_model(
                config.backbone,
                pretrained=config.use_pretrained_backbone,
                features_only=True,
                out_indices=(1, 2, 3, 4),
                in_chans=config.num_channels,
                **kwargs,
            )
        else:
            # 否则加载自定义的模型
            backbone = load_backbone(config)

        # 使用 `replace_batch_norm` 函数将模型中所有的 `nn.BatchNorm2d` 替换为 `DetrFrozenBatchNorm2d`
        with torch.no_grad():
            replace_batch_norm(backbone)

        # 将替换后的模型设置为类属性
        self.model = backbone
        # 根据配置获取中间层的通道数信息
        self.intermediate_channel_sizes = (
            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
        )

        # 根据模型类型和配置设置参数的梯度是否需要计算
        backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
        if "resnet" in backbone_model_type:
            for name, parameter in self.model.named_parameters():
                if config.use_timm_backbone:
                    # 对于 timm 模型，除了特定的几个阶段，其他参数设为不需要梯度计算
                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
                        parameter.requires_grad_(False)
                else:
                    # 对于自定义加载的模型，除了特定的几个阶段，其他参数设为不需要梯度计算
                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
                        parameter.requires_grad_(False)
    # 定义一个前向传播方法，接收像素数值和像素掩码作为输入参数
    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
        # 如果配置要求使用timm的后端模型，则将像素值传递给模型并获取特征图列表，否则直接从模型获取特征图
        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps

        # 初始化一个空列表，用于存储输出的特征图和相应的掩码
        out = []
        for feature_map in features:
            # 将像素掩码下采样至与对应特征图相同的形状
            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
            out.append((feature_map, mask))
        # 返回包含特征图和掩码的列表
        return out
class DetrConvModel(nn.Module):
    """
    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
    """

    def __init__(self, conv_encoder, position_embedding):
        super().__init__()
        self.conv_encoder = conv_encoder  # 初始化卷积编码器模型
        self.position_embedding = position_embedding  # 初始化位置嵌入模型

    def forward(self, pixel_values, pixel_mask):
        # 将像素值和像素掩码通过骨干网络传递，获取(feature_map, pixel_mask)元组的列表
        out = self.conv_encoder(pixel_values, pixel_mask)
        pos = []
        for feature_map, mask in out:
            # 位置编码
            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))

        return out, pos


class DetrSinePositionEmbedding(nn.Module):
    """
    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
    need paper, generalized to work on images.
    """

    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
        super().__init__()
        self.embedding_dim = embedding_dim  # 嵌入维度
        self.temperature = temperature  # 温度参数
        self.normalize = normalize  # 是否归一化
        if scale is not None and normalize is False:
            raise ValueError("normalize should be True if scale is passed")
        if scale is None:
            scale = 2 * math.pi
        self.scale = scale  # 缩放参数

    def forward(self, pixel_values, pixel_mask):
        if pixel_mask is None:
            raise ValueError("No pixel mask provided")  # 若未提供像素掩码，则引发异常
        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)  # 沿着纵向累积和
        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)  # 沿着横向累积和
        if self.normalize:
            y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale  # 归一化并乘以缩放参数
            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale  # 归一化并乘以缩放参数

        dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()  # 创建维度张量
        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)  # 计算温度的指数变换

        pos_x = x_embed[:, :, :, None] / dim_t  # 计算X方向的位置编码
        pos_y = y_embed[:, :, :, None] / dim_t  # 计算Y方向的位置编码
        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)  # 正弦余弦变换
        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)  # 正弦余弦变换
        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)  # 合并并转置维度
        return pos


class DetrLearnedPositionEmbedding(nn.Module):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, embedding_dim=256):
        super().__init__()
        self.row_embeddings = nn.Embedding(50, embedding_dim)  # 行嵌入模块
        self.column_embeddings = nn.Embedding(50, embedding_dim)  # 列嵌入模块
    # 定义一个前向传播方法，接受像素数值和可选的像素掩码作为输入
    def forward(self, pixel_values, pixel_mask=None):
        # 获取输入张量的高度和宽度
        height, width = pixel_values.shape[-2:]
        # 在设备上创建一个张量，包含从0到width-1的整数值，用于列的位置编码
        width_values = torch.arange(width, device=pixel_values.device)
        # 在设备上创建一个张量，包含从0到height-1的整数值，用于行的位置编码
        height_values = torch.arange(height, device=pixel_values.device)
        # 使用列位置编码器计算列方向的位置嵌入
        x_emb = self.column_embeddings(width_values)
        # 使用行位置编码器计算行方向的位置嵌入
        y_emb = self.row_embeddings(height_values)
        # 创建位置张量，结合列和行的位置嵌入，形成二维位置信息
        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
        # 将位置张量进行维度置换，调整顺序为(位置嵌入维度, 高度, 宽度)
        pos = pos.permute(2, 0, 1)
        # 在第一维度上添加一个维度，用于批处理维度
        pos = pos.unsqueeze(0)
        # 将位置张量复制以适配输入像素值张量的批处理大小
        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
        # 返回位置张量，其中包含了输入像素值的位置信息
        return pos
def build_position_encoding(config):
    # 根据模型配置计算位置编码的步数
    n_steps = config.d_model // 2
    # 根据位置嵌入类型选择不同的位置编码方法
    if config.position_embedding_type == "sine":
        # 如果选择使用正弦位置编码，则创建一个正弦位置嵌入对象
        # TODO find a better way of exposing other arguments
        position_embedding = DetrSinePositionEmbedding(n_steps, normalize=True)
    elif config.position_embedding_type == "learned":
        # 如果选择使用学习得到的位置编码，则创建一个学习得到的位置嵌入对象
        position_embedding = DetrLearnedPositionEmbedding(n_steps)
    else:
        # 如果选择的位置编码类型不被支持，则抛出数值错误异常
        raise ValueError(f"Not supported {config.position_embedding_type}")

    # 返回位置嵌入对象
    return position_embedding


class DetrAttention(nn.Module):
    """
    Multi-headed attention from 'Attention Is All You Need' paper.

    Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
    """

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        bias: bool = True,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        # 确保 embed_dim 必须能被 num_heads 整除，否则抛出数值错误异常
        if self.head_dim * num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {num_heads})."
            )
        # 缩放因子，用于缩放注意力分数
        self.scaling = self.head_dim**-0.5

        # 初始化线性映射函数，用于 Q、K、V 的投影
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
        # 将输入张量重新形状为适合多头注意力计算的形状
        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor], **kwargs):
        # 获取位置嵌入，或者警告使用过时的位置嵌入
        position_embeddings = kwargs.pop("position_embeddings", None)

        if kwargs:
            # 如果有未预期的关键字参数，则抛出数值错误异常
            raise ValueError(f"Unexpected arguments {kwargs.keys()}")

        if position_embeddings is not None and object_queries is not None:
            # 如果同时指定了位置嵌入和物体查询，则抛出数值错误异常
            raise ValueError(
                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
            )

        if position_embeddings is not None:
            # 如果仅指定了位置嵌入，则发出一次性警告
            logger.warning_once(
                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
            )
            # 将物体查询设置为位置嵌入
            object_queries = position_embeddings

        # 返回带有位置嵌入的张量或者原始张量
        return tensor if object_queries is None else tensor + object_queries

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        object_queries: Optional[torch.Tensor] = None,
        key_value_states: Optional[torch.Tensor] = None,
        spatial_position_embeddings: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        **kwargs,
    ):
        # 这里定义了注意力层的前向传播过程，具体实现包括 Q、K、V 的映射以及多头注意力机制等
    # 初始化函数，用于初始化一个DetrEncoderLayer对象
    def __init__(self, config: DetrConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 设置嵌入维度为config中定义的模型维度
        self.embed_dim = config.d_model
        # 初始化自注意力机制，使用DetrAttention类
        self.self_attn = DetrAttention(
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
        )
        # 初始化自注意力层的LayerNorm层，对嵌入向量进行归一化
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        # 设置dropout概率，用于各种dropout操作
        self.dropout = config.dropout
        # 根据配置文件选择激活函数
        self.activation_fn = ACT2FN[config.activation_function]
        # 设置激活函数的dropout概率
        self.activation_dropout = config.activation_dropout
        # 第一个全连接层，将嵌入维度映射到编码器中间维度
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        # 第二个全连接层，将编码器中间维度映射回嵌入维度
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        # 最终的LayerNorm层，对编码器输出进行最后的归一化
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
    
    # 前向传播函数，用于执行数据的前向传播
    def forward(
        self,
        hidden_states: torch.Tensor,  # 输入的隐藏状态张量
        attention_mask: torch.Tensor,  # 注意力掩码张量
        object_queries: torch.Tensor = None,  # 目标查询张量，默认为空
        output_attentions: bool = False,  # 是否输出注意力权重，默认为False
        **kwargs,  # 其他关键字参数
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
                隐藏状态，形状为 `(batch, seq_len, embed_dim)` 的输入张量
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                values.
                注意力掩码，大小为 `(batch, 1, target_len, source_len)`，其中填充元素由非常大的负值表示
            object_queries (`torch.FloatTensor`, *optional*):
                Object queries (also called content embeddings), to be added to the hidden states.
                对象查询（也称为内容嵌入），将添加到隐藏状态中
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
                是否返回所有注意力层的注意力张量。详细信息请参见返回的张量中的 `attentions` 字段
        """
        position_embeddings = kwargs.pop("position_embeddings", None)  # 从关键字参数中弹出 `position_embeddings`

        if kwargs:
            raise ValueError(f"Unexpected arguments {kwargs.keys()}")  # 如果还有未处理的关键字参数，引发错误

        if position_embeddings is not None and object_queries is not None:
            raise ValueError(
                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
            )  # 如果同时指定了 `position_embeddings` 和 `object_queries`，则引发错误

        if position_embeddings is not None:
            logger.warning_once(
                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
            )  # 如果指定了 `position_embeddings`，发出警告提示将在 v4.34 版本中移除 `position_embeddings`，建议使用 `object_queries`
            object_queries = position_embeddings  # 使用 `position_embeddings` 替代 `object_queries`

        residual = hidden_states  # 保存输入的隐藏状态

        # 通过自注意力机制处理隐藏状态
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            object_queries=object_queries,
            output_attentions=output_attentions,
        )

        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 对隐藏状态进行 dropout
        hidden_states = residual + hidden_states  # 加上残差连接
        hidden_states = self.self_attn_layer_norm(hidden_states)  # 对加和后的隐藏状态进行 layer normalization

        residual = hidden_states  # 保存上一层处理后的隐藏状态

        # 通过全连接层 fc1 处理隐藏状态，并使用激活函数进行非线性变换
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)  # 对处理后的隐藏状态进行 dropout

        hidden_states = self.fc2(hidden_states)  # 通过全连接层 fc2 进行线性变换
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 对处理后的隐藏状态进行 dropout

        hidden_states = residual + hidden_states  # 加上残差连接
        hidden_states = self.final_layer_norm(hidden_states)  # 对加和后的隐藏状态进行 layer normalization

        if self.training:
            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
                # 如果隐藏状态中存在无穷大或 NaN 值，进行 clamp 操作，避免数值溢出或无效操作

        outputs = (hidden_states,)  # 定义最终的输出为处理后的隐藏状态元组

        if output_attentions:
            outputs += (attn_weights,)  # 如果需要输出注意力权重，则将注意力权重加入输出元组中

        return outputs  # 返回最终的输出元组
class DetrDecoderLayer(nn.Module):
    # DETR 解码器层定义
    def __init__(self, config: DetrConfig):
        super().__init__()
        self.embed_dim = config.d_model

        # 自注意力机制
        self.self_attn = DetrAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
        )
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout

        # 自注意力机制层归一化
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 编码器注意力机制
        self.encoder_attn = DetrAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 前馈神经网络层
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)

        # 最终层归一化
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        object_queries: Optional[torch.Tensor] = None,
        query_position_embeddings: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        **kwargs,
class DetrClassificationHead(nn.Module):
    """用于句子级分类任务的头部模块。"""

    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
        super().__init__()
        # 全连接层
        self.dense = nn.Linear(input_dim, inner_dim)
        # Dropout 层
        self.dropout = nn.Dropout(p=pooler_dropout)
        # 输出投影层
        self.out_proj = nn.Linear(inner_dim, num_classes)

    def forward(self, hidden_states: torch.Tensor):
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.dense(hidden_states)
        hidden_states = torch.tanh(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.out_proj(hidden_states)
        return hidden_states


class DetrPreTrainedModel(PreTrainedModel):
    config_class = DetrConfig
    base_model_prefix = "model"
    main_input_name = "pixel_values"
    _no_split_modules = [r"DetrConvEncoder", r"DetrEncoderLayer", r"DetrDecoderLayer"]
    # 初始化模型权重的函数，根据模块类型不同采用不同的初始化方法
    def _init_weights(self, module):
        # 从配置中获取标准差和Xavier初始化的标准差
        std = self.config.init_std
        xavier_std = self.config.init_xavier_std

        # 如果是 DetrMHAttentionMap 类型的模块
        if isinstance(module, DetrMHAttentionMap):
            # 初始化 k_linear 和 q_linear 模块的偏置为零
            nn.init.zeros_(module.k_linear.bias)
            nn.init.zeros_(module.q_linear.bias)
            # 使用 Xavier 均匀分布初始化 k_linear 和 q_linear 模块的权重
            nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
            nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
        
        # 如果是 DetrLearnedPositionEmbedding 类型的模块
        elif isinstance(module, DetrLearnedPositionEmbedding):
            # 使用均匀分布初始化 row_embeddings 和 column_embeddings 的权重
            nn.init.uniform_(module.row_embeddings.weight)
            nn.init.uniform_(module.column_embeddings.weight)
        
        # 如果是 nn.Linear, nn.Conv2d, nn.BatchNorm2d 之一的模块
        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
            # 使用正态分布初始化权重，均值为0，标准差为 std
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果存在偏置，则将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        
        # 如果是 nn.Embedding 类型的模块
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，均值为0，标准差为 std
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果定义了 padding_idx，则将对应位置的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
# DETR 模型的文档字符串，提供了有关模型继承自 `PreTrainedModel` 的说明，建议查看超类文档以了解库实现的通用方法（例如下载或保存模型、调整输入嵌入大小、修剪头等）。

This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)

# DETR 模型同时也是 PyTorch 的 `torch.nn.Module` 子类，可以像普通的 PyTorch 模块一样使用，有关一般用法和行为，请参考 PyTorch 文档。

This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.

# 参数部分描述了模型的初始化参数，需要一个 `DetrConfig` 类的实例。通过配置文件初始化模型不会加载与模型关联的权重，只加载配置。可以查看 `~PreTrainedModel.from_pretrained` 方法来加载模型权重。

Parameters:
    config ([`DetrConfig`]):
        Model configuration class with all the parameters of the model. Initializing with a config file does not
        load the weights associated with the model, only the configuration. Check out the
        [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            # 像素值，表示图像的像素数据，格式为(batch_size, num_channels, height, width)

            Pixel values. Padding will be ignored by default should you provide it.
            # 默认情况下会忽略填充的像素值

        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            # 像素掩码，用于避免在填充像素上执行注意力操作，形状为(batch_size, height, width)，可选参数

            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
            # 用于遮盖填充像素，以避免在其上执行注意力操作。掩码值在[0, 1]之间：

            - 1 for pixels that are real (i.e. **not masked**),
            # 1 表示真实像素（即未被遮盖）

            - 0 for pixels that are padding (i.e. **masked**).
            # 0 表示填充像素（即已被遮盖）

            [What are attention masks?](../glossary#attention-mask)
            # 了解注意力掩码的更多信息，请参考链接中的文档

        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
            # 解码器注意力掩码，形状为(batch_size, num_queries)，可选参数
            Not used by default. Can be used to mask object queries.
            # 默认情况下不使用。可用于遮盖对象查询。

        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            # 编码器输出，形状为(tuple(torch.FloatTensor)，可选参数
            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
            # 元组包括(`last_hidden_state`, 可选: `hidden_states`, 可选: `attentions`)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
            # `last_hidden_state` 形状为(batch_size, sequence_length, hidden_size)，是编码器最后一层的隐藏状态输出。在解码器的交叉注意力中使用。

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            # 输入嵌入，形状为(batch_size, sequence_length, hidden_size)，可选参数
            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
            # 可选择直接传递扁平化的特征图（骨干网络输出 + 投影层输出），而不是传递这些特征图的嵌入表示。

            can choose to directly pass a flattened representation of an image.
            # 可选择直接传递图像的扁平化表示。

        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
            # 解码器输入嵌入，形状为(batch_size, num_queries, hidden_size)，可选参数
            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
            # 可选择直接传递一个嵌入表示来初始化查询，而不是用零张量初始化。

            embedded representation.
            # 嵌入表示

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。有关详细信息，请参见返回张量下的`attentions`。
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
            # 是否返回所有注意力层的注意力张量。有关详细信息，请参见返回张量下的`attentions`。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。有关详细信息，请参见返回张量下的`hidden_states`。
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
            # 是否返回所有层的隐藏状态。有关详细信息，请参见返回张量下的`hidden_states`。

        return_dict (`bool`, *optional*):
            # 是否返回[`~utils.ModelOutput`]而不是普通的元组。
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            # 是否返回一个[`~utils.ModelOutput`]而不是普通的元组。
"""
class DetrEncoder(DetrPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`DetrEncoderLayer`].

    The encoder updates the flattened feature map through multiple self-attention layers.

    Small tweak for DETR:

    - object_queries are added to the forward pass.

    Args:
        config: DetrConfig
    """

    def __init__(self, config: DetrConfig):
        super().__init__(config)

        self.dropout = config.dropout  # 从配置中获取 dropout 率
        self.layerdrop = config.encoder_layerdrop  # 从配置中获取 encoder 层级 dropout 率

        self.layers = nn.ModuleList([DetrEncoderLayer(config) for _ in range(config.encoder_layers)])  # 创建指定数量的编码器层

        # in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
        # 在原始的DETR中，编码器末端不使用layernorm，因为默认情况下"normalize_before"设置为False

        # Initialize weights and apply final processing
        self.post_init()  # 执行初始化和最终处理



    def forward(
        self,
        inputs_embeds=None,
        attention_mask=None,
        object_queries=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,



class DetrDecoder(DetrPreTrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetrDecoderLayer`].

    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.

    Some small tweaks for DETR:

    - object_queries and query_position_embeddings are added to the forward pass.
    - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.

    Args:
        config: DetrConfig
    """

    def __init__(self, config: DetrConfig):
        super().__init__(config)
        self.dropout = config.dropout  # 从配置中获取 dropout 率
        self.layerdrop = config.decoder_layerdrop  # 从配置中获取 decoder 层级 dropout 率

        self.layers = nn.ModuleList([DetrDecoderLayer(config) for _ in range(config.decoder_layers)])  # 创建指定数量的解码器层
        # in DETR, the decoder uses layernorm after the last decoder layer output
        # 在DETR中，解码器在最后一层解码器输出后使用layernorm
        self.layernorm = nn.LayerNorm(config.d_model)  # 创建指定维度的layernorm层

        self.gradient_checkpointing = False  # 默认关闭梯度检查点

        # Initialize weights and apply final processing
        self.post_init()  # 执行初始化和最终处理



    def forward(
        self,
        inputs_embeds=None,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        object_queries=None,
        query_position_embeddings=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,



@add_start_docstrings(
    """
    The bare DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
    any specific head on top.
    """,
    DETR_START_DOCSTRING,
)
class DetrModel(DetrPreTrainedModel):
    """
    DETR模型，包括骨干和编码器-解码器Transformer，输出没有特定头部的原始隐藏状态。
    """
    # 初始化函数，接受一个DetrConfig类型的配置对象作为参数
    def __init__(self, config: DetrConfig):
        # 调用父类的初始化方法，传入配置对象
        super().__init__(config)

        # 创建backbone和位置编码
        backbone = DetrConvEncoder(config)
        object_queries = build_position_encoding(config)
        # 使用创建的backbone和位置编码创建DetrConvModel对象，并赋给self.backbone属性
        self.backbone = DetrConvModel(backbone, object_queries)

        # 创建投影层，使用nn.Conv2d进行初始化
        self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)

        # 创建查询位置嵌入层，使用nn.Embedding进行初始化
        self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)

        # 创建编码器和解码器对象
        self.encoder = DetrEncoder(config)
        self.decoder = DetrDecoder(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 返回编码器对象
    def get_encoder(self):
        return self.encoder

    # 返回解码器对象
    def get_decoder(self):
        return self.decoder

    # 冻结backbone的参数，使其不可训练
    def freeze_backbone(self):
        # 遍历backbone的模型参数，并设置requires_grad为False
        for name, param in self.backbone.conv_encoder.model.named_parameters():
            param.requires_grad_(False)

    # 解冻backbone的参数，使其可训练
    def unfreeze_backbone(self):
        # 遍历backbone的模型参数，并设置requires_grad为True
        for name, param in self.backbone.conv_encoder.model.named_parameters():
            param.requires_grad_(True)

    # 前向传播函数，根据DETR的输入文档字符串进行注释
    @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING)
    # 替换返回的文档字符串类型为DetrModelOutput，并使用_CONFIG_FOR_DOC作为配置类
    @replace_return_docstrings(output_type=DetrModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        pixel_mask: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.FloatTensor] = None,
        encoder_outputs: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
"""
DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
such as COCO detection.
"""
# 导入所需模块和函数
@add_start_docstrings(
    """
    DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
    such as COCO panoptic.
    """,
    DETR_START_DOCSTRING,
)
# DETR模型的子类，用于分割任务，例如COCO全景分割
class DetrForSegmentation(DetrPreTrainedModel):
    # 使用给定的配置初始化对象检测模型
    def __init__(self, config: DetrConfig):
        # 调用父类的初始化方法
        super().__init__(config)

        # 创建对象检测模型实例
        self.detr = DetrForObjectDetection(config)

        # 初始化分割头部
        # 从配置中获取隐藏大小和注意力头数
        hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
        # 从对象检测模型中获取中间层通道大小
        intermediate_channel_sizes = self.detr.model.backbone.conv_encoder.intermediate_channel_sizes

        # 创建小型卷积分割头部实例
        self.mask_head = DetrMaskHeadSmallConv(
            hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
        )

        # 创建 DETR 多头注意力地图实例
        self.bbox_attention = DetrMHAttentionMap(
            hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
        )

        # 执行初始化权重和最终处理
        self.post_init()

    # 将输入参数和返回值的文档字符串添加到模型的前向方法中
    @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING)
    # 替换返回值的文档字符串为分割输出类型，并指定配置类
    @replace_return_docstrings(output_type=DetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        pixel_mask: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.FloatTensor] = None,
        encoder_outputs: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[List[dict]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 定义一个函数 `_expand`，用于将给定的张量在第一维度上插入新维度，然后在该维度上重复指定次数，并将结果展平。
def _expand(tensor, length: int):
    return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)


# 从 https://github.com/facebookresearch/detr/blob/master/models/segmentation.py 中引用的代码片段
# 定义了一个名为 `DetrMaskHeadSmallConv` 的类，用于实现一个简单的卷积头部，使用组归一化。通过 FPN 方法进行上采样。
class DetrMaskHeadSmallConv(nn.Module):
    """
    Simple convolutional head, using group norm. Upsampling is done using a FPN approach
    """

    def __init__(self, dim, fpn_dims, context_dim):
        super().__init__()

        # 如果 `dim` 不是 8 的倍数，抛出错误，因为 GroupNorm 的组数设置为 8
        if dim % 8 != 0:
            raise ValueError(
                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in"
                " GroupNorm is set to 8"
            )

        # 定义中间层的维度列表，依次为 `dim`, `context_dim // 2`, `context_dim // 4`, `context_dim // 8`, `context_dim // 16`, `context_dim // 64`
        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]

        # 定义卷积层和组归一化层
        self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
        self.gn1 = nn.GroupNorm(8, dim)
        self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
        self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
        self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
        self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
        self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
        self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
        self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
        self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
        self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)

        # 设置类属性 `dim`
        self.dim = dim

        # 适配器层，用于将 FPN 的输出适配到不同层的输入维度
        self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
        self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
        self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1)

        # 初始化所有卷积层的权重和偏置
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_uniform_(m.weight, a=1)
                nn.init.constant_(m.bias, 0)
    # 定义前向传播函数，接受输入参数 x（特征张量）、bbox_mask（边界框掩码张量）、fpns（特征金字塔网络列表）
    def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
        # 将 x（投影后的特征图，形状为 (batch_size, d_model, height/32, width/32)）与 bbox_mask（注意力映射，
        # 形状为 (batch_size, n_queries, n_heads, height/32, width/32)）拼接起来
        x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)

        # 经过第一层线性变换层
        x = self.lay1(x)
        # 经过第一个组归一化层
        x = self.gn1(x)
        # 经过 ReLU 激活函数
        x = nn.functional.relu(x)

        # 经过第二层线性变换层
        x = self.lay2(x)
        # 经过第二个组归一化层
        x = self.gn2(x)
        # 经过 ReLU 激活函数
        x = nn.functional.relu(x)

        # 获取当前的特征金字塔网络（fpns）中的第一个子网络
        cur_fpn = self.adapter1(fpns[0])
        # 如果当前特征金字塔网络的批次数不等于 x 的批次数，则扩展它以匹配 x 的批次数
        if cur_fpn.size(0) != x.size(0):
            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
        # 将当前特征金字塔网络的输出与 x 插值后相加
        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
        # 经过第三层线性变换层
        x = self.lay3(x)
        # 经过第三个组归一化层
        x = self.gn3(x)
        # 经过 ReLU 激活函数
        x = nn.functional.relu(x)

        # 获取当前的特征金字塔网络中的第二个子网络
        cur_fpn = self.adapter2(fpns[1])
        # 如果当前特征金字塔网络的批次数不等于 x 的批次数，则扩展它以匹配 x 的批次数
        if cur_fpn.size(0) != x.size(0):
            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
        # 将当前特征金字塔网络的输出与 x 插值后相加
        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
        # 经过第四层线性变换层
        x = self.lay4(x)
        # 经过第四个组归一化层
        x = self.gn4(x)
        # 经过 ReLU 激活函数
        x = nn.functional.relu(x)

        # 获取当前的特征金字塔网络中的第三个子网络
        cur_fpn = self.adapter3(fpns[2])
        # 如果当前特征金字塔网络的批次数不等于 x 的批次数，则扩展它以匹配 x 的批次数
        if cur_fpn.size(0) != x.size(0):
            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
        # 将当前特征金字塔网络的输出与 x 插值后相加
        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
        # 经过第五层线性变换层
        x = self.lay5(x)
        # 经过第五个组归一化层
        x = self.gn5(x)
        # 经过 ReLU 激活函数
        x = nn.functional.relu(x)

        # 经过输出层的线性变换
        x = self.out_lay(x)
        # 返回最终的输出张量
        return x
class DetrMHAttentionMap(nn.Module):
    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""

    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
        super().__init__()
        self.num_heads = num_heads
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout)

        # Linear transformation for queries
        self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
        # Linear transformation for keys
        self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)

        # Normalization factor for scaling dot products in attention calculation
        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5

    def forward(self, q, k, mask: Optional[Tensor] = None):
        # Linear transformation of queries
        q = self.q_linear(q)
        # Convolutional transformation of keys
        k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)

        # Reshape queries and keys for multi-head attention computation
        queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
        keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])

        # Compute scaled dot-product attention scores
        weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)

        # Apply mask to attention weights if provided
        if mask is not None:
            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min)

        # Apply softmax to obtain attention distributions
        weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
        # Apply dropout
        weights = self.dropout(weights)

        return weights


def dice_loss(inputs, targets, num_boxes):
    """
    Compute the DICE loss, similar to generalized IOU for masks

    Args:
        inputs: A float tensor of arbitrary shape.
                The predictions for each example.
        targets: A float tensor with the same shape as inputs. Stores the binary
                 classification label for each element in inputs (0 for the negative class and 1 for the positive
                 class).
    """
    # Apply sigmoid function to inputs
    inputs = inputs.sigmoid()
    # Flatten the inputs
    inputs = inputs.flatten(1)
    # Compute numerator of DICE coefficient
    numerator = 2 * (inputs * targets).sum(1)
    # Compute denominator of DICE coefficient
    denominator = inputs.sum(-1) + targets.sum(-1)
    # Compute DICE loss
    loss = 1 - (numerator + 1) / (denominator + 1)
    return loss.sum() / num_boxes


def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
    """
    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.

    Args:
        inputs (`torch.FloatTensor` of arbitrary shape):
            The predictions for each example.
        targets (`torch.FloatTensor` with the same shape as `inputs`)
            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
            and 1 for the positive class).
        alpha (`float`, *optional*, defaults to `0.25`):
            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
        gamma (`int`, *optional*, defaults to `2`):
            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.

    Returns:
        Loss tensor
    """
    # 对输入进行 sigmoid 操作，将输出值限制在 (0, 1) 范围内
    prob = inputs.sigmoid()
    # 使用二元交叉熵损失函数计算损失，但保留每个样本的损失值，不进行汇总
    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
    # 计算 modulating factor，用于调节损失
    p_t = prob * targets + (1 - prob) * (1 - targets)
    # 计算最终的损失值，使用 focal loss 的形式进行加权
    loss = ce_loss * ((1 - p_t) ** gamma)

    # 如果 alpha 大于等于 0，则使用 focal loss 的 alpha 权重调节损失
    if alpha >= 0:
        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        loss = alpha_t * loss

    # 计算最终的平均损失值，并对所有样本求和，然后除以 num_boxes 得到平均损失
    return loss.mean(1).sum() / num_boxes
    """
    This class computes the losses for DetrForObjectDetection/DetrForSegmentation. The process happens in two steps: 1)
    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
    of matched ground-truth / prediction (supervise class and box).

    A note on the `num_classes` argument (copied from original repo in detr.py): "the naming of the `num_classes`
    parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where `max_obj_id` is
    the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass `num_classes` to
    be 91. As another example, for a dataset that has a single class with `id` 1, you should pass `num_classes` to be 2
    (`max_obj_id` + 1). For more details on this, check the following discussion
    https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223"

    Args:
        matcher (`DetrHungarianMatcher`):
            Module able to compute a matching between targets and proposals.
        num_classes (`int`):
            Number of object categories, omitting the special no-object category.
        eos_coef (`float`):
            Relative classification weight applied to the no-object category.
        losses (`List[str]`):
            List of all the losses to be applied. See `get_loss` for a list of all available losses.
    """

    def __init__(self, matcher, num_classes, eos_coef, losses):
        super().__init__()
        self.matcher = matcher  # 初始化匹配器，用于计算目标与模型输出之间的匹配
        self.num_classes = num_classes  # 目标类别数，不包括特殊的无对象类别
        self.eos_coef = eos_coef  # 无对象类别的相对分类权重
        self.losses = losses  # 待应用的所有损失列表

        # 创建一个权重张量，用于交叉熵计算，最后一个元素用于处理无对象类别
        empty_weight = torch.ones(self.num_classes + 1)
        empty_weight[-1] = self.eos_coef
        self.register_buffer("empty_weight", empty_weight)

    # removed logging parameter, which was part of the original implementation
    def loss_labels(self, outputs, targets, indices, num_boxes):
        """
        Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim
        [nb_target_boxes]
        """
        if "logits" not in outputs:
            raise KeyError("No logits were found in the outputs")
        source_logits = outputs["logits"]  # 获取模型输出的逻辑回归结果

        idx = self._get_source_permutation_idx(indices)  # 获取源排列的索引
        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])  # 获取目标类别
        target_classes = torch.full(
            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
        )
        target_classes[idx] = target_classes_o  # 将目标类别放入正确的位置

        # 计算交叉熵损失
        loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight)
        losses = {"loss_ce": loss_ce}  # 存储交叉熵损失

        return losses

    @torch.no_grad()
    def loss_cardinality(self, outputs, targets, indices, num_boxes):
        """
        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.

        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
        """
        # 获取模型输出中的分类 logits
        logits = outputs["logits"]
        # 获取 logits 的设备信息
        device = logits.device
        # 计算目标长度，即每个目标包含的类标签数
        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
        # 计算预测中非空盒子数量
        # 非空盒子是指预测中不是“no-object”类别（即最后一个类别）的预测
        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
        # 计算预测盒子数量的绝对误差
        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
        # 构建 losses 字典，包含基于预测盒子数量的误差
        losses = {"cardinality_error": card_err}
        return losses

    def loss_boxes(self, outputs, targets, indices, num_boxes):
        """
        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.

        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
        are expected in format (center_x, center_y, w, h), normalized by the image size.
        """
        # 检查模型输出中是否存在预测框
        if "pred_boxes" not in outputs:
            raise KeyError("No predicted boxes found in outputs")
        # 根据索引获取源排列的索引
        idx = self._get_source_permutation_idx(indices)
        # 获取预测框和目标框
        source_boxes = outputs["pred_boxes"][idx]
        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)

        # 计算边界框的 L1 回归损失
        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")

        # 构建 losses 字典，包含边界框的 L1 回归损失和 GIoU 损失
        losses = {}
        losses["loss_bbox"] = loss_bbox.sum() / num_boxes

        # 计算广义 IoU 损失
        loss_giou = 1 - torch.diag(
            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
        )
        losses["loss_giou"] = loss_giou.sum() / num_boxes
        return losses
    def loss_masks(self, outputs, targets, indices, num_boxes):
        """
        Compute the losses related to the masks: the focal loss and the dice loss.

        Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
        """
        # 检查输出中是否包含预测的 masks
        if "pred_masks" not in outputs:
            raise KeyError("No predicted masks found in outputs")

        # 获取源排列索引，用于根据预测和目标的排列顺序调整预测 masks
        source_idx = self._get_source_permutation_idx(indices)
        # 获取目标排列索引
        target_idx = self._get_target_permutation_idx(indices)
        # 获取预测的 masks
        source_masks = outputs["pred_masks"]
        # 根据源排列索引选择对应的预测 masks
        source_masks = source_masks[source_idx]
        # 获取目标 masks 列表
        masks = [t["masks"] for t in targets]
        # 使用 nested_tensor_from_tensor_list 函数将目标 masks 转换为 NestedTensor，同时获取有效区域 valid
        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
        # 将目标 masks 转换为与预测 masks 相同的设备类型
        target_masks = target_masks.to(source_masks)
        # 根据目标排列索引选择对应的目标 masks
        target_masks = target_masks[target_idx]

        # 将预测 masks 上采样至目标大小
        source_masks = nn.functional.interpolate(
            source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
        )
        # 压缩维度，将预测 masks 变为一维
        source_masks = source_masks[:, 0].flatten(1)

        # 压缩维度，将目标 masks 变为一维
        target_masks = target_masks.flatten(1)
        # 将目标 masks 变换为与预测 masks 相同的形状
        target_masks = target_masks.view(source_masks.shape)

        # 计算损失，包括 sigmoid focal loss 和 dice loss
        losses = {
            "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
            "loss_dice": dice_loss(source_masks, target_masks, num_boxes),
        }
        return losses

    def _get_source_permutation_idx(self, indices):
        # 根据 indices 排列预测，返回批次索引和源索引
        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
        source_idx = torch.cat([source for (source, _) in indices])
        return batch_idx, source_idx

    def _get_target_permutation_idx(self, indices):
        # 根据 indices 排列目标，返回批次索引和目标索引
        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
        target_idx = torch.cat([target for (_, target) in indices])
        return batch_idx, target_idx

    def get_loss(self, loss, outputs, targets, indices, num_boxes):
        # 定义损失函数映射
        loss_map = {
            "labels": self.loss_labels,
            "cardinality": self.loss_cardinality,
            "boxes": self.loss_boxes,
            "masks": self.loss_masks,
        }
        # 检查所请求的损失是否在损失映射中
        if loss not in loss_map:
            raise ValueError(f"Loss {loss} not supported")
        # 返回所请求损失函数的结果
        return loss_map[loss](outputs, targets, indices, num_boxes)
    def forward(self, outputs, targets):
        """
        This performs the loss computation.

        Args:
             outputs (`dict`, *optional*):
                Dictionary of tensors, see the output specification of the model for the format.
             targets (`List[dict]`, *optional*):
                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
                losses applied, see each loss' doc.
        """
        # Exclude auxiliary outputs from the outputs dictionary
        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}

        # Retrieve indices that match outputs of the last layer with targets
        indices = self.matcher(outputs_without_aux, targets)

        # Compute the total number of target boxes across all samples for normalization
        num_boxes = sum(len(t["class_labels"]) for t in targets)
        # Convert num_boxes to a tensor of float type, and move it to the same device as outputs
        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
        world_size = 1

        # Check if acceleration is available and adjust num_boxes and world_size accordingly
        if is_accelerate_available():
            # If PartialState._shared_state is not empty, reduce num_boxes
            if PartialState._shared_state != {}:
                num_boxes = reduce(num_boxes)
                # Get the number of processes from PartialState
                world_size = PartialState().num_processes

        # Normalize num_boxes considering the number of processes
        num_boxes = torch.clamp(num_boxes / world_size, min=1).item()

        # Compute losses for each specified loss function
        losses = {}
        for loss in self.losses:
            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))

        # If there are auxiliary outputs, compute losses for each auxiliary output separately
        if "auxiliary_outputs" in outputs:
            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
                indices = self.matcher(auxiliary_outputs, targets)
                for loss in self.losses:
                    if loss == "masks":
                        # Skip computation of masks loss for auxiliary outputs due to cost
                        continue
                    # Append index to keys in losses dictionary for each auxiliary output
                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
                    losses.update(l_dict)

        # Return computed losses
        return losses
# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
class DetrMLPPredictionHead(nn.Module):
    """
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py

    """

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.num_layers = num_layers
        # Define a list of linear layers with ReLU activation for the MLP
        h = [hidden_dim] * (num_layers - 1)
        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))

    def forward(self, x):
        # Feed forward through each linear layer with ReLU activation, except the last layer
        for i, layer in enumerate(self.layers):
            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
        return x


# taken from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
class DetrHungarianMatcher(nn.Module):
    """
    This class computes an assignment between the targets and the predictions of the network.

    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
    un-matched (and thus treated as non-objects).

    Args:
        class_cost:
            The relative weight of the classification error in the matching cost.
        bbox_cost:
            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
        giou_cost:
            The relative weight of the giou loss of the bounding box in the matching cost.
    """

    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
        super().__init__()
        # Ensure that the "scipy" library is available when initializing this module
        requires_backends(self, ["scipy"])

        self.class_cost = class_cost
        self.bbox_cost = bbox_cost
        self.giou_cost = giou_cost
        # Check if all costs are non-zero; raise an error if they are all zero
        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
            raise ValueError("All costs of the Matcher can't be 0")

    @torch.no_grad()
    def forward(self, outputs, targets):
        """
        Args:
            outputs (`dict`):
                A dictionary that contains at least these entries:
                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
            targets (`List[dict]`):
                A list of targets (len(targets) = batch_size), where each target is a dict containing:
                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
                  ground-truth objects in the target) containing the class labels
                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.

        Returns:
            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
            - index_i is the indices of the selected predictions (in order)
            - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        # Extract batch size and number of queries from the outputs
        batch_size, num_queries = outputs["logits"].shape[:2]

        # Flatten logits and apply softmax to get probabilities over classes
        out_prob = outputs["logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]

        # Flatten predicted boxes
        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]

        # Concatenate target class labels into a single tensor
        target_ids = torch.cat([v["class_labels"] for v in targets])

        # Concatenate target boxes into a single tensor
        target_bbox = torch.cat([v["boxes"] for v in targets])

        # Compute classification cost matrix based on negative log likelihood approximation
        class_cost = -out_prob[:, target_ids]

        # Compute L1 cost matrix between predicted and target boxes
        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)

        # Compute generalized IoU cost matrix between predicted and target boxes
        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))

        # Combine different costs into a final cost matrix using predefined weights
        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost

        # Reshape cost matrix to batch size x num_queries x (sum of all target boxes)
        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()

        # Split cost matrix based on number of target boxes in each sample and perform linear sum assignment
        sizes = [len(v["boxes"]) for v in targets]
        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]

        # Return indices as a list of tuples containing selected predictions and corresponding targets
        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
# below: bounding box utilities taken from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py

def _upcast(t: Tensor) -> Tensor:
    """
    Protects from numerical overflows in multiplications by upcasting to the equivalent higher type.

    Args:
        t (`Tensor`): The input tensor to be upcasted.

    Returns:
        `Tensor`: The upcasted tensor.
    """
    if t.is_floating_point():
        return t if t.dtype in (torch.float32, torch.float64) else t.float()
    else:
        return t if t.dtype in (torch.int32, torch.int64) else t.int()


def box_area(boxes: Tensor) -> Tensor:
    """
    Computes the area of a set of bounding boxes, specified by (x1, y1, x2, y2) coordinates.

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            Boxes for which the area will be computed. They are expected in (x1, y1, x2, y2) format with `0 <= x1 < x2` and `0 <= y1 < y2`.

    Returns:
        `torch.FloatTensor`: A tensor containing the area for each box.
    """
    boxes = _upcast(boxes)
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


# modified from torchvision to also return the union
def box_iou(boxes1, boxes2):
    """
    Computes the Intersection over Union (IoU) between two sets of bounding boxes.

    Args:
        boxes1 (`Tensor`): Bounding boxes in format (x1, y1, x2, y2).
        boxes2 (`Tensor`): Bounding boxes in format (x1, y1, x2, y2).

    Returns:
        `Tensor`: IoU scores for each pair of boxes.
        `Tensor`: Union area for each pair of boxes.
    """
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]

    union = area1[:, None] + area2 - inter

    iou = inter / union
    return iou, union


def generalized_box_iou(boxes1, boxes2):
    """
    Computes the Generalized Intersection over Union (IoU) between two sets of bounding boxes.

    Args:
        boxes1 (`Tensor`): Bounding boxes in format [x0, y0, x1, y1].
        boxes2 (`Tensor`): Bounding boxes in format [x0, y0, x1, y1].

    Returns:
        `Tensor`: Generalized IoU scores for each pair of boxes.
    """
    # degenerate boxes gives inf / nan results
    # so do an early check
    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
    iou, union = box_iou(boxes1, boxes2)

    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
    area = width_height[:, :, 0] * width_height[:, :, 1]

    return iou - (area - union) / area


# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306
def _max_by_axis(the_list):
    """
    Finds the maximum value along each axis of a list of lists.

    Args:
        the_list (`List[List[int]]`): A list of lists of integers.

    Returns:
        `List[int]`: A list containing the maximum value along each axis.
    """
    maxes = the_list[0]
    for sublist in the_list[1:]:
        for index, item in enumerate(sublist):
            maxes[index] = max(maxes[index], item)
    return maxes


class NestedTensor(object):
    # Placeholder for further implementation, class not fully shown
    pass
    # 初始化方法，接受张量列表和可选的掩码张量作为参数
    def __init__(self, tensors, mask: Optional[Tensor]):
        # 将传入的张量列表赋值给实例变量 tensors
        self.tensors = tensors
        # 将传入的掩码张量赋值给实例变量 mask
        self.mask = mask

    # 将 NestedTensor 对象中的张量数据移动到指定的设备上
    def to(self, device):
        # 将 self.tensors 中的张量数据移动到指定设备，并赋值给 cast_tensor
        cast_tensor = self.tensors.to(device)
        # 获取实例变量 self.mask
        mask = self.mask
        # 如果 mask 不为 None
        if mask is not None:
            # 将 mask 中的数据移动到指定设备，并赋值给 cast_mask
            cast_mask = mask.to(device)
        else:
            # 如果 mask 为 None，则将 cast_mask 设置为 None
            cast_mask = None
        # 返回一个新的 NestedTensor 对象，其中的张量和掩码都已经移动到指定设备上
        return NestedTensor(cast_tensor, cast_mask)

    # 返回 NestedTensor 对象中包含的张量和掩码
    def decompose(self):
        return self.tensors, self.mask

    # 定制对象的字符串表示，返回张量的字符串表示
    def __repr__(self):
        return str(self.tensors)
# 根据给定的张量列表创建嵌套张量
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
    # 检查第一个张量的维度是否为3
    if tensor_list[0].ndim == 3:
        # 计算张量列表中每个张量的最大尺寸
        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
        # 构建批次的形状，包括批次大小、通道数、高度和宽度
        batch_shape = [len(tensor_list)] + max_size
        batch_size, num_channels, height, width = batch_shape
        # 获取张量列表中第一个张量的数据类型和设备
        dtype = tensor_list[0].dtype
        device = tensor_list[0].device
        # 创建一个全零的张量，形状为批次形状，指定数据类型和设备
        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
        # 创建一个全一的掩码张量，形状为(batch_size, height, width)，数据类型为布尔型，设备为指定设备
        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
        # 遍历张量列表中的每个张量，以及新创建的张量和掩码
        for img, pad_img, m in zip(tensor_list, tensor, mask):
            # 将原始图像的数据复制到新创建的张量中对应的位置
            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
            # 将掩码中对应图像部分的值设置为False，表示实际数据存在的位置
            m[: img.shape[1], : img.shape[2]] = False
    else:
        # 如果张量维度不为3，抛出值错误异常
        raise ValueError("Only 3-dimensional tensors are supported")
    # 返回嵌套张量对象，包括数据张量和掩码张量
    return NestedTensor(tensor, mask)

`.\models\detr\init.py`

# 版权声明和许可信息，指明代码版权及使用许可条件
# The HuggingFace 团队，版权所有 © 2020
#
# 根据 Apache 许可证版本 2.0 进行许可；
# 除非符合许可证的规定，否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件，
# 没有任何明示或暗示的担保或条件。
# 有关许可的详细信息，请参阅许可证。
#
# 引入类型检查模块
from typing import TYPE_CHECKING

# 从...utils 中引入必要的依赖项，包括自定义异常和延迟加载模块的工具函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块的导入结构字典
_import_structure = {"configuration_detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig", "DetrOnnxConfig"]}

# 检查视觉处理模块是否可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果模块可用，添加相关特征提取和图像处理的导入结构
    _import_structure["feature_extraction_detr"] = ["DetrFeatureExtractor"]
    _import_structure["image_processing_detr"] = ["DetrImageProcessor"]

# 检查 Torch 模块是否可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 模块可用，添加相关模型建模的导入结构
    _import_structure["modeling_detr"] = [
        "DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DetrForObjectDetection",
        "DetrForSegmentation",
        "DetrModel",
        "DetrPreTrainedModel",
    ]

# 如果是类型检查模式
if TYPE_CHECKING:
    # 从配置模块导入相关类和映射
    from .configuration_detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig, DetrOnnxConfig

    try:
        # 检查视觉处理模块是否可用，如果不可用则忽略
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入特征提取和图像处理模块
        from .feature_extraction_detr import DetrFeatureExtractor
        from .image_processing_detr import DetrImageProcessor

    try:
        # 检查 Torch 模块是否可用，如果不可用则忽略
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入模型建模相关类
        from .modeling_detr import (
            DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
            DetrForObjectDetection,
            DetrForSegmentation,
            DetrModel,
            DetrPreTrainedModel,
        )

# 如果不是类型检查模式
else:
    # 导入 sys 模块
    import sys

    # 将当前模块指定为一个延迟加载模块，用于动态导入定义的结构
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\dialogpt\convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py`

# 导入必要的模块和库
import argparse  # 导入用于处理命令行参数的模块
import os  # 导入用于处理操作系统功能的模块

import torch  # 导入PyTorch深度学习库

from transformers.utils import WEIGHTS_NAME  # 从transformers库中导入WEIGHTS_NAME常量

# 预定义的DialoGPT模型名称列表
DIALOGPT_MODELS = ["small", "medium", "large"]

# 旧的模型权重键名和新的模型权重键名
OLD_KEY = "lm_head.decoder.weight"
NEW_KEY = "lm_head.weight"


def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str):
    # 加载PyTorch模型检查点文件
    d = torch.load(checkpoint_path)
    # 将旧的权重键名映射到新的权重键名
    d[NEW_KEY] = d.pop(OLD_KEY)
    # 确保目标文件夹存在，如果不存在则创建
    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
    # 保存修改后的模型检查点到目标文件夹中
    torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME))


if __name__ == "__main__":
    # 创建参数解析器
    parser = argparse.ArgumentParser()
    # 添加命令行参数--dialogpt_path，默认为当前目录
    parser.add_argument("--dialogpt_path", default=".", type=str)
    # 解析命令行参数
    args = parser.parse_args()
    
    # 遍历预定义的DialoGPT模型列表
    for MODEL in DIALOGPT_MODELS:
        # 构建模型检查点文件路径
        checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
        # 构建PyTorch模型转换后的输出文件夹路径
        pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
        # 转换模型检查点格式并保存到目标文件夹中
        convert_dialogpt_checkpoint(
            checkpoint_path,
            pytorch_dump_folder_path,
        )

`.\models\dialogpt\init.py`

# 导入所需的模块：datetime 用于日期和时间操作
import datetime

# 定义一个函数 calculate_age，接收一个参数 birthdate，计算当前日期与出生日期的年龄差
def calculate_age(birthdate):
    # 获取当前日期
    today = datetime.date.today()
    # 计算年龄，用当前日期的年份减去出生日期的年份
    age = today.year - birthdate.year
    # 如果当前月份小于出生日期的月份，或者当前月份等于出生日期的月份但日期还没到，年龄减一
    if today.month < birthdate.month or (today.month == birthdate.month and today.day < birthdate.day):
        age -= 1
    # 返回计算出来的年龄
    return age

`.\models\dinat\configuration_dinat.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Dilated Neighborhood Attention Transformer model configuration
"""

# 导入预训练配置类和日志工具
from ...configuration_utils import PretrainedConfig
from ...utils import logging
# 导入Backbone配置混合类和获取对齐输出特征输出索引的实用函数
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices

# 获取模块级别的日志记录器
logger = logging.get_logger(__name__)

# DINAT预训练配置文件映射，指定不同预训练模型对应的配置文件URL
DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "shi-labs/dinat-mini-in1k-224": "https://huggingface.co/shi-labs/dinat-mini-in1k-224/resolve/main/config.json",
    # 查看所有Dinat模型：https://huggingface.co/models?filter=dinat
}

# DinatConfig类继承自BackboneConfigMixin和PretrainedConfig类
class DinatConfig(BackboneConfigMixin, PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DinatModel`]. It is used to instantiate a Dinat
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Dinat
    [shi-labs/dinat-mini-in1k-224](https://huggingface.co/shi-labs/dinat-mini-in1k-224) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import DinatConfig, DinatModel

    >>> # Initializing a Dinat shi-labs/dinat-mini-in1k-224 style configuration
    >>> configuration = DinatConfig()

    >>> # Initializing a model (with random weights) from the shi-labs/dinat-mini-in1k-224 style configuration
    >>> model = DinatModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型为'dinat'
    model_type = "dinat"

    # 属性映射字典，将num_attention_heads映射到num_heads，将num_hidden_layers映射到num_layers
    attribute_map = {
        "num_attention_heads": "num_heads",
        "num_hidden_layers": "num_layers",
    }
        # 调用父类的初始化方法，继承父类的属性和方法
        super().__init__(**kwargs)

        # 设定模型的 patch 大小（用于图像分割中的每个小块的尺寸）
        self.patch_size = patch_size
        # 输入图像的通道数
        self.num_channels = num_channels
        # 嵌入维度，即每个位置的特征向量的维度
        self.embed_dim = embed_dim
        # 每个阶段的深度列表，指定每个阶段有多少个注意力层
        self.depths = depths
        # 阶段（层）的数量
        self.num_layers = len(depths)
        # 每个注意力头的数量列表，每个阶段可以有多个注意力头
        self.num_heads = num_heads
        # 卷积核大小，用于卷积操作的核的尺寸
        self.kernel_size = kernel_size
        # 不同阶段的空洞卷积的扩张率列表
        self.dilations = dilations
        # MLP 部分的宽度倍率
        self.mlp_ratio = mlp_ratio
        # 是否在注意力计算中使用偏置项
        self.qkv_bias = qkv_bias
        # 隐藏层的 dropout 概率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 注意力概率的 dropout 概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 随机删除路径的比率，用于随机丢弃路径
        self.drop_path_rate = drop_path_rate
        # 隐藏层激活函数的类型
        self.hidden_act = hidden_act
        # 层归一化的 epsilon 值
        self.layer_norm_eps = layer_norm_eps
        # 初始化权重的范围
        self.initializer_range = initializer_range

        # 隐藏层大小，即在模型的最后阶段的通道维度
        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))

        # 层缩放的初始值，用于缩放每个阶段的输出
        self.layer_scale_init_value = layer_scale_init_value

        # 阶段的名称列表，包括 'stem'（干部阶段）和每个阶段的编号
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]

        # 获得对齐的输出特征和输出索引，用于确保与给定阶段名称对齐的输出特征和索引
        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
        )

`.\models\dinat\modeling_dinat.py`

# coding=utf-8
# 版权 2022 年 SHI Labs 和 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）获得许可；
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件是基于“按原样”提供的，
# 没有任何明示或暗示的保证或条件。
# 有关详细信息，请参阅许可证。
""" PyTorch Dilated Neighborhood Attention Transformer model."""

# 导入必要的库
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入相关的自定义模块和函数
from ...activations import ACT2FN
from ...modeling_outputs import BackboneOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    ModelOutput,
    OptionalDependencyNotAvailable,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_natten_available,
    logging,
    replace_return_docstrings,
    requires_backends,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_dinat import DinatConfig

# 检查是否安装了 natten 库，如果安装了，导入相关函数，否则定义占位函数并抛出异常
if is_natten_available():
    from natten.functional import natten2dav, natten2dqkrpb
else:

    def natten2dqkrpb(*args, **kwargs):
        raise OptionalDependencyNotAvailable()

    def natten2dav(*args, **kwargs):
        raise OptionalDependencyNotAvailable()

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 模型配置文件的通用文档字符串
_CONFIG_FOR_DOC = "DinatConfig"

# 检查点地址的基础文档字符串
_CHECKPOINT_FOR_DOC = "shi-labs/dinat-mini-in1k-224"

# 预期输出形状的基础文档字符串
_EXPECTED_OUTPUT_SHAPE = [1, 7, 7, 512]

# 图像分类模型检查点文档字符串
_IMAGE_CLASS_CHECKPOINT = "shi-labs/dinat-mini-in1k-224"

# 图像分类预期输出文档字符串
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

# Dinat 预训练模型存档列表
DINAT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "shi-labs/dinat-mini-in1k-224",
    # 查看所有 Dinat 模型：https://huggingface.co/models?filter=dinat
]

# drop_path 和 DinatDropPath 是从 timm 库中导入的。
    # 定义函数的参数和返回类型注释，以下是函数的输入参数说明：
    #   last_hidden_state：模型最后一层的隐藏状态，类型为 torch.FloatTensor，形状为 (batch_size, sequence_length, hidden_size)
    #   hidden_states：模型各层隐藏状态的元组，每个元素类型为 torch.FloatTensor，形状为 (batch_size, sequence_length, hidden_size)，可选参数，当 output_hidden_states=True 时返回
    #   attentions：注意力权重的元组，每个元素类型为 torch.FloatTensor，形状为 (batch_size, num_heads, sequence_length, sequence_length)，可选参数，当 output_attentions=True 时返回
    #   reshaped_hidden_states：重新调整后的隐藏状态的元组，每个元素类型为 torch.FloatTensor，形状为 (batch_size, hidden_size, height, width)，可选参数，当 output_hidden_states=True 时返回
    
    last_hidden_state: torch.FloatTensor = None
    # 初始化最后一层的隐藏状态为 None，类型为 torch.FloatTensor
    
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 初始化隐藏状态元组为 None，类型为 Optional[Tuple[torch.FloatTensor, ...]]，表示可能为 None 或包含多个 torch.FloatTensor 元素的元组
    
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 初始化注意力权重元组为 None，类型为 Optional[Tuple[torch.FloatTensor, ...]]，表示可能为 None 或包含多个 torch.FloatTensor 元素的元组
    
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 初始化重新调整后的隐藏状态元组为 None，类型为 Optional[Tuple[torch.FloatTensor, ...]]，表示可能为 None 或包含多个 torch.FloatTensor 元素的元组
# 从`transformers.models.nat.modeling_nat.NatModelOutput`复制并将`Nat`改为`Dinat`的数据类定义
@dataclass
class DinatModelOutput(ModelOutput):
    """
    Dinat model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    """

    # Dinat模型的最后隐藏状态
    last_hidden_state: torch.FloatTensor = None
    # 可选项，当传递`add_pooling_layer=True`时返回，最后一层隐藏状态的平均池化
    pooler_output: Optional[torch.FloatTensor] = None
    # 可选项，当传递`output_hidden_states=True`时或`config.output_hidden_states=True`时返回，模型每层的隐藏状态的元组
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 可选项，当传递`output_attentions=True`时或`config.output_attentions=True`时返回，注意力权重的元组
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 可选项，当传递`output_hidden_states=True`时或`config.output_hidden_states=True`时返回，包含空间维度的隐藏状态的元组
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None


# 从`transformers.models.nat.modeling_nat.NatImageClassifierOutput`复制并将`Nat`改为`Dinat`的数据类定义
@dataclass
class DinatImageClassifierOutput(ModelOutput):
    """
    Dinat outputs for image classification.
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            分类（如果 `config.num_labels==1` 则为回归）的损失值。
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            SoftMax 之前的分类（或回归，如果 `config.num_labels==1`）分数。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型每一层的隐藏状态，包括初始嵌入输出。

            包含形状为 `(batch_size, sequence_length, hidden_size)` 的 `torch.FloatTensor` 元组。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            注意力权重经过注意力 SoftMax 后的值，用于计算自注意力头中的加权平均值。

            包含形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的 `torch.FloatTensor` 元组。
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型每一层的隐藏状态，包括初始嵌入输出，且包含空间维度。

            包含形状为 `(batch_size, hidden_size, height, width)` 的 `torch.FloatTensor` 元组。
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 从 transformers.models.nat.modeling_nat.NatEmbeddings 复制的类，将 Nat 替换为 Dinat
class DinatEmbeddings(nn.Module):
    """
    构建补丁和位置嵌入。

    Args:
        config:
            模型配置对象，包含嵌入维度等参数。
    """

    def __init__(self, config):
        super().__init__()

        # 使用 DinatPatchEmbeddings 类构建补丁嵌入
        self.patch_embeddings = DinatPatchEmbeddings(config)

        # 应用 LayerNorm 进行归一化
        self.norm = nn.LayerNorm(config.embed_dim)
        # 应用 dropout 进行正则化
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor]:
        # 生成补丁嵌入
        embeddings = self.patch_embeddings(pixel_values)
        # 对嵌入应用 LayerNorm
        embeddings = self.norm(embeddings)
        # 对归一化后的嵌入应用 dropout
        embeddings = self.dropout(embeddings)

        return embeddings


# 从 transformers.models.nat.modeling_nat.NatPatchEmbeddings 复制的类，将 Nat 替换为 Dinat
class DinatPatchEmbeddings(nn.Module):
    """
    这个类将形状为 `(batch_size, num_channels, height, width)` 的 `pixel_values` 转换为形状为
    `(batch_size, height, width, hidden_size)` 的初始隐藏状态（补丁嵌入），以供 Transformer 消费。

    Args:
        config:
            模型配置对象，包含补丁大小、通道数和嵌入维度等参数。
    """

    def __init__(self, config):
        super().__init__()
        patch_size = config.patch_size
        num_channels, hidden_size = config.num_channels, config.embed_dim
        self.num_channels = num_channels

        if patch_size == 4:
            pass
        else:
            # TODO: 支持任意的补丁大小。
            raise ValueError("Dinat 目前仅支持补丁大小为 4。")

        # 使用两个卷积层进行投影
        self.projection = nn.Sequential(
            nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
            nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
        )

    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor:
        _, num_channels, height, width = pixel_values.shape
        # 检查像素值的通道维度是否与配置中设置的一致
        if num_channels != self.num_channels:
            raise ValueError(
                "确保像素值的通道维度与配置中设置的一致。"
            )
        # 应用投影来生成补丁嵌入，然后重新排列维度
        embeddings = self.projection(pixel_values)
        embeddings = embeddings.permute(0, 2, 3, 1)

        return embeddings


# 从 transformers.models.nat.modeling_nat.NatDownsampler 复制的类，将 Nat 替换为 Dinat
class DinatDownsampler(nn.Module):
    """
    卷积下采样层。

    Args:
        dim (`int`):
            输入通道数。
        norm_layer (`nn.Module`, *optional*, 默认为 `nn.LayerNorm`):
            归一化层类。
    """

    def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
        super().__init__()
        self.dim = dim
        # 使用卷积进行降维
        self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        # 应用归一化层
        self.norm = norm_layer(2 * dim)
    # 定义前向传播方法，接受一个形状为 [batch_size, height, width, channels] 的张量 input_feature，并返回一个形状相同的张量
    def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
        # 调用 self.reduction 方法对输入张量进行维度变换，将通道维移到第二个位置，然后再次调用 permute 将通道维还原到最后一个位置
        input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
        # 对变换后的张量 input_feature 进行规范化处理
        input_feature = self.norm(input_feature)
        # 返回处理后的张量 input_feature
        return input_feature
# 从transformers.models.beit.modeling_beit.drop_path复制而来
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    每个样本中应用在残差块主路径上的路径丢弃（随机深度）。

    注释由Ross Wightman提供：这与我为EfficientNet等网络创建的DropConnect实现相同，
    但原始名称有误导，因为'Drop Connect'是另一篇论文中不同形式的dropout...
    参见讨论：https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
    我选择将层和参数名称更改为'drop path'，而不是混合使用DropConnect作为层名称并使用'survival rate'作为参数。
    """
    if drop_prob == 0.0 or not training:
        return input
    keep_prob = 1 - drop_prob
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # 适用于不同维度张量，而不仅仅是2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 二值化
    output = input.div(keep_prob) * random_tensor
    return output


# 从transformers.models.beit.modeling_beit.BeitDropPath复制，将Beit改为Dinat
class DinatDropPath(nn.Module):
    """每个样本中应用在残差块主路径上的路径丢弃（随机深度）。"""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


class NeighborhoodAttention(nn.Module):
    def __init__(self, config, dim, num_heads, kernel_size, dilation):
        super().__init__()
        if dim % num_heads != 0:
            raise ValueError(
                f"隐藏大小（{dim}）不是注意力头数（{num_heads}）的整数倍"
            )

        self.num_attention_heads = num_heads
        self.attention_head_size = int(dim / num_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.kernel_size = kernel_size
        self.dilation = dilation

        # rpb是可学习的相对位置偏置；与Swin使用的概念相同。
        self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1)))

        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    # 从transformers.models.nat.modeling_nat.NeighborhoodAttention.transpose_for_scores复制，将Nat改为Dinat
    # 将输入张量 x 进行形状转换，以便进行多头注意力计算
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 3, 1, 2, 4)

    # 实现 Transformer 的前向传播
    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 通过 self.query、self.key、self.value 函数获取查询、键和值张量，并进行形状转换
        query_layer = self.transpose_for_scores(self.query(hidden_states))
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        # 对查询张量应用缩放因子，以便在计算注意力权重之前缩放
        query_layer = query_layer / math.sqrt(self.attention_head_size)

        # 计算注意力分数，包括相对位置偏置
        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, self.dilation)

        # 将注意力分数归一化为注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 使用 dropout 随机丢弃一部分注意力概率，这在 Transformer 中是标准做法
        attention_probs = self.dropout(attention_probs)

        # 计算上下文张量，结合注意力概率和值张量
        context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, self.dilation)

        # 对上下文张量进行维度置换，以适应后续处理
        context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous()

        # 调整上下文张量的形状，以适应全头尺寸
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        # 根据输出选项返回结果，包括上下文张量和（如果需要的话）注意力概率
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
# Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionOutput
class NeighborhoodAttentionOutput(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 定义一个全连接层，输入和输出维度都是 dim
        self.dense = nn.Linear(dim, dim)
        # 定义一个 Dropout 层，使用配置中的概率来丢弃注意力机制的概率
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入的 hidden_states 经过全连接层 dense
        hidden_states = self.dense(hidden_states)
        # 对经过全连接层后的 hidden_states 进行 Dropout 处理
        hidden_states = self.dropout(hidden_states)

        return hidden_states


class NeighborhoodAttentionModule(nn.Module):
    def __init__(self, config, dim, num_heads, kernel_size, dilation):
        super().__init__()
        # 创建一个邻域注意力模块，使用给定的参数
        self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size, dilation)
        # 创建一个输出层，将邻域注意力模块的输出映射到指定维度上
        self.output = NeighborhoodAttentionOutput(config, dim)
        # 初始化一个空的集合，用于存储被剪枝的注意力头索引
        self.pruned_heads = set()

    # Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionModule.prune_heads
    def prune_heads(self, heads):
        # 如果 heads 列表为空，则直接返回
        if len(heads) == 0:
            return
        # 查找可剪枝的注意力头和对应的索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储被剪枝的头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionModule.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 执行邻域注意力模块的前向传播
        self_outputs = self.self(hidden_states, output_attentions)
        # 将邻域注意力模块的输出传递给输出层，同时传入原始的 hidden_states
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力权重，则将它们加入到输出中
        outputs = (attention_output,) + self_outputs[1:]
        return outputs


# Copied from transformers.models.nat.modeling_nat.NatIntermediate with Nat->Dinat
class DinatIntermediate(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 定义一个线性层，将输入维度 dim 映射到 config.mlp_ratio * dim 的输出维度
        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
        # 根据配置中的激活函数类型选择对应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入的 hidden_states 经过线性层 dense
        hidden_states = self.dense(hidden_states)
        # 将线性层的输出经过选择的激活函数 intermediate_act_fn 处理
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states
# 从transformers.models.nat.modeling_nat.NatOutput复制并将Nat->Dinat
class DinatOutput(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 使用线性层将输入维度映射到指定维度，mlp_ratio为配置参数
        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
        # 使用指定的dropout概率创建一个dropout层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入的hidden_states通过线性层映射
        hidden_states = self.dense(hidden_states)
        # 对映射后的结果进行dropout处理
        hidden_states = self.dropout(hidden_states)
        return hidden_states


class DinatLayer(nn.Module):
    def __init__(self, config, dim, num_heads, dilation, drop_path_rate=0.0):
        super().__init__()
        # 设置用于分块前馈的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 设置卷积核大小
        self.kernel_size = config.kernel_size
        # 设置扩张率
        self.dilation = dilation
        # 计算窗口大小，是卷积核大小和扩张率的乘积
        self.window_size = self.kernel_size * self.dilation
        # 在LayerNorm之前应用LayerNorm进行归一化，eps是配置参数
        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
        # 使用NeighborhoodAttentionModule创建注意力层，config为配置参数
        self.attention = NeighborhoodAttentionModule(
            config, dim, num_heads, kernel_size=self.kernel_size, dilation=self.dilation
        )
        # 如果drop_path_rate大于0，创建DropPath层，否则创建Identity层
        self.drop_path = DinatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
        # 在LayerNorm之后应用LayerNorm进行归一化，eps是配置参数
        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
        # 创建DinatIntermediate层，处理中间状态
        self.intermediate = DinatIntermediate(config, dim)
        # 创建DinatOutput层，产生最终输出
        self.output = DinatOutput(config, dim)
        # 如果配置中的layer_scale_init_value大于0，则创建可训练参数，否则为None
        self.layer_scale_parameters = (
            nn.Parameter(config.layer_scale_init_value * torch.ones((2, dim)), requires_grad=True)
            if config.layer_scale_init_value > 0
            else None
        )

    def maybe_pad(self, hidden_states, height, width):
        # 获取当前窗口大小
        window_size = self.window_size
        # 默认填充值为0
        pad_values = (0, 0, 0, 0, 0, 0)
        # 如果输入的高度或宽度小于窗口大小，则进行填充
        if height < window_size or width < window_size:
            pad_l = pad_t = 0
            pad_r = max(0, window_size - width)
            pad_b = max(0, window_size - height)
            pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b)
            # 对隐藏状态进行填充
            hidden_states = nn.functional.pad(hidden_states, pad_values)
        return hidden_states, pad_values

    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
        #
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # 获取隐藏状态的批量大小、高度、宽度和通道数
        batch_size, height, width, channels = hidden_states.size()
        # 保存隐藏状态的快捷方式
        shortcut = hidden_states

        # 对隐藏状态进行 layer normalization
        hidden_states = self.layernorm_before(hidden_states)
        # 如果隐藏状态小于卷积核大小乘以膨胀率，则进行填充
        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)

        # 获取填充后的高度和宽度
        _, height_pad, width_pad, _ = hidden_states.shape

        # 执行注意力机制，获取注意力输出
        attention_outputs = self.attention(hidden_states, output_attentions=output_attentions)

        # 从注意力输出中提取主要的注意力输出
        attention_output = attention_outputs[0]

        # 检查是否进行了填充
        was_padded = pad_values[3] > 0 or pad_values[5] > 0
        if was_padded:
            # 如果有填充，则裁剪注意力输出以匹配原始尺寸
            attention_output = attention_output[:, :height, :width, :].contiguous()

        # 如果存在层缩放参数，则应用第一个参数到注意力输出
        if self.layer_scale_parameters is not None:
            attention_output = self.layer_scale_parameters[0] * attention_output

        # 将注意力输出与快捷方式相加，应用 drop path 操作
        hidden_states = shortcut + self.drop_path(attention_output)

        # 对层输出进行 layer normalization
        layer_output = self.layernorm_after(hidden_states)
        # 经过中间层和输出层的处理
        layer_output = self.output(self.intermediate(layer_output))

        # 如果存在层缩放参数，则应用第二个参数到层输出
        if self.layer_scale_parameters is not None:
            layer_output = self.layer_scale_parameters[1] * layer_output

        # 将层输出与隐藏状态相加，再应用 drop path 操作
        layer_output = hidden_states + self.drop_path(layer_output)

        # 构造层输出元组，可能包含注意力权重
        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
        return layer_outputs
# 定义了一个名为 DinatStage 的自定义神经网络模块，继承自 nn.Module
class DinatStage(nn.Module):
    # 初始化函数，接收多个参数用于配置模块
    def __init__(self, config, dim, depth, num_heads, dilations, drop_path_rate, downsample):
        super().__init__()
        self.config = config  # 存储配置参数
        self.dim = dim  # 存储维度参数
        # 使用 nn.ModuleList 存储 DinatLayer 层的列表
        self.layers = nn.ModuleList(
            [
                DinatLayer(
                    config=config,
                    dim=dim,
                    num_heads=num_heads,
                    dilation=dilations[i],
                    drop_path_rate=drop_path_rate[i],
                )
                for i in range(depth)
            ]
        )

        # 如果 downsample 参数不为 None，则创建 downsample 层
        if downsample is not None:
            self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm)
        else:
            self.downsample = None

        self.pointing = False  # 初始化 pointing 属性为 False

    # 重写 forward 方法，执行前向传播计算
    # 从 transformers.models.nat.modeling_nat.NatStage.forward 复制而来
    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        _, height, width, _ = hidden_states.size()
        # 遍历 self.layers 列表中的每个 DinatLayer 层，依次计算输出
        for i, layer_module in enumerate(self.layers):
            layer_outputs = layer_module(hidden_states, output_attentions)
            hidden_states = layer_outputs[0]  # 更新 hidden_states

        hidden_states_before_downsampling = hidden_states
        # 如果存在 downsample 层，则对计算前的 hidden_states 进行下采样
        if self.downsample is not None:
            hidden_states = self.downsample(hidden_states_before_downsampling)

        # 返回计算后的 hidden_states 和计算前的 hidden_states_before_downsampling
        stage_outputs = (hidden_states, hidden_states_before_downsampling)

        # 如果需要输出注意力矩阵，则将其加入 stage_outputs 中
        if output_attentions:
            stage_outputs += layer_outputs[1:]
        return stage_outputs


# 定义了一个名为 DinatEncoder 的自定义神经网络模块，继承自 nn.Module
class DinatEncoder(nn.Module):
    # 初始化函数，接收配置参数 config
    def __init__(self, config):
        super().__init__()
        self.num_levels = len(config.depths)  # 计算深度级别数量
        self.config = config  # 存储配置参数
        # 根据配置参数创建多层 DinatStage 模块，并存储在 nn.ModuleList 中
        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
        self.levels = nn.ModuleList(
            [
                DinatStage(
                    config=config,
                    dim=int(config.embed_dim * 2**i_layer),
                    depth=config.depths[i_layer],
                    num_heads=config.num_heads[i_layer],
                    dilations=config.dilations[i_layer],
                    drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                    downsample=DinatDownsampler if (i_layer < self.num_levels - 1) else None,
                )
                for i_layer in range(self.num_levels)
            ]
        )

    # 重写 forward 方法，执行前向传播计算
    # 从 transformers.models.nat.modeling_nat.NatEncoder.forward 复制而来，Nat->Dinat
    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        output_hidden_states_before_downsampling: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple, DinatEncoderOutput]:
        # 如果没有要输出的隐藏状态，则置空
        all_hidden_states = () if output_hidden_states else None
        # 如果没有要输出的重塑后的隐藏状态，则置空
        all_reshaped_hidden_states = () if output_hidden_states else None
        # 如果没有要输出的注意力权重，则置空
        all_self_attentions = () if output_attentions else None

        if output_hidden_states:
            # 重新排列隐藏状态的维度顺序：从 b h w c 到 b c h w
            reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
            # 将当前隐藏状态添加到所有隐藏状态的元组中
            all_hidden_states += (hidden_states,)
            # 将重塑后的隐藏状态添加到所有重塑后的隐藏状态的元组中
            all_reshaped_hidden_states += (reshaped_hidden_state,)

        for i, layer_module in enumerate(self.levels):
            # 对每一层模块进行前向传播
            layer_outputs = layer_module(hidden_states, output_attentions)

            # 更新当前隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果需要输出隐藏状态且需要输出下采样前的隐藏状态
            hidden_states_before_downsampling = layer_outputs[1]

            if output_hidden_states and output_hidden_states_before_downsampling:
                # 重新排列下采样前的隐藏状态的维度顺序：从 b h w c 到 b c h w
                reshaped_hidden_state = hidden_states_before_downsampling.permute(0, 3, 1, 2)
                # 将下采样前的隐藏状态添加到所有隐藏状态的元组中
                all_hidden_states += (hidden_states_before_downsampling,)
                # 将重塑后的隐藏状态添加到所有重塑后的隐藏状态的元组中
                all_reshaped_hidden_states += (reshaped_hidden_state,)
            elif output_hidden_states and not output_hidden_states_before_downsampling:
                # 重新排列当前隐藏状态的维度顺序：从 b h w c 到 b c h w
                reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
                # 将当前隐藏状态添加到所有隐藏状态的元组中
                all_hidden_states += (hidden_states,)
                # 将重塑后的隐藏状态添加到所有重塑后的隐藏状态的元组中
                all_reshaped_hidden_states += (reshaped_hidden_state,)

            if output_attentions:
                # 将当前层的注意力权重添加到所有注意力权重的元组中
                all_self_attentions += layer_outputs[2:]

        if not return_dict:
            # 如果不返回字典，则返回非空值的元组
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)

        # 返回 DinatEncoderOutput 对象，包含最终的隐藏状态、所有隐藏状态、所有注意力权重和所有重塑后的隐藏状态
        return DinatEncoderOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            reshaped_hidden_states=all_reshaped_hidden_states,
        )
# DinatPreTrainedModel 类的子类，用于处理权重初始化以及下载和加载预训练模型的简单接口
class DinatPreTrainedModel(PreTrainedModel):

    # 模型的配置类，指定为 DinatConfig
    config_class = DinatConfig
    # 基础模型的前缀名称为 "dinat"
    base_model_prefix = "dinat"
    # 主要输入的名称为 "pixel_values"
    main_input_name = "pixel_values"

    def _init_weights(self, module):
        """初始化模型的权重"""
        # 如果是 nn.Linear 或 nn.Conv2d 模块
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 使用正态分布初始化权重数据，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项，则将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是 nn.LayerNorm 模块
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为全1
            module.weight.data.fill_(1.0)


# DINAT_START_DOCSTRING 是字符串常量，用于保存 DinatModel 类的文档字符串
DINAT_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`DinatConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# DINAT_INPUTS_DOCSTRING 是字符串常量，用于保存 DinatModel 类的输入参数文档字符串
DINAT_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# add_start_docstrings 装饰器，用于给 DinatModel 类添加文档字符串
@add_start_docstrings(
    "The bare Dinat Model transformer outputting raw hidden-states without any specific head on top.",
    DINAT_START_DOCSTRING,
)
# DinatModel 类的定义，继承自 DinatPreTrainedModel 类
# 从 transformers.models.nat.modeling_nat.NatModel 复制而来，将 Nat 替换为 Dinat，NAT 替换为 DINAT
class DinatModel(DinatPreTrainedModel):
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)

        requires_backends(self, ["natten"])  # 要求后端支持 "natten" 模块

        self.config = config  # 保存配置信息
        self.num_levels = len(config.depths)  # 确定金字塔层数
        self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1))  # 计算特征数量

        self.embeddings = DinatEmbeddings(config)  # 初始化嵌入层
        self.encoder = DinatEncoder(config)  # 初始化编码器

        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)  # 初始化层归一化层
        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None  # 根据参数决定是否添加池化层

        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.patch_embeddings  # 返回输入嵌入

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)  # 剪枝模型的注意力头部

    @add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=DinatModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, DinatModelOutput]:
        # 设置是否输出注意力权重，默认从模型配置中获取
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置是否输出隐藏状态，默认从模型配置中获取
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置是否返回字典格式的输出，默认从模型配置中获取
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果未提供像素值，则抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将像素值传入嵌入层进行处理
        embedding_output = self.embeddings(pixel_values)

        # 使用编码器处理嵌入输出
        encoder_outputs = self.encoder(
            embedding_output,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的序列输出，并进行 LayerNormalization
        sequence_output = encoder_outputs[0]
        sequence_output = self.layernorm(sequence_output)

        # 初始化池化输出为 None
        pooled_output = None
        # 如果模型有池化层，则对序列输出进行池化操作
        if self.pooler is not None:
            pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2))
            pooled_output = torch.flatten(pooled_output, 1)

        # 如果不要求以字典格式返回结果，则返回元组形式的输出
        if not return_dict:
            output = (sequence_output, pooled_output) + encoder_outputs[1:]
            return output

        # 否则，以自定义的输出对象形式返回结果，包括最后的隐藏状态、池化输出以及各层的隐藏状态和注意力权重
        return DinatModelOutput(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
        )
@add_start_docstrings(
    """
    Dinat Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    """,
    DINAT_START_DOCSTRING,
)
class DinatForImageClassification(DinatPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 检查后端库是否已经加载
        requires_backends(self, ["natten"])

        # 设置分类任务的类别数目
        self.num_labels = config.num_labels
        # 初始化 DinatModel 模型
        self.dinat = DinatModel(config)

        # 分类器头部
        self.classifier = (
            nn.Linear(self.dinat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=DinatImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, DinatImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 确保返回字典存在，如果未提供则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用自注意力网络模型（DINAT），传入像素值和其他选项参数
        outputs = self.dinat(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 提取汇聚后的输出特征
        pooled_output = outputs[1]

        # 将汇聚后的特征输入分类器，生成预测 logits
        logits = self.classifier(pooled_output)

        # 初始化损失值为 None
        loss = None
        # 如果提供了标签
        if labels is not None:
            # 如果问题类型未定义，则根据标签类型设置问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不要求返回字典，则组装输出元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 DINAT 图像分类器输出对象，包括损失、logits、隐藏状态、注意力等
        return DinatImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            reshaped_hidden_states=outputs.reshaped_hidden_states,
        )
# 使用装饰器添加文档字符串，描述了这个类是一个用于如DETR和MaskFormer等框架的NAT骨干。
# 这里继承了DinatPreTrainedModel和BackboneMixin类。
class DinatBackbone(DinatPreTrainedModel, BackboneMixin):
    def __init__(self, config):
        super().__init__(config)
        # 调用父类的初始化方法，传递配置对象给父类
        super()._init_backbone(config)

        # 确保所需的后端库存在
        requires_backends(self, ["natten"])

        # 初始化嵌入层和编码器
        self.embeddings = DinatEmbeddings(config)
        self.encoder = DinatEncoder(config)

        # 计算每个阶段的特征维度列表，这些维度是根据配置的嵌入维度和深度计算得出的
        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]

        # 为输出特征的隐藏状态添加层归一化
        hidden_states_norms = {}
        for stage, num_channels in zip(self._out_features, self.channels):
            hidden_states_norms[stage] = nn.LayerNorm(num_channels)
        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)

        # 执行后续的权重初始化和最终处理
        self.post_init()

    # 获取输入嵌入层的方法
    def get_input_embeddings(self):
        return self.embeddings.patch_embeddings

    # 使用装饰器添加文档字符串，描述了这个方法的输入参数和输出类型
    # 并替换返回值的文档字符串，指定输出类型为BackboneOutput，配置类为_CONFIG_FOR_DOC
    def forward(
        self,
        pixel_values: torch.Tensor,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        """
        如果 return_dict 参数为 None，则使用 self.config.use_return_dict 决定返回值类型
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        """
        如果 output_hidden_states 参数为 None，则使用 self.config.output_hidden_states 决定是否输出隐藏状态
        """
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        """
        如果 output_attentions 参数为 None，则使用 self.config.output_attentions 决定是否输出注意力权重
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

        """
        使用 self.embeddings 将 pixel_values 转换为嵌入输出
        """
        embedding_output = self.embeddings(pixel_values)

        """
        使用 self.encoder 对嵌入输出进行编码，设置输出选项和返回值类型为字典
        """
        outputs = self.encoder(
            embedding_output,
            output_attentions=output_attentions,
            output_hidden_states=True,
            output_hidden_states_before_downsampling=True,
            return_dict=True,
        )

        """
        从编码器输出中获取重塑后的隐藏状态
        """
        hidden_states = outputs.reshaped_hidden_states

        """
        初始化空的特征图列表
        """
        feature_maps = ()
        """
        遍历阶段名称和隐藏状态，将符合条件的特征图添加到列表中
        """
        for stage, hidden_state in zip(self.stage_names, hidden_states):
            if stage in self.out_features:
                batch_size, num_channels, height, width = hidden_state.shape
                hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
                hidden_state = hidden_state.view(batch_size, height * width, num_channels)
                hidden_state = self.hidden_states_norms[stage](hidden_state)
                hidden_state = hidden_state.view(batch_size, height, width, num_channels)
                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
                feature_maps += (hidden_state,)

        """
        如果不需要返回字典形式的结果，则将特征图和可能的隐藏状态组成元组返回
        """
        if not return_dict:
            output = (feature_maps,)
            if output_hidden_states:
                output += (outputs.hidden_states,)
            return output

        """
        否则，返回包含特征图、隐藏状态和注意力权重的 BackboneOutput 对象
        """
        return BackboneOutput(
            feature_maps=feature_maps,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=outputs.attentions,
        )

Transformers-源码解析-三十九-

Transformers 源码解析（三十九）

.\models\detr\convert_detr_original_pytorch_checkpoint_to_pytorch.py

.\models\detr\convert_detr_to_pytorch.py

.\models\detr\feature_extraction_detr.py

.\models\detr\image_processing_detr.py

.\models\detr\modeling_detr.py

.\models\detr\__init__.py

.\models\dialogpt\convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py

.\models\dialogpt\__init__.py

.\models\dinat\configuration_dinat.py

.\models\dinat\modeling_dinat.py

`.\models\detr\convert_detr_original_pytorch_checkpoint_to_pytorch.py`

`.\models\detr\convert_detr_to_pytorch.py`

`.\models\detr\feature_extraction_detr.py`

`.\models\detr\image_processing_detr.py`

`.\models\detr\modeling_detr.py`

`.\models\detr\init.py`

`.\models\dialogpt\convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py`

`.\models\dialogpt\init.py`

`.\models\dinat\configuration_dinat.py`

`.\models\dinat\modeling_dinat.py`