Transformers 源码解析（二十八）

`.\models\conditional_detr\feature_extraction_conditional_detr.py`

# 设置编码格式为 UTF-8
# 版权声明和许可条款，指明此代码的版权归 HuggingFace Inc. 团队所有，遵循 Apache License, Version 2.0
# 除非符合许可证要求，否则不得使用此文件
# 可以在以下网址获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0
# 根据适用法律或书面同意，软件按"原样"提供，不提供任何明示或暗示的保证或条件
# 请查阅许可证了解具体的语言和限制
"""
Feature extractor class for Conditional DETR.
"""

# 导入警告模块
import warnings

# 从相关模块中导入 rgb_to_id 函数并重命名为 _rgb_to_id
from ...image_transforms import rgb_to_id as _rgb_to_id
# 从 utils 模块导入 logging 功能
from ...utils import logging
# 从当前目录下的 image_processing_conditional_detr 模块中导入 ConditionalDetrImageProcessor 类
from .image_processing_conditional_detr import ConditionalDetrImageProcessor

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义一个函数 rgb_to_id，用于将 RGB 图像转换为 ID（标识符）
def rgb_to_id(x):
    # 发出警告，说明 rgb_to_id 函数已移动，从 v5 版本开始将不再从此模块导入
    warnings.warn(
        "rgb_to_id has moved and will not be importable from this module from v5. "
        "Please import from transformers.image_transforms instead.",
        FutureWarning,
    )
    # 调用 _rgb_to_id 函数，执行 RGB 到 ID 的转换操作
    return _rgb_to_id(x)

# 定义一个特征提取器类 ConditionalDetrFeatureExtractor，继承自 ConditionalDetrImageProcessor 类
class ConditionalDetrFeatureExtractor(ConditionalDetrImageProcessor):
    # 初始化方法，接受任意位置参数和关键字参数
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告，说明 ConditionalDetrFeatureExtractor 类已弃用，将在 Transformers 版本 5 中移除
        # 建议使用 ConditionalDetrImageProcessor 替代
        warnings.warn(
            "The class ConditionalDetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
            " Please use ConditionalDetrImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类 ConditionalDetrImageProcessor 的初始化方法，传入所有接收到的参数
        super().__init__(*args, **kwargs)

`.\models\conditional_detr\image_processing_conditional_detr.py`

# 设置文件编码为 UTF-8
# 版权声明及版权许可信息
# 此代码基于 Apache License, Version 2.0 许可
# 详细信息请访问 http://www.apache.org/licenses/LICENSE-2.0

"""Conditional DETR 的图像处理器类。"""

# 导入所需库和模块
import io  # 提供了对 I/O 操作的支持
import pathlib  # 提供了操作文件和目录路径的功能
from collections import defaultdict  # 提供了默认值的字典实现
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union  # 引入类型提示

import numpy as np  # 引入 NumPy 数学库，用于数组操作

# 从 HuggingFace 库中导入图像处理相关的工具和模块
from ...feature_extraction_utils import BatchFeature
from ...image_processing_utils import BaseImageProcessor, get_size_dict
from ...image_transforms import (
    PaddingMode,
    center_to_corners_format,
    corners_to_center_format,
    id_to_rgb,
    pad,
    rescale,
    resize,
    rgb_to_id,
    to_channel_dimension_format,
)
from ...image_utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
    AnnotationFormat,
    AnnotationType,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_annotations,
    validate_kwargs,
    validate_preprocess_arguments,
)
from ...utils import (
    TensorType,
    is_flax_available,
    is_jax_tensor,
    is_scipy_available,
    is_tf_available,
    is_tf_tensor,
    is_torch_available,
    is_torch_tensor,
    is_vision_available,
    logging,
)

# 如果 Torch 可用，导入 Torch 相关模块
if is_torch_available():
    import torch
    from torch import nn

# 如果 vision 相关工具可用，导入 PIL 库
if is_vision_available():
    import PIL

# 如果 SciPy 可用，导入 SciPy 的特定模块
if is_scipy_available():
    import scipy.special
    import scipy.stats

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器，用于日志记录


SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)

# 从 transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio 复制而来
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
    """
    根据输入图像大小和所需输出大小计算输出图像的尺寸。

    Args:
        image_size (`Tuple[int, int]`):
            输入图像的尺寸.
        size (`int`):
            所需的输出尺寸.
        max_size (`int`, *optional*):
            允许的最大输出尺寸.

    Returns:
        Tuple[int, int]: 输出图像的高度和宽度.
    """
    height, width = image_size
    # 如果指定了最大尺寸限制
    if max_size is not None:
        # 计算原始尺寸的最小值和最大值
        min_original_size = float(min((height, width)))
        max_original_size = float(max((height, width)))
        # 如果根据原始尺寸计算的新尺寸超过了最大限制，则调整尺寸大小
        if max_original_size / min_original_size * size > max_size:
            size = int(round(max_size * min_original_size / max_original_size))
    
    # 如果图像高度小于等于宽度且高度等于指定尺寸，或者宽度小于等于高度且宽度等于指定尺寸，则直接返回原始尺寸
    if (height <= width and height == size) or (width <= height and width == size):
        return height, width
    
    # 根据图像宽高比例计算新的缩放后的宽度和高度
    if width < height:
        ow = size  # 新的宽度为指定尺寸
        oh = int(size * height / width)  # 根据比例计算新的高度
    else:
        oh = size  # 新的高度为指定尺寸
        ow = int(size * width / height)  # 根据比例计算新的宽度
    
    return (oh, ow)  # 返回新的图像尺寸元组
# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
def get_resize_output_image_size(
    input_image: np.ndarray,
    size: Union[int, Tuple[int, int], List[int]],
    max_size: Optional[int] = None,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
    """
    Computes the output image size given the input image size and the desired output size. If the desired output size
    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
    image size is computed by keeping the aspect ratio of the input image size.

    Args:
        input_image (`np.ndarray`):
            The image to resize.
        size (`int` or `Tuple[int, int]` or `List[int]`):
            The desired output size.
        max_size (`int`, *optional*):
            The maximum allowed output size.
        input_data_format (`ChannelDimension` or `str`, *optional*):
            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
    """
    # 获取输入图片的尺寸
    image_size = get_image_size(input_image, input_data_format)
    # 如果输出大小是一个 tuple 或 list，则直接返回
    if isinstance(size, (list, tuple)):
        return size
    # 否则，根据输入图片尺寸和指定的大小计算保持宽高比的输出图片尺寸
    return get_size_with_aspect_ratio(image_size, size, max_size)


# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable:
    """
    Returns a function that converts a numpy array to the framework of the input array.

    Args:
        arr (`np.ndarray`): The array to convert.
    """
    # 如果输入是 numpy 数组，则返回 numpy 的数组转换函数
    if isinstance(arr, np.ndarray):
        return np.array
    # 如果 TensorFlow 可用且输入是 TensorFlow 的张量，则返回 TensorFlow 的转换函数
    if is_tf_available() and is_tf_tensor(arr):
        import tensorflow as tf

        return tf.convert_to_tensor
    # 如果 PyTorch 可用且输入是 PyTorch 的张量，则返回 PyTorch 的转换函数
    if is_torch_available() and is_torch_tensor(arr):
        import torch

        return torch.tensor
    # 如果 Flax 可用且输入是 JAX 的张量，则返回 JAX 的数组转换函数
    if is_flax_available() and is_jax_tensor(arr):
        import jax.numpy as jnp

        return jnp.array
    # 如果以上都不是，则抛出值错误异常
    raise ValueError(f"Cannot convert arrays of type {type(arr)}")


# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
    """
    Squeezes an array, but only if the axis specified has dim 1.
    """
    # 如果未指定轴，则直接调用 squeeze 方法
    if axis is None:
        return arr.squeeze()
    # 否则，尝试在指定轴上调用 squeeze 方法，如果出现值错误则返回原数组
    try:
        return arr.squeeze(axis=axis)
    except ValueError:
        return arr


# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
    image_height, image_width = image_size
    norm_annotation = {}
    # 遍历注释字典中的每个键值对
    for key, value in annotation.items():
        # 如果当前键是"boxes"
        if key == "boxes":
            # 将值赋给变量boxes，并将边角坐标格式转换为中心坐标格式
            boxes = value
            boxes = corners_to_center_format(boxes)
            # 将坐标值除以图像宽度和高度，以归一化坐标
            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
            # 将归一化后的坐标存入norm_annotation字典
            norm_annotation[key] = boxes
        else:
            # 对于除了"boxes"以外的其他键，直接复制其值到norm_annotation字典中
            norm_annotation[key] = value
    # 返回归一化后的注释字典
    return norm_annotation
# Copied from transformers.models.detr.image_processing_detr.max_across_indices
def max_across_indices(values: Iterable[Any]) -> List[Any]:
    """
    Return the maximum value across all indices of an iterable of values.
    """
    # 使用 zip(*values) 对传入的可迭代对象进行解压，获取每个索引位置上的值集合
    return [max(values_i) for values_i in zip(*values)]


# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
def get_max_height_width(
    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
    """
    Get the maximum height and width across all images in a batch.
    """
    # 如果未指定输入数据格式，则通过第一张图像推断通道维度格式
    if input_data_format is None:
        input_data_format = infer_channel_dimension_format(images[0])

    # 根据输入数据格式的不同，计算图像中的最大高度和宽度
    if input_data_format == ChannelDimension.FIRST:
        _, max_height, max_width = max_across_indices([img.shape for img in images])
    elif input_data_format == ChannelDimension.LAST:
        max_height, max_width, _ = max_across_indices([img.shape for img in images])
    else:
        # 如果输入数据格式不是有效的通道维度格式，则引发异常
        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
    return (max_height, max_width)


# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
def make_pixel_mask(
    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
    """
    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.

    Args:
        image (`np.ndarray`):
            Image to make the pixel mask for.
        output_size (`Tuple[int, int]`):
            Output size of the mask.
    """
    # 获取图像的输入高度和宽度，根据数据格式不同可能会进行转换
    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
    
    # 创建一个大小为 output_size 的像素掩码，初始值全部为 0
    mask = np.zeros(output_size, dtype=np.int64)
    
    # 将有效像素位置（即图像的实际尺寸）标记为 1
    mask[:input_height, :input_width] = 1
    return mask


# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
    """
    Convert a COCO polygon annotation to a mask.

    Args:
        segmentations (`List[List[float]]`):
            List of polygons, each polygon represented by a list of x-y coordinates.
        height (`int`):
            Height of the mask.
        width (`int`):
            Width of the mask.
    """
    try:
        # 尝试导入 pycocotools 的 mask 模块
        from pycocotools import mask as coco_mask
    except ImportError:
        # 如果导入失败，则抛出 ImportError 异常
        raise ImportError("Pycocotools is not installed in your environment.")

    # 初始化一个空列表来存储所有的掩码
    masks = []
    
    # 遍历每个多边形的坐标列表，将其转换为 COCO 格式的 RLE 编码，再解码为二进制掩码
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        
        # 如果掩码的维度少于 3（缺少颜色通道维度），则添加一个额外的维度
        if len(mask.shape) < 3:
            mask = mask[..., None]
        
        # 将掩码转换为 uint8 类型的 numpy 数组，并将所有非零值转换为 True（1），零值转换为 False（0）
        mask = np.asarray(mask, dtype=np.uint8)
        mask = np.any(mask, axis=2)  # 将所有颜色通道的信息合并成一个单一的掩码
        
        # 将生成的掩码添加到掩码列表中
        masks.append(mask)
    
    # 如果成功生成了掩码列表，则将它们堆叠成一个 numpy 数组返回；否则返回一个空的掩码数组
    if masks:
        masks = np.stack(masks, axis=0)
    else:
        masks = np.zeros((0, height, width), dtype=np.uint8)

    return masks
# 从transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation复制，将DETR转换为ConditionalDetr
def prepare_coco_detection_annotation(
    image,
    target,
    return_segmentation_masks: bool = False,
    input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
    """
    将COCO格式的目标转换为ConditionalDetr所期望的格式。
    """
    # 获取图像的高度和宽度
    image_height, image_width = get_image_size(image, channel_dim=input_data_format)

    # 获取图像ID
    image_id = target["image_id"]
    image_id = np.asarray([image_id], dtype=np.int64)

    # 获取所有给定图像的COCO注释
    annotations = target["annotations"]
    # 过滤掉"iscrowd"属性为1的对象
    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]

    # 获取所有目标的类别ID
    classes = [obj["category_id"] for obj in annotations]
    classes = np.asarray(classes, dtype=np.int64)

    # 用于转换为COCO API格式
    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
    iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)

    # 获取所有目标的边界框
    boxes = [obj["bbox"] for obj in annotations]
    # 防止没有边界框时通过调整大小来处理
    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
    boxes[:, 2:] += boxes[:, :2]
    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)

    # 确保边界框的有效性
    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])

    # 创建新的目标字典
    new_target = {}
    new_target["image_id"] = image_id
    new_target["class_labels"] = classes[keep]
    new_target["boxes"] = boxes[keep]
    new_target["area"] = area[keep]
    new_target["iscrowd"] = iscrowd[keep]
    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)

    # 如果注释中包含关键点信息
    if annotations and "keypoints" in annotations[0]:
        keypoints = [obj["keypoints"] for obj in annotations]
        # 将筛选后的关键点列表转换为numpy数组
        keypoints = np.asarray(keypoints, dtype=np.float32)
        # 在此应用keep掩码以筛选相关的注释
        keypoints = keypoints[keep]
        num_keypoints = keypoints.shape[0]
        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
        new_target["keypoints"] = keypoints

    # 如果需要返回分割掩码
    if return_segmentation_masks:
        segmentation_masks = [obj["segmentation"] for obj in annotations]
        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
        new_target["masks"] = masks[keep]

    return new_target


# 从transformers.models.detr.image_processing_detr.masks_to_boxes复制
def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
    """
    计算提供的全景分割掩码周围的边界框。

    Args:
        masks: 格式为`[number_masks, height, width]`的掩码，其中N是掩码数量

    Returns:
        boxes: 格式为`[number_masks, 4]`的边界框，xyxy格式
    """
    # 如果masks数组为空，返回一个形状为(0, 4)的零数组
    if masks.size == 0:
        return np.zeros((0, 4))

    # 获取masks数组的高度h和宽度w
    h, w = masks.shape[-2:]
    
    # 创建一个包含0到h-1的浮点数数组y，和一个包含0到w-1的浮点数数组x
    y = np.arange(0, h, dtype=np.float32)
    x = np.arange(0, w, dtype=np.float32)
    
    # 创建y和x的网格，使用"ij"索引顺序，详见https://github.com/pytorch/pytorch/issues/50276
    y, x = np.meshgrid(y, x, indexing="ij")

    # 将masks数组与x的扩展维度相乘，计算每个像素在x轴上的最大值和最小值
    x_mask = masks * np.expand_dims(x, axis=0)
    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
    
    # 使用masks数组的布尔反转掩码创建一个掩码数组x，填充填充值为1e8
    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
    x_min = x.filled(fill_value=1e8)
    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)

    # 将masks数组与y的扩展维度相乘，计算每个像素在y轴上的最大值和最小值
    y_mask = masks * np.expand_dims(y, axis=0)
    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
    
    # 使用masks数组的布尔反转掩码创建一个掩码数组y，填充填充值为1e8
    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
    y_min = y.filled(fill_value=1e8)
    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)

    # 将x_min、y_min、x_max和y_max堆叠成一个形状为(?, 4)的数组并返回
    return np.stack([x_min, y_min, x_max, y_max], 1)
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->ConditionalDetr
def prepare_coco_panoptic_annotation(
    image: np.ndarray,
    target: Dict,
    masks_path: Union[str, pathlib.Path],
    return_masks: bool = True,
    input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
    """
    Prepare a coco panoptic annotation for ConditionalDetr.
    """
    # 获取图像的高度和宽度
    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
    # 构建注释文件的路径
    annotation_path = pathlib.Path(masks_path) / target["file_name"]

    new_target = {}
    # 设置图像ID
    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
    # 设置图像大小
    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
    # 设置原始图像大小
    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)

    if "segments_info" in target:
        # 读取注释文件中的掩码图像并转换成numpy数组
        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
        # 将RGB格式的掩码图像转换成ID格式
        masks = rgb_to_id(masks)

        # 获取所有分段信息中的ID
        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
        # 生成掩码数组，表示每个像素属于哪个分段
        masks = masks == ids[:, None, None]
        masks = masks.astype(np.uint8)
        
        if return_masks:
            # 如果需要返回掩码，则将其存储在new_target中
            new_target["masks"] = masks
        # 将掩码转换为边界框
        new_target["boxes"] = masks_to_boxes(masks)
        # 存储每个分段的类别标签
        new_target["class_labels"] = np.array(
            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
        )
        # 存储每个分段的is_crowd标志
        new_target["iscrowd"] = np.asarray(
            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
        )
        # 存储每个分段的面积信息
        new_target["area"] = np.asarray(
            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
        )

    return new_target


# Copied from transformers.models.detr.image_processing_detr.get_segmentation_image
def get_segmentation_image(
    masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False
):
    # 获取输入图像的高度和宽度
    h, w = input_size
    # 获取目标图像的最终高度和宽度
    final_h, final_w = target_size

    # 对掩码进行softmax操作，并转置
    m_id = scipy.special.softmax(masks.transpose(0, 1), -1)

    if m_id.shape[-1] == 0:
        # 如果未检测到任何掩码，则生成全零掩码
        m_id = np.zeros((h, w), dtype=np.int64)
    else:
        # 取最大概率的掩码ID，并重新整形为输入图像大小
        m_id = m_id.argmax(-1).reshape(h, w)

    if deduplicate:
        # 如果需要去除重复掩码，则将属于相同类别的掩码ID合并为一个
        for equiv in stuff_equiv_classes.values():
            for eq_id in equiv:
                m_id[m_id == eq_id] = equiv[0]

    # 将掩码ID转换成RGB格式的分割图像
    seg_img = id_to_rgb(m_id)
    # 将分割图像调整到最终的目标尺寸
    seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
    return seg_img


# Copied from transformers.models.detr.image_processing_detr.get_mask_area
def get_mask_area(seg_img: np.ndarray, target_size: Tuple[int, int], n_classes: int) -> np.ndarray:
    # 获取目标图像的最终高度和宽度
    final_h, final_w = target_size
    # 将分割图像转换成numpy数组，并设置数据类型为uint8
    np_seg_img = seg_img.astype(np.uint8)
    # 调整numpy数组形状以匹配最终的目标尺寸
    np_seg_img = np_seg_img.reshape(final_h, final_w, 3)
    # 使用 rgb_to_id 函数将 np_seg_img 转换为标识图像 m_id
    m_id = rgb_to_id(np_seg_img)
    # 计算每个类别的像素数量，存储在列表 area 中
    area = [(m_id == i).sum() for i in range(n_classes)]
    # 返回计算得到的像素数量列表 area
    return area
# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities
def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    # 计算类别概率的 softmax，将 logits 转换为概率值
    probs = scipy.special.softmax(logits, axis=-1)
    # 获取每个样本的预测类别
    labels = probs.argmax(-1, keepdims=True)
    # 根据预测类别从概率数组中取得对应的分数
    scores = np.take_along_axis(probs, labels, axis=-1)
    # 去除多余的维度，得到一维数组的分数和类别
    scores, labels = scores.squeeze(-1), labels.squeeze(-1)
    return scores, labels


# Copied from transformers.models.detr.image_processing_detr.post_process_panoptic_sample with DetrForSegmentation->ConditionalDetrForSegmentation
def post_process_panoptic_sample(
    out_logits: np.ndarray,
    masks: np.ndarray,
    boxes: np.ndarray,
    processed_size: Tuple[int, int],
    target_size: Tuple[int, int],
    is_thing_map: Dict,
    threshold=0.85,
) -> Dict:
    """
    Converts the output of [`ConditionalDetrForSegmentation`] into panoptic segmentation predictions for a single sample.

    Args:
        out_logits (`torch.Tensor`):
            The logits for this sample.
        masks (`torch.Tensor`):
            The predicted segmentation masks for this sample.
        boxes (`torch.Tensor`):
            The prediced bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y,
            width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding).
        processed_size (`Tuple[int, int]`):
            The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size
            after data augmentation but before batching.
        target_size (`Tuple[int, int]`):
            The target size of the image, `(height, width)` corresponding to the requested final size of the
            prediction.
        is_thing_map (`Dict`):
            A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not.
        threshold (`float`, *optional*, defaults to 0.85):
            The threshold used to binarize the segmentation masks.
    """
    # 过滤掉预测类别为无效类别或分数低于阈值的结果
    scores, labels = score_labels_from_class_probabilities(out_logits)
    keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold)

    cur_scores = scores[keep]
    cur_classes = labels[keep]
    cur_boxes = center_to_corners_format(boxes[keep])

    if len(cur_boxes) != len(cur_classes):
        raise ValueError("Not as many boxes as there are classes")

    cur_masks = masks[keep]
    # 将预测的分割掩码调整大小以适应处理后的图像尺寸
    cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR)
    cur_masks = safe_squeeze(cur_masks, 1)
    b, h, w = cur_masks.shape

    # 可能存在同一类别的多个预测掩码，此处跟踪每个物体类别的掩码 ID 列表（稍后将合并）
    cur_masks = cur_masks.reshape(b, -1)
    stuff_equiv_classes = defaultdict(list)
    # 遍历当前类别列表及其对应的标签
    for k, label in enumerate(cur_classes):
        # 如果当前标签不是物体类别，则将其索引添加到对应的等效类别列表中
        if not is_thing_map[label]:
            stuff_equiv_classes[label].append(k)

    # 根据当前的掩膜图像获取分割图像
    seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True)
    # 获取当前掩膜的面积
    area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores))

    # 过滤掉面积过小的掩膜
    if cur_classes.size() > 0:
        # 创建布尔数组，标记面积小于等于4的掩膜
        filtered_small = np.array([a <= 4 for a in area], dtype=bool)
        # 只要还有面积小于等于4的掩膜，就循环过滤
        while filtered_small.any():
            # 从当前掩膜中移除面积过小的掩膜
            cur_masks = cur_masks[~filtered_small]
            cur_scores = cur_scores[~filtered_small]
            cur_classes = cur_classes[~filtered_small]
            # 根据更新后的掩膜获取分割图像
            seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True)
            # 获取更新后的掩膜的面积
            area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores))
            # 再次过滤面积小于等于4的掩膜
            filtered_small = np.array([a <= 4 for a in area], dtype=bool)
    else:
        # 如果当前类别列表为空，则创建一个包含单一元素的整数数组
        cur_classes = np.ones((1, 1), dtype=np.int64)

    # 创建分段信息列表，包含每个分段的ID、是否为物体、类别ID及其面积
    segments_info = [
        {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a}
        for i, (cat, a) in enumerate(zip(cur_classes, area))
    ]
    # 删除不再需要的当前类别数组
    del cur_classes

    # 创建一个字节流对象
    with io.BytesIO() as out:
        # 将分割图像转换为PIL图像并保存为PNG格式
        PIL.Image.fromarray(seg_img).save(out, format="PNG")
        # 构造预测结果字典，包含PNG格式的字符串及分段信息
        predictions = {"png_string": out.getvalue(), "segments_info": segments_info}

    # 返回最终预测结果字典
    return predictions
# Copied from transformers.models.detr.image_processing_detr.resize_annotation
def resize_annotation(
    annotation: Dict[str, Any],
    orig_size: Tuple[int, int],
    target_size: Tuple[int, int],
    threshold: float = 0.5,
    resample: PILImageResampling = PILImageResampling.NEAREST,
):
    """
    Resizes an annotation to a target size.

    Args:
        annotation (`Dict[str, Any]`):
            The annotation dictionary.
        orig_size (`Tuple[int, int]`):
            The original size of the input image.
        target_size (`Tuple[int, int]`):
            The target size of the image, as returned by the preprocessing `resize` step.
        threshold (`float`, *optional*, defaults to 0.5):
            The threshold used to binarize the segmentation masks.
        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
            The resampling filter to use when resizing the masks.
    """
    # 计算尺寸比例
    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
    ratio_height, ratio_width = ratios

    # 创建一个新的注释字典，将目标大小放入其中
    new_annotation = {}
    new_annotation["size"] = target_size

    # 遍历原始注释的键值对
    for key, value in annotation.items():
        if key == "boxes":
            # 对象边界框的调整尺寸
            boxes = value
            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
            new_annotation["boxes"] = scaled_boxes
        elif key == "area":
            # 目标区域的调整尺寸
            area = value
            scaled_area = area * (ratio_width * ratio_height)
            new_annotation["area"] = scaled_area
        elif key == "masks":
            # 掩码的调整尺寸
            masks = value[:, None]
            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
            masks = masks.astype(np.float32)
            masks = masks[:, 0] > threshold
            new_annotation["masks"] = masks
        elif key == "size":
            # 更新目标大小（可能被覆盖）
            new_annotation["size"] = target_size
        else:
            # 复制其它键值对
            new_annotation[key] = value

    # 返回调整尺寸后的新注释字典
    return new_annotation


# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle
def binary_mask_to_rle(mask):
    """
    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.

    Args:
        mask (`torch.Tensor` or `numpy.array`):
            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
            segment_id or class_id.
    Returns:
        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
        format.
    """
    # 如果输入是 PyTorch 张量，则转换为 NumPy 数组
    if is_torch_tensor(mask):
        mask = mask.numpy()

    # 将二进制掩码展平为一维数组
    pixels = mask.flatten()
    # 添加额外的零到像素数组的两端
    pixels = np.concatenate([[0], pixels, [0]])
    # 找到像素变化的位置索引
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    # 计算 RLE 编码长度
    runs[1::2] -= runs[::2]
    # 返回运行长度编码列表
    return list(runs)


# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
def convert_segmentation_to_rle(segmentation):
    """
    # 获取分割图中唯一的分割标识符（segmentation id）
    segment_ids = torch.unique(segmentation)
    
    # 初始化用于存储所有分割标识符的运行长度编码的列表
    run_length_encodings = []
    
    # 遍历每个分割标识符
    for idx in segment_ids:
        # 创建一个掩码，其中分割图中与当前标识符相同的位置为1，否则为0
        mask = torch.where(segmentation == idx, 1, 0)
        
        # 将二进制掩码转换为运行长度编码（RLE）
        rle = binary_mask_to_rle(mask)
        
        # 将当前标识符的运行长度编码添加到结果列表中
        run_length_encodings.append(rle)
    
    # 返回所有分割标识符的运行长度编码列表
    return run_length_encodings
# Copied from transformers.models.detr.image_processing_detr.compute_segments

# 根据目标大小或默认大小计算掩码的高度和宽度
height = mask_probs.shape[1] if target_size is None else target_size[0]
width = mask_probs.shape[2] if target_size is None else target_size[1]

# 创建一个与图像大小相同的空白分割结果张量，用于存储每个像素点的分割标识
segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
# 初始化空的分割结果列表，用于存储每个分割的详细信息
segments: List[Dict] = []

# 如果设置了目标大小，则对掩码进行插值以适应目标大小
if target_size is not None:
    mask_probs = nn.functional.interpolate(
        mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
    )[0]

# 当前分割的 ID
current_segment_id = 0

# 根据预测分数加权每个掩码
mask_probs *= pred_scores.view(-1, 1, 1)
# 找到每个像素点最可能的类别标签
mask_labels = mask_probs.argmax(0)  # [height, width]

# 用于跟踪每个类别的实例数量
    # 初始化一个空的字典，用于存储每个类别的对象段的内存索引
    stuff_memory_list: Dict[str, int] = {}
    
    # 遍历预测标签的每一行
    for k in range(pred_labels.shape[0]):
        # 获取当前预测类别的整数表示
        pred_class = pred_labels[k].item()
    
        # 检查当前预测类别是否需要融合
        should_fuse = pred_class in label_ids_to_fuse
    
        # 检查当前索引处的掩码是否存在并且足够大以表示一个段
        mask_exists, mask_k = check_segment_validity(
            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
        )
    
        # 如果存在有效的掩码
        if mask_exists:
            # 如果当前预测类别已经在内存列表中存在
            if pred_class in stuff_memory_list:
                # 获取当前预测类别的段的内存索引
                current_segment_id = stuff_memory_list[pred_class]
            else:
                # 如果当前预测类别不在内存列表中，则增加段的内存索引
                current_segment_id += 1
    
            # 将当前对象段添加到最终的分割映射中
            segmentation[mask_k] = current_segment_id
    
            # 获取当前段的预测得分并进行四舍五入保留小数点后六位
            segment_score = round(pred_scores[k].item(), 6)
    
            # 将当前段的信息添加到段列表中
            segments.append(
                {
                    "id": current_segment_id,
                    "label_id": pred_class,
                    "was_fused": should_fuse,
                    "score": segment_score,
                }
            )
    
            # 如果需要融合，则更新内存列表中当前预测类别的段的内存索引
            if should_fuse:
                stuff_memory_list[pred_class] = current_segment_id
    
    # 返回最终的分割映射和段列表
    return segmentation, segments
class ConditionalDetrImageProcessor(BaseImageProcessor):
    r"""
    Constructs a Conditional Detr image processor.

    Args:
        format (`str`, *optional*, defaults to `"coco_detection"`):
            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
        do_resize (`bool`, *optional*, defaults to `True`):
            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
            overridden by the `do_resize` parameter in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
            Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
            the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
            Resampling filter to use if resizing the image.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
            `do_rescale` parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        do_normalize:
            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
            `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_annotations (`bool`, *optional*, defaults to `True`):
            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
        do_pad (`bool`, *optional*, defaults to `True`):
            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
            method. If `True` will pad the images in the batch to the largest height and width in the batch.
            Padding will be applied to the bottom and right of the image with zeros.
    """
    model_input_names = ["pixel_values", "pixel_mask"]

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
    # 初始化函数，用于创建一个图像处理器对象
    def __init__(
        self,
        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Union[float, List[float]] = None,
        image_std: Union[float, List[float]] = None,
        do_convert_annotations: Optional[bool] = None,
        do_pad: bool = True,
        **kwargs,
    ) -> None:
        # 如果 kwargs 中有 "pad_and_return_pixel_mask"，则设置 do_pad 为其值并移除该参数
        if "pad_and_return_pixel_mask" in kwargs:
            do_pad = kwargs.pop("pad_and_return_pixel_mask")

        # 如果 kwargs 中有 "max_size"，则发出警告提示并将其移除，推荐使用 size['longest_edge']
        if "max_size" in kwargs:
            logger.warning_once(
                "The `max_size` parameter is deprecated and will be removed in v4.26. "
                "Please specify in `size['longest_edge'] instead`.",
            )
            max_size = kwargs.pop("max_size")
        else:
            max_size = None if size is None else 1333

        # 如果 size 为 None，则设置默认 size 字典
        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
        # 根据 size 和 max_size 获取最终的尺寸字典
        size = get_size_dict(size, max_size=max_size, default_to_square=False)

        # 向父类初始化方法传递其余的 kwargs 参数
        super().__init__(**kwargs)
        # 设置各个属性值
        self.format = format
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.do_convert_annotations = do_convert_annotations
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        self.do_pad = do_pad
        # 定义一个有效的处理器键名列表，用于验证和配置处理器参数
        self._valid_processor_keys = [
            "images",
            "annotations",
            "return_segmentation_masks",
            "masks_path",
            "do_resize",
            "size",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "do_convert_annotations",
            "image_mean",
            "image_std",
            "do_pad",
            "format",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    @classmethod
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->ConditionalDetr
    # 重写基类中的 `from_dict` 方法，用于从字典创建 ConditionalDetrImageProcessor 对象，
    # 并确保在使用 from_dict 方法创建图像处理器时更新参数，例如 `ConditionalDetrImageProcessor.from_pretrained(checkpoint, size=600, max_size=800)`
    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
        # 复制输入的 image_processor_dict，以确保不修改原始字典
        image_processor_dict = image_processor_dict.copy()
        # 如果 kwargs 中有 "max_size" 参数，则更新到 image_processor_dict 中，并从 kwargs 中移除该参数
        if "max_size" in kwargs:
            image_processor_dict["max_size"] = kwargs.pop("max_size")
        # 如果 kwargs 中有 "pad_and_return_pixel_mask" 参数，则更新到 image_processor_dict 中，并从 kwargs 中移除该参数
        if "pad_and_return_pixel_mask" in kwargs:
            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
        # 调用父类的 from_dict 方法，使用更新后的 image_processor_dict 和任何额外的 kwargs 参数
        return super().from_dict(image_processor_dict, **kwargs)

    # 从 DETR 源码中复制的方法，准备输入图像的注释，以便供 ConditionalDetr 模型使用
    def prepare_annotation(
        self,
        image: np.ndarray,
        target: Dict,
        format: Optional[AnnotationFormat] = None,
        return_segmentation_masks: bool = None,
        masks_path: Optional[Union[str, pathlib.Path]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> Dict:
        """
        Prepare an annotation for feeding into ConditionalDetr model.
        """
        # 如果未指定 format，则使用对象中存储的 format
        format = format if format is not None else self.format

        # 如果 format 是 AnnotationFormat.COCO_DETECTION
        if format == AnnotationFormat.COCO_DETECTION:
            # 如果未指定 return_segmentation_masks，则设为 False；否则保持原值
            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
            # 调用 prepare_coco_detection_annotation 方法，准备 COCO 检测格式的注释
            target = prepare_coco_detection_annotation(
                image, target, return_segmentation_masks, input_data_format=input_data_format
            )
        # 如果 format 是 AnnotationFormat.COCO_PANOPTIC
        elif format == AnnotationFormat.COCO_PANOPTIC:
            # 如果未指定 return_segmentation_masks，则设为 True；否则保持原值
            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
            # 调用 prepare_coco_panoptic_annotation 方法，准备 COCO 全景格式的注释
            target = prepare_coco_panoptic_annotation(
                image,
                target,
                masks_path=masks_path,
                return_masks=return_segmentation_masks,
                input_data_format=input_data_format,
            )
        else:
            # 如果 format 不是支持的格式，则抛出 ValueError 异常
            raise ValueError(f"Format {format} is not supported.")
        
        # 返回处理后的 target 字典
        return target

    # 从 DETR 源码中复制的方法，准备输入数据，调用 prepare_annotation 方法处理注释
    def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
        # 发出一次性警告，提示 `prepare` 方法已弃用，并将在 v4.33 版本中移除
        logger.warning_once(
            "The `prepare` method is deprecated and will be removed in a v4.33. "
            "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
            "does not return the image anymore.",
        )
        # 调用 prepare_annotation 方法处理输入的 image 和 target，并返回处理后的 image 和 target
        target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
        return image, target

    # 从 DETR 源码中复制的方法，将 COCO 多边形转换为掩码的方法，未完成复制
    # 发出警告日志，提示`convert_coco_poly_to_mask`方法已弃用，将在v4.33版本移除
    def convert_coco_poly_to_mask(self, *args, **kwargs):
        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
        # 调用同名函数`convert_coco_poly_to_mask`并返回其结果
        return convert_coco_poly_to_mask(*args, **kwargs)

    # 从`transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection`复制而来，已经修改了`DETR`为`ConditionalDetr`
    def prepare_coco_detection(self, *args, **kwargs):
        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
        # 调用`prepare_coco_detection_annotation`函数并返回其结果
        return prepare_coco_detection_annotation(*args, **kwargs)

    # 从`transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic`复制而来
    def prepare_coco_panoptic(self, *args, **kwargs):
        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
        # 调用`prepare_coco_panoptic_annotation`函数并返回其结果
        return prepare_coco_panoptic_annotation(*args, **kwargs)

    # 从`transformers.models.detr.image_processing_detr.DetrImageProcessor.resize`复制而来，定义了图像的调整大小函数
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    # 定义函数 `resize_image`，接受 `image` 和 `size` 参数，返回 `np.ndarray` 类型
    def resize_image(
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[str] = None,
        input_data_format: Optional[ChannelDimension] = None,
        **kwargs,
    ):
        """
        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
        int, smaller edge of the image will be matched to this number.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
                `height` and `width`.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                Resampling filter to use if resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 检查 `kwargs` 中是否包含 `max_size` 参数，如果包含则发出警告
        if "max_size" in kwargs:
            logger.warning_once(
                "The `max_size` parameter is deprecated and will be removed in v4.26. "
                "Please specify in `size['longest_edge'] instead`.",
            )
            # 弹出 `kwargs` 中的 `max_size` 参数，并赋值给 `max_size` 变量
            max_size = kwargs.pop("max_size")
        else:
            max_size = None
        
        # 调用 `get_size_dict` 函数，根据 `size` 和 `max_size` 参数获取尺寸字典 `size`
        size = get_size_dict(size, max_size=max_size, default_to_square=False)
        
        # 检查 `size` 字典中是否同时包含 `shortest_edge` 和 `longest_edge` 键
        if "shortest_edge" in size and "longest_edge" in size:
            # 调用 `get_resize_output_image_size` 函数，根据 `shortest_edge` 和 `longest_edge` 获取调整后的尺寸
            size = get_resize_output_image_size(
                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
            )
        # 否则，检查 `size` 字典中是否同时包含 `height` 和 `width` 键
        elif "height" in size and "width" in size:
            # 直接取出 `height` 和 `width` 的值，组成元组赋值给 `size`
            size = (size["height"], size["width"])
        else:
            # 如果 `size` 字典中既不包含 `shortest_edge` 和 `longest_edge`，也不包含 `height` 和 `width`，则抛出数值错误异常
            raise ValueError(
                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
                f" {size.keys()}."
            )
        
        # 调用 `resize` 函数，根据给定参数调整图像大小，并将结果赋值给 `image`
        image = resize(
            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
        )
        
        # 返回调整大小后的图像
        return image

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
    # 定义 `resize_annotation` 方法，接受 `annotation`、`orig_size`、`size` 和可选的 `resample` 参数，返回 `Dict` 类型
    def resize_annotation(
        self,
        annotation,
        orig_size,
        size,
        resample: PILImageResampling = PILImageResampling.NEAREST,
    ) -> Dict:
        """
        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
        to this number.
        """
        # 调用 `resize_annotation` 函数，根据参数调整注释大小，并返回结果
        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
    # 使用给定的因子对图像进行重新缩放，即 image = image * rescale_factor。
    def rescale(
        self,
        image: np.ndarray,
        rescale_factor: float,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Rescale the image by the given factor. image = image * rescale_factor.

        Args:
            image (`np.ndarray`):
                Image to rescale.
            rescale_factor (`float`):
                The value to use for rescaling.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
                one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        """
        # 调用全局函数 rescale，对图像进行重新缩放
        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)

    # 从 transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation 复制而来
    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
        """
        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
        """
        # 调用全局函数 normalize_annotation，规范化注释中的边界框格式
        return normalize_annotation(annotation, image_size=image_size)

    # 从 transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image 复制而来
    def _update_annotation_for_padded_image(
        self,
        annotation: Dict,
        input_image_size: Tuple[int, int],
        output_image_size: Tuple[int, int],
        padding,
        update_bboxes,
    ):
        """
        Update annotations to account for padding in the image.

        Args:
            annotation (`Dict`):
                Dictionary containing annotations.
            input_image_size (`Tuple[int, int]`):
                Original size of the input image (height, width).
            output_image_size (`Tuple[int, int]`):
                Size of the output image after padding (height, width).
            padding:
                Padding information.
            update_bboxes:
                Whether to update bounding boxes or not.
        """
        # 调用全局函数 _update_annotation_for_padded_image，更新由于图像填充而改变的注释信息
        pass  # 这里使用 pass 表示该方法暂不执行任何操作
    ) -> Dict:
        """
        Update the annotation for a padded image.
        """
        # 创建一个新的空注释字典
        new_annotation = {}
        # 将输出图像的尺寸信息添加到新注释字典中
        new_annotation["size"] = output_image_size

        # 遍历给定的注释字典
        for key, value in annotation.items():
            # 如果键是 "masks"
            if key == "masks":
                # 获取 masks，并使用 pad 函数进行填充操作
                masks = value
                masks = pad(
                    masks,
                    padding,
                    mode=PaddingMode.CONSTANT,
                    constant_values=0,
                    input_data_format=ChannelDimension.FIRST,
                )
                # 对填充后的 masks 进行 squeeze 操作，移除大小为 1 的维度
                masks = safe_squeeze(masks, 1)
                # 将填充后的 masks 更新到新注释字典中
                new_annotation["masks"] = masks
            # 如果键是 "boxes" 并且 update_bboxes 为 True
            elif key == "boxes" and update_bboxes:
                # 获取 boxes，并根据输入和输出图像尺寸的比例进行缩放
                boxes = value
                boxes *= np.asarray(
                    [
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                        input_image_size[1] / output_image_size[1],
                        input_image_size[0] / output_image_size[0],
                    ]
                )
                # 将缩放后的 boxes 更新到新注释字典中
                new_annotation["boxes"] = boxes
            # 如果键是 "size"
            elif key == "size":
                # 将输出图像的尺寸信息更新到新注释字典中
                new_annotation["size"] = output_image_size
            else:
                # 对于其它所有情况，直接将原始值添加到新注释字典中
                new_annotation[key] = value
        
        # 返回更新后的注释字典
        return new_annotation

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        annotation: Optional[Dict[str, Any]] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        update_bboxes: bool = True,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
        """
        # 获取输入图像的高度和宽度
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        # 获取输出图像的高度和宽度
        output_height, output_width = output_size

        # 计算需要在图像底部和右侧填充的像素数
        pad_bottom = output_height - input_height
        pad_right = output_width - input_width
        # 构造填充元组
        padding = ((0, pad_bottom), (0, pad_right))
        # 使用 pad 函数对图像进行填充操作
        padded_image = pad(
            image,
            padding,
            mode=PaddingMode.CONSTANT,
            constant_values=constant_values,
            data_format=data_format,
            input_data_format=input_data_format,
        )
        
        # 如果提供了注释信息，则更新注释以适应填充后的图像
        if annotation is not None:
            annotation = self._update_annotation_for_padded_image(
                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
            )
        
        # 返回填充后的图像和更新后的注释（如果提供了注释）
        return padded_image, annotation

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
    # 定义一个方法 `pad`，属于当前类的实例方法（self 指向当前对象）
    def pad(
        self,
        images: List[np.ndarray],  # images 参数是一个包含 np.ndarray 元素的列表
        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,  # annotations 参数可选，可以是单个 AnnotationType 或其列表
        constant_values: Union[float, Iterable[float]] = 0,  # constant_values 参数可以是单个浮点数或浮点数的可迭代对象，默认值为 0
        return_pixel_mask: bool = True,  # return_pixel_mask 参数是一个布尔值，默认为 True
        return_tensors: Optional[Union[str, TensorType]] = None,  # return_tensors 参数可选，可以是字符串或 TensorType 类型
        data_format: Optional[ChannelDimension] = None,  # data_format 参数可选，可以是 ChannelDimension 类型
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # input_data_format 参数可选，可以是字符串或 ChannelDimension 类型
        update_bboxes: bool = True,  # update_bboxes 参数是一个布尔值，默认为 True
    # 从 transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess 复制而来的方法定义
    def preprocess(
        self,
        images: ImageInput,  # images 参数是 ImageInput 类型
        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,  # annotations 参数可选，可以是单个 AnnotationType 或其列表
        return_segmentation_masks: bool = None,  # return_segmentation_masks 参数是一个布尔值，默认为 None
        masks_path: Optional[Union[str, pathlib.Path]] = None,  # masks_path 参数可选，可以是字符串或 pathlib.Path 类型
        do_resize: Optional[bool] = None,  # do_resize 参数可选，可以是布尔值或者 None
        size: Optional[Dict[str, int]] = None,  # size 参数可选，是一个字典，键是字符串，值是整数
        resample=None,  # resample 参数，默认为 None，应为 PILImageResampling 类型
        do_rescale: Optional[bool] = None,  # do_rescale 参数可选，可以是布尔值或者 None
        rescale_factor: Optional[Union[int, float]] = None,  # rescale_factor 参数可选，可以是整数或浮点数
        do_normalize: Optional[bool] = None,  # do_normalize 参数可选，可以是布尔值或者 None
        do_convert_annotations: Optional[bool] = None,  # do_convert_annotations 参数可选，可以是布尔值或者 None
        image_mean: Optional[Union[float, List[float]]] = None,  # image_mean 参数可选，可以是单个浮点数或浮点数的列表
        image_std: Optional[Union[float, List[float]]] = None,  # image_std 参数可选，可以是单个浮点数或浮点数的列表
        do_pad: Optional[bool] = None,  # do_pad 参数可选，可以是布尔值或者 None
        format: Optional[Union[str, AnnotationFormat]] = None,  # format 参数可选，可以是字符串或 AnnotationFormat 类型
        return_tensors: Optional[Union[TensorType, str]] = None,  # return_tensors 参数可选，可以是 TensorType 类型或字符串
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,  # data_format 参数是字符串或 ChannelDimension 类型，默认为 ChannelDimension.FIRST
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # input_data_format 参数可选，可以是字符串或 ChannelDimension 类型
        **kwargs,  # 其余未命名参数，以字典形式接收
    # 后处理方法 - TODO: 添加对其它框架的支持
    # 将模型输出转换为 Pascal VOC 格式（xmin, ymin, xmax, ymax），该方法已被弃用，建议使用 `post_process_object_detection` 方法代替
    def post_process(self, outputs, target_sizes):
        """
        Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the Pascal VOC format (xmin, ymin, xmax, ymax).
        Only supports PyTorch.

        Args:
            outputs ([`ConditionalDetrObjectDetectionOutput`]):
                Raw outputs of the model.
            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
                image size (before any data augmentation). For visualization, this should be the image size after data
                augment, but before padding.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        """
        # 发出警告信息，提醒用户方法即将在 Transformers v5 中移除，建议使用 `post_process_object_detection` 替代
        logging.warning_once(
            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
        )

        # 提取模型输出中的逻辑回归结果和预测框
        out_logits, out_bbox = outputs.logits, outputs.pred_boxes

        # 检查输出的数量与目标大小数量是否一致
        if len(out_logits) != len(target_sizes):
            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
        # 检查目标大小的形状是否正确
        if target_sizes.shape[1] != 2:
            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")

        # 对逻辑回归结果进行 sigmoid 激活
        prob = out_logits.sigmoid()

        # 获取概率最高的前 300 个预测结果的值和索引
        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1)
        scores = topk_values
        # 计算预测框的索引
        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
        # 计算预测结果的标签
        labels = topk_indexes % out_logits.shape[2]
        # 将预测框格式从中心点转换为角点坐标格式
        boxes = center_to_corners_format(out_bbox)
        # 根据预测框的索引从所有预测框中选择特定的预测框
        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))

        # 将相对坐标 [0, 1] 转换为绝对坐标 [0, height]，其中 height 和 width 分别是图像的高度和宽度
        img_h, img_w = target_sizes.unbind(1)
        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
        boxes = boxes * scale_fct[:, None, :]

        # 将结果组织成字典列表，每个字典包含模型对批次中每个图像的预测结果（分数、标签和框）
        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]

        # 返回结果列表
        return results

    # 从 transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process_object_detection 复制并修改为 ConditionalDetr
    def post_process_object_detection(
        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
    ):
        """
        Converts the raw output of [`ConditionalDetrForObjectDetection`] into final bounding boxes in (top_left_x,
        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.

        Args:
            outputs ([`DetrObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                (height, width) of each image in the batch. If left to None, predictions will not be resized.
            top_k (`int`, *optional*, defaults to 100):
                Keep only top k bounding boxes before filtering by thresholding.

        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        """
        # Extract logits and bounding boxes from the model's outputs
        out_logits, out_bbox = outputs.logits, outputs.pred_boxes

        # Verify if target_sizes are provided and match the batch dimension of logits
        if target_sizes is not None:
            if len(out_logits) != len(target_sizes):
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )

        # Apply sigmoid activation to logits to obtain probabilities
        prob = out_logits.sigmoid()
        # Reshape probabilities to (batch_size, num_classes)
        prob = prob.view(out_logits.shape[0], -1)
        # Determine the number of top-k predictions to consider
        k_value = min(top_k, prob.size(1))
        # Extract top-k values and their indices along the second dimension
        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
        # Scores correspond to the top-k values
        scores = topk_values
        # Convert top-k indexes to top-k boxes in relative [0, 1] coordinates
        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
        # Extract labels from top-k indexes
        labels = topk_indexes % out_logits.shape[2]
        # Convert predicted boxes from center-offset format to (x1, y1, x2, y2) format
        boxes = center_to_corners_format(out_bbox)
        # Gather top-k boxes based on top-k indexes
        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))

        # Convert relative [0, 1] coordinates to absolute [0, height] coordinates if target_sizes are provided
        if target_sizes is not None:
            if isinstance(target_sizes, list):
                # If target_sizes is a list, extract heights and widths
                img_h = torch.Tensor([i[0] for i in target_sizes])
                img_w = torch.Tensor([i[1] for i in target_sizes])
            else:
                # If target_sizes is a tensor, unbind heights and widths
                img_h, img_w = target_sizes.unbind(1)
            # Stack widths and heights and scale boxes accordingly
            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
            boxes = boxes * scale_fct[:, None, :]

        # Filter predictions based on the score threshold and construct result dictionaries
        results = []
        for s, l, b in zip(scores, labels, boxes):
            score = s[s > threshold]
            label = l[s > threshold]
            box = b[s > threshold]
            results.append({"scores": score, "labels": label, "boxes": box})

        return results

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_semantic_segmentation with Detr->ConditionalDetr
    def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple[int, int]] = None):
        """
        Converts the output of [`ConditionalDetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.

        Args:
            outputs ([`ConditionalDetrForSegmentation`]):
                Raw outputs of the model.
            target_sizes (`List[Tuple[int, int]]`, *optional*):
                A list of tuples (`Tuple[int, int]`) containing the target size (height, width) of each image in the
                batch. If unset, predictions will not be resized.
        Returns:
            `List[torch.Tensor]`:
                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
                `torch.Tensor` correspond to a semantic class id.
        """
        # Extract class logits from model outputs [batch_size, num_queries, num_classes+1]
        class_queries_logits = outputs.logits

        # Extract mask logits from model outputs [batch_size, num_queries, height, width]
        masks_queries_logits = outputs.pred_masks

        # Remove the null class from class logits using softmax, leaving out the last dimension (background class)
        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]

        # Apply sigmoid to mask logits to get probabilities [batch_size, num_queries, height, width]
        masks_probs = masks_queries_logits.sigmoid()

        # Compute semantic segmentation logits by combining class and mask probabilities [batch_size, num_classes, height, width]
        segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
        batch_size = class_queries_logits.shape[0]

        # Resize logits and compute semantic segmentation maps if target_sizes are provided
        if target_sizes is not None:
            # Ensure that the number of target sizes matches the batch size
            if batch_size != len(target_sizes):
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )

            semantic_segmentation = []
            # Iterate over each image in the batch
            for idx in range(batch_size):
                # Resize logits to match target size using bilinear interpolation
                resized_logits = nn.functional.interpolate(
                    segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
                )
                # Extract semantic segmentation map by taking the argmax along the channel dimension
                semantic_map = resized_logits[0].argmax(dim=0)
                semantic_segmentation.append(semantic_map)
        else:
            # If target_sizes are not provided, compute semantic segmentation by taking argmax along the channel dimension
            semantic_segmentation = segmentation.argmax(dim=1)
            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]

        return semantic_segmentation

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_instance_segmentation with Detr->ConditionalDetr
    def post_process_instance_segmentation(
        self,
        outputs,
        threshold: float = 0.5,
        mask_threshold: float = 0.5,
        overlap_mask_area_threshold: float = 0.8,
        target_sizes: Optional[List[Tuple[int, int]]] = None,
        return_coco_annotation: Optional[bool] = False,
    # 从 transformers.models.conditional_detr.image_processing_conditional_detr.ConditionalDetrImageProcessor.post_process_panoptic_segmentation 复制而来
    def post_process_panoptic_segmentation(
        self,
        outputs,
        threshold: float = 0.5,
        mask_threshold: float = 0.5,
        overlap_mask_area_threshold: float = 0.8,
        label_ids_to_fuse: Optional[Set[int]] = None,
        target_sizes: Optional[List[Tuple[int, int]]] = None,

`.\models\conditional_detr\modeling_conditional_detr.py`

# coding=utf-8
# Copyright 2022 Microsoft Research Asia and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Conditional DETR model."""

# 导入所需的库和模块
import math  # 导入数学函数库
from dataclasses import dataclass  # 导入 dataclass 装饰器，用于定义数据类
from typing import Dict, List, Optional, Tuple, Union  # 导入类型提示相关模块

import torch  # 导入 PyTorch 库
from torch import Tensor, nn  # 导入张量和神经网络相关模块

# 导入辅助功能模块和实用工具
from ...activations import ACT2FN  # 导入激活函数映射
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask  # 导入用于准备4D注意力掩码的函数
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput  # 导入模型输出相关类
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...utils import (  # 导入各种实用函数和类
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_accelerate_available,
    is_scipy_available,
    is_timm_available,
    is_vision_available,
    logging,
    replace_return_docstrings,
    requires_backends,
)
from ...utils.backbone_utils import load_backbone  # 导入加载骨干网络的函数
from .configuration_conditional_detr import ConditionalDetrConfig  # 导入 Conditional DETR 模型配置类


if is_accelerate_available():  # 如果加速库可用
    from accelerate import PartialState  # 导入 PartialState 类
    from accelerate.utils import reduce  # 导入 reduce 函数

if is_scipy_available():  # 如果 scipy 库可用
    from scipy.optimize import linear_sum_assignment  # 导入 linear_sum_assignment 函数

if is_timm_available():  # 如果 timm 库可用
    from timm import create_model  # 导入 create_model 函数

if is_vision_available():  # 如果视觉库可用
    from ...image_transforms import center_to_corners_format  # 导入将中心格式转换为角点格式的函数

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CONFIG_FOR_DOC = "ConditionalDetrConfig"  # 用于文档的配置信息
_CHECKPOINT_FOR_DOC = "microsoft/conditional-detr-resnet-50"  # 用于文档的检查点信息

# 预训练模型存档列表
CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/conditional-detr-resnet-50",
    # 查看所有 Conditional DETR 模型：https://huggingface.co/models?filter=conditional_detr
]


@dataclass
class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
    """
    Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
    BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
    of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary
    decoding losses.
    """
    pass  # 条件 DETR 解码器输出的基类，添加了中间解码器激活堆栈属性
    # 定义函数的参数及其类型说明
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            最后一个模型层的隐藏状态序列输出。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            包含模型每一层隐藏状态的元组，形状为 `(batch_size, sequence_length, hidden_size)`。
            当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            注意力权重的元组，每个元素对应每一层的注意力权重。
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            在自注意力头中用于计算加权平均后返回，当 `output_attentions=True` 或 `config.output_attentions=True` 时返回。
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
            交叉注意力层的注意力权重元组，每个元素对应解码器交叉注意力层的注意力权重。
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            在交叉注意力头中用于计算加权平均后返回，当同时设置了 `output_attentions=True` 和 `config.add_cross_attention=True` 或者 `config.output_attentions=True` 时返回。
        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
            中间解码器激活状态，即每个解码器层的输出，经过层归一化后的结果。
            形状为 `(config.decoder_layers, batch_size, num_queries, hidden_size)`。
            当设置了 `config.auxiliary_loss=True` 时返回。
@dataclass
class ConditionalDetrModelOutput(Seq2SeqModelOutput):
    """
    ConditionalDetr 模型的输出基类。添加了一个额外的属性 intermediate_hidden_states，
    可选地包含中间解码器激活的堆栈，即每个解码器层的输出，经过 layernorm 处理。
    在训练模型时使用辅助解码损失时非常有用。
    """

    intermediate_hidden_states: Optional[torch.FloatTensor] = None
    reference_points: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class ConditionalDetrObjectDetectionOutput(ModelOutput):
    """
    ConditionalDetr 对象检测模型的输出类型。

    """

    loss: Optional[torch.FloatTensor] = None
    loss_dict: Optional[Dict] = None
    logits: torch.FloatTensor = None
    pred_boxes: torch.FloatTensor = None
    auxiliary_outputs: Optional[List[Dict]] = None
    last_hidden_state: Optional[torch.FloatTensor] = None
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class ConditionalDetrSegmentationOutput(ModelOutput):
    """
    ConditionalDetr 分割模型的输出类型。

    """

    loss: Optional[torch.FloatTensor] = None
    loss_dict: Optional[Dict] = None
    logits: torch.FloatTensor = None
    pred_boxes: torch.FloatTensor = None
    pred_masks: torch.FloatTensor = None
    auxiliary_outputs: Optional[List[Dict]] = None
    last_hidden_state: Optional[torch.FloatTensor] = None
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None


class ConditionalDetrFrozenBatchNorm2d(nn.Module):
    """
    ConditionalDetr 的冻结批量归一化层。

    BatchNorm2d 的批次统计信息和仿射参数被固定的版本。
    从 torchvision.misc.ops 中复制粘贴而来，添加了在求平方根前的 eps，
    否则除 torchvision.models.resnet[18,34,50,101] 之外的其他模型会产生 NaN 值。
    """
    # 初始化函数，用于创建一个新的实例
    def __init__(self, n):
        # 调用父类的初始化方法
        super().__init__()
        # 注册一个名为 "weight" 的缓冲区，包含了大小为 n 的全为 1 的张量
        self.register_buffer("weight", torch.ones(n))
        # 注册一个名为 "bias" 的缓冲区，包含了大小为 n 的全为 0 的张量
        self.register_buffer("bias", torch.zeros(n))
        # 注册一个名为 "running_mean" 的缓冲区，包含了大小为 n 的全为 0 的张量
        self.register_buffer("running_mean", torch.zeros(n))
        # 注册一个名为 "running_var" 的缓冲区，包含了大小为 n 的全为 1 的张量
        self.register_buffer("running_var", torch.ones(n))

    # 加载模型状态字典的私有方法
    def _load_from_state_dict(
        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
    ):
        # 构建 num_batches_tracked 对应的键
        num_batches_tracked_key = prefix + "num_batches_tracked"
        # 如果 state_dict 中存在 num_batches_tracked_key，则将其删除
        if num_batches_tracked_key in state_dict:
            del state_dict[num_batches_tracked_key]

        # 调用父类的 _load_from_state_dict 方法来加载状态字典
        super()._load_from_state_dict(
            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
        )

    # 前向传播函数
    def forward(self, x):
        # 将 weight 重塑为大小为 (1, n, 1, 1) 的张量，以便与输入 x 的形状兼容
        weight = self.weight.reshape(1, -1, 1, 1)
        # 将 bias 重塑为大小为 (1, n, 1, 1) 的张量，以便与输入 x 的形状兼容
        bias = self.bias.reshape(1, -1, 1, 1)
        # 将 running_var 重塑为大小为 (1, n, 1, 1) 的张量，以便与输入 x 的形状兼容
        running_var = self.running_var.reshape(1, -1, 1, 1)
        # 将 running_mean 重塑为大小为 (1, n, 1, 1) 的张量，以便与输入 x 的形状兼容
        running_mean = self.running_mean.reshape(1, -1, 1, 1)
        # 定义一个极小值 epsilon 用于稳定计算
        epsilon = 1e-5
        # 计算缩放因子，用于标准化输入 x
        scale = weight * (running_var + epsilon).rsqrt()
        # 根据标准化的结果，调整偏置 bias
        bias = bias - running_mean * scale
        # 返回经过标准化和调整的输入 x
        return x * scale + bias
# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDetr
def replace_batch_norm(model):
    r"""
    递归地将模型中所有的 `torch.nn.BatchNorm2d` 替换为 `ConditionalDetrFrozenBatchNorm2d`。

    Args:
        model (torch.nn.Module):
            输入的模型
    """
    # 遍历模型的每个子模块
    for name, module in model.named_children():
        # 如果当前模块是 `nn.BatchNorm2d` 类型
        if isinstance(module, nn.BatchNorm2d):
            # 创建新的 `ConditionalDetrFrozenBatchNorm2d` 模块
            new_module = ConditionalDetrFrozenBatchNorm2d(module.num_features)

            # 如果原始的 BatchNorm 参数不在设备 "meta" 上
            if not module.weight.device == torch.device("meta"):
                # 复制参数到新模块中
                new_module.weight.data.copy_(module.weight)
                new_module.bias.data.copy_(module.bias)
                new_module.running_mean.data.copy_(module.running_mean)
                new_module.running_var.data.copy_(module.running_var)

            # 将模型中的当前子模块替换为新模块
            model._modules[name] = new_module

        # 如果当前模块还有子模块，则递归调用替换函数
        if len(list(module.children())) > 0:
            replace_batch_norm(module)


# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder
class ConditionalDetrConvEncoder(nn.Module):
    """
    使用 AutoBackbone API 或 timm 库中的模型作为卷积主干网络。

    所有的 nn.BatchNorm2d 层都被上述定义的 DetrFrozenBatchNorm2d 替换。
    """

    def __init__(self, config):
        super().__init__()

        self.config = config

        # 根据配置选择使用 timm 提供的模型或自定义的模型加载
        if config.use_timm_backbone:
            # 确保依赖于 timm 库的模型加载
            requires_backends(self, ["timm"])
            kwargs = {}
            # 如果配置中指定了 dilation，则设置输出步幅为 16
            if config.dilation:
                kwargs["output_stride"] = 16
            # 创建 timm 模型，仅输出特征，指定需要的层
            backbone = create_model(
                config.backbone,
                pretrained=config.use_pretrained_backbone,
                features_only=True,
                out_indices=(1, 2, 3, 4),
                in_chans=config.num_channels,
                **kwargs,
            )
        else:
            # 根据配置加载自定义的模型
            backbone = load_backbone(config)

        # 使用 `replace_batch_norm` 函数替换所有的 BatchNorm 层为 Frozen BatchNorm
        with torch.no_grad():
            replace_batch_norm(backbone)

        # 将处理过的模型设置为当前类的模型属性
        self.model = backbone
        # 根据选择的主干网络类型确定中间特征通道数
        self.intermediate_channel_sizes = (
            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
        )

        # 根据主干网络类型和配置冻结不需要训练的参数
        backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
        if "resnet" in backbone_model_type:
            for name, parameter in self.model.named_parameters():
                if config.use_timm_backbone:
                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
                        parameter.requires_grad_(False)
                else:
                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
                        parameter.requires_grad_(False)
    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
        # 将像素值通过模型以获取特征图列表
        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps

        out = []
        for feature_map in features:
            # 将像素掩码下采样以匹配对应特征图的形状
            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
            out.append((feature_map, mask))
        # 返回包含特征图和掩码元组的列表
        return out
# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->ConditionalDetr
class ConditionalDetrConvModel(nn.Module):
    """
    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
    """

    def __init__(self, conv_encoder, position_embedding):
        super().__init__()
        self.conv_encoder = conv_encoder  # 初始化卷积编码器
        self.position_embedding = position_embedding  # 初始化位置嵌入模块

    def forward(self, pixel_values, pixel_mask):
        # 将像素值和像素掩码通过骨干网络，获取包含(feature_map, pixel_mask)元组的列表
        out = self.conv_encoder(pixel_values, pixel_mask)
        pos = []
        for feature_map, mask in out:
            # 位置编码
            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))

        return out, pos


class ConditionalDetrSinePositionEmbedding(nn.Module):
    """
    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
    need paper, generalized to work on images.
    """

    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
        super().__init__()
        self.embedding_dim = embedding_dim  # 嵌入维度
        self.temperature = temperature  # 温度参数
        self.normalize = normalize  # 是否归一化
        if scale is not None and normalize is False:
            raise ValueError("normalize should be True if scale is passed")
        if scale is None:
            scale = 2 * math.pi
        self.scale = scale  # 缩放参数

    def forward(self, pixel_values, pixel_mask):
        if pixel_mask is None:
            raise ValueError("No pixel mask provided")  # 如果没有提供像素掩码则报错
        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)  # 沿着y轴累积求和
        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)  # 沿着x轴累积求和
        if self.normalize:
            y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale  # 归一化y嵌入
            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale  # 归一化x嵌入

        dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)  # 温度调整

        pos_x = x_embed[:, :, :, None] / dim_t  # 计算x位置编码
        pos_y = y_embed[:, :, :, None] / dim_t  # 计算y位置编码
        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)  # 堆叠并展平x位置编码
        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)  # 堆叠并展平y位置编码
        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)  # 拼接位置编码并进行维度置换
        return pos


# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->ConditionalDetr
class ConditionalDetrLearnedPositionEmbedding(nn.Module):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """
    # 初始化函数，设置嵌入维度，默认为256
    def __init__(self, embedding_dim=256):
        # 调用父类初始化方法
        super().__init__()
        # 创建行坐标的嵌入层，将50个坐标映射到指定维度的向量空间
        self.row_embeddings = nn.Embedding(50, embedding_dim)
        # 创建列坐标的嵌入层，将50个坐标映射到指定维度的向量空间
        self.column_embeddings = nn.Embedding(50, embedding_dim)

    # 前向传播函数，接收像素值和可能的掩码
    def forward(self, pixel_values, pixel_mask=None):
        # 获取像素值的高度和宽度
        height, width = pixel_values.shape[-2:]
        # 创建一个张量，包含从0到width-1的列坐标，设备与像素值相同
        width_values = torch.arange(width, device=pixel_values.device)
        # 创建一个张量，包含从0到height-1的行坐标，设备与像素值相同
        height_values = torch.arange(height, device=pixel_values.device)
        # 使用列坐标嵌入层获取宽度方向的嵌入表示
        x_emb = self.column_embeddings(width_values)
        # 使用行坐标嵌入层获取高度方向的嵌入表示
        y_emb = self.row_embeddings(height_values)
        # 将列坐标和行坐标的嵌入表示拼接起来，形成位置编码
        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
        # 将位置编码的维度顺序调整为(2, height, width)
        pos = pos.permute(2, 0, 1)
        # 在最前面添加一个维度，将其形状变为(1, 2, height, width)
        pos = pos.unsqueeze(0)
        # 将位置编码复制为与输入像素值相同数量的批次，形状变为(batch_size, 2, height, width)
        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
        # 返回包含位置编码的张量作为输出
        return pos
# 从transformers.models.detr.modeling_detr.build_position_encoding复制，并将Detr更改为ConditionalDetr
def build_position_encoding(config):
    # 根据配置计算位置编码的步数
    n_steps = config.d_model // 2
    # 根据位置编码类型选择不同的位置编码方式
    if config.position_embedding_type == "sine":
        # TODO 找到更好的方法来暴露其他参数
        # 使用ConditionalDetrSinePositionEmbedding创建正弦位置编码
        position_embedding = ConditionalDetrSinePositionEmbedding(n_steps, normalize=True)
    elif config.position_embedding_type == "learned":
        # 使用ConditionalDetrLearnedPositionEmbedding创建学习位置编码
        position_embedding = ConditionalDetrLearnedPositionEmbedding(n_steps)
    else:
        # 若位置编码类型不支持，则抛出错误
        raise ValueError(f"Not supported {config.position_embedding_type}")

    # 返回位置编码对象
    return position_embedding


# 用于生成二维坐标正弦位置编码的函数
def gen_sine_position_embeddings(pos_tensor, d_model):
    # 正弦函数的缩放因子
    scale = 2 * math.pi
    # 将模型维度除以2，得到每个坐标的维度
    dim = d_model // 2
    # 创建一个张量，表示维度信息
    dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device)
    # 计算不同维度的缩放因子
    dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / dim)
    # 对输入的位置张量进行缩放
    x_embed = pos_tensor[:, :, 0] * scale
    y_embed = pos_tensor[:, :, 1] * scale
    # 计算位置编码
    pos_x = x_embed[:, :, None] / dim_t
    pos_y = y_embed[:, :, None] / dim_t
    # 使用正弦和余弦函数来编码位置
    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
    # 拼接编码后的位置张量
    pos = torch.cat((pos_y, pos_x), dim=2)
    # 返回生成的位置编码张量
    return pos


# 多头注意力机制，从transformers.models.detr.modeling_detr.DetrAttention复制
class DetrAttention(nn.Module):
    """
    来自《Attention Is All You Need》论文的多头注意力机制。

    在这里，我们根据DETR论文的解释，将位置编码添加到查询和键中。
    """

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        bias: bool = True,
    ):
        super().__init__()
        # 设置注意力机制的参数
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        # 检查是否能够正确划分维度
        if self.head_dim * num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim必须能被num_heads整除 (得到 `embed_dim`: {self.embed_dim} 和 `num_heads`: {num_heads})."
            )
        # 缩放因子
        self.scaling = self.head_dim**-0.5

        # 线性映射，用于查询、键、值以及输出
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    # 重新整形张量形状以适应多头注意力机制
    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
    # 定义一个方法，用于在输入张量中添加位置嵌入或对象查询
    def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor], **kwargs):
        # 从kwargs中取出名为"position_embeddings"的参数
        position_embeddings = kwargs.pop("position_embeddings", None)

        # 如果kwargs中还有其他未知参数，则抛出异常
        if kwargs:
            raise ValueError(f"Unexpected arguments {kwargs.keys()}")

        # 如果同时指定了position_embeddings和object_queries，则抛出异常
        if position_embeddings is not None and object_queries is not None:
            raise ValueError(
                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
            )

        # 如果指定了position_embeddings，则发出警告并使用object_queries替代
        if position_embeddings is not None:
            logger.warning_once(
                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
            )
            object_queries = position_embeddings

        # 返回原始张量或者原始张量加上对象查询（位置嵌入）
        return tensor if object_queries is None else tensor + object_queries

    # 定义一个前向传播方法，接收多个输入参数，并返回处理后的结果
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        object_queries: Optional[torch.Tensor] = None,
        key_value_states: Optional[torch.Tensor] = None,
        spatial_position_embeddings: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        **kwargs,
    ):
# 定义了一个名为 ConditionalDetrAttention 的 PyTorch 模块，用于实现条件 DETR 模型中的交叉注意力机制。
class ConditionalDetrAttention(nn.Module):
    """
    Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.

    The key q_proj, k_proj, v_proj are defined outside the attention. This attention allows the dim of q, k to be
    different to v.
    """

    # 初始化函数，设置模块的参数和层
    def __init__(
        self,
        embed_dim: int,                   # 输入的嵌入维度
        out_dim: int,                     # 输出的维度
        num_heads: int,                   # 注意力头的数量
        dropout: float = 0.0,             # Dropout 概率，默认为 0.0
        bias: bool = True,                # 是否使用偏置
    ):
        super().__init__()
        self.embed_dim = embed_dim        # 设置输入的嵌入维度
        self.out_dim = out_dim            # 设置输出的维度
        self.num_heads = num_heads        # 设置注意力头的数量
        self.dropout = dropout            # 设置 Dropout 概率
        self.head_dim = embed_dim // num_heads  # 计算每个注意力头的维度
        # 检查 embed_dim 必须被 num_heads 整除，否则抛出异常
        if self.head_dim * num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {num_heads})."
            )
        # 计算值的每个注意力头的维度
        self.v_head_dim = out_dim // num_heads
        # 检查 out_dim 必须被 num_heads 整除，否则抛出异常
        if self.v_head_dim * num_heads != self.out_dim:
            raise ValueError(
                f"out_dim must be divisible by num_heads (got `out_dim`: {self.out_dim} and `num_heads`: {num_heads})."
            )
        # 缩放因子，用于缩放注意力分数
        self.scaling = self.head_dim**-0.5

        # 输出投影层，将注意力计算后的结果映射到指定维度的空间
        self.out_proj = nn.Linear(out_dim, out_dim, bias=bias)

    # 辅助函数，用于整理输入张量的形状以便与注意力计算兼容
    def _qk_shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    # 辅助函数，用于整理值张量的形状以便与注意力计算兼容
    def _v_shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
        return tensor.view(batch_size, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()

    # 前向传播函数，实现注意力计算和投影映射
    def forward(
        self,
        hidden_states: torch.Tensor,                  # 输入的隐藏状态张量
        attention_mask: Optional[torch.Tensor] = None, # 注意力遮罩张量（可选）
        key_states: Optional[torch.Tensor] = None,     # 键张量（可选）
        value_states: Optional[torch.Tensor] = None,   # 值张量（可选）
        output_attentions: bool = False,               # 是否输出注意力权重（默认为 False）
# Copied from transformers.models.detr.modeling_detr.DetrEncoderLayer with DetrEncoderLayer->ConditionalDetrEncoderLayer,DetrConfig->ConditionalDetrConfig
# 来自 transformers.models.detr.modeling_detr.DetrEncoderLayer 的复制，将 DetrEncoderLayer 改为 ConditionalDetrEncoderLayer，DetrConfig 改为 ConditionalDetrConfig
class ConditionalDetrEncoderLayer(nn.Module):
    # 初始化函数，根据给定的配置创建层
    def __init__(self, config: ConditionalDetrConfig):
        super().__init__()
        self.embed_dim = config.d_model                          # 设置嵌入维度
        # 自注意力层，使用给定配置中的参数初始化
        self.self_attn = DetrAttention(
            embed_dim=self.embed_dim,                           # 设置嵌入维度
            num_heads=config.encoder_attention_heads,            # 设置注意力头的数量
            dropout=config.attention_dropout,                   # 设置注意力 dropout 概率
        )
        # 自注意力层的 LayerNorm 层，标准化输入
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout                           # 设置 dropout 概率
        # 激活函数，使用给定配置中的激活函数
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout     # 设置激活函数 dropout 概率
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)  # 第一个全连接层
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)  # 第二个全连接层
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)       # 最终输出的 LayerNorm 层
    # 定义模型的前向传播函数，接收输入的隐藏状态、注意力掩码和对象查询（可选），并可能返回注意力张量
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        object_queries: torch.Tensor = None,
        output_attentions: bool = False,
        **kwargs,
    ):
        """
        Args:
            hidden_states (`torch.FloatTensor`): 输入层的张量，形状为 `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): 注意力掩码张量，形状为
                `(batch, 1, target_len, source_len)`，其中填充元素由非常大的负值表示。
            object_queries (`torch.FloatTensor`, *optional*):
                对象查询（也称为内容嵌入），将添加到隐藏状态中。
            output_attentions (`bool`, *optional*):
                是否返回所有注意力层的注意力张量。有关更多细节，请参见返回张量中的 `attentions`。

        """

        # 从 `kwargs` 中弹出 `position_embeddings`，如果存在的话
        position_embeddings = kwargs.pop("position_embeddings", None)

        # 如果 `kwargs` 不为空，则抛出错误，不允许有未知的额外参数
        if kwargs:
            raise ValueError(f"Unexpected arguments {kwargs.keys()}")

        # 如果同时指定了 `position_embeddings` 和 `object_queries`，则抛出错误
        if position_embeddings is not None and object_queries is not None:
            raise ValueError(
                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
            )

        # 如果存在 `position_embeddings`，发出警告并使用 `object_queries` 替代
        if position_embeddings is not None:
            logger.warning_once(
                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
            )
            object_queries = position_embeddings

        # 保存初始的隐藏状态作为残差连接的一部分
        residual = hidden_states

        # 使用自注意力机制进行前向传播，计算新的隐藏状态和可能的注意力权重
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            object_queries=object_queries,
            output_attentions=output_attentions,
        )

        # 对新的隐藏状态应用 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

        # 将残差连接应用到新的隐藏状态上
        hidden_states = residual + hidden_states

        # 对连接后的隐藏状态应用层归一化
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # 保存新的隐藏状态作为残差连接的一部分
        residual = hidden_states

        # 应用激活函数和线性变换 fc1 到隐藏状态
        hidden_states = self.activation_fn(self.fc1(hidden_states))

        # 对经过 fc1 处理后的隐藏状态应用 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)

        # 再次应用线性变换 fc2 到隐藏状态
        hidden_states = self.fc2(hidden_states)

        # 对经过 fc2 处理后的隐藏状态应用 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

        # 将残差连接应用到新的隐藏状态上
        hidden_states = residual + hidden_states

        # 对连接后的隐藏状态应用最终的层归一化
        hidden_states = self.final_layer_norm(hidden_states)

        # 如果处于训练模式下，检查隐藏状态是否包含无穷大或 NaN 值，进行裁剪处理
        if self.training:
            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        # 准备输出结果，仅包含隐藏状态
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则将它们加入到输出中
        if output_attentions:
            outputs += (attn_weights,)

        # 返回最终的输出结果
        return outputs
# 定义一个名为 ConditionalDetrDecoderLayer 的类，继承自 nn.Module 类
class ConditionalDetrDecoderLayer(nn.Module):
    # 初始化方法，接受一个配置对象 config: ConditionalDetrConfig
    def __init__(self, config: ConditionalDetrConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 设置类属性 embed_dim 为 config.d_model
        self.embed_dim = config.d_model

        d_model = config.d_model
        # 初始化 Decoder Self-Attention 的投影层
        self.sa_qcontent_proj = nn.Linear(d_model, d_model)
        self.sa_qpos_proj = nn.Linear(d_model, d_model)
        self.sa_kcontent_proj = nn.Linear(d_model, d_model)
        self.sa_kpos_proj = nn.Linear(d_model, d_model)
        self.sa_v_proj = nn.Linear(d_model, d_model)

        # 创建 ConditionalDetrAttention 对象并赋值给 self.self_attn 属性
        self.self_attn = ConditionalDetrAttention(
            embed_dim=self.embed_dim,
            out_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
        )
        # 设置类属性 dropout 为 config.dropout
        self.dropout = config.dropout
        # 根据配置选择激活函数，并赋值给 self.activation_fn 属性
        self.activation_fn = ACT2FN[config.activation_function]
        # 设置类属性 activation_dropout 为 config.activation_dropout
        self.activation_dropout = config.activation_dropout

        # 初始化 Decoder Self-Attention 的 LayerNorm
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 初始化 Decoder Cross-Attention 的投影层
        self.ca_qcontent_proj = nn.Linear(d_model, d_model)
        self.ca_qpos_proj = nn.Linear(d_model, d_model)
        self.ca_kcontent_proj = nn.Linear(d_model, d_model)
        self.ca_kpos_proj = nn.Linear(d_model, d_model)
        self.ca_v_proj = nn.Linear(d_model, d_model)
        self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)

        # 创建 ConditionalDetrAttention 对象并赋值给 self.encoder_attn 属性
        self.encoder_attn = ConditionalDetrAttention(
            self.embed_dim * 2, self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout
        )
        # 初始化 Decoder Cross-Attention 的 LayerNorm
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        # 初始化第一个全连接层
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        # 初始化第二个全连接层
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
        # 初始化最终的 LayerNorm
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
        # 设置类属性 nhead 为 config.decoder_attention_heads

    # 前向传播方法，接受一些输入参数，并返回计算结果
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        object_queries: Optional[torch.Tensor] = None,
        query_position_embeddings: Optional[torch.Tensor] = None,
        query_sine_embed: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        is_first: Optional[bool] = False,
        **kwargs,
# 定义一个名为 ConditionalDetrClassificationHead 的类，继承自 nn.Module 类
class ConditionalDetrClassificationHead(nn.Module):
    # 初始化方法，接受输入维度 input_dim，内部维度 inner_dim，类别数量 num_classes，以及池化层的 dropout 比例 pooler_dropout
    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
        # 调用父类的初始化方法
        super().__init__()
        # 初始化一个全连接层，输入维度为 input_dim，输出维度为 inner_dim
        self.dense = nn.Linear(input_dim, inner_dim)
        # 初始化一个 dropout 层，dropout 比例为 pooler_dropout
        self.dropout = nn.Dropout(p=pooler_dropout)
        # 初始化一个全连接层，输入维度为 inner_dim，输出维度为 num_classes
        self.out_proj = nn.Linear(inner_dim, num_classes)
    # 对输入的隐藏状态进行 dropout 操作，以减少过拟合风险
    hidden_states = self.dropout(hidden_states)
    # 将经过 dropout 后的隐藏状态输入全连接层 dense，进行线性变换
    hidden_states = self.dense(hidden_states)
    # 对全连接层的输出应用双曲正切函数，引入非线性映射
    hidden_states = torch.tanh(hidden_states)
    # 再次对处理后的隐藏状态进行 dropout 操作，进一步减少过拟合
    hidden_states = self.dropout(hidden_states)
    # 最终通过输出投影层 out_proj 得到最终的隐藏状态表示
    hidden_states = self.out_proj(hidden_states)
    # 返回经过全连接层和激活函数处理后的隐藏状态
    return hidden_states
# 从 transformers.models.detr.modeling_detr.DetrMLPPredictionHead 复制的 MLP 类，修改了类名为 MLP
class MLP(nn.Module):
    """
    非常简单的多层感知机（MLP，也称为前馈神经网络），用于预测相对于图像的归一化中心坐标、高度和宽度的边界框。

    从 https://github.com/facebookresearch/detr/blob/master/models/detr.py 复制而来
    """

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.num_layers = num_layers
        h = [hidden_dim] * (num_layers - 1)
        # 创建多个线性层组成的层序列
        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))

    def forward(self, x):
        for i, layer in enumerate(self.layers):
            # 对每一层应用 ReLU 激活函数，最后一层不应用激活函数
            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
        return x


# 从 transformers.models.detr.modeling_detr.DetrPreTrainedModel 复制的 ConditionalDetrPreTrainedModel 类
# 修改了类名为 ConditionalDetrPreTrainedModel
class ConditionalDetrPreTrainedModel(PreTrainedModel):
    # 使用 ConditionalDetrConfig 作为配置类
    config_class = ConditionalDetrConfig
    # 模型的基础名称前缀为 "model"
    base_model_prefix = "model"
    # 主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 不分割的模块名称列表，用于模型初始化权重时的特殊处理
    _no_split_modules = [r"ConditionalDetrConvEncoder", r"ConditionalDetrEncoderLayer", r"ConditionalDetrDecoderLayer"]

    def _init_weights(self, module):
        # 获取初始化的标准差和 Xaiver 初始化的标准差
        std = self.config.init_std
        xavier_std = self.config.init_xavier_std

        if isinstance(module, ConditionalDetrMHAttentionMap):
            # 初始化条件化的 MH Attention Map 中的偏置为零，权重使用 Xaiver 初始化
            nn.init.zeros_(module.k_linear.bias)
            nn.init.zeros_(module.q_linear.bias)
            nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
            nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
        elif isinstance(module, ConditionalDetrLearnedPositionEmbedding):
            # 均匀分布初始化位置嵌入的行和列权重
            nn.init.uniform_(module.row_embeddings.weight)
            nn.init.uniform_(module.column_embeddings.weight)
        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
            # 略微不同于 TF 版本的初始化方法，这里使用正态分布初始化权重，偏置初始化为零
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 正态分布初始化嵌入权重
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


CONDITIONAL_DETR_START_DOCSTRING = r"""
    该模型继承自 [`PreTrainedModel`]。查看超类文档以了解库为其所有模型实现的通用方法（如下载或保存、调整输入嵌入、修剪头部等）。

    该模型还是一个 PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 的子类。
    将其视为常规的 PyTorch 模块，并参考 PyTorch 文档以获取所有与一般用法相关的信息
"""
    and behavior.

    Parameters:
        config ([`ConditionalDetrConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
# 从transformers.models.detr.modeling_detr.DetrEncoder复制并修改为ConditionalDetrEncoder，继承自ConditionalDetrPreTrainedModel
class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
    """
    Transformer编码器，包含config.encoder_layers个自注意力层。每一层是一个ConditionalDetrEncoderLayer。

    编码器通过多个自注意力层更新扁平化特征图。

    对于ConditionalDETR的小调整：
    - 对象查询（object_queries）在前向传播中添加。
    """
    Args:
        config: ConditionalDetrConfig
    """

    # 初始化方法，接收一个配置对象，设置网络的参数和层
    def __init__(self, config: ConditionalDetrConfig):
        # 调用父类的初始化方法，传入配置对象
        super().__init__(config)

        # 从配置对象中获取dropout和encoder_layerdrop参数
        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop

        # 使用列表推导式创建一个包含多个ConditionalDetrEncoderLayer对象的ModuleList
        self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])

        # 在原始的ConditionalDETR中，encoder的末尾没有使用layernorm，因为默认情况下"normalize_before"设置为False

        # 初始化权重并应用最终处理
        self.post_init()

    # 前向传播方法定义，接收多个输入参数和关键字参数
    def forward(
        self,
        inputs_embeds=None,
        attention_mask=None,
        object_queries=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDetrDecoderLayer`].

    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.

    Some small tweaks for Conditional DETR:

    - object_queries and query_position_embeddings are added to the forward pass.
    - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.

    Args:
        config: ConditionalDetrConfig
    """

    def __init__(self, config: ConditionalDetrConfig):
        super().__init__(config)
        self.dropout = config.dropout  # 初始化dropout比率
        self.layerdrop = config.decoder_layerdrop  # 初始化层间dropout比率

        # 创建多层Transformer解码器，每层为ConditionalDetrDecoderLayer类的实例
        self.layers = nn.ModuleList([ConditionalDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
        
        # Conditional DETR中，在最后一个解码器层输出后使用layernorm
        self.layernorm = nn.LayerNorm(config.d_model)
        d_model = config.d_model
        self.gradient_checkpointing = False  # 梯度检查点设为False

        # query_scale是应用于f以生成变换T的前馈神经网络
        self.query_scale = MLP(d_model, d_model, d_model, 2)  # 初始化query_scale网络
        self.ref_point_head = MLP(d_model, d_model, 2, 2)  # 初始化ref_point_head网络
        
        # 对于除最后一层以外的每一层，设置ca_qpos_proj为None
        for layer_id in range(config.decoder_layers - 1):
            self.layers[layer_id + 1].ca_qpos_proj = None

        # 初始化权重并应用最终处理
        self.post_init()

    def forward(
        self,
        inputs_embeds=None,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        object_queries=None,
        query_position_embeddings=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        # 省略了forward方法的注释，因为forward方法的详细解释不应该包含在代码块内部，只需提供类的初始化和重要属性的解释即可。



@add_start_docstrings(
    """
    The bare Conditional DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
    hidden-states without any specific head on top.
    """,
    CONDITIONAL_DETR_START_DOCSTRING,
)
class ConditionalDetrModel(ConditionalDetrPreTrainedModel):
    def __init__(self, config: ConditionalDetrConfig):
        super().__init__(config)

        # 创建骨干网络(backbone) + 位置编码
        backbone = ConditionalDetrConvEncoder(config)
        object_queries = build_position_encoding(config)
        self.backbone = ConditionalDetrConvModel(backbone, object_queries)

        # 创建投影层
        self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)

        self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)

        self.encoder = ConditionalDetrEncoder(config)
        self.decoder = ConditionalDetrDecoder(config)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_encoder(self):
        return self.encoder



# 注释结束
    # 返回当前对象的解码器
    def get_decoder(self):
        return self.decoder

    # 冻结模型的主干网络，使其参数不再更新
    def freeze_backbone(self):
        # 遍历主干网络的所有参数，并设置它们的梯度更新为 False
        for name, param in self.backbone.conv_encoder.model.named_parameters():
            param.requires_grad_(False)

    # 解冻模型的主干网络，使其参数可以更新
    def unfreeze_backbone(self):
        # 遍历主干网络的所有参数，并设置它们的梯度更新为 True
        for name, param in self.backbone.conv_encoder.model.named_parameters():
            param.requires_grad_(True)

    # 前向传播函数，接受输入并返回模型的输出
    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ConditionalDetrModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        pixel_mask: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        encoder_outputs: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
"""
CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
top, for tasks such as COCO detection.
"""
@add_start_docstrings(
    """
    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
    top, for tasks such as COCO detection.
    """,
    CONDITIONAL_DETR_START_DOCSTRING,
)
class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
    def __init__(self, config: ConditionalDetrConfig):
        super().__init__(config)

        # CONDITIONAL DETR encoder-decoder model
        self.model = ConditionalDetrModel(config)

        # Object detection heads
        self.class_labels_classifier = nn.Linear(
            config.d_model, config.num_labels
        )  # We add one for the "no object" class
        self.bbox_predictor = ConditionalDetrMLPPredictionHead(
            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
        )

        # Initialize weights and apply final processing
        self.post_init()

    # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
    @torch.jit.unused
    def _set_aux_loss(self, outputs_class, outputs_coord):
        # this is a workaround to make torchscript happy, as torchscript
        # doesn't support dictionary with non-homogeneous values, such
        # as a dict having both a Tensor and a list.
        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]

    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ConditionalDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        pixel_mask: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        encoder_outputs: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[List[dict]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
):
    """
    Perform forward pass of the Conditional DETR model for object detection.

    Args:
        pixel_values (torch.FloatTensor): Tensor of pixel values of shape (batch_size, sequence_length, channels).
        pixel_mask (Optional[torch.LongTensor]): Optional tensor of pixel masks with shape (batch_size, sequence_length).
        decoder_attention_mask (Optional[torch.LongTensor]): Optional tensor indicating which positions should be
            attended to by the decoder with shape (batch_size, sequence_length).
        encoder_outputs (Optional[torch.FloatTensor]): Optional tensor with encoder outputs of shape
            (batch_size, sequence_length, hidden_size).
        inputs_embeds (Optional[torch.FloatTensor]): Optional tensor of embeddings to be used as inputs to the decoder
            instead of pixel_values.
        decoder_inputs_embeds (Optional[torch.FloatTensor]): Optional tensor of embeddings to be used as inputs to the
            decoder.
        labels (Optional[List[dict]]): Optional list of dictionaries containing labels for object detection.
        output_attentions (Optional[bool]): Whether to output attentions weights.
        output_hidden_states (Optional[bool]): Whether to output hidden states.
        return_dict (Optional[bool]): Whether to return a dictionary as output.

    Returns:
        ConditionalDetrObjectDetectionOutput: Output object containing the logits and predicted boxes.

    """
    """
    CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top,
    for tasks such as COCO panoptic.
    """
    @add_start_docstrings(
        """
        CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top,
        for tasks such as COCO panoptic.
    
        """,
        CONDITIONAL_DETR_START_DOCSTRING,
    )
    class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel):
    def __init__(self, config: ConditionalDetrConfig):
        super().__init__(config)

        # object detection model
        self.conditional_detr = ConditionalDetrForObjectDetection(config)

        # segmentation head
        # 获取配置中的隐藏大小和注意力头数
        hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
        # 从模型的编码器中提取中间通道大小
        intermediate_channel_sizes = self.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes

        # 初始化分割头部，连接隐藏大小、注意力头数和中间通道大小的一部分
        self.mask_head = ConditionalDetrMaskHeadSmallConv(
            hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
        )

        # 初始化边界框的注意力机制，使用隐藏大小和注意力头数，指定初始化Xavier的标准差
        self.bbox_attention = ConditionalDetrMHAttentionMap(
            hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
        )

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ConditionalDetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        pixel_mask: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.FloatTensor] = None,
        encoder_outputs: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[List[dict]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 定义一个函数 _expand，用于扩展张量的维度
def _expand(tensor, length: int):
    # 在第一维度上插入一个维度，并重复该维度，以扩展张量的长度
    return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)

# 从 transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv 复制并修改为 ConditionalDetrMaskHeadSmallConv 类
class ConditionalDetrMaskHeadSmallConv(nn.Module):
    """
    简单的卷积头部，使用组归一化。使用 FPN 方法进行上采样
    """

    def __init__(self, dim, fpn_dims, context_dim):
        super().__init__()

        # 检查 dim 是否能被 8 整除，因为 GroupNorm 中的组数设置为 8
        if dim % 8 != 0:
            raise ValueError(
                "隐藏大小加注意力头的数量必须能被 8 整除，因为 GroupNorm 的组数设置为 8"
            )

        # 计算中间层的维度
        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]

        # 定义卷积层和组归一化层
        self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
        self.gn1 = nn.GroupNorm(8, dim)
        self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
        self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
        self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
        self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
        self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
        self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
        self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
        self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
        self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)

        self.dim = dim

        # 定义适配器层，用于将 FPN 的特征映射适配到不同的中间层维度
        self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
        self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
        self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1)

        # 对所有模块进行初始化，卷积层使用 Kaiming 初始化，偏置初始化为常数 0
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_uniform_(m.weight, a=1)
                nn.init.constant_(m.bias, 0)
    # 定义一个方法 `forward`，用于前向传播计算
    def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
        # 将特征图 x（形状为 batch_size, d_model, heigth/32, width/32）与 bbox_mask（注意力图，形状为 batch_size, n_queries, n_heads, height/32, width/32）拼接起来
        x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)

        # 经过第一个神经网络层 lay1
        x = self.lay1(x)
        # 经过第一个 GroupNorm 层 gn1
        x = self.gn1(x)
        # 应用 ReLU 激活函数
        x = nn.functional.relu(x)

        # 经过第二个神经网络层 lay2
        x = self.lay2(x)
        # 经过第二个 GroupNorm 层 gn2
        x = self.gn2(x)
        # 再次应用 ReLU 激活函数
        x = nn.functional.relu(x)

        # 获取当前的特征金字塔网络（FPN）的第一个分支 fpns[0]，并通过 adapter1 适配器层调整其大小
        cur_fpn = self.adapter1(fpns[0])
        # 如果当前特征金字塔分支的 batch 大小与 x 的 batch 大小不一致，则扩展它以匹配 x 的大小
        if cur_fpn.size(0) != x.size(0):
            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
        # 将适配后的特征金字塔与 x 进行相加，并对 x 进行最近邻插值以调整大小
        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
        # 经过第三个神经网络层 lay3
        x = self.lay3(x)
        # 经过第三个 GroupNorm 层 gn3
        x = self.gn3(x)
        # 再次应用 ReLU 激活函数
        x = nn.functional.relu(x)

        # 获取当前的特征金字塔网络（FPN）的第二个分支 fpns[1]，并通过 adapter2 适配器层调整其大小
        cur_fpn = self.adapter2(fpns[1])
        # 如果当前特征金字塔分支的 batch 大小与 x 的 batch 大小不一致，则扩展它以匹配 x 的大小
        if cur_fpn.size(0) != x.size(0):
            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
        # 将适配后的特征金字塔与 x 进行相加，并对 x 进行最近邻插值以调整大小
        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
        # 经过第四个神经网络层 lay4
        x = self.lay4(x)
        # 经过第四个 GroupNorm 层 gn4
        x = self.gn4(x)
        # 再次应用 ReLU 激活函数
        x = nn.functional.relu(x)

        # 获取当前的特征金字塔网络（FPN）的第三个分支 fpns[2]，并通过 adapter3 适配器层调整其大小
        cur_fpn = self.adapter3(fpns[2])
        # 如果当前特征金字塔分支的 batch 大小与 x 的 batch 大小不一致，则扩展它以匹配 x 的大小
        if cur_fpn.size(0) != x.size(0):
            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
        # 将适配后的特征金字塔与 x 进行相加，并对 x 进行最近邻插值以调整大小
        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
        # 经过第五个神经网络层 lay5
        x = self.lay5(x)
        # 经过第五个 GroupNorm 层 gn5
        x = self.gn5(x)
        # 再次应用 ReLU 激活函数
        x = nn.functional.relu(x)

        # 最终经过输出层 out_lay
        x = self.out_lay(x)
        # 返回处理后的输出 x
        return x
# 从 transformers.models.detr.modeling_detr.DetrMHAttentionMap 复制而来，修改为 ConditionalDetrMHAttentionMap 类
class ConditionalDetrMHAttentionMap(nn.Module):
    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""

    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
        super().__init__()
        self.num_heads = num_heads
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout)

        # 创建线性层，用于计算查询（q）和键（k）的线性变换
        self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
        self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)

        # 归一化因子，用于缩放每个头的注意力分数
        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5

    def forward(self, q, k, mask: Optional[Tensor] = None):
        # 计算查询的线性变换
        q = self.q_linear(q)
        # 计算键的线性变换，并进行卷积操作
        k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
        # 将查询和键分割成每个头的部分
        queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
        keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
        # 计算加权的注意力分数
        weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)

        if mask is not None:
            # 对注意力分数应用掩码
            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min)
        # 计算注意力权重的 softmax，并应用 dropout
        weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
        weights = self.dropout(weights)
        return weights


# 从 transformers.models.detr.modeling_detr.dice_loss 复制而来
def dice_loss(inputs, targets, num_boxes):
    """
    Compute the DICE loss, similar to generalized IOU for masks

    Args:
        inputs: A float tensor of arbitrary shape.
                The predictions for each example.
        targets: A float tensor with the same shape as inputs. Stores the binary
                 classification label for each element in inputs (0 for the negative class and 1 for the positive
                 class).
    """
    # 对输入进行 sigmoid 激活
    inputs = inputs.sigmoid()
    # 展平输入张量
    inputs = inputs.flatten(1)
    # 计算 DICE 损失的分子部分
    numerator = 2 * (inputs * targets).sum(1)
    # 计算 DICE 损失的分母部分
    denominator = inputs.sum(-1) + targets.sum(-1)
    # 计算 DICE 损失值
    loss = 1 - (numerator + 1) / (denominator + 1)
    return loss.sum() / num_boxes


# 从 transformers.models.detr.modeling_detr.sigmoid_focal_loss 复制而来
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
    """
    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
    """
    # 计算 sigmoid focal 损失
    pass
    # 将输入 logits 转换为概率值，使用 sigmoid 函数
    prob = inputs.sigmoid()
    # 计算二元交叉熵损失，reduction="none" 表示不进行降维
    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
    # 计算 modulating factor，用于平衡简单和困难的样本
    p_t = prob * targets + (1 - prob) * (1 - targets)
    # 计算加权的损失，通过对损失的平方进行操作
    loss = ce_loss * ((1 - p_t) ** gamma)

    # 如果 alpha 大于等于 0，则计算 alpha_t 权重，用于平衡正负样本
    if alpha >= 0:
        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        loss = alpha_t * loss

    # 计算最终的损失值，对每个样本的损失求均值后求和，并除以 num_boxes 得到平均损失
    return loss.mean(1).sum() / num_boxes
    # 定义 ConditionalDetrLoss 类，用于计算条件化目标检测或分割任务中的损失
    """
    This class computes the losses for ConditionalDetrForObjectDetection/ConditionalDetrForSegmentation. The process
    happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2)
    we supervise each pair of matched ground-truth / prediction (supervise class and box).

    Args:
        matcher (`ConditionalDetrHungarianMatcher`):
            Module able to compute a matching between targets and proposals.
        num_classes (`int`):
            Number of object categories, omitting the special no-object category.
        focal_alpha (`float`):
            Alpha parameter in focal loss.
        losses (`List[str]`):
            List of all the losses to be applied. See `get_loss` for a list of all available losses.
    """
    # 初始化方法，接受匹配器、类别数量、焦点损失的 alpha 参数和损失列表
    # 被复制自 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.__init__
    def __init__(self, matcher, num_classes, focal_alpha, losses):
        super().__init__()
        # 设置匹配器对象
        self.matcher = matcher
        # 设置类别数量
        self.num_classes = num_classes
        # 设置焦点损失的 alpha 参数
        self.focal_alpha = focal_alpha
        # 设置损失列表
        self.losses = losses

    # 定义损失标签方法，计算分类损失（二元焦点损失）
    # 输出中必须包含 "logits" 键
    """
    Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
    of dim [nb_target_boxes]
    """
    # 接受输出、目标、索引和盒子数量作为参数
    # 被复制自 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_labels
    def loss_labels(self, outputs, targets, indices, num_boxes):
        # 如果输出中没有 "logits" 键，则抛出 KeyError
        if "logits" not in outputs:
            raise KeyError("No logits were found in the outputs")
        # 获取源 logits
        source_logits = outputs["logits"]

        # 获取源排列的索引
        idx = self._get_source_permutation_idx(indices)
        # 从目标中获取目标类别
        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
        # 创建一个全为 num_classes 的 tensor，用于表示目标类别
        target_classes = torch.full(
            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
        )
        target_classes[idx] = target_classes_o

        # 创建一个全零的 tensor，用于表示目标类别的 one-hot 编码
        target_classes_onehot = torch.zeros(
            [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
            dtype=source_logits.dtype,
            layout=source_logits.layout,
            device=source_logits.device,
        )
        # 在 target_classes_onehot 上按照 target_classes 的索引进行填充
        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)

        # 去掉最后一维，使得 target_classes_onehot 与 source_logits 的形状一致
        target_classes_onehot = target_classes_onehot[:, :, :-1]
        # 计算交叉熵损失（乘以类别数目）
        loss_ce = (
            sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
            * source_logits.shape[1]
        )
        # 返回计算出的损失
        losses = {"loss_ce": loss_ce}

        return losses

    # 用 torch.no_grad() 修饰的方法，计算基数损失
    # 被复制自 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_cardinality
    # 定义一个方法用于计算卡迪尼尔错误，即预测的非空框数量的绝对误差
    def loss_cardinality(self, outputs, targets, indices, num_boxes):
        """
        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.

        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
        """
        # 从模型输出中获取 logits（预测结果）
        logits = outputs["logits"]
        # 获取 logits 的设备信息
        device = logits.device
        # 计算目标长度，即每个目标框内的类别标签数目
        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
        # 计算预测的非"no-object"类别的数量
        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
        # 使用 L1 损失计算卡迪尼尔错误
        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
        # 构建损失字典，存储卡迪尼尔错误
        losses = {"cardinality_error": card_err}
        return losses

    # 从 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_boxes 复制而来
    def loss_boxes(self, outputs, targets, indices, num_boxes):
        """
        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.

        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
        are expected in format (center_x, center_y, w, h), normalized by the image size.
        """
        # 检查输出中是否存在预测框
        if "pred_boxes" not in outputs:
            raise KeyError("No predicted boxes found in outputs")
        # 获取源索引的置换索引
        idx = self._get_source_permutation_idx(indices)
        # 获取源框（预测框）和目标框
        source_boxes = outputs["pred_boxes"][idx]
        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)

        # 计算边界框的 L1 回归损失
        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")

        # 构建损失字典，存储边界框的损失，将损失平均化为每个框的损失
        losses = {}
        losses["loss_bbox"] = loss_bbox.sum() / num_boxes

        # 计算边界框的 GIoU 损失
        loss_giou = 1 - torch.diag(
            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
        )
        # 将 GIoU 损失平均化为每个框的损失
        losses["loss_giou"] = loss_giou.sum() / num_boxes
        return losses

    # 从 transformers.models.detr.modeling_detr.DetrLoss.loss_masks 复制而来
    def loss_masks(self, outputs, targets, indices, num_boxes):
        """
        Compute the losses related to the masks: the focal loss and the dice loss.

        Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
        """
        # 检查输出中是否包含预测的 masks
        if "pred_masks" not in outputs:
            raise KeyError("No predicted masks found in outputs")

        # 获取源索引的排列顺序
        source_idx = self._get_source_permutation_idx(indices)
        # 获取目标索引的排列顺序
        target_idx = self._get_target_permutation_idx(indices)
        # 获取预测的 masks
        source_masks = outputs["pred_masks"]
        # 根据源索引重新排列预测的 masks
        source_masks = source_masks[source_idx]
        # 获取目标中的 masks 列表
        masks = [t["masks"] for t in targets]

        # 将目标 masks 转换成嵌套张量，并解析出有效区域
        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
        # 将目标 masks 转换为与预测 masks 相同的设备类型
        target_masks = target_masks.to(source_masks)
        # 根据目标索引重新排列目标 masks
        target_masks = target_masks[target_idx]

        # 将预测 masks 上采样到目标尺寸
        source_masks = nn.functional.interpolate(
            source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
        )
        # 压缩维度
        source_masks = source_masks[:, 0].flatten(1)

        # 压缩维度
        target_masks = target_masks.flatten(1)
        # 重新调整形状以匹配预测 masks
        target_masks = target_masks.view(source_masks.shape)

        # 计算损失，包括 sigmoid focal loss 和 dice loss
        losses = {
            "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
            "loss_dice": dice_loss(source_masks, target_masks, num_boxes),
        }
        return losses

    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_source_permutation_idx
    def _get_source_permutation_idx(self, indices):
        # 根据 indices 对预测结果进行排列
        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
        source_idx = torch.cat([source for (source, _) in indices])
        return batch_idx, source_idx

    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_target_permutation_idx
    def _get_target_permutation_idx(self, indices):
        # 根据 indices 对目标进行排列
        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
        target_idx = torch.cat([target for (_, target) in indices])
        return batch_idx, target_idx

    # Copied from transformers.models.detr.modeling_detr.DetrLoss.get_loss
    def get_loss(self, loss, outputs, targets, indices, num_boxes):
        # 根据指定的损失类型获取相应的损失函数并计算损失
        loss_map = {
            "labels": self.loss_labels,
            "cardinality": self.loss_cardinality,
            "boxes": self.loss_boxes,
            "masks": self.loss_masks,
        }
        # 检查损失类型是否被支持
        if loss not in loss_map:
            raise ValueError(f"Loss {loss} not supported")
        return loss_map[loss](outputs, targets, indices, num_boxes)

    # Copied from transformers.models.detr.modeling_detr.DetrLoss.forward
    def forward(self, outputs, targets):
        """
        This method computes the losses for the model during training.

        Args:
             outputs (`dict`, *optional*):
                Dictionary containing tensors representing model predictions.
             targets (`List[dict]`, *optional*):
                List of dictionaries where each dictionary corresponds to target data for one sample in the batch.

        Returns:
            losses (`dict`):
                Dictionary containing computed losses.

        """
        # Exclude auxiliary outputs from outputs dictionary
        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}

        # Match model outputs with target data
        indices = self.matcher(outputs_without_aux, targets)

        # Calculate the total number of target boxes across all samples
        num_boxes = sum(len(t["class_labels"]) for t in targets)
        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)

        # Determine the world size for distributed training
        world_size = 1
        if is_accelerate_available():
            if PartialState._shared_state != {}:
                num_boxes = reduce(num_boxes)
                world_size = PartialState().num_processes
        
        # Normalize num_boxes and ensure it is at least 1
        num_boxes = torch.clamp(num_boxes / world_size, min=1).item()

        # Compute losses for each specified loss type
        losses = {}
        for loss in self.losses:
            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))

        # If there are auxiliary outputs, compute losses for each auxiliary output separately
        if "auxiliary_outputs" in outputs:
            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
                indices = self.matcher(auxiliary_outputs, targets)
                for loss in self.losses:
                    if loss == "masks":
                        # Skip computing intermediate masks losses due to computational cost
                        continue
                    # Append index suffix to loss keys to distinguish between auxiliary losses
                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
                    losses.update(l_dict)

        return losses
# 从 transformers.models.detr.modeling_detr.DetrMLPPredictionHead 复制而来，修改为 ConditionalDetrMLPPredictionHead
class ConditionalDetrMLPPredictionHead(nn.Module):
    """
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
    """

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.num_layers = num_layers
        h = [hidden_dim] * (num_layers - 1)
        # 创建包含多个线性层的 ModuleList，用于构建 MLP
        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))

    def forward(self, x):
        for i, layer in enumerate(self.layers):
            # 对输入应用 ReLU 激活函数，除了最后一层
            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
        return x


# 从 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher 复制而来，修改为 ConditionalDetrHungarianMatcher
class ConditionalDetrHungarianMatcher(nn.Module):
    """
    This class computes an assignment between the targets and the predictions of the network.

    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
    un-matched (and thus treated as non-objects).

    Args:
        class_cost:
            The relative weight of the classification error in the matching cost.
        bbox_cost:
            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
        giou_cost:
            The relative weight of the giou loss of the bounding box in the matching cost.
    """

    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
        super().__init__()
        # 确保模块在后端中使用了 "scipy" 库
        requires_backends(self, ["scipy"])

        self.class_cost = class_cost
        self.bbox_cost = bbox_cost
        self.giou_cost = giou_cost
        # 如果所有匹配器的成本都为零，则引发 ValueError 异常
        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
            raise ValueError("All costs of the Matcher can't be 0")

    @torch.no_grad()
    def forward(self, outputs, targets):
        """
        Args:
            outputs (`dict`):
                A dictionary that contains at least these entries:
                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
            targets (`List[dict]`):
                A list of targets (len(targets) = batch_size), where each target is a dict containing:
                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
                  ground-truth objects in the target) containing the class labels
                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.

        Returns:
            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
            - index_i is the indices of the selected predictions (in order)
            - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        
        batch_size, num_queries = outputs["logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
        # 将 logits 展平为 [batch_size * num_queries, num_classes] 并进行 sigmoid 操作，得到分类概率
        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
        
        # 将 pred_boxes 展平为 [batch_size * num_queries, 4]
        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]

        # Also concat the target labels and boxes
        # 将所有目标的类别标签拼接成一个张量
        target_ids = torch.cat([v["class_labels"] for v in targets])
        
        # 将所有目标的边界框坐标拼接成一个张量
        target_bbox = torch.cat([v["boxes"] for v in targets])

        # Compute the classification cost.
        # 计算分类损失
        alpha = 0.25
        gamma = 2.0
        # 计算负分类损失
        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
        # 计算正分类损失
        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
        # 选取目标类别对应的损失
        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]

        # Compute the L1 cost between boxes
        # 计算边界框之间的 L1 损失
        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)

        # Compute the giou cost between boxes
        # 计算边界框之间的 GIoU 损失
        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))

        # Final cost matrix
        # 组合计算得到的三种损失到最终的损失矩阵中
        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()

        # Perform linear sum assignment on the cost matrix split by target sizes
        # 根据目标的大小执行线性求和分配，并返回索引
        sizes = [len(v["boxes"]) for v in targets]
        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
# Copied from transformers.models.detr.modeling_detr._upcast
# 将输入张量的数据类型上溯，以防止乘法运算中的数值溢出
def _upcast(t: Tensor) -> Tensor:
    if t.is_floating_point():
        # 如果输入张量已经是浮点类型，则保持不变；否则将其转换为 float 类型
        return t if t.dtype in (torch.float32, torch.float64) else t.float()
    else:
        # 如果输入张量是整数类型，则保持不变；否则将其转换为 int 类型
        return t if t.dtype in (torch.int32, torch.int64) else t.int()


# Copied from transformers.models.detr.modeling_detr.box_area
# 计算一组边界框的面积，这些边界框由其 (x1, y1, x2, y2) 坐标表示
def box_area(boxes: Tensor) -> Tensor:
    """
    计算一组边界框的面积，这些边界框由其 (x1, y1, x2, y2) 坐标表示。

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            需要计算面积的边界框。边界框应以 (x1, y1, x2, y2) 格式给出，要求 `0 <= x1 < x2` 和 `0 <= y1 < y2`。

    Returns:
        `torch.FloatTensor`: 包含每个边界框面积的张量。
    """
    boxes = _upcast(boxes)
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


# Copied from transformers.models.detr.modeling_detr.box_iou
# 计算两组边界框的 IoU（交并比）
def box_iou(boxes1, boxes2):
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]

    union = area1[:, None] + area2 - inter

    iou = inter / union
    return iou, union


# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
# 计算通用 IoU，参考 https://giou.stanford.edu/
def generalized_box_iou(boxes1, boxes2):
    """
    计算通用 IoU，参考 https://giou.stanford.edu/。边界框应以 [x0, y0, x1, y1]（角点）格式给出。

    Returns:
        `torch.FloatTensor`: 一个形状为 [N, M] 的成对矩阵，其中 N = len(boxes1)，M = len(boxes2)
    """
    # 检查是否存在不正确的边界框，避免产生无限值或非数值结果
    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
        raise ValueError(f"boxes1 的格式必须为 [x0, y0, x1, y1]（角点），但得到的是 {boxes1}")
    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
        raise ValueError(f"boxes2 的格式必须为 [x0, y0, x1, y1]（角点），但得到的是 {boxes2}")
    iou, union = box_iou(boxes1, boxes2)

    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
    area = width_height[:, :, 0] * width_height[:, :, 1]

    return iou - (area - union) / area


# Copied from transformers.models.detr.modeling_detr._max_by_axis
# 返回给定列表中各子列表在相同索引处的最大值列表
def _max_by_axis(the_list):
    # type: (List[List[int]]) -> List[int]
    maxes = the_list[0]
    for sublist in the_list[1:]:
        for index, item in enumerate(sublist):
            maxes[index] = max(maxes[index], item)
    return maxes
# 定义嵌套张量类，包含多个张量和可选的遮罩张量
class NestedTensor(object):
    def __init__(self, tensors, mask: Optional[Tensor]):
        self.tensors = tensors  # 初始化张量列表
        self.mask = mask  # 初始化遮罩张量

    # 将嵌套张量移到指定设备
    def to(self, device):
        # 将张量列表转移到指定设备
        cast_tensor = self.tensors.to(device)
        mask = self.mask
        if mask is not None:
            # 如果有遮罩张量，则也将其转移到指定设备
            cast_mask = mask.to(device)
        else:
            cast_mask = None
        return NestedTensor(cast_tensor, cast_mask)

    # 分解嵌套张量，返回包含的张量和遮罩张量
    def decompose(self):
        return self.tensors, self.mask

    # 返回嵌套张量的字符串表示
    def __repr__(self):
        return str(self.tensors)


# 从张量列表创建嵌套张量
# 复制自 transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
    if tensor_list[0].ndim == 3:
        # 计算最大尺寸
        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
        batch_shape = [len(tensor_list)] + max_size
        batch_size, num_channels, height, width = batch_shape
        dtype = tensor_list[0].dtype
        device = tensor_list[0].device
        # 创建全零张量，形状为 batch_shape，指定 dtype 和设备
        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
        # 创建全一遮罩张量，形状为 (batch_size, height, width)，布尔类型，指定设备
        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
        # 将每个张量复制到对应的零张量片段中，并更新遮罩张量
        for img, pad_img, m in zip(tensor_list, tensor, mask):
            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
            m[: img.shape[1], : img.shape[2]] = False
    else:
        # 如果不是三维张量，则引发错误
        raise ValueError("Only 3-dimensional tensors are supported")
    return NestedTensor(tensor, mask)

`.\models\conditional_detr\init.py`

# 导入类型检查模块
from typing import TYPE_CHECKING

# 导入自定义的异常类，用于处理缺失的可选依赖
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义导入结构字典，包含各模块及其对应的导入项
_import_structure = {
    "configuration_conditional_detr": [
        "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "ConditionalDetrConfig",
        "ConditionalDetrOnnxConfig",
    ]
}

# 检查视觉处理模块是否可用，若不可用则抛出异常处理
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则添加特征提取和图像处理模块到导入结构中
    _import_structure["feature_extraction_conditional_detr"] = ["ConditionalDetrFeatureExtractor"]
    _import_structure["image_processing_conditional_detr"] = ["ConditionalDetrImageProcessor"]

# 检查 Torch 是否可用，若不可用则抛出异常处理
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则添加建模相关模块到导入结构中
    _import_structure["modeling_conditional_detr"] = [
        "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ConditionalDetrForObjectDetection",
        "ConditionalDetrForSegmentation",
        "ConditionalDetrModel",
        "ConditionalDetrPreTrainedModel",
    ]

# 如果是类型检查模式，则进行额外的导入操作
if TYPE_CHECKING:
    from .configuration_conditional_detr import (
        CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
        ConditionalDetrConfig,
        ConditionalDetrOnnxConfig,
    )

    # 检查视觉处理模块是否可用，若可用则导入特征提取和图像处理模块
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .feature_extraction_conditional_detr import ConditionalDetrFeatureExtractor
        from .image_processing_conditional_detr import ConditionalDetrImageProcessor

    # 检查 Torch 是否可用，若可用则导入建模相关模块
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_conditional_detr import (
            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
            ConditionalDetrForObjectDetection,
            ConditionalDetrForSegmentation,
            ConditionalDetrModel,
            ConditionalDetrPreTrainedModel,
        )

# 如果不是类型检查模式，则将当前模块定义为懒加载模块
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\convbert\configuration_convbert.py`

# coding=utf-8
# 定义了文件的编码格式为 UTF-8

# 版权声明和许可证信息，告知使用者此代码的版权信息和许可条件

# 导入必要的库和模块
from collections import OrderedDict
from typing import Mapping

# 从相关模块中导入必要的配置类和工具函数
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义了预训练模型名称到其配置文件地址的映射字典
CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "YituTech/conv-bert-base": "https://huggingface.co/YituTech/conv-bert-base/resolve/main/config.json",
    "YituTech/conv-bert-medium-small": (
        "https://huggingface.co/YituTech/conv-bert-medium-small/resolve/main/config.json"
    ),
    "YituTech/conv-bert-small": "https://huggingface.co/YituTech/conv-bert-small/resolve/main/config.json",
    # 查看所有 ConvBERT 模型的地址：https://huggingface.co/models?filter=convbert
}

# 定义 ConvBertConfig 类，继承自 PretrainedConfig 类
class ConvBertConfig(PretrainedConfig):
    r"""
    这是存储 [`ConvBertModel`] 配置的类。用于根据指定的参数实例化 ConvBERT 模型，定义模型架构。使用默认参数实例化一个配置对象，
    可以得到与 ConvBERT [YituTech/conv-bert-base](https://huggingface.co/YituTech/conv-bert-base) 架构类似的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。
    """
    # 定义 ConvBERT 模型的配置类，用于配置模型的各种参数和超参数
    class ConvBertConfig:
    
        # 初始化函数，设置模型的默认词汇大小为 30522
        def __init__(
            self,
            vocab_size=30522,
            hidden_size=768,
            num_hidden_layers=12,
            num_attention_heads=12,
            intermediate_size=3072,
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            max_position_embeddings=512,
            type_vocab_size=2,
            initializer_range=0.02,
            layer_norm_eps=1e-12,
            head_ratio=2,
            num_groups=1,
            conv_kernel_size=9,
            classifier_dropout=None  # 分类头部的dropout比例，默认为None
        ):
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
            self.num_attention_heads = num_attention_heads
            self.intermediate_size = intermediate_size
            self.hidden_act = hidden_act
            self.hidden_dropout_prob = hidden_dropout_prob
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            self.max_position_embeddings = max_position_embeddings
            self.type_vocab_size = type_vocab_size
            self.initializer_range = initializer_range
            self.layer_norm_eps = layer_norm_eps
            self.head_ratio = head_ratio
            self.num_groups = num_groups
            self.conv_kernel_size = conv_kernel_size
            self.classifier_dropout = classifier_dropout
    
    # 引入 ConvBertConfig 和 ConvBertModel 类
    from transformers import ConvBertConfig, ConvBertModel
    
    # 创建一个 ConvBERT 模型的配置对象，使用默认配置
    configuration = ConvBertConfig()
    
    # 使用配置对象初始化一个 ConvBERT 模型，权重随机初始化
    model = ConvBertModel(configuration)
    # 访问模型配置信息
    configuration = model.config
# 从 transformers.models.bert.configuration_bert.BertOnnxConfig 复制过来的 ConvBertOnnxConfig 类
class ConvBertOnnxConfig(OnnxConfig):
    # 定义 inputs 属性，返回一个映射，表示模型的输入信息
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务是多项选择（multiple-choice）
        if self.task == "multiple-choice":
            # 定义动态轴，包括 batch（批量）、choice（选择）、sequence（序列）
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则定义动态轴，包括 batch（批量）、sequence（序列）
            dynamic_axis = {0: "batch", 1: "sequence"}
        # 返回一个有序字典，包含输入名称到动态轴映射的信息
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),         # 输入：input_ids，使用动态轴
                ("attention_mask", dynamic_axis),    # 输入：attention_mask，使用动态轴
                ("token_type_ids", dynamic_axis),    # 输入：token_type_ids，使用动态轴
            ]
        )

`.\models\convbert\convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py`

# coding=utf-8
# 设置文件编码为 UTF-8

# Copyright 2020 The HuggingFace Inc. team.
# 版权声明：2020 年由 HuggingFace Inc. 团队拥有

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 许可协议（"许可协议"）授权使用

# you may not use this file except in compliance with the License.
# 除非符合许可协议，否则不得使用此文件

# You may obtain a copy of the License at
# 可以在以下链接处获取许可协议的副本
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则依据"现状"分发软件，无论是明示的还是暗示的任何类型的保证或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 详细信息请参阅许可协议，以了解权限的具体限制和责任

"""Convert ConvBERT checkpoint."""
# 脚本用于将 ConvBERT 模型的 TensorFlow 1.x 检查点转换为 PyTorch 模型

import argparse
# 导入 argparse 模块，用于处理命令行参数

from transformers import ConvBertConfig, ConvBertModel, TFConvBertModel, load_tf_weights_in_convbert
# 从 transformers 库中导入 ConvBertConfig, ConvBertModel, TFConvBertModel 和 load_tf_weights_in_convbert 函数

from transformers.utils import logging
# 从 transformers.utils 模块中导入 logging 模块

logging.set_verbosity_info()
# 设置日志记录器的详细程度为 info 级别

def convert_orig_tf1_checkpoint_to_pytorch(tf_checkpoint_path, convbert_config_file, pytorch_dump_path):
    # 定义函数，用于将原始 TensorFlow 1.x 检查点转换为 PyTorch 模型

    conf = ConvBertConfig.from_json_file(convbert_config_file)
    # 从指定的 JSON 文件加载 ConvBertConfig 配置

    model = ConvBertModel(conf)
    # 使用加载的配置创建 ConvBertModel 模型对象

    model = load_tf_weights_in_convbert(model, conf, tf_checkpoint_path)
    # 加载 TensorFlow 检查点中的权重到 PyTorch 模型中

    model.save_pretrained(pytorch_dump_path)
    # 将转换后的 PyTorch 模型保存到指定路径

    tf_model = TFConvBertModel.from_pretrained(pytorch_dump_path, from_pt=True)
    # 从保存的 PyTorch 模型中创建 TFConvBertModel 对象，标记为来自 PyTorch

    tf_model.save_pretrained(pytorch_dump_path)
    # 将转换后的 TFConvBertModel 模型保存到指定路径


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 创建命令行参数解析器对象

    # Required parameters
    parser.add_argument(
        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
    )
    # 添加必需的命令行参数 --tf_checkpoint_path，指定 TensorFlow 检查点的路径

    parser.add_argument(
        "--convbert_config_file",
        default=None,
        type=str,
        required=True,
        help=(
            "The config json file corresponding to the pre-trained ConvBERT model. \n"
            "This specifies the model architecture."
        ),
    )
    # 添加必需的命令行参数 --convbert_config_file，指定预训练 ConvBERT 模型的配置 JSON 文件路径

    parser.add_argument(
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 添加必需的命令行参数 --pytorch_dump_path，指定输出 PyTorch 模型的路径

    args = parser.parse_args()
    # 解析命令行参数

    convert_orig_tf1_checkpoint_to_pytorch(args.tf_checkpoint_path, args.convbert_config_file, args.pytorch_dump_path)
    # 调用转换函数，执行 TensorFlow 1.x 到 PyTorch 模型的转换操作

`.\models\convbert\modeling_convbert.py`

# 设定文件编码为UTF-8
# 版权声明和许可信息
# 版权所有 2021 年 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可 2.0 版本（"许可证"）授权；
# 除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，软件
# 按"原样"提供，不提供任何明示或暗示的担保
# 或条件。详见许可证。
""" PyTorch ConvBERT 模型。"""


import math
import os
from operator import attrgetter
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入激活函数相关内容
from ...activations import ACT2FN, get_activation
# 导入模型输出相关内容
from ...modeling_outputs import (
    BaseModelOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
# 导入模型工具函数
from ...modeling_utils import PreTrainedModel, SequenceSummary
# 导入PyTorch相关工具函数和模型方法
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
# 导入工具函数中的日志记录功能
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
# 导入ConvBERT配置
from .configuration_convbert import ConvBertConfig

# 获取logger对象
logger = logging.get_logger(__name__)

# 文档中使用的检查点和配置
_CHECKPOINT_FOR_DOC = "YituTech/conv-bert-base"
_CONFIG_FOR_DOC = "ConvBertConfig"

# ConvBERT预训练模型的存档列表
CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "YituTech/conv-bert-base",
    "YituTech/conv-bert-medium-small",
    "YituTech/conv-bert-small",
    # 查看所有ConvBERT模型，请访问 https://huggingface.co/models?filter=convbert
]


def load_tf_weights_in_convbert(model, config, tf_checkpoint_path):
    """从TensorFlow检查点加载权重到PyTorch模型中。"""
    try:
        import tensorflow as tf
    except ImportError:
        # 如果导入失败，记录错误信息并抛出异常
        logger.error(
            "在PyTorch中加载TensorFlow模型需要安装TensorFlow。请参阅 "
            "https://www.tensorflow.org/install/ 获取安装说明。"
        )
        raise
    # 获取TensorFlow检查点文件的绝对路径
    tf_path = os.path.abspath(tf_checkpoint_path)
    # 记录日志：正在从TensorFlow模型转换检查点
    logger.info(f"从 {tf_path} 转换TensorFlow检查点")
    # 从TF模型加载权重
    init_vars = tf.train.list_variables(tf_path)
    tf_data = {}
    # 遍历初始化变量列表，加载每个变量的权重数据到字典中
    for name, shape in init_vars:
        logger.info(f"加载TF权重 {name}，形状为 {shape}")
        array = tf.train.load_variable(tf_path, name)
        tf_data[name] = array
    # 定义参数映射字典，将模型参数名映射到TensorFlow模型中对应的权重名
    param_mapping = {
        "embeddings.word_embeddings.weight": "electra/embeddings/word_embeddings",
        "embeddings.position_embeddings.weight": "electra/embeddings/position_embeddings",
        "embeddings.token_type_embeddings.weight": "electra/embeddings/token_type_embeddings",
        "embeddings.LayerNorm.weight": "electra/embeddings/LayerNorm/gamma",
        "embeddings.LayerNorm.bias": "electra/embeddings/LayerNorm/beta",
        "embeddings_project.weight": "electra/embeddings_project/kernel",
        "embeddings_project.bias": "electra/embeddings_project/bias",
    }
    
    # 根据配置条件设置group_dense_name变量的值
    if config.num_groups > 1:
        group_dense_name = "g_dense"
    else:
        group_dense_name = "dense"

    # 遍历模型的所有命名参数
    for param in model.named_parameters():
        # 获取参数名
        param_name = param[0]
        # 使用attrgetter获取模型中参数名对应的属性
        retriever = attrgetter(param_name)
        result = retriever(model)
        # 根据param_mapping将参数名映射到对应的TensorFlow权重名
        tf_name = param_mapping[param_name]
        # 从tf_data中读取TensorFlow权重值，并转换为PyTorch Tensor
        value = torch.from_numpy(tf_data[tf_name])
        
        # 打印日志，显示转换信息
        logger.info(f"TF: {tf_name}, PT: {param_name} ")
        
        # 根据TensorFlow权重名后缀进行不同的处理
        if tf_name.endswith("/kernel"):
            # 如果不是特定的g_dense相关的kernel，需要对value进行转置操作
            if not tf_name.endswith("/intermediate/g_dense/kernel"):
                if not tf_name.endswith("/output/g_dense/kernel"):
                    value = value.T
        elif tf_name.endswith("/depthwise_kernel"):
            # 如果是深度可分离卷积的kernel，需要对value进行维度置换操作
            value = value.permute(1, 2, 0)  # 2, 0, 1
        elif tf_name.endswith("/pointwise_kernel"):
            # 如果是点卷积的kernel，同样需要对value进行维度置换操作
            value = value.permute(2, 1, 0)  # 2, 1, 0
        elif tf_name.endswith("/conv_attn_key/bias"):
            # 如果是注意力机制中的bias，需要在最后一维添加一个维度
            value = value.unsqueeze(-1)
        
        # 将处理后的value赋值给模型参数的data属性
        result.data = value
    
    # 返回处理后的模型
    return model
class ConvBertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
        # 创建词嵌入层，根据词汇大小、嵌入大小和填充标识初始化
        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
        # 创建位置嵌入层，根据最大位置编码和嵌入大小初始化
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
        # 创建类型嵌入层，根据类型词汇大小和嵌入大小初始化
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)

        # self.LayerNorm 未采用蛇形命名法，以保持与 TensorFlow 模型变量名一致，以便能够加载任何 TensorFlow 检查点文件
        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
        # 创建丢弃层，根据隐藏层丢弃概率初始化
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 创建位置编码张量，保持内存连续性并在序列化时导出
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 创建类型编码张量，使用全零初始化，与位置编码张量大小相同
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.LongTensor:
        # 如果提供了 input_ids，则获取其形状；否则，根据 inputs_embeds 推断形状
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        # 如果未提供位置编码，则使用预先注册的 position_ids，截取适当长度
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果未提供类型编码，检查是否已有注册的 token_type_ids，并扩展以匹配 input_shape；否则初始化全零的类型编码张量
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果未提供输入的嵌入向量，根据 input_ids 获取词嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        
        # 根据位置编码获取位置嵌入
        position_embeddings = self.position_embeddings(position_ids)
        # 根据类型编码获取类型嵌入
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 合并词嵌入、位置嵌入和类型嵌入
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
        # 应用 LayerNorm
        embeddings = self.LayerNorm(embeddings)
        # 应用 Dropout
        embeddings = self.dropout(embeddings)
        return embeddings


class ConvBertPreTrainedModel(PreTrainedModel):
    """
    # 用于处理权重初始化和简单接口以下载和加载预训练模型的抽象类。

    # 指定配置类为ConvBertConfig
    config_class = ConvBertConfig
    # 加载 TensorFlow 权重的函数为load_tf_weights_in_convbert
    load_tf_weights = load_tf_weights_in_convbert
    # 基础模型前缀为"convbert"
    base_model_prefix = "convbert"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """初始化模型的权重"""
        # 如果是线性层(nn.Linear)
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化权重，均值为0，标准差为配置文件中的initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置，则将偏置初始化为0
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是嵌入层(nn.Embedding)
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，均值为0，标准差为配置文件中的initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在填充索引(padding_idx)，则将对应位置的权重初始化为0
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果是层归一化层(nn.LayerNorm)
        elif isinstance(module, nn.LayerNorm):
            # 将偏置初始化为0
            module.bias.data.zero_()
            # 将权重初始化为1
            module.weight.data.fill_(1.0)
class SeparableConv1D(nn.Module):
    """This class implements separable convolution, i.e. a depthwise and a pointwise layer"""

    def __init__(self, config, input_filters, output_filters, kernel_size, **kwargs):
        super().__init__()
        # 定义深度卷积层，使用深度卷积（depthwise convolution）方式，groups=input_filters 表示每个输入通道单独卷积
        self.depthwise = nn.Conv1d(
            input_filters,
            input_filters,
            kernel_size=kernel_size,
            groups=input_filters,
            padding=kernel_size // 2,
            bias=False,
        )
        # 定义点卷积层，用于将深度卷积的结果进行升维到输出通道数
        self.pointwise = nn.Conv1d(input_filters, output_filters, kernel_size=1, bias=False)
        # 定义偏置项参数
        self.bias = nn.Parameter(torch.zeros(output_filters, 1))

        # 初始化深度卷积层和点卷积层的权重
        self.depthwise.weight.data.normal_(mean=0.0, std=config.initializer_range)
        self.pointwise.weight.data.normal_(mean=0.0, std=config.initializer_range)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 执行深度卷积操作
        x = self.depthwise(hidden_states)
        # 执行点卷积操作
        x = self.pointwise(x)
        # 添加偏置项
        x += self.bias
        return x


class ConvBertSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 检查隐藏层大小是否能被注意力头数整除
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 计算新的注意力头数
        new_num_attention_heads = config.num_attention_heads // config.head_ratio
        if new_num_attention_heads < 1:
            self.head_ratio = config.num_attention_heads
            self.num_attention_heads = 1
        else:
            self.num_attention_heads = new_num_attention_heads
            self.head_ratio = config.head_ratio

        # 设置卷积核大小
        self.conv_kernel_size = config.conv_kernel_size
        # 检查隐藏层大小是否能被注意力头数整除
        if config.hidden_size % self.num_attention_heads != 0:
            raise ValueError("hidden_size should be divisible by num_attention_heads")

        # 计算每个注意力头的大小
        self.attention_head_size = (config.hidden_size // self.num_attention_heads) // 2
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 定义查询、键、值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 设置键卷积注意力层
        self.key_conv_attn_layer = SeparableConv1D(
            config, config.hidden_size, self.all_head_size, self.conv_kernel_size
        )
        # 设置卷积核层
        self.conv_kernel_layer = nn.Linear(self.all_head_size, self.num_attention_heads * self.conv_kernel_size)
        # 设置卷积输出层
        self.conv_out_layer = nn.Linear(config.hidden_size, self.all_head_size)

        # 定义卷积展开层
        self.unfold = nn.Unfold(
            kernel_size=[self.conv_kernel_size, 1], padding=[int((self.conv_kernel_size - 1) / 2), 0]
        )

        # 定义 dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
    # 将输入张量 x 进行形状变换，用于注意力分数计算
    def transpose_for_scores(self, x):
        # 计算新的形状，保留除了最后一维外的所有维度，增加注意力头数和每个注意力头的大小
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        # 对输入张量 x 进行形状重塑，使其符合注意力计算的需要
        x = x.view(*new_x_shape)
        # 对张量进行维度置换，以便进行注意力计算，顺序为 batch, head, seq_length, head_size
        return x.permute(0, 2, 1, 3)

    # 模型的前向传播函数，接收隐藏状态、注意力掩码、头掩码、编码器隐藏状态等作为输入
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
# 定义一个名为 ConvBertSelfOutput 的神经网络模块类
class ConvBertSelfOutput(nn.Module):
    # 初始化函数，接收一个配置参数对象 config
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__()
        # 创建一个线性层，输入和输出大小均为配置参数中的隐藏大小
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 创建一个 LayerNorm 层，输入大小为隐藏大小，epsilon 为配置参数中的层归一化 epsilon
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层，使用配置参数中的隐藏 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，接收两个张量参数并返回一个张量
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用线性层处理隐藏状态张量
        hidden_states = self.dense(hidden_states)
        # 使用 Dropout 处理后的隐藏状态张量
        hidden_states = self.dropout(hidden_states)
        # 对加和后的归一化处理隐藏状态张量
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的隐藏状态张量
        return hidden_states


# 定义一个名为 ConvBertAttention 的神经网络模块类
class ConvBertAttention(nn.Module):
    # 初始化函数，接收一个配置参数对象 config
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__()
        # 创建一个 ConvBertSelfAttention 对象，使用给定的配置参数
        self.self = ConvBertSelfAttention(config)
        # 创建一个 ConvBertSelfOutput 对象，使用给定的配置参数
        self.output = ConvBertSelfOutput(config)
        # 初始化一个空集合，用于存储被修剪的注意力头
        self.pruned_heads = set()

    # 修剪注意力头的方法，接收一个头的列表
    def prune_heads(self, heads):
        # 如果头列表为空，直接返回
        if len(heads) == 0:
            return
        # 调用辅助函数查找可修剪的头并返回索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 修剪自注意力层的线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪过的头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播函数，接收多个参数并返回一个元组
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor, Optional[torch.FloatTensor]]:
        # 调用自注意力层的前向传播方法，处理隐藏状态等输入参数
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            output_attentions,
        )
        # 使用输出层处理自注意力层的输出和输入的隐藏状态张量
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力权重，则添加到输出元组中
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出注意力权重，则添加到输出元组中
        # 返回处理后的输出元组
        return outputs


# 定义一个名为 GroupedLinearLayer 的神经网络模块类
class GroupedLinearLayer(nn.Module):
    # 初始化函数，接收输入大小、输出大小和分组数量作为参数
    def __init__(self, input_size, output_size, num_groups):
        # 调用父类的初始化函数
        super().__init__()
        # 设置输入大小、输出大小和分组数量的属性
        self.input_size = input_size
        self.output_size = output_size
        self.num_groups = num_groups
        # 计算每个分组的输入维度和输出维度
        self.group_in_dim = self.input_size // self.num_groups
        self.group_out_dim = self.output_size // self.num_groups
        # 创建权重参数张量，形状为 (num_groups, group_in_dim, group_out_dim)
        self.weight = nn.Parameter(torch.empty(self.num_groups, self.group_in_dim, self.group_out_dim))
        # 创建偏置参数张量，形状为 (output_size,)
        self.bias = nn.Parameter(torch.empty(output_size))
    # 定义一个前向传播方法，接收隐藏状态作为输入张量，返回处理后的张量作为输出
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 获取隐藏状态张量的批量大小
        batch_size = list(hidden_states.size())[0]
        # 将隐藏状态张量重塑为 [batch_size, self.num_groups, self.group_in_dim] 的形状
        x = torch.reshape(hidden_states, [-1, self.num_groups, self.group_in_dim])
        # 将张量 x 的维度重新排列为 [self.num_groups, batch_size, self.group_in_dim]
        x = x.permute(1, 0, 2)
        # 使用 self.weight 对 x 进行矩阵乘法运算
        x = torch.matmul(x, self.weight)
        # 再次将张量 x 的维度重新排列为 [batch_size, self.num_groups, self.output_size]
        x = x.permute(1, 0, 2)
        # 将张量 x 重塑为 [batch_size, -1, self.output_size] 的形状
        x = torch.reshape(x, [batch_size, -1, self.output_size])
        # 将张量 x 加上偏置 self.bias
        x = x + self.bias
        # 返回处理后的张量 x 作为输出
        return x
class ConvBertIntermediate(nn.Module):
    # ConvBertIntermediate 类定义
    def __init__(self, config):
        super().__init__()
        # 如果分组数为1，使用普通的线性层
        if config.num_groups == 1:
            self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        else:
            # 否则使用分组线性层
            self.dense = GroupedLinearLayer(
                input_size=config.hidden_size, output_size=config.intermediate_size, num_groups=config.num_groups
            )
        # 根据配置选择激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播方法
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用线性层处理隐藏状态
        hidden_states = self.dense(hidden_states)
        # 使用中间激活函数处理线性层输出
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


class ConvBertOutput(nn.Module):
    # ConvBertOutput 类定义
    def __init__(self, config):
        super().__init__()
        # 如果分组数为1，使用普通的线性层
        if config.num_groups == 1:
            self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        else:
            # 否则使用分组线性层
            self.dense = GroupedLinearLayer(
                input_size=config.intermediate_size, output_size=config.hidden_size, num_groups=config.num_groups
            )
        # LayerNorm 层，用于归一化隐藏状态
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层，用于随机丢弃部分隐藏状态
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用线性层处理隐藏状态
        hidden_states = self.dense(hidden_states)
        # 使用 Dropout 随机丢弃部分隐藏状态
        hidden_states = self.dropout(hidden_states)
        # 使用 LayerNorm 层对加和后的隐藏状态进行归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class ConvBertLayer(nn.Module):
    # ConvBertLayer 类定义
    def __init__(self, config):
        super().__init__()
        # 用于分块的前馈传播的大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度
        self.seq_len_dim = 1
        # 自注意力层
        self.attention = ConvBertAttention(config)
        # 是否为解码器
        self.is_decoder = config.is_decoder
        # 是否添加交叉注意力
        self.add_cross_attention = config.add_cross_attention
        if self.add_cross_attention:
            # 如果添加交叉注意力且不是解码器模型，则抛出类型错误
            if not self.is_decoder:
                raise TypeError(f"{self} should be used as a decoder model if cross attention is added")
            # 否则添加交叉注意力自注意力层
            self.crossattention = ConvBertAttention(config)
        # ConvBertIntermediate 中间层
        self.intermediate = ConvBertIntermediate(config)
        # ConvBertOutput 输出层
        self.output = ConvBertOutput(config)

    # 前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        # 省略号表示可能的其他参数
    # 定义方法，接受隐藏状态，注意力掩码，头部掩码，是否输出注意力权重，返回自注意力模型输出和可能的注意力权重
    ) -> Tuple[torch.Tensor, Optional[torch.FloatTensor]]:
        # 使用自注意力模型计算注意力输出
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
        )
        # 取出自注意力模型的输出
        attention_output = self_attention_outputs[0]
        # 如果输出注意力权重，将它们添加到输出中
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        # 如果当前模型是解码器且有编码器隐藏状态输入
        if self.is_decoder and encoder_hidden_states is not None:
            # 如果当前模型没有跨注意力层，抛出属性错误
            if not hasattr(self, "crossattention"):
                raise AttributeError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )
            # 使用跨注意力模型计算跨注意力输出
            cross_attention_outputs = self.crossattention(
                attention_output,
                encoder_attention_mask,
                head_mask,
                encoder_hidden_states,
                output_attentions,
            )
            # 取出跨注意力模型的输出
            attention_output = cross_attention_outputs[0]
            # 将跨注意力权重添加到输出中
            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights

        # 对注意力输出应用分块处理
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # 将处理后的层输出添加到总体输出中
        outputs = (layer_output,) + outputs
        # 返回所有输出
        return outputs

    # 定义方法，接受注意力输出并进行前馈处理
    def feed_forward_chunk(self, attention_output):
        # 将注意力输出送入中间层
        intermediate_output = self.intermediate(attention_output)
        # 将中间层输出和注意力输出送入输出层
        layer_output = self.output(intermediate_output, attention_output)
        # 返回层输出
        return layer_output
# 定义一个名为 ConvBertEncoder 的神经网络模型类，继承自 nn.Module
class ConvBertEncoder(nn.Module):
    # 初始化函数，接受一个配置参数 config
    def __init__(self, config):
        super().__init__()
        # 将传入的配置参数保存到当前对象的 config 属性中
        self.config = config
        # 使用列表推导式创建一个包含多个 ConvBertLayer 实例的 ModuleList，数量为 config.num_hidden_layers
        self.layer = nn.ModuleList([ConvBertLayer(config) for _ in range(config.num_hidden_layers)])
        # 初始化梯度检查点标志为 False
        self.gradient_checkpointing = False

    # 前向传播函数，接受多个输入参数，并返回一个包含多个输出的对象
    def forward(
        self,
        hidden_states: torch.Tensor,  # 输入的隐藏状态张量
        attention_mask: Optional[torch.FloatTensor] = None,  # 可选的注意力掩码张量
        head_mask: Optional[torch.FloatTensor] = None,  # 可选的头部掩码张量
        encoder_hidden_states: Optional[torch.Tensor] = None,  # 可选的编码器隐藏状态张量
        encoder_attention_mask: Optional[torch.Tensor] = None,  # 可选的编码器注意力掩码张量
        output_attentions: Optional[bool] = False,  # 是否输出注意力权重，默认为 False
        output_hidden_states: Optional[bool] = False,  # 是否输出所有隐藏状态，默认为 False
        return_dict: Optional[bool] = True,  # 是否返回字典格式的输出，默认为 True
    ) -> Union[Tuple, BaseModelOutputWithCrossAttentions]:  # 返回值可以是元组或 BaseModelOutputWithCrossAttentions 类型
        # 如果需要输出隐藏状态，则初始化空的元组 all_hidden_states
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力权重，则初始化空的元组 all_self_attentions
        all_self_attentions = () if output_attentions else None
        # 如果需要输出交叉注意力权重且配置允许，则初始化空的元组 all_cross_attentions
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
        # 遍历每个 ConvBertLayer 实例
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
            
            # 获取当前层的头部掩码，如果头部掩码不为 None，则使用对应的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None
            
            # 如果开启了梯度检查点且当前处于训练状态
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数来调用当前层的 __call__ 方法
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层的 __call__ 方法
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    output_attentions,
                )
            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果需要输出注意力权重
            if output_attentions:
                # 将当前层的注意力权重添加到 all_self_attentions 中
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果配置允许，将当前层的交叉注意力权重添加到 all_cross_attentions 中
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
        
        # 如果需要输出隐藏状态，则将最终的隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)
        
        # 如果不需要返回字典格式的输出，则返回包含多个非 None 元素的元组
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, all_hidden_states, all_self_attentions, all_cross_attentions]
                if v is not None
            )
        # 否则返回一个 BaseModelOutputWithCrossAttentions 类型的对象，包含指定的输出
        return BaseModelOutputWithCrossAttentions(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
    # 初始化函数，用于创建一个新的神经网络层对象
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__()
        # 创建一个线性层，输入和输出维度都是config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 根据config中的配置选择激活函数，存储在self.transform_act_fn中
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        # 创建一个LayerNorm层，对输入进行归一化处理
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # 前向传播函数，接受一个张量hidden_states作为输入，返回一个张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 输入张量经过线性层dense，输出经过变换后的hidden_states
        hidden_states = self.dense(hidden_states)
        # 经过激活函数变换
        hidden_states = self.transform_act_fn(hidden_states)
        # 经过LayerNorm层进行归一化处理
        hidden_states = self.LayerNorm(hidden_states)
        # 返回处理后的张量作为输出
        return hidden_states
# CONVBERT_INPUTS_DOCSTRING 用于定义 ConvBERT 模型的输入文档字符串，通常用于解释模型的输入参数和格式。
CONVBERT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列标记在词汇表中的索引。

            # 可以使用 `AutoTokenizer` 获得这些索引。详见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 遮盖机制，用于避免在填充的标记索引上进行注意力计算。遮盖值在 `[0, 1]` 范围内：

            # - 1 表示 **不被遮盖** 的标记，
            # - 0 表示 **被遮盖** 的标记。

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 段落标记索引，指示输入的第一和第二部分。索引选择在 `[0, 1]` 范围内：

            # - 0 对应于 *句子 A* 的标记，
            # - 1 对应于 *句子 B* 的标记。

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 每个输入序列标记在位置嵌入中的位置索引。选取范围为 `[0, config.max_position_embeddings - 1]`。

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 自注意力模块中选择性屏蔽的头部。遮盖值在 `[0, 1]` 范围内：

            # - 1 表示 **未被遮盖** 的头部，
            # - 0 表示 **被遮盖** 的头部。

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选项，可以直接传递嵌入表示，而不是传递 `input_ids`。这对于想要更好地控制如何将 *input_ids* 索引转换为相关向量比模型内部的嵌入查找矩阵更有用。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详见返回张量中的 `attentions` 以获取更多细节。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详见返回张量中的 `hidden_states` 以获取更多细节。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""
定义一个 ConvBERT 模型类，继承自 ConvBertPreTrainedModel，用于生成原始隐藏状态而不添加特定的输出头部。

@add_start_docstrings(
    "The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.",
    CONVBERT_START_DOCSTRING,
)
class ConvBertModel(ConvBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 初始化嵌入层
        self.embeddings = ConvBertEmbeddings(config)

        # 如果嵌入大小不等于隐藏层大小，则添加线性映射层
        if config.embedding_size != config.hidden_size:
            self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size)

        # 初始化编码器层
        self.encoder = ConvBertEncoder(config)
        self.config = config
        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 获取输入的词嵌入
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        # 设置输入的词嵌入
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        对模型的注意力头进行修剪。heads_to_prune: dict，格式为 {层号: 要修剪的头列表} 参见基类 PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            # 对每个层的指定头进行修剪
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithCrossAttentions]:
        # 如果未指定 output_attentions 参数，则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定 output_hidden_states 参数，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定 return_dict 参数，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果同时指定了 input_ids 和 inputs_embeds，抛出 ValueError 异常
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            # 如果指定了 input_ids，则检查 padding 的情况并提醒
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            # 获取 input_ids 的形状
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            # 如果指定了 inputs_embeds，则获取其形状，去掉最后一维
            input_shape = inputs_embeds.size()[:-1]
        else:
            # 如果既未指定 input_ids 也未指定 inputs_embeds，则抛出 ValueError 异常
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 获取 batch_size 和 seq_length
        batch_size, seq_length = input_shape
        # 获取 input_ids 或 inputs_embeds 的设备信息
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # 如果未提供 attention_mask，则创建一个全为1的 mask 张量，形状与 input_shape 一致
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        # 如果未提供 token_type_ids
        if token_type_ids is None:
            # 如果 embeddings 拥有 token_type_ids 属性，则使用其提供的 token_type_ids
            if hasattr(self.embeddings, "token_type_ids"):
                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                # 否则创建一个全为0的 token_type_ids 张量，dtype 为 long
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # 获取扩展后的 attention_mask
        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
        # 获取 head_mask
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        # 使用 embeddings 函数获取 hidden_states
        hidden_states = self.embeddings(
            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
        )

        # 如果模型具有 embeddings_project 属性，则对 hidden_states 进行投影处理
        if hasattr(self, "embeddings_project"):
            hidden_states = self.embeddings_project(hidden_states)

        # 使用 encoder 处理 hidden_states
        hidden_states = self.encoder(
            hidden_states,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 返回处理后的 hidden_states
        return hidden_states
# 定义 ConvBERT 模型的预测模块，由两个全连接层组成
class ConvBertGeneratorPredictions(nn.Module):
    """Prediction module for the generator, made up of two dense layers."""

    def __init__(self, config):
        super().__init__()

        # 使用 GELU 激活函数作为激活函数
        self.activation = get_activation("gelu")
        # LayerNorm 层，对隐藏状态进行标准化处理
        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
        # 全连接层，将隐藏状态映射到指定维度的特征空间
        self.dense = nn.Linear(config.hidden_size, config.embedding_size)

    def forward(self, generator_hidden_states: torch.FloatTensor) -> torch.FloatTensor:
        # 将生成器的隐藏状态传入全连接层
        hidden_states = self.dense(generator_hidden_states)
        # 使用 GELU 激活函数处理全连接层的输出
        hidden_states = self.activation(hidden_states)
        # 对处理后的隐藏状态进行 LayerNorm
        hidden_states = self.LayerNorm(hidden_states)

        return hidden_states


# ConvBERT 模型，具有在顶部进行语言建模的头部
@add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING)
class ConvBertForMaskedLM(ConvBertPreTrainedModel):
    _tied_weights_keys = ["generator.lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)

        # 初始化 ConvBERT 模型
        self.convbert = ConvBertModel(config)
        # 初始化生成器预测模块
        self.generator_predictions = ConvBertGeneratorPredictions(config)

        # 生成器的语言建模头部，将隐藏状态映射到词汇表大小的空间
        self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)
        # 初始化权重并应用最终处理
        self.post_init()

    def get_output_embeddings(self):
        return self.generator_lm_head

    def set_output_embeddings(self, word_embeddings):
        self.generator_lm_head = word_embeddings

    @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 根据 return_dict 是否为 None，确定是否使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 通过 ConvBERT 模型进行前向传播，生成隐状态
        generator_hidden_states = self.convbert(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict,
        )
        # 从生成的隐藏状态中获取序列输出
        generator_sequence_output = generator_hidden_states[0]

        # 使用生成的序列输出进行预测得分计算
        prediction_scores = self.generator_predictions(generator_sequence_output)
        # 将预测得分再经过语言模型头部计算，得到最终预测结果
        prediction_scores = self.generator_lm_head(prediction_scores)

        loss = None
        # 如果提供了 labels，则计算损失
        # 遮罩语言建模的 softmax 层
        if labels is not None:
            # 使用交叉熵损失函数，忽略 -100 索引（填充标记）
            loss_fct = nn.CrossEntropyLoss()  # -100 index = padding token
            # 计算损失值
            loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # 如果 return_dict 为 False，则返回元组形式的输出
        if not return_dict:
            output = (prediction_scores,) + generator_hidden_states[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 MaskedLMOutput 对象
        return MaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=generator_hidden_states.hidden_states,
            attentions=generator_hidden_states.attentions,
        )
class ConvBertClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 根据配置选择分类器的 dropout 比例，如果未指定则使用隐藏层 dropout 比例
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 定义一个 dropout 层，用于在训练过程中随机失活输入张量
        self.dropout = nn.Dropout(classifier_dropout)
        # 输出层全连接层，输入维度为 config.hidden_size，输出维度为 config.num_labels
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

        self.config = config

    def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
        # 取隐藏状态张量的第一个位置的特征向量作为输出
        x = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        # 根据配置中指定的激活函数对全连接层的输出进行非线性变换
        x = ACT2FN[self.config.hidden_act](x)
        x = self.dropout(x)
        # 将处理后的特征向量传入输出层全连接层，得到最终的分类预测结果
        x = self.out_proj(x)
        return x


@add_start_docstrings(
    """
    ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    CONVBERT_START_DOCSTRING,
)
class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 标签数量
        self.num_labels = config.num_labels
        self.config = config
        # ConvBERT 模型主体
        self.convbert = ConvBertModel(config)
        # 分类任务的头部
        self.classifier = ConvBertClassificationHead(config)

        # 初始化模型权重并进行后处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 参数为 None，则使用 self.config.use_return_dict 的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 ConvBert 模型进行前向传播，获取输出结果
        outputs = self.convbert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 ConvBert 的输出中获取序列输出
        sequence_output = outputs[0]
        # 将序列输出 logits 输入分类器，得到分类结果
        logits = self.classifier(sequence_output)

        # 初始化损失值为 None
        loss = None
        # 如果 labels 参数不为 None，则计算损失
        if labels is not None:
            # 如果问题类型未设置，则根据 num_labels 设置问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择损失函数和计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单个标签的回归任务，计算均方误差损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签的回归任务，计算均方误差损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类任务，使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类任务，使用带Logits的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回输出元组
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 SequenceClassifierOutput 对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    ConvBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    CONVBERT_START_DOCSTRING,
)
class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 ConvBERT 模型
        self.convbert = ConvBertModel(config)
        # 初始化用于序列摘要的对象
        self.sequence_summary = SequenceSummary(config)
        # 初始化多选题分类的线性层
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(
        CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传播函数，接受多个输入参数并返回模型输出。

        Args:
            input_ids (Optional[torch.LongTensor], optional): 输入 token 的 IDs. Defaults to None.
            attention_mask (Optional[torch.FloatTensor], optional): 表示每个 token 的 attention mask. Defaults to None.
            token_type_ids (Optional[torch.LongTensor], optional): 区分不同句子的 token type IDs. Defaults to None.
            position_ids (Optional[torch.LongTensor], optional): 句子中每个 token 的位置 IDs. Defaults to None.
            head_mask (Optional[torch.FloatTensor], optional): 用于屏蔽不同 attention heads 的掩码. Defaults to None.
            inputs_embeds (Optional[torch.FloatTensor], optional): 直接提供的嵌入输入. Defaults to None.
            labels (Optional[torch.LongTensor], optional): 多选题的标签. Defaults to None.
            output_attentions (Optional[bool], optional): 是否输出 attention. Defaults to None.
            output_hidden_states (Optional[bool], optional): 是否输出隐藏状态. Defaults to None.
            return_dict (Optional[bool], optional): 是否返回字典格式的输出. Defaults to None.

        Returns:
            输出结果，通常为多选题模型的分类输出.
        """
        # 在 ConvBERT 模型上进行前向传播
        return self.convbert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
    ) -> Union[Tuple, MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 确定是否返回字典格式的输出结果，如果未指定则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 计算选择题数量，根据输入的 input_ids 或 inputs_embeds 的第二个维度确定
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 将输入的 input_ids 调整为二维张量，以便于后续处理，如果为 None 则置为 None
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        # 将输入的 attention_mask 调整为二维张量，以便于后续处理，如果为 None 则置为 None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        # 将输入的 token_type_ids 调整为二维张量，以便于后续处理，如果为 None 则置为 None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        # 将输入的 position_ids 调整为二维张量，以便于后续处理，如果为 None 则置为 None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        # 将输入的 inputs_embeds 调整为三维张量，以便于后续处理，如果为 None 则置为 None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用 ConvBERT 模型进行前向传播计算
        outputs = self.convbert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取序列输出
        sequence_output = outputs[0]

        # 对序列输出进行汇总
        pooled_output = self.sequence_summary(sequence_output)
        # 对汇总后的输出进行分类预测
        logits = self.classifier(pooled_output)
        # 调整 logits 的形状以便与标签进行比较
        reshaped_logits = logits.view(-1, num_choices)

        # 初始化损失值
        loss = None
        # 如果存在标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果不需要返回字典格式的输出，则按原始格式返回结果
        if not return_dict:
            output = (reshaped_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典格式的输出，则构建 MultipleChoiceModelOutput 对象并返回
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    CONVBERT_START_DOCSTRING,
)
class ConvBertForTokenClassification(ConvBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels  # 从配置中获取标签数量

        self.convbert = ConvBertModel(config)  # 初始化 ConvBERT 模型
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)  # 初始化 dropout 层
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)  # 初始化分类器线性层

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 如果 return_dict 不为 None，则使用传入的 return_dict；否则使用配置中的 use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 ConvBert 模型进行处理，并获得输出
        outputs = self.convbert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 ConvBert 模型的输出中获取序列输出（即隐藏状态的最后一层）
        sequence_output = outputs[0]

        # 对序列输出进行 dropout 处理
        sequence_output = self.dropout(sequence_output)
        # 使用分类器对 dropout 后的序列输出进行分类得到 logits
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        # 如果提供了 labels，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # 将 logits 和 labels 展平为二维张量进行损失计算
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果 return_dict 为 False，则按非字典格式输出结果
        if not return_dict:
            # 将 logits 和 ConvBert 模型的其他输出组合成一个元组输出
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则构建 TokenClassifierOutput 对象进行输出
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    CONVBERT_START_DOCSTRING,
)
class ConvBertForQuestionAnswering(ConvBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.num_labels = config.num_labels
        self.convbert = ConvBertModel(config)  # 初始化 ConvBERT 模型
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)  # 线性层用于输出 span 的起始和结束 logits

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传播函数，接收输入参数，并返回模型的输出结果。

        Args:
            input_ids (Optional[torch.LongTensor], optional): 输入序列的 token IDs. Defaults to None.
            attention_mask (Optional[torch.FloatTensor], optional): 注意力遮罩，掩盖要忽略的位置. Defaults to None.
            token_type_ids (Optional[torch.LongTensor], optional): 区分不同序列的 token 类型 IDs. Defaults to None.
            position_ids (Optional[torch.LongTensor], optional): 指定输入 token 的位置 IDs. Defaults to None.
            head_mask (Optional[torch.FloatTensor], optional): 多头注意力机制中指定屏蔽的头. Defaults to None.
            inputs_embeds (Optional[torch.FloatTensor], optional): 直接输入的嵌入表示. Defaults to None.
            start_positions (Optional[torch.LongTensor], optional): 答案 span 的起始位置. Defaults to None.
            end_positions (Optional[torch.LongTensor], optional): 答案 span 的结束位置. Defaults to None.
            output_attentions (Optional[bool], optional): 是否返回注意力权重. Defaults to None.
            output_hidden_states (Optional[bool], optional): 是否返回隐藏状态. Defaults to None.
            return_dict (Optional[bool], optional): 是否以字典形式返回输出. Defaults to None.

        Returns:
            模型的输出，通常是一个 QuestionAnsweringModelOutput 对象.
        """
        # 略过对输入参数的处理和组合

        # 调用 ConvBERT 模型的 forward 方法，生成隐藏状态
        outputs = self.convbert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 使用线性层计算答案 span 的起始和结束 logits
        logits = self.qa_outputs(outputs[0])

        # 构建模型输出结果
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        # 返回模型输出
        return QuestionAnsweringModelOutput(
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=outputs.attentions if output_attentions else None,
        )
        ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 如果 return_dict 不为空，则使用 return_dict；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 ConvBERT 模型，传入参数进行前向传播
        outputs = self.convbert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取序列输出
        sequence_output = outputs[0]

        # 将序列输出传入 QA 输出层获取 logits
        logits = self.qa_outputs(sequence_output)

        # 将 logits 按最后一个维度分割为 start_logits 和 end_logits
        start_logits, end_logits = logits.split(1, dim=-1)

        # 去除多余的维度，并确保连续的内存布局
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果 start_positions 或 end_positions 是多维的，则压缩成一维
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            
            # 忽略超出模型输入范围的 start/end positions
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，忽略 ignore_index
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            # 如果 return_dict 为 False，则返回元组格式的输出
            output = (start_logits, end_logits) + outputs[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果 return_dict 为 True，则返回 QuestionAnsweringModelOutput 类型的对象
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Transformers-源码解析-二十八-

Transformers 源码解析（二十八）

.\models\conditional_detr\feature_extraction_conditional_detr.py

.\models\conditional_detr\image_processing_conditional_detr.py

.\models\conditional_detr\modeling_conditional_detr.py

.\models\conditional_detr\__init__.py

.\models\convbert\configuration_convbert.py

.\models\convbert\convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py

.\models\convbert\modeling_convbert.py

`.\models\conditional_detr\feature_extraction_conditional_detr.py`

`.\models\conditional_detr\image_processing_conditional_detr.py`

`.\models\conditional_detr\modeling_conditional_detr.py`

`.\models\conditional_detr\init.py`

`.\models\convbert\configuration_convbert.py`

`.\models\convbert\convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py`

`.\models\convbert\modeling_convbert.py`