Transformers 源码解析(二十八)
.\models\conditional_detr\feature_extraction_conditional_detr.py
"""
Feature extractor class for Conditional DETR.
"""
import warnings
from ...image_transforms import rgb_to_id as _rgb_to_id
from ...utils import logging
from .image_processing_conditional_detr import ConditionalDetrImageProcessor
logger = logging.get_logger(__name__)
def rgb_to_id(x):
warnings.warn(
"rgb_to_id has moved and will not be importable from this module from v5. "
"Please import from transformers.image_transforms instead.",
FutureWarning,
)
return _rgb_to_id(x)
class ConditionalDetrFeatureExtractor(ConditionalDetrImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class ConditionalDetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
" Please use ConditionalDetrImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\conditional_detr\image_processing_conditional_detr.py
"""Conditional DETR 的图像处理器类。"""
import io
import pathlib
from collections import defaultdict
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
import numpy as np
from ...feature_extraction_utils import BatchFeature
from ...image_processing_utils import BaseImageProcessor, get_size_dict
from ...image_transforms import (
PaddingMode,
center_to_corners_format,
corners_to_center_format,
id_to_rgb,
pad,
rescale,
resize,
rgb_to_id,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
AnnotationFormat,
AnnotationType,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_annotations,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import (
TensorType,
is_flax_available,
is_jax_tensor,
is_scipy_available,
is_tf_available,
is_tf_tensor,
is_torch_available,
is_torch_tensor,
is_vision_available,
logging,
)
if is_torch_available():
import torch
from torch import nn
if is_vision_available():
import PIL
if is_scipy_available():
import scipy.special
import scipy.stats
logger = logging.get_logger(__name__)
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
"""
根据输入图像大小和所需输出大小计算输出图像的尺寸。
Args:
image_size (`Tuple[int, int]`):
输入图像的尺寸.
size (`int`):
所需的输出尺寸.
max_size (`int`, *optional*):
允许的最大输出尺寸.
Returns:
Tuple[int, int]: 输出图像的高度和宽度.
"""
height, width = image_size
if max_size is not None:
min_original_size = float(min((height, width)))
max_original_size = float(max((height, width)))
if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size))
if (height <= width and height == size) or (width <= height and width == size):
return height, width
if width < height:
ow = size
oh = int(size * height / width)
else:
oh = size
ow = int(size * width / height)
return (oh, ow)
def get_resize_output_image_size(
input_image: np.ndarray,
size: Union[int, Tuple[int, int], List[int]],
max_size: Optional[int] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image size and the desired output size. If the desired output size
is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
image size is computed by keeping the aspect ratio of the input image size.
Args:
input_image (`np.ndarray`):
The image to resize.
size (`int` or `Tuple[int, int]` or `List[int]`):
The desired output size.
max_size (`int`, *optional*):
The maximum allowed output size.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
if isinstance(size, (list, tuple)):
return size
return get_size_with_aspect_ratio(image_size, size, max_size)
def get_numpy_to_framework_fn(arr) -> Callable:
"""
Returns a function that converts a numpy array to the framework of the input array.
Args:
arr (`np.ndarray`): The array to convert.
"""
if isinstance(arr, np.ndarray):
return np.array
if is_tf_available() and is_tf_tensor(arr):
import tensorflow as tf
return tf.convert_to_tensor
if is_torch_available() and is_torch_tensor(arr):
import torch
return torch.tensor
if is_flax_available() and is_jax_tensor(arr):
import jax.numpy as jnp
return jnp.array
raise ValueError(f"Cannot convert arrays of type {type(arr)}")
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
"""
Squeezes an array, but only if the axis specified has dim 1.
"""
if axis is None:
return arr.squeeze()
try:
return arr.squeeze(axis=axis)
except ValueError:
return arr
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
image_height, image_width = image_size
norm_annotation = {}
for key, value in annotation.items():
if key == "boxes":
boxes = value
boxes = corners_to_center_format(boxes)
boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
norm_annotation[key] = boxes
else:
norm_annotation[key] = value
return norm_annotation
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
Return the maximum value across all indices of an iterable of values.
"""
return [max(values_i) for values_i in zip(*values)]
def get_max_height_width(
images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
"""
Get the maximum height and width across all images in a batch.
"""
if input_data_format is None:
input_data_format = infer_channel_dimension_format(images[0])
if input_data_format == ChannelDimension.FIRST:
_, max_height, max_width = max_across_indices([img.shape for img in images])
elif input_data_format == ChannelDimension.LAST:
max_height, max_width, _ = max_across_indices([img.shape for img in images])
else:
raise ValueError(f"Invalid channel dimension format: {input_data_format}")
return (max_height, max_width)
def make_pixel_mask(
image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
"""
Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
Args:
image (`np.ndarray`):
Image to make the pixel mask for.
output_size (`Tuple[int, int]`):
Output size of the mask.
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
mask = np.zeros(output_size, dtype=np.int64)
mask[:input_height, :input_width] = 1
return mask
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
"""
Convert a COCO polygon annotation to a mask.
Args:
segmentations (`List[List[float]]`):
List of polygons, each polygon represented by a list of x-y coordinates.
height (`int`):
Height of the mask.
width (`int`):
Width of the mask.
"""
try:
from pycocotools import mask as coco_mask
except ImportError:
raise ImportError("Pycocotools is not installed in your environment.")
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = np.asarray(mask, dtype=np.uint8)
mask = np.any(mask, axis=2)
masks.append(mask)
if masks:
masks = np.stack(masks, axis=0)
else:
masks = np.zeros((0, height, width), dtype=np.uint8)
return masks
def prepare_coco_detection_annotation(
image,
target,
return_segmentation_masks: bool = False,
input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
"""
将COCO格式的目标转换为ConditionalDetr所期望的格式。
"""
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
image_id = target["image_id"]
image_id = np.asarray([image_id], dtype=np.int64)
annotations = target["annotations"]
annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
classes = [obj["category_id"] for obj in annotations]
classes = np.asarray(classes, dtype=np.int64)
area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
boxes = [obj["bbox"] for obj in annotations]
boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2]
boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
new_target = {}
new_target["image_id"] = image_id
new_target["class_labels"] = classes[keep]
new_target["boxes"] = boxes[keep]
new_target["area"] = area[keep]
new_target["iscrowd"] = iscrowd[keep]
new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
if annotations and "keypoints" in annotations[0]:
keypoints = [obj["keypoints"] for obj in annotations]
keypoints = np.asarray(keypoints, dtype=np.float32)
keypoints = keypoints[keep]
num_keypoints = keypoints.shape[0]
keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
new_target["keypoints"] = keypoints
if return_segmentation_masks:
segmentation_masks = [obj["segmentation"] for obj in annotations]
masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
new_target["masks"] = masks[keep]
return new_target
def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
"""
计算提供的全景分割掩码周围的边界框。
Args:
masks: 格式为`[number_masks, height, width]`的掩码,其中N是掩码数量
Returns:
boxes: 格式为`[number_masks, 4]`的边界框,xyxy格式
"""
if masks.size == 0:
return np.zeros((0, 4))
h, w = masks.shape[-2:]
y = np.arange(0, h, dtype=np.float32)
x = np.arange(0, w, dtype=np.float32)
y, x = np.meshgrid(y, x, indexing="ij")
x_mask = masks * np.expand_dims(x, axis=0)
x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
x_min = x.filled(fill_value=1e8)
x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
y_mask = masks * np.expand_dims(y, axis=0)
y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
y_min = y.filled(fill_value=1e8)
y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
return np.stack([x_min, y_min, x_max, y_max], 1)
def prepare_coco_panoptic_annotation(
image: np.ndarray,
target: Dict,
masks_path: Union[str, pathlib.Path],
return_masks: bool = True,
input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
"""
Prepare a coco panoptic annotation for ConditionalDetr.
"""
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
annotation_path = pathlib.Path(masks_path) / target["file_name"]
new_target = {}
new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
if "segments_info" in target:
masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
masks = rgb_to_id(masks)
ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
masks = masks == ids[:, None, None]
masks = masks.astype(np.uint8)
if return_masks:
new_target["masks"] = masks
new_target["boxes"] = masks_to_boxes(masks)
new_target["class_labels"] = np.array(
[segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
)
new_target["iscrowd"] = np.asarray(
[segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
)
new_target["area"] = np.asarray(
[segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
)
return new_target
def get_segmentation_image(
masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False
):
h, w = input_size
final_h, final_w = target_size
m_id = scipy.special.softmax(masks.transpose(0, 1), -1)
if m_id.shape[-1] == 0:
m_id = np.zeros((h, w), dtype=np.int64)
else:
m_id = m_id.argmax(-1).reshape(h, w)
if deduplicate:
for equiv in stuff_equiv_classes.values():
for eq_id in equiv:
m_id[m_id == eq_id] = equiv[0]
seg_img = id_to_rgb(m_id)
seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
return seg_img
def get_mask_area(seg_img: np.ndarray, target_size: Tuple[int, int], n_classes: int) -> np.ndarray:
final_h, final_w = target_size
np_seg_img = seg_img.astype(np.uint8)
np_seg_img = np_seg_img.reshape(final_h, final_w, 3)
m_id = rgb_to_id(np_seg_img)
area = [(m_id == i).sum() for i in range(n_classes)]
return area
def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
probs = scipy.special.softmax(logits, axis=-1)
labels = probs.argmax(-1, keepdims=True)
scores = np.take_along_axis(probs, labels, axis=-1)
scores, labels = scores.squeeze(-1), labels.squeeze(-1)
return scores, labels
def post_process_panoptic_sample(
out_logits: np.ndarray,
masks: np.ndarray,
boxes: np.ndarray,
processed_size: Tuple[int, int],
target_size: Tuple[int, int],
is_thing_map: Dict,
threshold=0.85,
) -> Dict:
"""
Converts the output of [`ConditionalDetrForSegmentation`] into panoptic segmentation predictions for a single sample.
Args:
out_logits (`torch.Tensor`):
The logits for this sample.
masks (`torch.Tensor`):
The predicted segmentation masks for this sample.
boxes (`torch.Tensor`):
The prediced bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y,
width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding).
processed_size (`Tuple[int, int]`):
The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size
after data augmentation but before batching.
target_size (`Tuple[int, int]`):
The target size of the image, `(height, width)` corresponding to the requested final size of the
prediction.
is_thing_map (`Dict`):
A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not.
threshold (`float`, *optional*, defaults to 0.85):
The threshold used to binarize the segmentation masks.
"""
scores, labels = score_labels_from_class_probabilities(out_logits)
keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold)
cur_scores = scores[keep]
cur_classes = labels[keep]
cur_boxes = center_to_corners_format(boxes[keep])
if len(cur_boxes) != len(cur_classes):
raise ValueError("Not as many boxes as there are classes")
cur_masks = masks[keep]
cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR)
cur_masks = safe_squeeze(cur_masks, 1)
b, h, w = cur_masks.shape
cur_masks = cur_masks.reshape(b, -1)
stuff_equiv_classes = defaultdict(list)
for k, label in enumerate(cur_classes):
if not is_thing_map[label]:
stuff_equiv_classes[label].append(k)
seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True)
area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores))
if cur_classes.size() > 0:
filtered_small = np.array([a <= 4 for a in area], dtype=bool)
while filtered_small.any():
cur_masks = cur_masks[~filtered_small]
cur_scores = cur_scores[~filtered_small]
cur_classes = cur_classes[~filtered_small]
seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True)
area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores))
filtered_small = np.array([a <= 4 for a in area], dtype=bool)
else:
cur_classes = np.ones((1, 1), dtype=np.int64)
segments_info = [
{"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a}
for i, (cat, a) in enumerate(zip(cur_classes, area))
]
del cur_classes
with io.BytesIO() as out:
PIL.Image.fromarray(seg_img).save(out, format="PNG")
predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
return predictions
def resize_annotation(
annotation: Dict[str, Any],
orig_size: Tuple[int, int],
target_size: Tuple[int, int],
threshold: float = 0.5,
resample: PILImageResampling = PILImageResampling.NEAREST,
):
"""
Resizes an annotation to a target size.
Args:
annotation (`Dict[str, Any]`):
The annotation dictionary.
orig_size (`Tuple[int, int]`):
The original size of the input image.
target_size (`Tuple[int, int]`):
The target size of the image, as returned by the preprocessing `resize` step.
threshold (`float`, *optional*, defaults to 0.5):
The threshold used to binarize the segmentation masks.
resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
The resampling filter to use when resizing the masks.
"""
ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
ratio_height, ratio_width = ratios
new_annotation = {}
new_annotation["size"] = target_size
for key, value in annotation.items():
if key == "boxes":
boxes = value
scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
new_annotation["boxes"] = scaled_boxes
elif key == "area":
area = value
scaled_area = area * (ratio_width * ratio_height)
new_annotation["area"] = scaled_area
elif key == "masks":
masks = value[:, None]
masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
masks = masks.astype(np.float32)
masks = masks[:, 0] > threshold
new_annotation["masks"] = masks
elif key == "size":
new_annotation["size"] = target_size
else:
new_annotation[key] = value
return new_annotation
def binary_mask_to_rle(mask):
"""
Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.
Args:
mask (`torch.Tensor` or `numpy.array`):
A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
segment_id or class_id.
Returns:
`List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
format.
"""
if is_torch_tensor(mask):
mask = mask.numpy()
pixels = mask.flatten()
pixels = np.concatenate([[0], pixels, [0]])
runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
runs[1::2] -= runs[::2]
return list(runs)
def convert_segmentation_to_rle(segmentation):
"""
# 获取分割图中唯一的分割标识符(segmentation id)
segment_ids = torch.unique(segmentation)
# 初始化用于存储所有分割标识符的运行长度编码的列表
run_length_encodings = []
# 遍历每个分割标识符
for idx in segment_ids:
# 创建一个掩码,其中分割图中与当前标识符相同的位置为1,否则为0
mask = torch.where(segmentation == idx, 1, 0)
# 将二进制掩码转换为运行长度编码(RLE)
rle = binary_mask_to_rle(mask)
# 将当前标识符的运行长度编码添加到结果列表中
run_length_encodings.append(rle)
# 返回所有分割标识符的运行长度编码列表
return run_length_encodings
# Copied from transformers.models.detr.image_processing_detr.compute_segments
# 根据目标大小或默认大小计算掩码的高度和宽度
height = mask_probs.shape[1] if target_size is None else target_size[0]
width = mask_probs.shape[2] if target_size is None else target_size[1]
# 创建一个与图像大小相同的空白分割结果张量,用于存储每个像素点的分割标识
segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
# 初始化空的分割结果列表,用于存储每个分割的详细信息
segments: List[Dict] = []
# 如果设置了目标大小,则对掩码进行插值以适应目标大小
if target_size is not None:
mask_probs = nn.functional.interpolate(
mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
)[0]
# 当前分割的 ID
current_segment_id = 0
# 根据预测分数加权每个掩码
mask_probs *= pred_scores.view(-1, 1, 1)
# 找到每个像素点最可能的类别标签
mask_labels = mask_probs.argmax(0) # [height, width]
# 用于跟踪每个类别的实例数量
# 初始化一个空的字典,用于存储每个类别的对象段的内存索引
stuff_memory_list: Dict[str, int] = {}
# 遍历预测标签的每一行
for k in range(pred_labels.shape[0]):
# 获取当前预测类别的整数表示
pred_class = pred_labels[k].item()
# 检查当前预测类别是否需要融合
should_fuse = pred_class in label_ids_to_fuse
# 检查当前索引处的掩码是否存在并且足够大以表示一个段
mask_exists, mask_k = check_segment_validity(
mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
)
# 如果存在有效的掩码
if mask_exists:
# 如果当前预测类别已经在内存列表中存在
if pred_class in stuff_memory_list:
# 获取当前预测类别的段的内存索引
current_segment_id = stuff_memory_list[pred_class]
else:
# 如果当前预测类别不在内存列表中,则增加段的内存索引
current_segment_id += 1
# 将当前对象段添加到最终的分割映射中
segmentation[mask_k] = current_segment_id
# 获取当前段的预测得分并进行四舍五入保留小数点后六位
segment_score = round(pred_scores[k].item(), 6)
# 将当前段的信息添加到段列表中
segments.append(
{
"id": current_segment_id,
"label_id": pred_class,
"was_fused": should_fuse,
"score": segment_score,
}
)
# 如果需要融合,则更新内存列表中当前预测类别的段的内存索引
if should_fuse:
stuff_memory_list[pred_class] = current_segment_id
# 返回最终的分割映射和段列表
return segmentation, segments
class ConditionalDetrImageProcessor(BaseImageProcessor):
r"""
Constructs a Conditional Detr image processor.
Args:
format (`str`, *optional*, defaults to `"coco_detection"`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_resize (`bool`, *optional*, defaults to `True`):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
the `preprocess` method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
`do_rescale` parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize:
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
`preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
"""
model_input_names = ["pixel_values", "pixel_mask"]
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
# 初始化函数,用于创建一个图像处理器对象
def __init__(
self,
format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
**kwargs,
) -> None:
# 如果 kwargs 中有 "pad_and_return_pixel_mask",则设置 do_pad 为其值并移除该参数
if "pad_and_return_pixel_mask" in kwargs:
do_pad = kwargs.pop("pad_and_return_pixel_mask")
# 如果 kwargs 中有 "max_size",则发出警告提示并将其移除,推荐使用 size['longest_edge']
if "max_size" in kwargs:
logger.warning_once(
"The `max_size` parameter is deprecated and will be removed in v4.26. "
"Please specify in `size['longest_edge'] instead`.",
)
max_size = kwargs.pop("max_size")
else:
max_size = None if size is None else 1333
# 如果 size 为 None,则设置默认 size 字典
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
# 根据 size 和 max_size 获取最终的尺寸字典
size = get_size_dict(size, max_size=max_size, default_to_square=False)
# 向父类初始化方法传递其余的 kwargs 参数
super().__init__(**kwargs)
# 设置各个属性值
self.format = format
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
# 定义一个有效的处理器键名列表,用于验证和配置处理器参数
self._valid_processor_keys = [
"images",
"annotations",
"return_segmentation_masks",
"masks_path",
"do_resize",
"size",
"resample",
"do_rescale",
"rescale_factor",
"do_normalize",
"do_convert_annotations",
"image_mean",
"image_std",
"do_pad",
"format",
"return_tensors",
"data_format",
"input_data_format",
]
@classmethod
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->ConditionalDetr
# 重写基类中的 `from_dict` 方法,用于从字典创建 ConditionalDetrImageProcessor 对象,
# 并确保在使用 from_dict 方法创建图像处理器时更新参数,例如 `ConditionalDetrImageProcessor.from_pretrained(checkpoint, size=600, max_size=800)`
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
# 复制输入的 image_processor_dict,以确保不修改原始字典
image_processor_dict = image_processor_dict.copy()
# 如果 kwargs 中有 "max_size" 参数,则更新到 image_processor_dict 中,并从 kwargs 中移除该参数
if "max_size" in kwargs:
image_processor_dict["max_size"] = kwargs.pop("max_size")
# 如果 kwargs 中有 "pad_and_return_pixel_mask" 参数,则更新到 image_processor_dict 中,并从 kwargs 中移除该参数
if "pad_and_return_pixel_mask" in kwargs:
image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
# 调用父类的 from_dict 方法,使用更新后的 image_processor_dict 和任何额外的 kwargs 参数
return super().from_dict(image_processor_dict, **kwargs)
# 从 DETR 源码中复制的方法,准备输入图像的注释,以便供 ConditionalDetr 模型使用
def prepare_annotation(
self,
image: np.ndarray,
target: Dict,
format: Optional[AnnotationFormat] = None,
return_segmentation_masks: bool = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Dict:
"""
Prepare an annotation for feeding into ConditionalDetr model.
"""
# 如果未指定 format,则使用对象中存储的 format
format = format if format is not None else self.format
# 如果 format 是 AnnotationFormat.COCO_DETECTION
if format == AnnotationFormat.COCO_DETECTION:
# 如果未指定 return_segmentation_masks,则设为 False;否则保持原值
return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
# 调用 prepare_coco_detection_annotation 方法,准备 COCO 检测格式的注释
target = prepare_coco_detection_annotation(
image, target, return_segmentation_masks, input_data_format=input_data_format
)
# 如果 format 是 AnnotationFormat.COCO_PANOPTIC
elif format == AnnotationFormat.COCO_PANOPTIC:
# 如果未指定 return_segmentation_masks,则设为 True;否则保持原值
return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
# 调用 prepare_coco_panoptic_annotation 方法,准备 COCO 全景格式的注释
target = prepare_coco_panoptic_annotation(
image,
target,
masks_path=masks_path,
return_masks=return_segmentation_masks,
input_data_format=input_data_format,
)
else:
# 如果 format 不是支持的格式,则抛出 ValueError 异常
raise ValueError(f"Format {format} is not supported.")
# 返回处理后的 target 字典
return target
# 从 DETR 源码中复制的方法,准备输入数据,调用 prepare_annotation 方法处理注释
def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
# 发出一次性警告,提示 `prepare` 方法已弃用,并将在 v4.33 版本中移除
logger.warning_once(
"The `prepare` method is deprecated and will be removed in a v4.33. "
"Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
"does not return the image anymore.",
)
# 调用 prepare_annotation 方法处理输入的 image 和 target,并返回处理后的 image 和 target
target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
return image, target
# 从 DETR 源码中复制的方法,将 COCO 多边形转换为掩码的方法,未完成复制
# 发出警告日志,提示`convert_coco_poly_to_mask`方法已弃用,将在v4.33版本移除
def convert_coco_poly_to_mask(self, *args, **kwargs):
logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
# 调用同名函数`convert_coco_poly_to_mask`并返回其结果
return convert_coco_poly_to_mask(*args, **kwargs)
# 从`transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection`复制而来,已经修改了`DETR`为`ConditionalDetr`
def prepare_coco_detection(self, *args, **kwargs):
logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
# 调用`prepare_coco_detection_annotation`函数并返回其结果
return prepare_coco_detection_annotation(*args, **kwargs)
# 从`transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic`复制而来
def prepare_coco_panoptic(self, *args, **kwargs):
logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
# 调用`prepare_coco_panoptic_annotation`函数并返回其结果
return prepare_coco_panoptic_annotation(*args, **kwargs)
# 从`transformers.models.detr.image_processing_detr.DetrImageProcessor.resize`复制而来,定义了图像的调整大小函数
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
# 定义函数 `resize_image`,接受 `image` 和 `size` 参数,返回 `np.ndarray` 类型
def resize_image(
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[str] = None,
input_data_format: Optional[ChannelDimension] = None,
**kwargs,
):
"""
Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
int, smaller edge of the image will be matched to this number.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
`height` and `width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# 检查 `kwargs` 中是否包含 `max_size` 参数,如果包含则发出警告
if "max_size" in kwargs:
logger.warning_once(
"The `max_size` parameter is deprecated and will be removed in v4.26. "
"Please specify in `size['longest_edge'] instead`.",
)
# 弹出 `kwargs` 中的 `max_size` 参数,并赋值给 `max_size` 变量
max_size = kwargs.pop("max_size")
else:
max_size = None
# 调用 `get_size_dict` 函数,根据 `size` 和 `max_size` 参数获取尺寸字典 `size`
size = get_size_dict(size, max_size=max_size, default_to_square=False)
# 检查 `size` 字典中是否同时包含 `shortest_edge` 和 `longest_edge` 键
if "shortest_edge" in size and "longest_edge" in size:
# 调用 `get_resize_output_image_size` 函数,根据 `shortest_edge` 和 `longest_edge` 获取调整后的尺寸
size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
# 否则,检查 `size` 字典中是否同时包含 `height` 和 `width` 键
elif "height" in size and "width" in size:
# 直接取出 `height` 和 `width` 的值,组成元组赋值给 `size`
size = (size["height"], size["width"])
else:
# 如果 `size` 字典中既不包含 `shortest_edge` 和 `longest_edge`,也不包含 `height` 和 `width`,则抛出数值错误异常
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
# 调用 `resize` 函数,根据给定参数调整图像大小,并将结果赋值给 `image`
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
)
# 返回调整大小后的图像
return image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
# 定义 `resize_annotation` 方法,接受 `annotation`、`orig_size`、`size` 和可选的 `resample` 参数,返回 `Dict` 类型
def resize_annotation(
self,
annotation,
orig_size,
size,
resample: PILImageResampling = PILImageResampling.NEAREST,
) -> Dict:
"""
Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
to this number.
"""
# 调用 `resize_annotation` 函数,根据参数调整注释大小,并返回结果
return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
# 使用给定的因子对图像进行重新缩放,即 image = image * rescale_factor。
def rescale(
self,
image: np.ndarray,
rescale_factor: float,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the input image. If unset, is inferred from the input image. Can be
one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
# 调用全局函数 rescale,对图像进行重新缩放
return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
# 从 transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation 复制而来
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
"""
# 调用全局函数 normalize_annotation,规范化注释中的边界框格式
return normalize_annotation(annotation, image_size=image_size)
# 从 transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image 复制而来
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
):
"""
Update annotations to account for padding in the image.
Args:
annotation (`Dict`):
Dictionary containing annotations.
input_image_size (`Tuple[int, int]`):
Original size of the input image (height, width).
output_image_size (`Tuple[int, int]`):
Size of the output image after padding (height, width).
padding:
Padding information.
update_bboxes:
Whether to update bounding boxes or not.
"""
# 调用全局函数 _update_annotation_for_padded_image,更新由于图像填充而改变的注释信息
pass # 这里使用 pass 表示该方法暂不执行任何操作
) -> Dict:
"""
Update the annotation for a padded image.
"""
# 创建一个新的空注释字典
new_annotation = {}
# 将输出图像的尺寸信息添加到新注释字典中
new_annotation["size"] = output_image_size
# 遍历给定的注释字典
for key, value in annotation.items():
# 如果键是 "masks"
if key == "masks":
# 获取 masks,并使用 pad 函数进行填充操作
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
# 对填充后的 masks 进行 squeeze 操作,移除大小为 1 的维度
masks = safe_squeeze(masks, 1)
# 将填充后的 masks 更新到新注释字典中
new_annotation["masks"] = masks
# 如果键是 "boxes" 并且 update_bboxes 为 True
elif key == "boxes" and update_bboxes:
# 获取 boxes,并根据输入和输出图像尺寸的比例进行缩放
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
# 将缩放后的 boxes 更新到新注释字典中
new_annotation["boxes"] = boxes
# 如果键是 "size"
elif key == "size":
# 将输出图像的尺寸信息更新到新注释字典中
new_annotation["size"] = output_image_size
else:
# 对于其它所有情况,直接将原始值添加到新注释字典中
new_annotation[key] = value
# 返回更新后的注释字典
return new_annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
"""
# 获取输入图像的高度和宽度
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
# 获取输出图像的高度和宽度
output_height, output_width = output_size
# 计算需要在图像底部和右侧填充的像素数
pad_bottom = output_height - input_height
pad_right = output_width - input_width
# 构造填充元组
padding = ((0, pad_bottom), (0, pad_right))
# 使用 pad 函数对图像进行填充操作
padded_image = pad(
image,
padding,
mode=PaddingMode.CONSTANT,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
# 如果提供了注释信息,则更新注释以适应填充后的图像
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
# 返回填充后的图像和更新后的注释(如果提供了注释)
return padded_image, annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
# 定义一个方法 `pad`,属于当前类的实例方法(self 指向当前对象)
def pad(
self,
images: List[np.ndarray], # images 参数是一个包含 np.ndarray 元素的列表
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, # annotations 参数可选,可以是单个 AnnotationType 或其列表
constant_values: Union[float, Iterable[float]] = 0, # constant_values 参数可以是单个浮点数或浮点数的可迭代对象,默认值为 0
return_pixel_mask: bool = True, # return_pixel_mask 参数是一个布尔值,默认为 True
return_tensors: Optional[Union[str, TensorType]] = None, # return_tensors 参数可选,可以是字符串或 TensorType 类型
data_format: Optional[ChannelDimension] = None, # data_format 参数可选,可以是 ChannelDimension 类型
input_data_format: Optional[Union[str, ChannelDimension]] = None, # input_data_format 参数可选,可以是字符串或 ChannelDimension 类型
update_bboxes: bool = True, # update_bboxes 参数是一个布尔值,默认为 True
# 从 transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess 复制而来的方法定义
def preprocess(
self,
images: ImageInput, # images 参数是 ImageInput 类型
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, # annotations 参数可选,可以是单个 AnnotationType 或其列表
return_segmentation_masks: bool = None, # return_segmentation_masks 参数是一个布尔值,默认为 None
masks_path: Optional[Union[str, pathlib.Path]] = None, # masks_path 参数可选,可以是字符串或 pathlib.Path 类型
do_resize: Optional[bool] = None, # do_resize 参数可选,可以是布尔值或者 None
size: Optional[Dict[str, int]] = None, # size 参数可选,是一个字典,键是字符串,值是整数
resample=None, # resample 参数,默认为 None,应为 PILImageResampling 类型
do_rescale: Optional[bool] = None, # do_rescale 参数可选,可以是布尔值或者 None
rescale_factor: Optional[Union[int, float]] = None, # rescale_factor 参数可选,可以是整数或浮点数
do_normalize: Optional[bool] = None, # do_normalize 参数可选,可以是布尔值或者 None
do_convert_annotations: Optional[bool] = None, # do_convert_annotations 参数可选,可以是布尔值或者 None
image_mean: Optional[Union[float, List[float]]] = None, # image_mean 参数可选,可以是单个浮点数或浮点数的列表
image_std: Optional[Union[float, List[float]]] = None, # image_std 参数可选,可以是单个浮点数或浮点数的列表
do_pad: Optional[bool] = None, # do_pad 参数可选,可以是布尔值或者 None
format: Optional[Union[str, AnnotationFormat]] = None, # format 参数可选,可以是字符串或 AnnotationFormat 类型
return_tensors: Optional[Union[TensorType, str]] = None, # return_tensors 参数可选,可以是 TensorType 类型或字符串
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, # data_format 参数是字符串或 ChannelDimension 类型,默认为 ChannelDimension.FIRST
input_data_format: Optional[Union[str, ChannelDimension]] = None, # input_data_format 参数可选,可以是字符串或 ChannelDimension 类型
**kwargs, # 其余未命名参数,以字典形式接收
# 后处理方法 - TODO: 添加对其它框架的支持
# 将模型输出转换为 Pascal VOC 格式(xmin, ymin, xmax, ymax),该方法已被弃用,建议使用 `post_process_object_detection` 方法代替
def post_process(self, outputs, target_sizes):
"""
Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the Pascal VOC format (xmin, ymin, xmax, ymax).
Only supports PyTorch.
Args:
outputs ([`ConditionalDetrObjectDetectionOutput`]):
Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
image size (before any data augmentation). For visualization, this should be the image size after data
augment, but before padding.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
# 发出警告信息,提醒用户方法即将在 Transformers v5 中移除,建议使用 `post_process_object_detection` 替代
logging.warning_once(
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
)
# 提取模型输出中的逻辑回归结果和预测框
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
# 检查输出的数量与目标大小数量是否一致
if len(out_logits) != len(target_sizes):
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
# 检查目标大小的形状是否正确
if target_sizes.shape[1] != 2:
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
# 对逻辑回归结果进行 sigmoid 激活
prob = out_logits.sigmoid()
# 获取概率最高的前 300 个预测结果的值和索引
topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1)
scores = topk_values
# 计算预测框的索引
topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
# 计算预测结果的标签
labels = topk_indexes % out_logits.shape[2]
# 将预测框格式从中心点转换为角点坐标格式
boxes = center_to_corners_format(out_bbox)
# 根据预测框的索引从所有预测框中选择特定的预测框
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
# 将相对坐标 [0, 1] 转换为绝对坐标 [0, height],其中 height 和 width 分别是图像的高度和宽度
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
boxes = boxes * scale_fct[:, None, :]
# 将结果组织成字典列表,每个字典包含模型对批次中每个图像的预测结果(分数、标签和框)
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
# 返回结果列表
return results
# 从 transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process_object_detection 复制并修改为 ConditionalDetr
def post_process_object_detection(
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
):
"""
Converts the raw output of [`ConditionalDetrForObjectDetection`] into final bounding boxes in (top_left_x,
top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
Args:
outputs ([`DetrObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
(height, width) of each image in the batch. If left to None, predictions will not be resized.
top_k (`int`, *optional*, defaults to 100):
Keep only top k bounding boxes before filtering by thresholding.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
# Extract logits and bounding boxes from the model's outputs
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
# Verify if target_sizes are provided and match the batch dimension of logits
if target_sizes is not None:
if len(out_logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
# Apply sigmoid activation to logits to obtain probabilities
prob = out_logits.sigmoid()
# Reshape probabilities to (batch_size, num_classes)
prob = prob.view(out_logits.shape[0], -1)
# Determine the number of top-k predictions to consider
k_value = min(top_k, prob.size(1))
# Extract top-k values and their indices along the second dimension
topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
# Scores correspond to the top-k values
scores = topk_values
# Convert top-k indexes to top-k boxes in relative [0, 1] coordinates
topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
# Extract labels from top-k indexes
labels = topk_indexes % out_logits.shape[2]
# Convert predicted boxes from center-offset format to (x1, y1, x2, y2) format
boxes = center_to_corners_format(out_bbox)
# Gather top-k boxes based on top-k indexes
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
# Convert relative [0, 1] coordinates to absolute [0, height] coordinates if target_sizes are provided
if target_sizes is not None:
if isinstance(target_sizes, list):
# If target_sizes is a list, extract heights and widths
img_h = torch.Tensor([i[0] for i in target_sizes])
img_w = torch.Tensor([i[1] for i in target_sizes])
else:
# If target_sizes is a tensor, unbind heights and widths
img_h, img_w = target_sizes.unbind(1)
# Stack widths and heights and scale boxes accordingly
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]
# Filter predictions based on the score threshold and construct result dictionaries
results = []
for s, l, b in zip(scores, labels, boxes):
score = s[s > threshold]
label = l[s > threshold]
box = b[s > threshold]
results.append({"scores": score, "labels": label, "boxes": box})
return results
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_semantic_segmentation with Detr->ConditionalDetr
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple[int, int]] = None):
"""
Converts the output of [`ConditionalDetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
Args:
outputs ([`ConditionalDetrForSegmentation`]):
Raw outputs of the model.
target_sizes (`List[Tuple[int, int]]`, *optional*):
A list of tuples (`Tuple[int, int]`) containing the target size (height, width) of each image in the
batch. If unset, predictions will not be resized.
Returns:
`List[torch.Tensor]`:
A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
`torch.Tensor` correspond to a semantic class id.
"""
# Extract class logits from model outputs [batch_size, num_queries, num_classes+1]
class_queries_logits = outputs.logits
# Extract mask logits from model outputs [batch_size, num_queries, height, width]
masks_queries_logits = outputs.pred_masks
# Remove the null class from class logits using softmax, leaving out the last dimension (background class)
masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
# Apply sigmoid to mask logits to get probabilities [batch_size, num_queries, height, width]
masks_probs = masks_queries_logits.sigmoid()
# Compute semantic segmentation logits by combining class and mask probabilities [batch_size, num_classes, height, width]
segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
batch_size = class_queries_logits.shape[0]
# Resize logits and compute semantic segmentation maps if target_sizes are provided
if target_sizes is not None:
# Ensure that the number of target sizes matches the batch size
if batch_size != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
semantic_segmentation = []
# Iterate over each image in the batch
for idx in range(batch_size):
# Resize logits to match target size using bilinear interpolation
resized_logits = nn.functional.interpolate(
segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
)
# Extract semantic segmentation map by taking the argmax along the channel dimension
semantic_map = resized_logits[0].argmax(dim=0)
semantic_segmentation.append(semantic_map)
else:
# If target_sizes are not provided, compute semantic segmentation by taking argmax along the channel dimension
semantic_segmentation = segmentation.argmax(dim=1)
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
return semantic_segmentation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_instance_segmentation with Detr->ConditionalDetr
def post_process_instance_segmentation(
self,
outputs,
threshold: float = 0.5,
mask_threshold: float = 0.5,
overlap_mask_area_threshold: float = 0.8,
target_sizes: Optional[List[Tuple[int, int]]] = None,
return_coco_annotation: Optional[bool] = False,
# 从 transformers.models.conditional_detr.image_processing_conditional_detr.ConditionalDetrImageProcessor.post_process_panoptic_segmentation 复制而来
def post_process_panoptic_segmentation(
self,
outputs,
threshold: float = 0.5,
mask_threshold: float = 0.5,
overlap_mask_area_threshold: float = 0.8,
label_ids_to_fuse: Optional[Set[int]] = None,
target_sizes: Optional[List[Tuple[int, int]]] = None,
.\models\conditional_detr\modeling_conditional_detr.py
""" PyTorch Conditional DETR model."""
import math
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union
import torch
from torch import Tensor, nn
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_accelerate_available,
is_scipy_available,
is_timm_available,
is_vision_available,
logging,
replace_return_docstrings,
requires_backends,
)
from ...utils.backbone_utils import load_backbone
from .configuration_conditional_detr import ConditionalDetrConfig
if is_accelerate_available():
from accelerate import PartialState
from accelerate.utils import reduce
if is_scipy_available():
from scipy.optimize import linear_sum_assignment
if is_timm_available():
from timm import create_model
if is_vision_available():
from ...image_transforms import center_to_corners_format
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "ConditionalDetrConfig"
_CHECKPOINT_FOR_DOC = "microsoft/conditional-detr-resnet-50"
CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/conditional-detr-resnet-50",
]
@dataclass
class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
"""
Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary
decoding losses.
"""
pass
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
最后一个模型层的隐藏状态序列输出。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
包含模型每一层隐藏状态的元组,形状为 `(batch_size, sequence_length, hidden_size)`。
当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
注意力权重的元组,每个元素对应每一层的注意力权重。
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
在自注意力头中用于计算加权平均后返回,当 `output_attentions=True` 或 `config.output_attentions=True` 时返回。
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
交叉注意力层的注意力权重元组,每个元素对应解码器交叉注意力层的注意力权重。
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
在交叉注意力头中用于计算加权平均后返回,当同时设置了 `output_attentions=True` 和 `config.add_cross_attention=True` 或者 `config.output_attentions=True` 时返回。
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
中间解码器激活状态,即每个解码器层的输出,经过层归一化后的结果。
形状为 `(config.decoder_layers, batch_size, num_queries, hidden_size)`。
当设置了 `config.auxiliary_loss=True` 时返回。
@dataclass
class ConditionalDetrModelOutput(Seq2SeqModelOutput):
"""
ConditionalDetr 模型的输出基类。添加了一个额外的属性 intermediate_hidden_states,
可选地包含中间解码器激活的堆栈,即每个解码器层的输出,经过 layernorm 处理。
在训练模型时使用辅助解码损失时非常有用。
"""
intermediate_hidden_states: Optional[torch.FloatTensor] = None
reference_points: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class ConditionalDetrObjectDetectionOutput(ModelOutput):
"""
ConditionalDetr 对象检测模型的输出类型。
"""
loss: Optional[torch.FloatTensor] = None
loss_dict: Optional[Dict] = None
logits: torch.FloatTensor = None
pred_boxes: torch.FloatTensor = None
auxiliary_outputs: Optional[List[Dict]] = None
last_hidden_state: Optional[torch.FloatTensor] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class ConditionalDetrSegmentationOutput(ModelOutput):
"""
ConditionalDetr 分割模型的输出类型。
"""
loss: Optional[torch.FloatTensor] = None
loss_dict: Optional[Dict] = None
logits: torch.FloatTensor = None
pred_boxes: torch.FloatTensor = None
pred_masks: torch.FloatTensor = None
auxiliary_outputs: Optional[List[Dict]] = None
last_hidden_state: Optional[torch.FloatTensor] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
class ConditionalDetrFrozenBatchNorm2d(nn.Module):
"""
ConditionalDetr 的冻结批量归一化层。
BatchNorm2d 的批次统计信息和仿射参数被固定的版本。
从 torchvision.misc.ops 中复制粘贴而来,添加了在求平方根前的 eps,
否则除 torchvision.models.resnet[18,34,50,101] 之外的其他模型会产生 NaN 值。
"""
def __init__(self, n):
super().__init__()
self.register_buffer("weight", torch.ones(n))
self.register_buffer("bias", torch.zeros(n))
self.register_buffer("running_mean", torch.zeros(n))
self.register_buffer("running_var", torch.ones(n))
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
num_batches_tracked_key = prefix + "num_batches_tracked"
if num_batches_tracked_key in state_dict:
del state_dict[num_batches_tracked_key]
super()._load_from_state_dict(
state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
)
def forward(self, x):
weight = self.weight.reshape(1, -1, 1, 1)
bias = self.bias.reshape(1, -1, 1, 1)
running_var = self.running_var.reshape(1, -1, 1, 1)
running_mean = self.running_mean.reshape(1, -1, 1, 1)
epsilon = 1e-5
scale = weight * (running_var + epsilon).rsqrt()
bias = bias - running_mean * scale
return x * scale + bias
def replace_batch_norm(model):
r"""
递归地将模型中所有的 `torch.nn.BatchNorm2d` 替换为 `ConditionalDetrFrozenBatchNorm2d`。
Args:
model (torch.nn.Module):
输入的模型
"""
for name, module in model.named_children():
if isinstance(module, nn.BatchNorm2d):
new_module = ConditionalDetrFrozenBatchNorm2d(module.num_features)
if not module.weight.device == torch.device("meta"):
new_module.weight.data.copy_(module.weight)
new_module.bias.data.copy_(module.bias)
new_module.running_mean.data.copy_(module.running_mean)
new_module.running_var.data.copy_(module.running_var)
model._modules[name] = new_module
if len(list(module.children())) > 0:
replace_batch_norm(module)
class ConditionalDetrConvEncoder(nn.Module):
"""
使用 AutoBackbone API 或 timm 库中的模型作为卷积主干网络。
所有的 nn.BatchNorm2d 层都被上述定义的 DetrFrozenBatchNorm2d 替换。
"""
def __init__(self, config):
super().__init__()
self.config = config
if config.use_timm_backbone:
requires_backends(self, ["timm"])
kwargs = {}
if config.dilation:
kwargs["output_stride"] = 16
backbone = create_model(
config.backbone,
pretrained=config.use_pretrained_backbone,
features_only=True,
out_indices=(1, 2, 3, 4),
in_chans=config.num_channels,
**kwargs,
)
else:
backbone = load_backbone(config)
with torch.no_grad():
replace_batch_norm(backbone)
self.model = backbone
self.intermediate_channel_sizes = (
self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
)
backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
if "resnet" in backbone_model_type:
for name, parameter in self.model.named_parameters():
if config.use_timm_backbone:
if "layer2" not in name and "layer3" not in name and "layer4" not in name:
parameter.requires_grad_(False)
else:
if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
parameter.requires_grad_(False)
def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
out = []
for feature_map in features:
mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
out.append((feature_map, mask))
return out
class ConditionalDetrConvModel(nn.Module):
"""
This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
"""
def __init__(self, conv_encoder, position_embedding):
super().__init__()
self.conv_encoder = conv_encoder
self.position_embedding = position_embedding
def forward(self, pixel_values, pixel_mask):
out = self.conv_encoder(pixel_values, pixel_mask)
pos = []
for feature_map, mask in out:
pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
return out, pos
class ConditionalDetrSinePositionEmbedding(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
need paper, generalized to work on images.
"""
def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
super().__init__()
self.embedding_dim = embedding_dim
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, pixel_values, pixel_mask):
if pixel_mask is None:
raise ValueError("No pixel mask provided")
y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
if self.normalize:
y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
pos_x = x_embed[:, :, :, None] / dim_t
pos_y = y_embed[:, :, :, None] / dim_t
pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
class ConditionalDetrLearnedPositionEmbedding(nn.Module):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, embedding_dim=256):
super().__init__()
self.row_embeddings = nn.Embedding(50, embedding_dim)
self.column_embeddings = nn.Embedding(50, embedding_dim)
def forward(self, pixel_values, pixel_mask=None):
height, width = pixel_values.shape[-2:]
width_values = torch.arange(width, device=pixel_values.device)
height_values = torch.arange(height, device=pixel_values.device)
x_emb = self.column_embeddings(width_values)
y_emb = self.row_embeddings(height_values)
pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
pos = pos.permute(2, 0, 1)
pos = pos.unsqueeze(0)
pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
return pos
def build_position_encoding(config):
n_steps = config.d_model // 2
if config.position_embedding_type == "sine":
position_embedding = ConditionalDetrSinePositionEmbedding(n_steps, normalize=True)
elif config.position_embedding_type == "learned":
position_embedding = ConditionalDetrLearnedPositionEmbedding(n_steps)
else:
raise ValueError(f"Not supported {config.position_embedding_type}")
return position_embedding
def gen_sine_position_embeddings(pos_tensor, d_model):
scale = 2 * math.pi
dim = d_model // 2
dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device)
dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / dim)
x_embed = pos_tensor[:, :, 0] * scale
y_embed = pos_tensor[:, :, 1] * scale
pos_x = x_embed[:, :, None] / dim_t
pos_y = y_embed[:, :, None] / dim_t
pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
pos = torch.cat((pos_y, pos_x), dim=2)
return pos
class DetrAttention(nn.Module):
"""
来自《Attention Is All You Need》论文的多头注意力机制。
在这里,我们根据DETR论文的解释,将位置编码添加到查询和键中。
"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
bias: bool = True,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if self.head_dim * num_heads != self.embed_dim:
raise ValueError(
f"embed_dim必须能被num_heads整除 (得到 `embed_dim`: {self.embed_dim} 和 `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor], **kwargs):
position_embeddings = kwargs.pop("position_embeddings", None)
if kwargs:
raise ValueError(f"Unexpected arguments {kwargs.keys()}")
if position_embeddings is not None and object_queries is not None:
raise ValueError(
"Cannot specify both position_embeddings and object_queries. Please use just object_queries"
)
if position_embeddings is not None:
logger.warning_once(
"position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
)
object_queries = position_embeddings
return tensor if object_queries is None else tensor + object_queries
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
object_queries: Optional[torch.Tensor] = None,
key_value_states: Optional[torch.Tensor] = None,
spatial_position_embeddings: Optional[torch.Tensor] = None,
output_attentions: bool = False,
**kwargs,
):
class ConditionalDetrAttention(nn.Module):
"""
Cross-Attention used in Conditional DETR 'Conditional DETR for Fast Training Convergence' paper.
The key q_proj, k_proj, v_proj are defined outside the attention. This attention allows the dim of q, k to be
different to v.
"""
def __init__(
self,
embed_dim: int,
out_dim: int,
num_heads: int,
dropout: float = 0.0,
bias: bool = True,
):
super().__init__()
self.embed_dim = embed_dim
self.out_dim = out_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if self.head_dim * num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {num_heads})."
)
self.v_head_dim = out_dim // num_heads
if self.v_head_dim * num_heads != self.out_dim:
raise ValueError(
f"out_dim must be divisible by num_heads (got `out_dim`: {self.out_dim} and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.out_proj = nn.Linear(out_dim, out_dim, bias=bias)
def _qk_shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def _v_shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
return tensor.view(batch_size, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
key_states: Optional[torch.Tensor] = None,
value_states: Optional[torch.Tensor] = None,
output_attentions: bool = False,
class ConditionalDetrEncoderLayer(nn.Module):
def __init__(self, config: ConditionalDetrConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = DetrAttention(
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
object_queries: torch.Tensor = None,
output_attentions: bool = False,
**kwargs,
):
"""
Args:
hidden_states (`torch.FloatTensor`): 输入层的张量,形状为 `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): 注意力掩码张量,形状为
`(batch, 1, target_len, source_len)`,其中填充元素由非常大的负值表示。
object_queries (`torch.FloatTensor`, *optional*):
对象查询(也称为内容嵌入),将添加到隐藏状态中。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。有关更多细节,请参见返回张量中的 `attentions`。
"""
position_embeddings = kwargs.pop("position_embeddings", None)
if kwargs:
raise ValueError(f"Unexpected arguments {kwargs.keys()}")
if position_embeddings is not None and object_queries is not None:
raise ValueError(
"Cannot specify both position_embeddings and object_queries. Please use just object_queries"
)
if position_embeddings is not None:
logger.warning_once(
"position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
)
object_queries = position_embeddings
residual = hidden_states
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
object_queries=object_queries,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
if self.training:
if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class ConditionalDetrDecoderLayer(nn.Module):
def __init__(self, config: ConditionalDetrConfig):
super().__init__()
self.embed_dim = config.d_model
d_model = config.d_model
self.sa_qcontent_proj = nn.Linear(d_model, d_model)
self.sa_qpos_proj = nn.Linear(d_model, d_model)
self.sa_kcontent_proj = nn.Linear(d_model, d_model)
self.sa_kpos_proj = nn.Linear(d_model, d_model)
self.sa_v_proj = nn.Linear(d_model, d_model)
self.self_attn = ConditionalDetrAttention(
embed_dim=self.embed_dim,
out_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.ca_qcontent_proj = nn.Linear(d_model, d_model)
self.ca_qpos_proj = nn.Linear(d_model, d_model)
self.ca_kcontent_proj = nn.Linear(d_model, d_model)
self.ca_kpos_proj = nn.Linear(d_model, d_model)
self.ca_v_proj = nn.Linear(d_model, d_model)
self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
self.encoder_attn = ConditionalDetrAttention(
self.embed_dim * 2, self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
object_queries: Optional[torch.Tensor] = None,
query_position_embeddings: Optional[torch.Tensor] = None,
query_sine_embed: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
is_first: Optional[bool] = False,
**kwargs,
class ConditionalDetrClassificationHead(nn.Module):
def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
super().__init__()
self.dense = nn.Linear(input_dim, inner_dim)
self.dropout = nn.Dropout(p=pooler_dropout)
self.out_proj = nn.Linear(inner_dim, num_classes)
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense(hidden_states)
hidden_states = torch.tanh(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states)
return hidden_states
class MLP(nn.Module):
"""
非常简单的多层感知机(MLP,也称为前馈神经网络),用于预测相对于图像的归一化中心坐标、高度和宽度的边界框。
从 https://github.com/facebookresearch/detr/blob/master/models/detr.py 复制而来
"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
for i, layer in enumerate(self.layers):
x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
class ConditionalDetrPreTrainedModel(PreTrainedModel):
config_class = ConditionalDetrConfig
base_model_prefix = "model"
main_input_name = "pixel_values"
_no_split_modules = [r"ConditionalDetrConvEncoder", r"ConditionalDetrEncoderLayer", r"ConditionalDetrDecoderLayer"]
def _init_weights(self, module):
std = self.config.init_std
xavier_std = self.config.init_xavier_std
if isinstance(module, ConditionalDetrMHAttentionMap):
nn.init.zeros_(module.k_linear.bias)
nn.init.zeros_(module.q_linear.bias)
nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
elif isinstance(module, ConditionalDetrLearnedPositionEmbedding):
nn.init.uniform_(module.row_embeddings.weight)
nn.init.uniform_(module.column_embeddings.weight)
if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
CONDITIONAL_DETR_START_DOCSTRING = r"""
该模型继承自 [`PreTrainedModel`]。查看超类文档以了解库为其所有模型实现的通用方法(如下载或保存、调整输入嵌入、修剪头部等)。
该模型还是一个 PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 的子类。
将其视为常规的 PyTorch 模块,并参考 PyTorch 文档以获取所有与一般用法相关的信息
"""
and behavior.
Parameters:
config ([`ConditionalDetrConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
"""
Transformer编码器,包含config.encoder_layers个自注意力层。每一层是一个ConditionalDetrEncoderLayer。
编码器通过多个自注意力层更新扁平化特征图。
对于ConditionalDETR的小调整:
- 对象查询(object_queries)在前向传播中添加。
"""
Args:
config: ConditionalDetrConfig
"""
# 初始化方法,接收一个配置对象,设置网络的参数和层
def __init__(self, config: ConditionalDetrConfig):
# 调用父类的初始化方法,传入配置对象
super().__init__(config)
# 从配置对象中获取dropout和encoder_layerdrop参数
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
# 使用列表推导式创建一个包含多个ConditionalDetrEncoderLayer对象的ModuleList
self.layers = nn.ModuleList([ConditionalDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
# 在原始的ConditionalDETR中,encoder的末尾没有使用layernorm,因为默认情况下"normalize_before"设置为False
# 初始化权重并应用最终处理
self.post_init()
# 前向传播方法定义,接收多个输入参数和关键字参数
def forward(
self,
inputs_embeds=None,
attention_mask=None,
object_queries=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
**kwargs,
class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDetrDecoderLayer`].
The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
Some small tweaks for Conditional DETR:
- object_queries and query_position_embeddings are added to the forward pass.
- if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
Args:
config: ConditionalDetrConfig
"""
def __init__(self, config: ConditionalDetrConfig):
super().__init__(config)
self.dropout = config.dropout # 初始化dropout比率
self.layerdrop = config.decoder_layerdrop # 初始化层间dropout比率
# 创建多层Transformer解码器,每层为ConditionalDetrDecoderLayer类的实例
self.layers = nn.ModuleList([ConditionalDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
# Conditional DETR中,在最后一个解码器层输出后使用layernorm
self.layernorm = nn.LayerNorm(config.d_model)
d_model = config.d_model
self.gradient_checkpointing = False # 梯度检查点设为False
# query_scale是应用于f以生成变换T的前馈神经网络
self.query_scale = MLP(d_model, d_model, d_model, 2) # 初始化query_scale网络
self.ref_point_head = MLP(d_model, d_model, 2, 2) # 初始化ref_point_head网络
# 对于除最后一层以外的每一层,设置ca_qpos_proj为None
for layer_id in range(config.decoder_layers - 1):
self.layers[layer_id + 1].ca_qpos_proj = None
# 初始化权重并应用最终处理
self.post_init()
def forward(
self,
inputs_embeds=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
object_queries=None,
query_position_embeddings=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
**kwargs,
):
# 省略了forward方法的注释,因为forward方法的详细解释不应该包含在代码块内部,只需提供类的初始化和重要属性的解释即可。
@add_start_docstrings(
"""
The bare Conditional DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
hidden-states without any specific head on top.
""",
CONDITIONAL_DETR_START_DOCSTRING,
)
class ConditionalDetrModel(ConditionalDetrPreTrainedModel):
def __init__(self, config: ConditionalDetrConfig):
super().__init__(config)
# 创建骨干网络(backbone) + 位置编码
backbone = ConditionalDetrConvEncoder(config)
object_queries = build_position_encoding(config)
self.backbone = ConditionalDetrConvModel(backbone, object_queries)
# 创建投影层
self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
self.encoder = ConditionalDetrEncoder(config)
self.decoder = ConditionalDetrDecoder(config)
# 初始化权重并应用最终处理
self.post_init()
def get_encoder(self):
return self.encoder
# 注释结束
# 返回当前对象的解码器
def get_decoder(self):
return self.decoder
# 冻结模型的主干网络,使其参数不再更新
def freeze_backbone(self):
# 遍历主干网络的所有参数,并设置它们的梯度更新为 False
for name, param in self.backbone.conv_encoder.model.named_parameters():
param.requires_grad_(False)
# 解冻模型的主干网络,使其参数可以更新
def unfreeze_backbone(self):
# 遍历主干网络的所有参数,并设置它们的梯度更新为 True
for name, param in self.backbone.conv_encoder.model.named_parameters():
param.requires_grad_(True)
# 前向传播函数,接受输入并返回模型的输出
@add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ConditionalDetrModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""
CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
top, for tasks such as COCO detection.
"""
@add_start_docstrings(
"""
CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
top, for tasks such as COCO detection.
""",
CONDITIONAL_DETR_START_DOCSTRING,
)
class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
def __init__(self, config: ConditionalDetrConfig):
super().__init__(config)
# CONDITIONAL DETR encoder-decoder model
self.model = ConditionalDetrModel(config)
# Object detection heads
self.class_labels_classifier = nn.Linear(
config.d_model, config.num_labels
) # We add one for the "no object" class
self.bbox_predictor = ConditionalDetrMLPPredictionHead(
input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
)
# Initialize weights and apply final processing
self.post_init()
# taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
@torch.jit.unused
def _set_aux_loss(self, outputs_class, outputs_coord):
# this is a workaround to make torchscript happy, as torchscript
# doesn't support dictionary with non-homogeneous values, such
# as a dict having both a Tensor and a list.
return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
@add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ConditionalDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[List[dict]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Perform forward pass of the Conditional DETR model for object detection.
Args:
pixel_values (torch.FloatTensor): Tensor of pixel values of shape (batch_size, sequence_length, channels).
pixel_mask (Optional[torch.LongTensor]): Optional tensor of pixel masks with shape (batch_size, sequence_length).
decoder_attention_mask (Optional[torch.LongTensor]): Optional tensor indicating which positions should be
attended to by the decoder with shape (batch_size, sequence_length).
encoder_outputs (Optional[torch.FloatTensor]): Optional tensor with encoder outputs of shape
(batch_size, sequence_length, hidden_size).
inputs_embeds (Optional[torch.FloatTensor]): Optional tensor of embeddings to be used as inputs to the decoder
instead of pixel_values.
decoder_inputs_embeds (Optional[torch.FloatTensor]): Optional tensor of embeddings to be used as inputs to the
decoder.
labels (Optional[List[dict]]): Optional list of dictionaries containing labels for object detection.
output_attentions (Optional[bool]): Whether to output attentions weights.
output_hidden_states (Optional[bool]): Whether to output hidden states.
return_dict (Optional[bool]): Whether to return a dictionary as output.
Returns:
ConditionalDetrObjectDetectionOutput: Output object containing the logits and predicted boxes.
"""
"""
CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top,
for tasks such as COCO panoptic.
"""
@add_start_docstrings(
"""
CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top,
for tasks such as COCO panoptic.
""",
CONDITIONAL_DETR_START_DOCSTRING,
)
class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel):
def __init__(self, config: ConditionalDetrConfig):
super().__init__(config)
# object detection model
self.conditional_detr = ConditionalDetrForObjectDetection(config)
# segmentation head
# 获取配置中的隐藏大小和注意力头数
hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
# 从模型的编码器中提取中间通道大小
intermediate_channel_sizes = self.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes
# 初始化分割头部,连接隐藏大小、注意力头数和中间通道大小的一部分
self.mask_head = ConditionalDetrMaskHeadSmallConv(
hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
)
# 初始化边界框的注意力机制,使用隐藏大小和注意力头数,指定初始化Xavier的标准差
self.bbox_attention = ConditionalDetrMHAttentionMap(
hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(CONDITIONAL_DETR_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ConditionalDetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.FloatTensor] = None,
encoder_outputs: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[List[dict]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 定义一个函数 _expand,用于扩展张量的维度
def _expand(tensor, length: int):
# 在第一维度上插入一个维度,并重复该维度,以扩展张量的长度
return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
# 从 transformers.models.detr.modeling_detr.DetrMaskHeadSmallConv 复制并修改为 ConditionalDetrMaskHeadSmallConv 类
class ConditionalDetrMaskHeadSmallConv(nn.Module):
"""
简单的卷积头部,使用组归一化。使用 FPN 方法进行上采样
"""
def __init__(self, dim, fpn_dims, context_dim):
super().__init__()
# 检查 dim 是否能被 8 整除,因为 GroupNorm 中的组数设置为 8
if dim % 8 != 0:
raise ValueError(
"隐藏大小加注意力头的数量必须能被 8 整除,因为 GroupNorm 的组数设置为 8"
)
# 计算中间层的维度
inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
# 定义卷积层和组归一化层
self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
self.gn1 = nn.GroupNorm(8, dim)
self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
self.dim = dim
# 定义适配器层,用于将 FPN 的特征映射适配到不同的中间层维度
self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
# 对所有模块进行初始化,卷积层使用 Kaiming 初始化,偏置初始化为常数 0
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_uniform_(m.weight, a=1)
nn.init.constant_(m.bias, 0)
# 定义一个方法 `forward`,用于前向传播计算
def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
# 将特征图 x(形状为 batch_size, d_model, heigth/32, width/32)与 bbox_mask(注意力图,形状为 batch_size, n_queries, n_heads, height/32, width/32)拼接起来
x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
# 经过第一个神经网络层 lay1
x = self.lay1(x)
# 经过第一个 GroupNorm 层 gn1
x = self.gn1(x)
# 应用 ReLU 激活函数
x = nn.functional.relu(x)
# 经过第二个神经网络层 lay2
x = self.lay2(x)
# 经过第二个 GroupNorm 层 gn2
x = self.gn2(x)
# 再次应用 ReLU 激活函数
x = nn.functional.relu(x)
# 获取当前的特征金字塔网络(FPN)的第一个分支 fpns[0],并通过 adapter1 适配器层调整其大小
cur_fpn = self.adapter1(fpns[0])
# 如果当前特征金字塔分支的 batch 大小与 x 的 batch 大小不一致,则扩展它以匹配 x 的大小
if cur_fpn.size(0) != x.size(0):
cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
# 将适配后的特征金字塔与 x 进行相加,并对 x 进行最近邻插值以调整大小
x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
# 经过第三个神经网络层 lay3
x = self.lay3(x)
# 经过第三个 GroupNorm 层 gn3
x = self.gn3(x)
# 再次应用 ReLU 激活函数
x = nn.functional.relu(x)
# 获取当前的特征金字塔网络(FPN)的第二个分支 fpns[1],并通过 adapter2 适配器层调整其大小
cur_fpn = self.adapter2(fpns[1])
# 如果当前特征金字塔分支的 batch 大小与 x 的 batch 大小不一致,则扩展它以匹配 x 的大小
if cur_fpn.size(0) != x.size(0):
cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
# 将适配后的特征金字塔与 x 进行相加,并对 x 进行最近邻插值以调整大小
x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
# 经过第四个神经网络层 lay4
x = self.lay4(x)
# 经过第四个 GroupNorm 层 gn4
x = self.gn4(x)
# 再次应用 ReLU 激活函数
x = nn.functional.relu(x)
# 获取当前的特征金字塔网络(FPN)的第三个分支 fpns[2],并通过 adapter3 适配器层调整其大小
cur_fpn = self.adapter3(fpns[2])
# 如果当前特征金字塔分支的 batch 大小与 x 的 batch 大小不一致,则扩展它以匹配 x 的大小
if cur_fpn.size(0) != x.size(0):
cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
# 将适配后的特征金字塔与 x 进行相加,并对 x 进行最近邻插值以调整大小
x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
# 经过第五个神经网络层 lay5
x = self.lay5(x)
# 经过第五个 GroupNorm 层 gn5
x = self.gn5(x)
# 再次应用 ReLU 激活函数
x = nn.functional.relu(x)
# 最终经过输出层 out_lay
x = self.out_lay(x)
# 返回处理后的输出 x
return x
# 从 transformers.models.detr.modeling_detr.DetrMHAttentionMap 复制而来,修改为 ConditionalDetrMHAttentionMap 类
class ConditionalDetrMHAttentionMap(nn.Module):
"""This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
super().__init__()
self.num_heads = num_heads
self.hidden_dim = hidden_dim
self.dropout = nn.Dropout(dropout)
# 创建线性层,用于计算查询(q)和键(k)的线性变换
self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
# 归一化因子,用于缩放每个头的注意力分数
self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
def forward(self, q, k, mask: Optional[Tensor] = None):
# 计算查询的线性变换
q = self.q_linear(q)
# 计算键的线性变换,并进行卷积操作
k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
# 将查询和键分割成每个头的部分
queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
# 计算加权的注意力分数
weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
if mask is not None:
# 对注意力分数应用掩码
weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min)
# 计算注意力权重的 softmax,并应用 dropout
weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
weights = self.dropout(weights)
return weights
# 从 transformers.models.detr.modeling_detr.dice_loss 复制而来
def dice_loss(inputs, targets, num_boxes):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs (0 for the negative class and 1 for the positive
class).
"""
# 对输入进行 sigmoid 激活
inputs = inputs.sigmoid()
# 展平输入张量
inputs = inputs.flatten(1)
# 计算 DICE 损失的分子部分
numerator = 2 * (inputs * targets).sum(1)
# 计算 DICE 损失的分母部分
denominator = inputs.sum(-1) + targets.sum(-1)
# 计算 DICE 损失值
loss = 1 - (numerator + 1) / (denominator + 1)
return loss.sum() / num_boxes
# 从 transformers.models.detr.modeling_detr.sigmoid_focal_loss 复制而来
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
"""
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
"""
# 计算 sigmoid focal 损失
pass
# 将输入 logits 转换为概率值,使用 sigmoid 函数
prob = inputs.sigmoid()
# 计算二元交叉熵损失,reduction="none" 表示不进行降维
ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
# 计算 modulating factor,用于平衡简单和困难的样本
p_t = prob * targets + (1 - prob) * (1 - targets)
# 计算加权的损失,通过对损失的平方进行操作
loss = ce_loss * ((1 - p_t) ** gamma)
# 如果 alpha 大于等于 0,则计算 alpha_t 权重,用于平衡正负样本
if alpha >= 0:
alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
loss = alpha_t * loss
# 计算最终的损失值,对每个样本的损失求均值后求和,并除以 num_boxes 得到平均损失
return loss.mean(1).sum() / num_boxes
# 定义 ConditionalDetrLoss 类,用于计算条件化目标检测或分割任务中的损失
"""
This class computes the losses for ConditionalDetrForObjectDetection/ConditionalDetrForSegmentation. The process
happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2)
we supervise each pair of matched ground-truth / prediction (supervise class and box).
Args:
matcher (`ConditionalDetrHungarianMatcher`):
Module able to compute a matching between targets and proposals.
num_classes (`int`):
Number of object categories, omitting the special no-object category.
focal_alpha (`float`):
Alpha parameter in focal loss.
losses (`List[str]`):
List of all the losses to be applied. See `get_loss` for a list of all available losses.
"""
# 初始化方法,接受匹配器、类别数量、焦点损失的 alpha 参数和损失列表
# 被复制自 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.__init__
def __init__(self, matcher, num_classes, focal_alpha, losses):
super().__init__()
# 设置匹配器对象
self.matcher = matcher
# 设置类别数量
self.num_classes = num_classes
# 设置焦点损失的 alpha 参数
self.focal_alpha = focal_alpha
# 设置损失列表
self.losses = losses
# 定义损失标签方法,计算分类损失(二元焦点损失)
# 输出中必须包含 "logits" 键
"""
Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
of dim [nb_target_boxes]
"""
# 接受输出、目标、索引和盒子数量作为参数
# 被复制自 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_labels
def loss_labels(self, outputs, targets, indices, num_boxes):
# 如果输出中没有 "logits" 键,则抛出 KeyError
if "logits" not in outputs:
raise KeyError("No logits were found in the outputs")
# 获取源 logits
source_logits = outputs["logits"]
# 获取源排列的索引
idx = self._get_source_permutation_idx(indices)
# 从目标中获取目标类别
target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
# 创建一个全为 num_classes 的 tensor,用于表示目标类别
target_classes = torch.full(
source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
)
target_classes[idx] = target_classes_o
# 创建一个全零的 tensor,用于表示目标类别的 one-hot 编码
target_classes_onehot = torch.zeros(
[source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
dtype=source_logits.dtype,
layout=source_logits.layout,
device=source_logits.device,
)
# 在 target_classes_onehot 上按照 target_classes 的索引进行填充
target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
# 去掉最后一维,使得 target_classes_onehot 与 source_logits 的形状一致
target_classes_onehot = target_classes_onehot[:, :, :-1]
# 计算交叉熵损失(乘以类别数目)
loss_ce = (
sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
* source_logits.shape[1]
)
# 返回计算出的损失
losses = {"loss_ce": loss_ce}
return losses
# 用 torch.no_grad() 修饰的方法,计算基数损失
# 被复制自 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_cardinality
# 定义一个方法用于计算卡迪尼尔错误,即预测的非空框数量的绝对误差
def loss_cardinality(self, outputs, targets, indices, num_boxes):
"""
Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
"""
# 从模型输出中获取 logits(预测结果)
logits = outputs["logits"]
# 获取 logits 的设备信息
device = logits.device
# 计算目标长度,即每个目标框内的类别标签数目
target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
# 计算预测的非"no-object"类别的数量
card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
# 使用 L1 损失计算卡迪尼尔错误
card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
# 构建损失字典,存储卡迪尼尔错误
losses = {"cardinality_error": card_err}
return losses
# 从 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_boxes 复制而来
def loss_boxes(self, outputs, targets, indices, num_boxes):
"""
Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
are expected in format (center_x, center_y, w, h), normalized by the image size.
"""
# 检查输出中是否存在预测框
if "pred_boxes" not in outputs:
raise KeyError("No predicted boxes found in outputs")
# 获取源索引的置换索引
idx = self._get_source_permutation_idx(indices)
# 获取源框(预测框)和目标框
source_boxes = outputs["pred_boxes"][idx]
target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
# 计算边界框的 L1 回归损失
loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
# 构建损失字典,存储边界框的损失,将损失平均化为每个框的损失
losses = {}
losses["loss_bbox"] = loss_bbox.sum() / num_boxes
# 计算边界框的 GIoU 损失
loss_giou = 1 - torch.diag(
generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
)
# 将 GIoU 损失平均化为每个框的损失
losses["loss_giou"] = loss_giou.sum() / num_boxes
return losses
# 从 transformers.models.detr.modeling_detr.DetrLoss.loss_masks 复制而来
def loss_masks(self, outputs, targets, indices, num_boxes):
"""
Compute the losses related to the masks: the focal loss and the dice loss.
Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
"""
# 检查输出中是否包含预测的 masks
if "pred_masks" not in outputs:
raise KeyError("No predicted masks found in outputs")
# 获取源索引的排列顺序
source_idx = self._get_source_permutation_idx(indices)
# 获取目标索引的排列顺序
target_idx = self._get_target_permutation_idx(indices)
# 获取预测的 masks
source_masks = outputs["pred_masks"]
# 根据源索引重新排列预测的 masks
source_masks = source_masks[source_idx]
# 获取目标中的 masks 列表
masks = [t["masks"] for t in targets]
# 将目标 masks 转换成嵌套张量,并解析出有效区域
target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
# 将目标 masks 转换为与预测 masks 相同的设备类型
target_masks = target_masks.to(source_masks)
# 根据目标索引重新排列目标 masks
target_masks = target_masks[target_idx]
# 将预测 masks 上采样到目标尺寸
source_masks = nn.functional.interpolate(
source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
)
# 压缩维度
source_masks = source_masks[:, 0].flatten(1)
# 压缩维度
target_masks = target_masks.flatten(1)
# 重新调整形状以匹配预测 masks
target_masks = target_masks.view(source_masks.shape)
# 计算损失,包括 sigmoid focal loss 和 dice loss
losses = {
"loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
"loss_dice": dice_loss(source_masks, target_masks, num_boxes),
}
return losses
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_source_permutation_idx
def _get_source_permutation_idx(self, indices):
# 根据 indices 对预测结果进行排列
batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
source_idx = torch.cat([source for (source, _) in indices])
return batch_idx, source_idx
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_target_permutation_idx
def _get_target_permutation_idx(self, indices):
# 根据 indices 对目标进行排列
batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
target_idx = torch.cat([target for (_, target) in indices])
return batch_idx, target_idx
# Copied from transformers.models.detr.modeling_detr.DetrLoss.get_loss
def get_loss(self, loss, outputs, targets, indices, num_boxes):
# 根据指定的损失类型获取相应的损失函数并计算损失
loss_map = {
"labels": self.loss_labels,
"cardinality": self.loss_cardinality,
"boxes": self.loss_boxes,
"masks": self.loss_masks,
}
# 检查损失类型是否被支持
if loss not in loss_map:
raise ValueError(f"Loss {loss} not supported")
return loss_map[loss](outputs, targets, indices, num_boxes)
# Copied from transformers.models.detr.modeling_detr.DetrLoss.forward
def forward(self, outputs, targets):
"""
This method computes the losses for the model during training.
Args:
outputs (`dict`, *optional*):
Dictionary containing tensors representing model predictions.
targets (`List[dict]`, *optional*):
List of dictionaries where each dictionary corresponds to target data for one sample in the batch.
Returns:
losses (`dict`):
Dictionary containing computed losses.
"""
# Exclude auxiliary outputs from outputs dictionary
outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
# Match model outputs with target data
indices = self.matcher(outputs_without_aux, targets)
# Calculate the total number of target boxes across all samples
num_boxes = sum(len(t["class_labels"]) for t in targets)
num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
# Determine the world size for distributed training
world_size = 1
if is_accelerate_available():
if PartialState._shared_state != {}:
num_boxes = reduce(num_boxes)
world_size = PartialState().num_processes
# Normalize num_boxes and ensure it is at least 1
num_boxes = torch.clamp(num_boxes / world_size, min=1).item()
# Compute losses for each specified loss type
losses = {}
for loss in self.losses:
losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
# If there are auxiliary outputs, compute losses for each auxiliary output separately
if "auxiliary_outputs" in outputs:
for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
indices = self.matcher(auxiliary_outputs, targets)
for loss in self.losses:
if loss == "masks":
# Skip computing intermediate masks losses due to computational cost
continue
# Append index suffix to loss keys to distinguish between auxiliary losses
l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
losses.update(l_dict)
return losses
# 从 transformers.models.detr.modeling_detr.DetrMLPPredictionHead 复制而来,修改为 ConditionalDetrMLPPredictionHead
class ConditionalDetrMLPPredictionHead(nn.Module):
"""
Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
height and width of a bounding box w.r.t. an image.
Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
# 创建包含多个线性层的 ModuleList,用于构建 MLP
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
for i, layer in enumerate(self.layers):
# 对输入应用 ReLU 激活函数,除了最后一层
x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
# 从 transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher 复制而来,修改为 ConditionalDetrHungarianMatcher
class ConditionalDetrHungarianMatcher(nn.Module):
"""
This class computes an assignment between the targets and the predictions of the network.
For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
un-matched (and thus treated as non-objects).
Args:
class_cost:
The relative weight of the classification error in the matching cost.
bbox_cost:
The relative weight of the L1 error of the bounding box coordinates in the matching cost.
giou_cost:
The relative weight of the giou loss of the bounding box in the matching cost.
"""
def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
super().__init__()
# 确保模块在后端中使用了 "scipy" 库
requires_backends(self, ["scipy"])
self.class_cost = class_cost
self.bbox_cost = bbox_cost
self.giou_cost = giou_cost
# 如果所有匹配器的成本都为零,则引发 ValueError 异常
if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
raise ValueError("All costs of the Matcher can't be 0")
@torch.no_grad()
def forward(self, outputs, targets):
"""
Args:
outputs (`dict`):
A dictionary that contains at least these entries:
* "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
* "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
targets (`List[dict]`):
A list of targets (len(targets) = batch_size), where each target is a dict containing:
* "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
ground-truth objects in the target) containing the class labels
* "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
Returns:
`List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
batch_size, num_queries = outputs["logits"].shape[:2]
# We flatten to compute the cost matrices in a batch
# 将 logits 展平为 [batch_size * num_queries, num_classes] 并进行 sigmoid 操作,得到分类概率
out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes]
# 将 pred_boxes 展平为 [batch_size * num_queries, 4]
out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4]
# Also concat the target labels and boxes
# 将所有目标的类别标签拼接成一个张量
target_ids = torch.cat([v["class_labels"] for v in targets])
# 将所有目标的边界框坐标拼接成一个张量
target_bbox = torch.cat([v["boxes"] for v in targets])
# Compute the classification cost.
# 计算分类损失
alpha = 0.25
gamma = 2.0
# 计算负分类损失
neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
# 计算正分类损失
pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
# 选取目标类别对应的损失
class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
# Compute the L1 cost between boxes
# 计算边界框之间的 L1 损失
bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
# Compute the giou cost between boxes
# 计算边界框之间的 GIoU 损失
giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
# Final cost matrix
# 组合计算得到的三种损失到最终的损失矩阵中
cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
# Perform linear sum assignment on the cost matrix split by target sizes
# 根据目标的大小执行线性求和分配,并返回索引
sizes = [len(v["boxes"]) for v in targets]
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
# Copied from transformers.models.detr.modeling_detr._upcast
# 将输入张量的数据类型上溯,以防止乘法运算中的数值溢出
def _upcast(t: Tensor) -> Tensor:
if t.is_floating_point():
# 如果输入张量已经是浮点类型,则保持不变;否则将其转换为 float 类型
return t if t.dtype in (torch.float32, torch.float64) else t.float()
else:
# 如果输入张量是整数类型,则保持不变;否则将其转换为 int 类型
return t if t.dtype in (torch.int32, torch.int64) else t.int()
# Copied from transformers.models.detr.modeling_detr.box_area
# 计算一组边界框的面积,这些边界框由其 (x1, y1, x2, y2) 坐标表示
def box_area(boxes: Tensor) -> Tensor:
"""
计算一组边界框的面积,这些边界框由其 (x1, y1, x2, y2) 坐标表示。
Args:
boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
需要计算面积的边界框。边界框应以 (x1, y1, x2, y2) 格式给出,要求 `0 <= x1 < x2` 和 `0 <= y1 < y2`。
Returns:
`torch.FloatTensor`: 包含每个边界框面积的张量。
"""
boxes = _upcast(boxes)
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# Copied from transformers.models.detr.modeling_detr.box_iou
# 计算两组边界框的 IoU(交并比)
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2]
inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
# 计算通用 IoU,参考 https://giou.stanford.edu/
def generalized_box_iou(boxes1, boxes2):
"""
计算通用 IoU,参考 https://giou.stanford.edu/。边界框应以 [x0, y0, x1, y1](角点)格式给出。
Returns:
`torch.FloatTensor`: 一个形状为 [N, M] 的成对矩阵,其中 N = len(boxes1),M = len(boxes2)
"""
# 检查是否存在不正确的边界框,避免产生无限值或非数值结果
if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
raise ValueError(f"boxes1 的格式必须为 [x0, y0, x1, y1](角点),但得到的是 {boxes1}")
if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
raise ValueError(f"boxes2 的格式必须为 [x0, y0, x1, y1](角点),但得到的是 {boxes2}")
iou, union = box_iou(boxes1, boxes2)
top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2]
area = width_height[:, :, 0] * width_height[:, :, 1]
return iou - (area - union) / area
# Copied from transformers.models.detr.modeling_detr._max_by_axis
# 返回给定列表中各子列表在相同索引处的最大值列表
def _max_by_axis(the_list):
# type: (List[List[int]]) -> List[int]
maxes = the_list[0]
for sublist in the_list[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
# 定义嵌套张量类,包含多个张量和可选的遮罩张量
class NestedTensor(object):
def __init__(self, tensors, mask: Optional[Tensor]):
self.tensors = tensors # 初始化张量列表
self.mask = mask # 初始化遮罩张量
# 将嵌套张量移到指定设备
def to(self, device):
# 将张量列表转移到指定设备
cast_tensor = self.tensors.to(device)
mask = self.mask
if mask is not None:
# 如果有遮罩张量,则也将其转移到指定设备
cast_mask = mask.to(device)
else:
cast_mask = None
return NestedTensor(cast_tensor, cast_mask)
# 分解嵌套张量,返回包含的张量和遮罩张量
def decompose(self):
return self.tensors, self.mask
# 返回嵌套张量的字符串表示
def __repr__(self):
return str(self.tensors)
# 从张量列表创建嵌套张量
# 复制自 transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
if tensor_list[0].ndim == 3:
# 计算最大尺寸
max_size = _max_by_axis([list(img.shape) for img in tensor_list])
batch_shape = [len(tensor_list)] + max_size
batch_size, num_channels, height, width = batch_shape
dtype = tensor_list[0].dtype
device = tensor_list[0].device
# 创建全零张量,形状为 batch_shape,指定 dtype 和设备
tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
# 创建全一遮罩张量,形状为 (batch_size, height, width),布尔类型,指定设备
mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
# 将每个张量复制到对应的零张量片段中,并更新遮罩张量
for img, pad_img, m in zip(tensor_list, tensor, mask):
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
m[: img.shape[1], : img.shape[2]] = False
else:
# 如果不是三维张量,则引发错误
raise ValueError("Only 3-dimensional tensors are supported")
return NestedTensor(tensor, mask)
.\models\conditional_detr\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_conditional_detr": [
"CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP",
"ConditionalDetrConfig",
"ConditionalDetrOnnxConfig",
]
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_conditional_detr"] = ["ConditionalDetrFeatureExtractor"]
_import_structure["image_processing_conditional_detr"] = ["ConditionalDetrImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_conditional_detr"] = [
"CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
"ConditionalDetrForObjectDetection",
"ConditionalDetrForSegmentation",
"ConditionalDetrModel",
"ConditionalDetrPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_conditional_detr import (
CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
ConditionalDetrConfig,
ConditionalDetrOnnxConfig,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_conditional_detr import ConditionalDetrFeatureExtractor
from .image_processing_conditional_detr import ConditionalDetrImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_conditional_detr import (
CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
ConditionalDetrForObjectDetection,
ConditionalDetrForSegmentation,
ConditionalDetrModel,
ConditionalDetrPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\convbert\configuration_convbert.py
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"YituTech/conv-bert-base": "https://huggingface.co/YituTech/conv-bert-base/resolve/main/config.json",
"YituTech/conv-bert-medium-small": (
"https://huggingface.co/YituTech/conv-bert-medium-small/resolve/main/config.json"
),
"YituTech/conv-bert-small": "https://huggingface.co/YituTech/conv-bert-small/resolve/main/config.json",
}
class ConvBertConfig(PretrainedConfig):
r"""
这是存储 [`ConvBertModel`] 配置的类。用于根据指定的参数实例化 ConvBERT 模型,定义模型架构。使用默认参数实例化一个配置对象,
可以得到与 ConvBERT [YituTech/conv-bert-base](https://huggingface.co/YituTech/conv-bert-base) 架构类似的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。
"""
class ConvBertConfig:
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
head_ratio=2,
num_groups=1,
conv_kernel_size=9,
classifier_dropout=None
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.head_ratio = head_ratio
self.num_groups = num_groups
self.conv_kernel_size = conv_kernel_size
self.classifier_dropout = classifier_dropout
from transformers import ConvBertConfig, ConvBertModel
configuration = ConvBertConfig()
model = ConvBertModel(configuration)
configuration = model.config
class ConvBertOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
("token_type_ids", dynamic_axis),
]
)
.\models\convbert\convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py
"""Convert ConvBERT checkpoint."""
import argparse
from transformers import ConvBertConfig, ConvBertModel, TFConvBertModel, load_tf_weights_in_convbert
from transformers.utils import logging
logging.set_verbosity_info()
def convert_orig_tf1_checkpoint_to_pytorch(tf_checkpoint_path, convbert_config_file, pytorch_dump_path):
conf = ConvBertConfig.from_json_file(convbert_config_file)
model = ConvBertModel(conf)
model = load_tf_weights_in_convbert(model, conf, tf_checkpoint_path)
model.save_pretrained(pytorch_dump_path)
tf_model = TFConvBertModel.from_pretrained(pytorch_dump_path, from_pt=True)
tf_model.save_pretrained(pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--convbert_config_file",
default=None,
type=str,
required=True,
help=(
"The config json file corresponding to the pre-trained ConvBERT model. \n"
"This specifies the model architecture."
),
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_orig_tf1_checkpoint_to_pytorch(args.tf_checkpoint_path, args.convbert_config_file, args.pytorch_dump_path)
.\models\convbert\modeling_convbert.py
""" PyTorch ConvBERT 模型。"""
import math
import os
from operator import attrgetter
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN, get_activation
from ...modeling_outputs import (
BaseModelOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel, SequenceSummary
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_convbert import ConvBertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "YituTech/conv-bert-base"
_CONFIG_FOR_DOC = "ConvBertConfig"
CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"YituTech/conv-bert-base",
"YituTech/conv-bert-medium-small",
"YituTech/conv-bert-small",
]
def load_tf_weights_in_convbert(model, config, tf_checkpoint_path):
"""从TensorFlow检查点加载权重到PyTorch模型中。"""
try:
import tensorflow as tf
except ImportError:
logger.error(
"在PyTorch中加载TensorFlow模型需要安装TensorFlow。请参阅 "
"https://www.tensorflow.org/install/ 获取安装说明。"
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"从 {tf_path} 转换TensorFlow检查点")
init_vars = tf.train.list_variables(tf_path)
tf_data = {}
for name, shape in init_vars:
logger.info(f"加载TF权重 {name},形状为 {shape}")
array = tf.train.load_variable(tf_path, name)
tf_data[name] = array
param_mapping = {
"embeddings.word_embeddings.weight": "electra/embeddings/word_embeddings",
"embeddings.position_embeddings.weight": "electra/embeddings/position_embeddings",
"embeddings.token_type_embeddings.weight": "electra/embeddings/token_type_embeddings",
"embeddings.LayerNorm.weight": "electra/embeddings/LayerNorm/gamma",
"embeddings.LayerNorm.bias": "electra/embeddings/LayerNorm/beta",
"embeddings_project.weight": "electra/embeddings_project/kernel",
"embeddings_project.bias": "electra/embeddings_project/bias",
}
if config.num_groups > 1:
group_dense_name = "g_dense"
else:
group_dense_name = "dense"
for param in model.named_parameters():
param_name = param[0]
retriever = attrgetter(param_name)
result = retriever(model)
tf_name = param_mapping[param_name]
value = torch.from_numpy(tf_data[tf_name])
logger.info(f"TF: {tf_name}, PT: {param_name} ")
if tf_name.endswith("/kernel"):
if not tf_name.endswith("/intermediate/g_dense/kernel"):
if not tf_name.endswith("/output/g_dense/kernel"):
value = value.T
elif tf_name.endswith("/depthwise_kernel"):
value = value.permute(1, 2, 0)
elif tf_name.endswith("/pointwise_kernel"):
value = value.permute(2, 1, 0)
elif tf_name.endswith("/conv_attn_key/bias"):
value = value.unsqueeze(-1)
result.data = value
return model
class ConvBertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
) -> torch.LongTensor:
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class ConvBertPreTrainedModel(PreTrainedModel):
"""
# 用于处理权重初始化和简单接口以下载和加载预训练模型的抽象类。
# 指定配置类为ConvBertConfig
config_class = ConvBertConfig
# 加载 TensorFlow 权重的函数为load_tf_weights_in_convbert
load_tf_weights = load_tf_weights_in_convbert
# 基础模型前缀为"convbert"
base_model_prefix = "convbert"
# 支持梯度检查点
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""初始化模型的权重"""
# 如果是线性层(nn.Linear)
if isinstance(module, nn.Linear):
# 使用正态分布初始化权重,均值为0,标准差为配置文件中的initializer_range
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果存在偏置,则将偏置初始化为0
if module.bias is not None:
module.bias.data.zero_()
# 如果是嵌入层(nn.Embedding)
elif isinstance(module, nn.Embedding):
# 使用正态分布初始化权重,均值为0,标准差为配置文件中的initializer_range
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果存在填充索引(padding_idx),则将对应位置的权重初始化为0
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
# 如果是层归一化层(nn.LayerNorm)
elif isinstance(module, nn.LayerNorm):
# 将偏置初始化为0
module.bias.data.zero_()
# 将权重初始化为1
module.weight.data.fill_(1.0)
class SeparableConv1D(nn.Module):
"""This class implements separable convolution, i.e. a depthwise and a pointwise layer"""
def __init__(self, config, input_filters, output_filters, kernel_size, **kwargs):
super().__init__()
# 定义深度卷积层,使用深度卷积(depthwise convolution)方式,groups=input_filters 表示每个输入通道单独卷积
self.depthwise = nn.Conv1d(
input_filters,
input_filters,
kernel_size=kernel_size,
groups=input_filters,
padding=kernel_size // 2,
bias=False,
)
# 定义点卷积层,用于将深度卷积的结果进行升维到输出通道数
self.pointwise = nn.Conv1d(input_filters, output_filters, kernel_size=1, bias=False)
# 定义偏置项参数
self.bias = nn.Parameter(torch.zeros(output_filters, 1))
# 初始化深度卷积层和点卷积层的权重
self.depthwise.weight.data.normal_(mean=0.0, std=config.initializer_range)
self.pointwise.weight.data.normal_(mean=0.0, std=config.initializer_range)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 执行深度卷积操作
x = self.depthwise(hidden_states)
# 执行点卷积操作
x = self.pointwise(x)
# 添加偏置项
x += self.bias
return x
class ConvBertSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
# 检查隐藏层大小是否能被注意力头数整除
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
# 计算新的注意力头数
new_num_attention_heads = config.num_attention_heads // config.head_ratio
if new_num_attention_heads < 1:
self.head_ratio = config.num_attention_heads
self.num_attention_heads = 1
else:
self.num_attention_heads = new_num_attention_heads
self.head_ratio = config.head_ratio
# 设置卷积核大小
self.conv_kernel_size = config.conv_kernel_size
# 检查隐藏层大小是否能被注意力头数整除
if config.hidden_size % self.num_attention_heads != 0:
raise ValueError("hidden_size should be divisible by num_attention_heads")
# 计算每个注意力头的大小
self.attention_head_size = (config.hidden_size // self.num_attention_heads) // 2
self.all_head_size = self.num_attention_heads * self.attention_head_size
# 定义查询、键、值的线性层
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
# 设置键卷积注意力层
self.key_conv_attn_layer = SeparableConv1D(
config, config.hidden_size, self.all_head_size, self.conv_kernel_size
)
# 设置卷积核层
self.conv_kernel_layer = nn.Linear(self.all_head_size, self.num_attention_heads * self.conv_kernel_size)
# 设置卷积输出层
self.conv_out_layer = nn.Linear(config.hidden_size, self.all_head_size)
# 定义卷积展开层
self.unfold = nn.Unfold(
kernel_size=[self.conv_kernel_size, 1], padding=[int((self.conv_kernel_size - 1) / 2), 0]
)
# 定义 dropout 层
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
# 将输入张量 x 进行形状变换,用于注意力分数计算
def transpose_for_scores(self, x):
# 计算新的形状,保留除了最后一维外的所有维度,增加注意力头数和每个注意力头的大小
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
# 对输入张量 x 进行形状重塑,使其符合注意力计算的需要
x = x.view(*new_x_shape)
# 对张量进行维度置换,以便进行注意力计算,顺序为 batch, head, seq_length, head_size
return x.permute(0, 2, 1, 3)
# 模型的前向传播函数,接收隐藏状态、注意力掩码、头掩码、编码器隐藏状态等作为输入
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
# 定义一个名为 ConvBertSelfOutput 的神经网络模块类
class ConvBertSelfOutput(nn.Module):
# 初始化函数,接收一个配置参数对象 config
def __init__(self, config):
# 调用父类的初始化函数
super().__init__()
# 创建一个线性层,输入和输出大小均为配置参数中的隐藏大小
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 创建一个 LayerNorm 层,输入大小为隐藏大小,epsilon 为配置参数中的层归一化 epsilon
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 创建一个 Dropout 层,使用配置参数中的隐藏 dropout 概率
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 前向传播函数,接收两个张量参数并返回一个张量
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
# 使用线性层处理隐藏状态张量
hidden_states = self.dense(hidden_states)
# 使用 Dropout 处理后的隐藏状态张量
hidden_states = self.dropout(hidden_states)
# 对加和后的归一化处理隐藏状态张量
hidden_states = self.LayerNorm(hidden_states + input_tensor)
# 返回处理后的隐藏状态张量
return hidden_states
# 定义一个名为 ConvBertAttention 的神经网络模块类
class ConvBertAttention(nn.Module):
# 初始化函数,接收一个配置参数对象 config
def __init__(self, config):
# 调用父类的初始化函数
super().__init__()
# 创建一个 ConvBertSelfAttention 对象,使用给定的配置参数
self.self = ConvBertSelfAttention(config)
# 创建一个 ConvBertSelfOutput 对象,使用给定的配置参数
self.output = ConvBertSelfOutput(config)
# 初始化一个空集合,用于存储被修剪的注意力头
self.pruned_heads = set()
# 修剪注意力头的方法,接收一个头的列表
def prune_heads(self, heads):
# 如果头列表为空,直接返回
if len(heads) == 0:
return
# 调用辅助函数查找可修剪的头并返回索引
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# 修剪自注意力层的线性层
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# 更新超参数并存储修剪过的头
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
# 前向传播函数,接收多个参数并返回一个元组
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, Optional[torch.FloatTensor]]:
# 调用自注意力层的前向传播方法,处理隐藏状态等输入参数
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
output_attentions,
)
# 使用输出层处理自注意力层的输出和输入的隐藏状态张量
attention_output = self.output(self_outputs[0], hidden_states)
# 如果需要输出注意力权重,则添加到输出元组中
outputs = (attention_output,) + self_outputs[1:] # 如果需要输出注意力权重,则添加到输出元组中
# 返回处理后的输出元组
return outputs
# 定义一个名为 GroupedLinearLayer 的神经网络模块类
class GroupedLinearLayer(nn.Module):
# 初始化函数,接收输入大小、输出大小和分组数量作为参数
def __init__(self, input_size, output_size, num_groups):
# 调用父类的初始化函数
super().__init__()
# 设置输入大小、输出大小和分组数量的属性
self.input_size = input_size
self.output_size = output_size
self.num_groups = num_groups
# 计算每个分组的输入维度和输出维度
self.group_in_dim = self.input_size // self.num_groups
self.group_out_dim = self.output_size // self.num_groups
# 创建权重参数张量,形状为 (num_groups, group_in_dim, group_out_dim)
self.weight = nn.Parameter(torch.empty(self.num_groups, self.group_in_dim, self.group_out_dim))
# 创建偏置参数张量,形状为 (output_size,)
self.bias = nn.Parameter(torch.empty(output_size))
# 定义一个前向传播方法,接收隐藏状态作为输入张量,返回处理后的张量作为输出
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 获取隐藏状态张量的批量大小
batch_size = list(hidden_states.size())[0]
# 将隐藏状态张量重塑为 [batch_size, self.num_groups, self.group_in_dim] 的形状
x = torch.reshape(hidden_states, [-1, self.num_groups, self.group_in_dim])
# 将张量 x 的维度重新排列为 [self.num_groups, batch_size, self.group_in_dim]
x = x.permute(1, 0, 2)
# 使用 self.weight 对 x 进行矩阵乘法运算
x = torch.matmul(x, self.weight)
# 再次将张量 x 的维度重新排列为 [batch_size, self.num_groups, self.output_size]
x = x.permute(1, 0, 2)
# 将张量 x 重塑为 [batch_size, -1, self.output_size] 的形状
x = torch.reshape(x, [batch_size, -1, self.output_size])
# 将张量 x 加上偏置 self.bias
x = x + self.bias
# 返回处理后的张量 x 作为输出
return x
class ConvBertIntermediate(nn.Module):
# ConvBertIntermediate 类定义
def __init__(self, config):
super().__init__()
# 如果分组数为1,使用普通的线性层
if config.num_groups == 1:
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
else:
# 否则使用分组线性层
self.dense = GroupedLinearLayer(
input_size=config.hidden_size, output_size=config.intermediate_size, num_groups=config.num_groups
)
# 根据配置选择激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
# 前向传播方法
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 使用线性层处理隐藏状态
hidden_states = self.dense(hidden_states)
# 使用中间激活函数处理线性层输出
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class ConvBertOutput(nn.Module):
# ConvBertOutput 类定义
def __init__(self, config):
super().__init__()
# 如果分组数为1,使用普通的线性层
if config.num_groups == 1:
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
else:
# 否则使用分组线性层
self.dense = GroupedLinearLayer(
input_size=config.intermediate_size, output_size=config.hidden_size, num_groups=config.num_groups
)
# LayerNorm 层,用于归一化隐藏状态
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# Dropout 层,用于随机丢弃部分隐藏状态
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 前向传播方法
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
# 使用线性层处理隐藏状态
hidden_states = self.dense(hidden_states)
# 使用 Dropout 随机丢弃部分隐藏状态
hidden_states = self.dropout(hidden_states)
# 使用 LayerNorm 层对加和后的隐藏状态进行归一化处理
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class ConvBertLayer(nn.Module):
# ConvBertLayer 类定义
def __init__(self, config):
super().__init__()
# 用于分块的前馈传播的大小
self.chunk_size_feed_forward = config.chunk_size_feed_forward
# 序列长度维度
self.seq_len_dim = 1
# 自注意力层
self.attention = ConvBertAttention(config)
# 是否为解码器
self.is_decoder = config.is_decoder
# 是否添加交叉注意力
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
# 如果添加交叉注意力且不是解码器模型,则抛出类型错误
if not self.is_decoder:
raise TypeError(f"{self} should be used as a decoder model if cross attention is added")
# 否则添加交叉注意力自注意力层
self.crossattention = ConvBertAttention(config)
# ConvBertIntermediate 中间层
self.intermediate = ConvBertIntermediate(config)
# ConvBertOutput 输出层
self.output = ConvBertOutput(config)
# 前向传播方法
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
# 省略号表示可能的其他参数
# 定义方法,接受隐藏状态,注意力掩码,头部掩码,是否输出注意力权重,返回自注意力模型输出和可能的注意力权重
) -> Tuple[torch.Tensor, Optional[torch.FloatTensor]]:
# 使用自注意力模型计算注意力输出
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
)
# 取出自注意力模型的输出
attention_output = self_attention_outputs[0]
# 如果输出注意力权重,将它们添加到输出中
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
# 如果当前模型是解码器且有编码器隐藏状态输入
if self.is_decoder and encoder_hidden_states is not None:
# 如果当前模型没有跨注意力层,抛出属性错误
if not hasattr(self, "crossattention"):
raise AttributeError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
# 使用跨注意力模型计算跨注意力输出
cross_attention_outputs = self.crossattention(
attention_output,
encoder_attention_mask,
head_mask,
encoder_hidden_states,
output_attentions,
)
# 取出跨注意力模型的输出
attention_output = cross_attention_outputs[0]
# 将跨注意力权重添加到输出中
outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights
# 对注意力输出应用分块处理
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
# 将处理后的层输出添加到总体输出中
outputs = (layer_output,) + outputs
# 返回所有输出
return outputs
# 定义方法,接受注意力输出并进行前馈处理
def feed_forward_chunk(self, attention_output):
# 将注意力输出送入中间层
intermediate_output = self.intermediate(attention_output)
# 将中间层输出和注意力输出送入输出层
layer_output = self.output(intermediate_output, attention_output)
# 返回层输出
return layer_output
# 定义一个名为 ConvBertEncoder 的神经网络模型类,继承自 nn.Module
class ConvBertEncoder(nn.Module):
# 初始化函数,接受一个配置参数 config
def __init__(self, config):
super().__init__()
# 将传入的配置参数保存到当前对象的 config 属性中
self.config = config
# 使用列表推导式创建一个包含多个 ConvBertLayer 实例的 ModuleList,数量为 config.num_hidden_layers
self.layer = nn.ModuleList([ConvBertLayer(config) for _ in range(config.num_hidden_layers)])
# 初始化梯度检查点标志为 False
self.gradient_checkpointing = False
# 前向传播函数,接受多个输入参数,并返回一个包含多个输出的对象
def forward(
self,
hidden_states: torch.Tensor, # 输入的隐藏状态张量
attention_mask: Optional[torch.FloatTensor] = None, # 可选的注意力掩码张量
head_mask: Optional[torch.FloatTensor] = None, # 可选的头部掩码张量
encoder_hidden_states: Optional[torch.Tensor] = None, # 可选的编码器隐藏状态张量
encoder_attention_mask: Optional[torch.Tensor] = None, # 可选的编码器注意力掩码张量
output_attentions: Optional[bool] = False, # 是否输出注意力权重,默认为 False
output_hidden_states: Optional[bool] = False, # 是否输出所有隐藏状态,默认为 False
return_dict: Optional[bool] = True, # 是否返回字典格式的输出,默认为 True
) -> Union[Tuple, BaseModelOutputWithCrossAttentions]: # 返回值可以是元组或 BaseModelOutputWithCrossAttentions 类型
# 如果需要输出隐藏状态,则初始化空的元组 all_hidden_states
all_hidden_states = () if output_hidden_states else None
# 如果需要输出注意力权重,则初始化空的元组 all_self_attentions
all_self_attentions = () if output_attentions else None
# 如果需要输出交叉注意力权重且配置允许,则初始化空的元组 all_cross_attentions
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
# 遍历每个 ConvBertLayer 实例
for i, layer_module in enumerate(self.layer):
# 如果需要输出隐藏状态,则将当前隐藏状态添加到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 获取当前层的头部掩码,如果头部掩码不为 None,则使用对应的头部掩码
layer_head_mask = head_mask[i] if head_mask is not None else None
# 如果开启了梯度检查点且当前处于训练状态
if self.gradient_checkpointing and self.training:
# 使用梯度检查点函数来调用当前层的 __call__ 方法
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
output_attentions,
)
else:
# 否则直接调用当前层的 __call__ 方法
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
output_attentions,
)
# 更新隐藏状态为当前层的输出的第一个元素
hidden_states = layer_outputs[0]
# 如果需要输出注意力权重
if output_attentions:
# 将当前层的注意力权重添加到 all_self_attentions 中
all_self_attentions = all_self_attentions + (layer_outputs[1],)
# 如果配置允许,将当前层的交叉注意力权重添加到 all_cross_attentions 中
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
# 如果需要输出隐藏状态,则将最终的隐藏状态添加到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果不需要返回字典格式的输出,则返回包含多个非 None 元素的元组
if not return_dict:
return tuple(
v
for v in [hidden_states, all_hidden_states, all_self_attentions, all_cross_attentions]
if v is not None
)
# 否则返回一个 BaseModelOutputWithCrossAttentions 类型的对象,包含指定的输出
return BaseModelOutputWithCrossAttentions(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
# 初始化函数,用于创建一个新的神经网络层对象
def __init__(self, config):
# 调用父类的初始化函数
super().__init__()
# 创建一个线性层,输入和输出维度都是config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 根据config中的配置选择激活函数,存储在self.transform_act_fn中
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
# 创建一个LayerNorm层,对输入进行归一化处理
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 前向传播函数,接受一个张量hidden_states作为输入,返回一个张量
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 输入张量经过线性层dense,输出经过变换后的hidden_states
hidden_states = self.dense(hidden_states)
# 经过激活函数变换
hidden_states = self.transform_act_fn(hidden_states)
# 经过LayerNorm层进行归一化处理
hidden_states = self.LayerNorm(hidden_states)
# 返回处理后的张量作为输出
return hidden_states
# CONVBERT_INPUTS_DOCSTRING 用于定义 ConvBERT 模型的输入文档字符串,通常用于解释模型的输入参数和格式。
CONVBERT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
[What are input IDs?](../glossary
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
[What are attention masks?](../glossary
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
[What are token type IDs?](../glossary
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
[What are position IDs?](../glossary
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
定义一个 ConvBERT 模型类,继承自 ConvBertPreTrainedModel,用于生成原始隐藏状态而不添加特定的输出头部。
@add_start_docstrings(
"The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.",
CONVBERT_START_DOCSTRING,
)
class ConvBertModel(ConvBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化嵌入层
self.embeddings = ConvBertEmbeddings(config)
# 如果嵌入大小不等于隐藏层大小,则添加线性映射层
if config.embedding_size != config.hidden_size:
self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size)
# 初始化编码器层
self.encoder = ConvBertEncoder(config)
self.config = config
# 初始化权重并进行最终处理
self.post_init()
def get_input_embeddings(self):
# 获取输入的词嵌入
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
# 设置输入的词嵌入
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
对模型的注意力头进行修剪。heads_to_prune: dict,格式为 {层号: 要修剪的头列表} 参见基类 PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
# 对每个层的指定头进行修剪
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithCrossAttentions]:
# 如果未指定 output_attentions 参数,则使用配置中的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果未指定 output_hidden_states 参数,则使用配置中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未指定 return_dict 参数,则使用配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果同时指定了 input_ids 和 inputs_embeds,抛出 ValueError 异常
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
# 如果指定了 input_ids,则检查 padding 的情况并提醒
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
# 获取 input_ids 的形状
input_shape = input_ids.size()
elif inputs_embeds is not None:
# 如果指定了 inputs_embeds,则获取其形状,去掉最后一维
input_shape = inputs_embeds.size()[:-1]
else:
# 如果既未指定 input_ids 也未指定 inputs_embeds,则抛出 ValueError 异常
raise ValueError("You have to specify either input_ids or inputs_embeds")
# 获取 batch_size 和 seq_length
batch_size, seq_length = input_shape
# 获取 input_ids 或 inputs_embeds 的设备信息
device = input_ids.device if input_ids is not None else inputs_embeds.device
# 如果未提供 attention_mask,则创建一个全为1的 mask 张量,形状与 input_shape 一致
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
# 如果未提供 token_type_ids
if token_type_ids is None:
# 如果 embeddings 拥有 token_type_ids 属性,则使用其提供的 token_type_ids
if hasattr(self.embeddings, "token_type_ids"):
buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
# 否则创建一个全为0的 token_type_ids 张量,dtype 为 long
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
# 获取扩展后的 attention_mask
extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
# 获取 head_mask
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
# 使用 embeddings 函数获取 hidden_states
hidden_states = self.embeddings(
input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
)
# 如果模型具有 embeddings_project 属性,则对 hidden_states 进行投影处理
if hasattr(self, "embeddings_project"):
hidden_states = self.embeddings_project(hidden_states)
# 使用 encoder 处理 hidden_states
hidden_states = self.encoder(
hidden_states,
attention_mask=extended_attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 返回处理后的 hidden_states
return hidden_states
# 定义 ConvBERT 模型的预测模块,由两个全连接层组成
class ConvBertGeneratorPredictions(nn.Module):
"""Prediction module for the generator, made up of two dense layers."""
def __init__(self, config):
super().__init__()
# 使用 GELU 激活函数作为激活函数
self.activation = get_activation("gelu")
# LayerNorm 层,对隐藏状态进行标准化处理
self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
# 全连接层,将隐藏状态映射到指定维度的特征空间
self.dense = nn.Linear(config.hidden_size, config.embedding_size)
def forward(self, generator_hidden_states: torch.FloatTensor) -> torch.FloatTensor:
# 将生成器的隐藏状态传入全连接层
hidden_states = self.dense(generator_hidden_states)
# 使用 GELU 激活函数处理全连接层的输出
hidden_states = self.activation(hidden_states)
# 对处理后的隐藏状态进行 LayerNorm
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
# ConvBERT 模型,具有在顶部进行语言建模的头部
@add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING)
class ConvBertForMaskedLM(ConvBertPreTrainedModel):
_tied_weights_keys = ["generator.lm_head.weight"]
def __init__(self, config):
super().__init__(config)
# 初始化 ConvBERT 模型
self.convbert = ConvBertModel(config)
# 初始化生成器预测模块
self.generator_predictions = ConvBertGeneratorPredictions(config)
# 生成器的语言建模头部,将隐藏状态映射到词汇表大小的空间
self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)
# 初始化权重并应用最终处理
self.post_init()
def get_output_embeddings(self):
return self.generator_lm_head
def set_output_embeddings(self, word_embeddings):
self.generator_lm_head = word_embeddings
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
# 根据 return_dict 是否为 None,确定是否使用 self.config.use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 通过 ConvBERT 模型进行前向传播,生成隐状态
generator_hidden_states = self.convbert(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict,
)
# 从生成的隐藏状态中获取序列输出
generator_sequence_output = generator_hidden_states[0]
# 使用生成的序列输出进行预测得分计算
prediction_scores = self.generator_predictions(generator_sequence_output)
# 将预测得分再经过语言模型头部计算,得到最终预测结果
prediction_scores = self.generator_lm_head(prediction_scores)
loss = None
# 如果提供了 labels,则计算损失
# 遮罩语言建模的 softmax 层
if labels is not None:
# 使用交叉熵损失函数,忽略 -100 索引(填充标记)
loss_fct = nn.CrossEntropyLoss() # -100 index = padding token
# 计算损失值
loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
# 如果 return_dict 为 False,则返回元组形式的输出
if not return_dict:
output = (prediction_scores,) + generator_hidden_states[1:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则返回 MaskedLMOutput 对象
return MaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=generator_hidden_states.hidden_states,
attentions=generator_hidden_states.attentions,
)
class ConvBertClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
# 初始化一个全连接层,输入和输出维度都是 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 根据配置选择分类器的 dropout 比例,如果未指定则使用隐藏层 dropout 比例
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
# 定义一个 dropout 层,用于在训练过程中随机失活输入张量
self.dropout = nn.Dropout(classifier_dropout)
# 输出层全连接层,输入维度为 config.hidden_size,输出维度为 config.num_labels
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
self.config = config
def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
# 取隐藏状态张量的第一个位置的特征向量作为输出
x = hidden_states[:, 0, :] # take <s> token (equiv. to [CLS])
x = self.dropout(x)
x = self.dense(x)
# 根据配置中指定的激活函数对全连接层的输出进行非线性变换
x = ACT2FN[self.config.hidden_act](x)
x = self.dropout(x)
# 将处理后的特征向量传入输出层全连接层,得到最终的分类预测结果
x = self.out_proj(x)
return x
@add_start_docstrings(
"""
ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
CONVBERT_START_DOCSTRING,
)
class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 标签数量
self.num_labels = config.num_labels
self.config = config
# ConvBERT 模型主体
self.convbert = ConvBertModel(config)
# 分类任务的头部
self.classifier = ConvBertClassificationHead(config)
# 初始化模型权重并进行后处理
self.post_init()
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 如果 return_dict 参数为 None,则使用 self.config.use_return_dict 的设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 ConvBert 模型进行前向传播,获取输出结果
outputs = self.convbert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从 ConvBert 的输出中获取序列输出
sequence_output = outputs[0]
# 将序列输出 logits 输入分类器,得到分类结果
logits = self.classifier(sequence_output)
# 初始化损失值为 None
loss = None
# 如果 labels 参数不为 None,则计算损失
if labels is not None:
# 如果问题类型未设置,则根据 num_labels 设置问题类型
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型选择损失函数和计算损失
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
# 对于单个标签的回归任务,计算均方误差损失
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
# 对于多标签的回归任务,计算均方误差损失
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
# 对于单标签分类任务,使用交叉熵损失函数
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
# 对于多标签分类任务,使用带Logits的二元交叉熵损失函数
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果 return_dict 为 False,则返回输出元组
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则返回 SequenceClassifierOutput 对象
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
ConvBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
CONVBERT_START_DOCSTRING,
)
class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化 ConvBERT 模型
self.convbert = ConvBertModel(config)
# 初始化用于序列摘要的对象
self.sequence_summary = SequenceSummary(config)
# 初始化多选题分类的线性层
self.classifier = nn.Linear(config.hidden_size, 1)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(
CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
前向传播函数,接受多个输入参数并返回模型输出。
Args:
input_ids (Optional[torch.LongTensor], optional): 输入 token 的 IDs. Defaults to None.
attention_mask (Optional[torch.FloatTensor], optional): 表示每个 token 的 attention mask. Defaults to None.
token_type_ids (Optional[torch.LongTensor], optional): 区分不同句子的 token type IDs. Defaults to None.
position_ids (Optional[torch.LongTensor], optional): 句子中每个 token 的位置 IDs. Defaults to None.
head_mask (Optional[torch.FloatTensor], optional): 用于屏蔽不同 attention heads 的掩码. Defaults to None.
inputs_embeds (Optional[torch.FloatTensor], optional): 直接提供的嵌入输入. Defaults to None.
labels (Optional[torch.LongTensor], optional): 多选题的标签. Defaults to None.
output_attentions (Optional[bool], optional): 是否输出 attention. Defaults to None.
output_hidden_states (Optional[bool], optional): 是否输出隐藏状态. Defaults to None.
return_dict (Optional[bool], optional): 是否返回字典格式的输出. Defaults to None.
Returns:
输出结果,通常为多选题模型的分类输出.
"""
# 在 ConvBERT 模型上进行前向传播
return self.convbert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
) -> Union[Tuple, MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
# 确定是否返回字典格式的输出结果,如果未指定则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 计算选择题数量,根据输入的 input_ids 或 inputs_embeds 的第二个维度确定
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
# 将输入的 input_ids 调整为二维张量,以便于后续处理,如果为 None 则置为 None
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
# 将输入的 attention_mask 调整为二维张量,以便于后续处理,如果为 None 则置为 None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
# 将输入的 token_type_ids 调整为二维张量,以便于后续处理,如果为 None 则置为 None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
# 将输入的 position_ids 调整为二维张量,以便于后续处理,如果为 None 则置为 None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
# 将输入的 inputs_embeds 调整为三维张量,以便于后续处理,如果为 None 则置为 None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
# 调用 ConvBERT 模型进行前向传播计算
outputs = self.convbert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取序列输出
sequence_output = outputs[0]
# 对序列输出进行汇总
pooled_output = self.sequence_summary(sequence_output)
# 对汇总后的输出进行分类预测
logits = self.classifier(pooled_output)
# 调整 logits 的形状以便与标签进行比较
reshaped_logits = logits.view(-1, num_choices)
# 初始化损失值
loss = None
# 如果存在标签,则计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
# 如果不需要返回字典格式的输出,则按原始格式返回结果
if not return_dict:
output = (reshaped_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果需要返回字典格式的输出,则构建 MultipleChoiceModelOutput 对象并返回
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
CONVBERT_START_DOCSTRING,
)
class ConvBertForTokenClassification(ConvBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels # 从配置中获取标签数量
self.convbert = ConvBertModel(config) # 初始化 ConvBERT 模型
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout) # 初始化 dropout 层
self.classifier = nn.Linear(config.hidden_size, config.num_labels) # 初始化分类器线性层
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
# 如果 return_dict 不为 None,则使用传入的 return_dict;否则使用配置中的 use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 将输入传递给 ConvBert 模型进行处理,并获得输出
outputs = self.convbert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从 ConvBert 模型的输出中获取序列输出(即隐藏状态的最后一层)
sequence_output = outputs[0]
# 对序列输出进行 dropout 处理
sequence_output = self.dropout(sequence_output)
# 使用分类器对 dropout 后的序列输出进行分类得到 logits
logits = self.classifier(sequence_output)
# 初始化损失为 None
loss = None
# 如果提供了 labels,则计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
# 将 logits 和 labels 展平为二维张量进行损失计算
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
# 如果 return_dict 为 False,则按非字典格式输出结果
if not return_dict:
# 将 logits 和 ConvBert 模型的其他输出组合成一个元组输出
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则构建 TokenClassifierOutput 对象进行输出
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
CONVBERT_START_DOCSTRING,
)
class ConvBertForQuestionAnswering(ConvBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.convbert = ConvBertModel(config) # 初始化 ConvBERT 模型
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) # 线性层用于输出 span 的起始和结束 logits
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
前向传播函数,接收输入参数,并返回模型的输出结果。
Args:
input_ids (Optional[torch.LongTensor], optional): 输入序列的 token IDs. Defaults to None.
attention_mask (Optional[torch.FloatTensor], optional): 注意力遮罩,掩盖要忽略的位置. Defaults to None.
token_type_ids (Optional[torch.LongTensor], optional): 区分不同序列的 token 类型 IDs. Defaults to None.
position_ids (Optional[torch.LongTensor], optional): 指定输入 token 的位置 IDs. Defaults to None.
head_mask (Optional[torch.FloatTensor], optional): 多头注意力机制中指定屏蔽的头. Defaults to None.
inputs_embeds (Optional[torch.FloatTensor], optional): 直接输入的嵌入表示. Defaults to None.
start_positions (Optional[torch.LongTensor], optional): 答案 span 的起始位置. Defaults to None.
end_positions (Optional[torch.LongTensor], optional): 答案 span 的结束位置. Defaults to None.
output_attentions (Optional[bool], optional): 是否返回注意力权重. Defaults to None.
output_hidden_states (Optional[bool], optional): 是否返回隐藏状态. Defaults to None.
return_dict (Optional[bool], optional): 是否以字典形式返回输出. Defaults to None.
Returns:
模型的输出,通常是一个 QuestionAnsweringModelOutput 对象.
"""
# 略过对输入参数的处理和组合
# 调用 ConvBERT 模型的 forward 方法,生成隐藏状态
outputs = self.convbert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 使用线性层计算答案 span 的起始和结束 logits
logits = self.qa_outputs(outputs[0])
# 构建模型输出结果
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
# 返回模型输出
return QuestionAnsweringModelOutput(
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=outputs.attentions if output_attentions else None,
)
) -> Union[Tuple, QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
# 如果 return_dict 不为空,则使用 return_dict;否则使用 self.config.use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 ConvBERT 模型,传入参数进行前向传播
outputs = self.convbert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取序列输出
sequence_output = outputs[0]
# 将序列输出传入 QA 输出层获取 logits
logits = self.qa_outputs(sequence_output)
# 将 logits 按最后一个维度分割为 start_logits 和 end_logits
start_logits, end_logits = logits.split(1, dim=-1)
# 去除多余的维度,并确保连续的内存布局
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
# 如果 start_positions 或 end_positions 是多维的,则压缩成一维
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# 忽略超出模型输入范围的 start/end positions
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
# 定义交叉熵损失函数,忽略 ignore_index
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
# 如果 return_dict 为 False,则返回元组格式的输出
output = (start_logits, end_logits) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
# 如果 return_dict 为 True,则返回 QuestionAnsweringModelOutput 类型的对象
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)