Transformers 源码解析(三十九)
.\models\detr\convert_detr_original_pytorch_checkpoint_to_pytorch.py
"""Convert DETR checkpoints with timm backbone."""
import argparse
import json
from collections import OrderedDict
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
rename_keys = []
for i in range(6):
rename_keys.append(
(f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
)
rename_keys.append(
(f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
)
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
rename_keys.append(
(f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
)
rename_keys.append(
(f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")
)
rename_keys.append(
(f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")
)
rename_keys.append(
(f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
)
rename_keys.append(
(
f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
f"decoder.layers.{i}.encoder_attn.out_proj.weight",
)
)
rename_keys.append(
(
f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
f"decoder.layers.{i}.encoder_attn.out_proj.bias",
)
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")
)
rename_keys.append(
(f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")
)
rename_keys.extend(
[
("input_proj.weight", "input_projection.weight"),
("input_proj.bias", "input_projection.bias"),
("query_embed.weight", "query_position_embeddings.weight"),
("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
("class_embed.weight", "class_labels_classifier.weight"),
("class_embed.bias", "class_labels_classifier.bias"),
("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
]
)
def rename_key(state_dict, old, new):
val = state_dict.pop(old)
state_dict[new] = val
def rename_backbone_keys(state_dict):
new_state_dict = OrderedDict()
for key, value in state_dict.items():
if "backbone.0.body" in key:
new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
new_state_dict[new_key] = value
else:
new_state_dict[key] = value
return new_state_dict
def read_in_q_k_v(state_dict, is_panoptic=False):
prefix = ""
if is_panoptic:
prefix = "detr."
for i in range(6):
in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
for i in range(6):
in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
in_proj_weight_cross_attn = state_dict.pop(
f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
)
in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
"""
Copy/paste/tweak model's weights to our DETR structure.
"""
config = DetrConfig()
if "resnet101" in model_name:
config.backbone = "resnet101"
if "dc5" in model_name:
config.dilation = True
is_panoptic = "panoptic" in model_name
if is_panoptic:
config.num_labels = 250
else:
config.num_labels = 91
repo_id = "huggingface/label-files"
filename = "coco-detection-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
format = "coco_panoptic" if is_panoptic else "coco_detection"
image_processor = DetrImageProcessor(format=format)
img = prepare_img()
encoding = image_processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
logger.info(f"Converting model {model_name}...")
detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval()
state_dict = detr.state_dict()
for src, dest in rename_keys:
if is_panoptic:
src = "detr." + src
rename_key(state_dict, src, dest)
state_dict = rename_backbone_keys(state_dict)
read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
prefix = "detr.model." if is_panoptic else "model."
for key in state_dict.copy().keys():
if is_panoptic:
if (
key.startswith("detr")
and not key.startswith("class_labels_classifier")
and not key.startswith("bbox_predictor")
):
val = state_dict.pop(key)
state_dict["detr.model" + key[4:]] = val
elif "class_labels_classifier" in key or "bbox_predictor" in key:
val = state_dict.pop(key)
state_dict["detr." + key] = val
elif key.startswith("bbox_attention") or key.startswith("mask_head"):
continue
else:
val = state_dict.pop(key)
state_dict[prefix + key] = val
else:
if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
val = state_dict.pop(key)
state_dict[prefix + key] = val
model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
original_outputs = detr(pixel_values)
outputs = model(pixel_values)
assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
if is_panoptic:
assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name", default="detr_resnet50", type=str, help="Name of the DETR model you'd like to convert."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
)
args = parser.parse_args()
convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)
.\models\detr\convert_detr_to_pytorch.py
"""Convert DETR checkpoints with native (Transformers) backbone."""
import argparse
import json
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor, ResNetConfig
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_detr_config(model_name):
if "resnet-50" in model_name:
backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50")
elif "resnet-101" in model_name:
backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-101")
else:
raise ValueError("Model name should include either resnet50 or resnet101")
config = DetrConfig(use_timm_backbone=False, backbone_config=backbone_config)
is_panoptic = "panoptic" in model_name
if is_panoptic:
config.num_labels = 250
else:
config.num_labels = 91
repo_id = "huggingface/label-files"
filename = "coco-detection-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config, is_panoptic
def create_rename_keys(config):
rename_keys = []
rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight"))
rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight"))
rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias"))
rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean"))
rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var"))
rename_keys.extend(
[
("input_proj.weight", "input_projection.weight"),
("input_proj.bias", "input_projection.bias"),
("query_embed.weight", "query_position_embeddings.weight"),
("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
("class_embed.weight", "class_labels_classifier.weight"),
("class_embed.bias", "class_labels_classifier.bias"),
("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
]
)
return rename_keys
def rename_key(state_dict, old, new):
val = state_dict.pop(old)
state_dict[new] = val
def read_in_q_k_v(state_dict, is_panoptic=False):
prefix = ""
if is_panoptic:
prefix = "detr."
for i in range(6):
in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
for i in range(6):
in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
in_proj_weight_cross_attn = state_dict.pop(
f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
)
in_proj_bias_cross_attn = state_dict.pop(
f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias"
)
state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_detr_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
"""
Copy/paste/tweak model's weights to our DETR structure.
"""
config, is_panoptic = get_detr_config(model_name)
model_name_to_original_name = {
"detr-resnet-50": "detr_resnet50",
"detr-resnet-101": "detr_resnet101",
}
logger.info(f"Converting model {model_name}...")
detr = torch.hub.load("facebookresearch/detr", model_name_to_original_name[model_name], pretrained=True).eval()
state_dict = detr.state_dict()
for src, dest in create_rename_keys(config):
if is_panoptic:
src = "detr." + src
rename_key(state_dict, src, dest)
read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
prefix = "detr.model." if is_panoptic else "model."
for key in state_dict.copy().keys():
if is_panoptic:
if (
key.startswith("detr")
and not key.startswith("class_labels_classifier")
and not key.startswith("bbox_predictor")
):
val = state_dict.pop(key)
state_dict["detr.model" + key[4:]] = val
elif "class_labels_classifier" in key or "bbox_predictor" in key:
val = state_dict.pop(key)
state_dict["detr." + key] = val
elif key.startswith("bbox_attention") or key.startswith("mask_head"):
continue
else:
val = state_dict.pop(key)
state_dict[prefix + key] = val
else:
if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
val = state_dict.pop(key)
state_dict[prefix + key] = val
model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
format = "coco_panoptic" if is_panoptic else "coco_detection"
processor = DetrImageProcessor(format=format)
encoding = processor(images=prepare_img(), return_tensors="pt")
pixel_values = encoding["pixel_values"]
original_outputs = detr(pixel_values)
outputs = model(pixel_values)
assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-3)
assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-3)
if is_panoptic:
assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
logger.info("Uploading PyTorch model and image processor to the hub...")
model.push_to_hub(f"nielsr/{model_name}")
processor.push_to_hub(f"nielsr/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="detr-resnet-50",
type=str,
choices=["detr-resnet-50", "detr-resnet-101"],
help="Name of the DETR model you'd like to convert."
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model."
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to push the model to the hub or not."
)
args = parser.parse_args()
convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\detr\feature_extraction_detr.py
"""DETR 的特征提取器类。"""
import warnings
from ...image_transforms import rgb_to_id as _rgb_to_id
from ...utils import logging
from .image_processing_detr import DetrImageProcessor
logger = logging.get_logger(__name__)
def rgb_to_id(x):
warnings.warn(
"rgb_to_id has moved and will not be importable from this module from v5. "
"Please import from transformers.image_transforms instead.",
FutureWarning,
)
return _rgb_to_id(x)
class DetrFeatureExtractor(DetrImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class DetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
" Please use DetrImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\detr\image_processing_detr.py
"""Image processor class for DETR."""
import io
import pathlib
from collections import defaultdict
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
PaddingMode,
center_to_corners_format,
corners_to_center_format,
id_to_rgb,
pad,
rescale,
resize,
rgb_to_id,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
AnnotationFormat,
AnnotationType,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_annotations,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import (
TensorType,
is_flax_available,
is_jax_tensor,
is_scipy_available,
is_tf_available,
is_tf_tensor,
is_torch_available,
is_torch_tensor,
is_vision_available,
logging,
)
if is_torch_available():
import torch
from torch import nn
if is_vision_available():
import PIL
if is_scipy_available():
import scipy.special
import scipy.stats
logger = logging.get_logger(__name__)
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
"""
Computes the output image size given the input image size and the desired output size.
Args:
image_size (`Tuple[int, int]`):
The input image size.
size (`int`):
The desired output size.
max_size (`int`, *optional*):
The maximum allowed output size.
"""
height, width = image_size
if max_size is not None:
min_original_size = float(min((height, width)))
max_original_size = float(max((height, width)))
if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size))
if (height <= width and height == size) or (width <= height and width == size):
return height, width
if width < height:
ow = size
oh = int(size * height / width)
else:
oh = size
ow = int(size * width / height)
return (oh, ow)
def get_resize_output_image_size(
input_image: np.ndarray,
size: Union[int, Tuple[int, int], List[int]],
max_size: Optional[int] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Args:
input_image (`np.ndarray`):
要调整大小的图像。
size (`int` or `Tuple[int, int]` or `List[int]`):
期望的输出大小。
max_size (`int`, *可选*):
允许的最大输出大小。
input_data_format (`ChannelDimension` or `str`, *可选*):
输入图像的通道维度格式。如果未提供,则将从输入图像推断。
"""
image_size = get_image_size(input_image, input_data_format)
if isinstance(size, (list, tuple)):
return size
return get_size_with_aspect_ratio(image_size, size, max_size)
def get_numpy_to_framework_fn(arr) -> Callable:
"""
Args:
arr (`np.ndarray`): 要转换的数组。
"""
if isinstance(arr, np.ndarray):
return np.array
if is_tf_available() and is_tf_tensor(arr):
import tensorflow as tf
return tf.convert_to_tensor
if is_torch_available() and is_torch_tensor(arr):
import torch
return torch.tensor
if is_flax_available() and is_jax_tensor(arr):
import jax.numpy as jnp
return jnp.array
raise ValueError(f"Cannot convert arrays of type {type(arr)}")
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
"""
Args:
arr (`np.ndarray`): 要压缩的数组。
axis (`int`, *可选*): 要压缩的轴。
"""
if axis is None:
return arr.squeeze()
try:
return arr.squeeze(axis=axis)
except ValueError:
return arr
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Args:
annotation (`Dict`): 要归一化的注释。
image_size (`Tuple[int, int]`): 图像的高度和宽度。
"""
image_height, image_width = image_size
norm_annotation = {}
for key, value in annotation.items():
if key == "boxes":
boxes = value
boxes = corners_to_center_format(boxes)
boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
norm_annotation[key] = boxes
else:
norm_annotation[key] = value
return norm_annotation
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
Args:
values (`Iterable[Any]`): 要比较的可迭代值。
"""
return [max(values_i) for values_i in zip(*values)]
def get_max_height_width(
images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
"""
Get the maximum height and width across all images in a batch.
"""
if input_data_format is None:
input_data_format = infer_channel_dimension_format(images[0])
if input_data_format == ChannelDimension.FIRST:
_, max_height, max_width = max_across_indices([img.shape for img in images])
elif input_data_format == ChannelDimension.LAST:
max_height, max_width, _ = max_across_indices([img.shape for img in images])
else:
raise ValueError(f"Invalid channel dimension format: {input_data_format}")
return (max_height, max_width)
def make_pixel_mask(
image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
"""
Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
Args:
image (`np.ndarray`):
Image to make the pixel mask for.
output_size (`Tuple[int, int]`):
Output size of the mask.
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
mask = np.zeros(output_size, dtype=np.int64)
mask[:input_height, :input_width] = 1
return mask
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
"""
Convert a COCO polygon annotation to a mask.
Args:
segmentations (`List[List[float]]`):
List of polygons, each polygon represented by a list of x-y coordinates.
height (`int`):
Height of the mask.
width (`int`):
Width of the mask.
"""
try:
from pycocotools import mask as coco_mask
except ImportError:
raise ImportError("Pycocotools is not installed in your environment.")
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = np.asarray(mask, dtype=np.uint8)
mask = np.any(mask, axis=2)
masks.append(mask)
if masks:
masks = np.stack(masks, axis=0)
else:
masks = np.zeros((0, height, width), dtype=np.uint8)
return masks
def prepare_coco_detection_annotation(
image,
target,
return_segmentation_masks: bool = False,
input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
"""
Convert the target in COCO format into the format expected by DETR.
"""
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
image_id = target["image_id"]
image_id = np.asarray([image_id], dtype=np.int64)
annotations = target["annotations"]
annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
classes = [obj["category_id"] for obj in annotations]
classes = np.asarray(classes, dtype=np.int64)
area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
boxes = [obj["bbox"] for obj in annotations]
boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2]
boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
new_target = {}
new_target["image_id"] = image_id
new_target["class_labels"] = classes[keep]
new_target["boxes"] = boxes[keep]
new_target["area"] = area[keep]
new_target["iscrowd"] = iscrowd[keep]
new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
if annotations and "keypoints" in annotations[0]:
keypoints = [obj["keypoints"] for obj in annotations]
keypoints = np.asarray(keypoints, dtype=np.float32)
keypoints = keypoints[keep]
num_keypoints = keypoints.shape[0]
keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
new_target["keypoints"] = keypoints
if return_segmentation_masks:
segmentation_masks = [obj["segmentation"] for obj in annotations]
masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
new_target["masks"] = masks[keep]
return new_target
def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
"""
计算提供的全景分割掩码周围的边界框。
Args:
masks: 格式为 `[number_masks, height, width]` 的掩码数组,N 是掩码数量
Returns:
boxes: 格式为 `[number_masks, 4]` 的边界框数组,使用 xyxy 格式
"""
if masks.size == 0:
return np.zeros((0, 4))
h, w = masks.shape[-2:]
y = np.arange(0, h, dtype=np.float32)
x = np.arange(0, w, dtype=np.float32)
y, x = np.meshgrid(y, x, indexing="ij")
x_mask = masks * np.expand_dims(x, axis=0)
x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
x_min = x.filled(fill_value=1e8)
x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
y_mask = masks * np.expand_dims(y, axis=0)
y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
y_min = y.filled(fill_value=1e8)
y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
return np.stack([x_min, y_min, x_max, y_max], 1)
def prepare_coco_panoptic_annotation(
image: np.ndarray,
target: Dict,
masks_path: Union[str, pathlib.Path],
return_masks: bool = True,
input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
"""
为 DETR 准备 COCO 全景注释。
Args:
image: 输入图像
target: 包含目标信息的字典
masks_path: 分割掩码的路径
return_masks: 是否返回掩码
input_data_format: 输入数据的通道维度格式
Returns:
new_target: 处理后的 COCO 全景注释字典
"""
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
annotation_path = pathlib.Path(masks_path) / target["file_name"]
new_target = {}
new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
if "segments_info" in target:
masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
masks = rgb_to_id(masks)
ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
masks = masks == ids[:, None, None]
masks = masks.astype(np.uint8)
if return_masks:
new_target["masks"] = masks
new_target["boxes"] = masks_to_boxes(masks)
new_target["class_labels"] = np.array(
[segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
)
new_target["iscrowd"] = np.asarray(
[segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
)
new_target["area"] = np.asarray(
[segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
)
return new_target
def get_segmentation_image(
masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False
):
"""
获取分割图像。
Args:
masks: 分割掩码数组
input_size: 输入大小
target_size: 目标大小
stuff_equiv_classes: 材质等价类
deduplicate: 是否去重
Returns:
segmentation_image: 分割图像
"""
h, w = input_size
final_h, final_w = target_size
m_id = scipy.special.softmax(masks.transpose(0, 1), -1)
if m_id.shape[-1] == 0:
m_id = np.zeros((h, w), dtype=np.int64)
else:
m_id = m_id.argmax(-1).reshape(h, w)
if deduplicate:
for equiv in stuff_equiv_classes.values():
for eq_id in equiv:
m_id[m_id == eq_id] = equiv[0]
seg_img = id_to_rgb(m_id)
seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
return seg_img
def post_process_panoptic_sample(
out_logits: np.ndarray,
masks: np.ndarray,
boxes: np.ndarray,
processed_size: Tuple[int, int],
target_size: Tuple[int, int],
is_thing_map: Dict,
threshold=0.85,
) -> Dict:
"""
Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample.
Args:
out_logits (`torch.Tensor`):
The logits for this sample.
masks (`torch.Tensor`):
The predicted segmentation masks for this sample.
boxes (`torch.Tensor`):
The predicted bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y,
width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding).
processed_size (`Tuple[int, int]`):
The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size
after data augmentation but before batching.
target_size (`Tuple[int, int]`):
The target size of the image, `(height, width)` corresponding to the requested final size of the
prediction.
is_thing_map (`Dict`):
A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not.
threshold (`float`, *optional*, defaults to 0.85):
The threshold used to binarize the segmentation masks.
"""
scores, labels = score_labels_from_class_probabilities(out_logits)
keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold)
cur_scores = scores[keep]
cur_classes = labels[keep]
cur_boxes = center_to_corners_format(boxes[keep])
if len(cur_boxes) != len(cur_classes):
raise ValueError("Not as many boxes as there are classes")
cur_masks = masks[keep]
cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR)
cur_masks = safe_squeeze(cur_masks, 1)
b, h, w = cur_masks.shape
cur_masks = cur_masks.reshape(b, -1)
stuff_equiv_classes = defaultdict(list)
for k, label in enumerate(cur_classes):
if not is_thing_map[label]:
stuff_equiv_classes[label].append(k)
seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True)
area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores))
if cur_classes.size() > 0:
filtered_small = np.array([a <= 4 for a in area], dtype=bool)
while filtered_small.any():
cur_masks = cur_masks[~filtered_small]
cur_scores = cur_scores[~filtered_small]
cur_classes = cur_classes[~filtered_small]
seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True)
area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores))
filtered_small = np.array([a <= 4 for a in area], dtype=bool)
else:
cur_classes = np.ones((1, 1), dtype=np.int64)
segments_info = [
{"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a}
for i, (cat, a) in enumerate(zip(cur_classes, area))
]
del cur_classes
with io.BytesIO() as out:
PIL.Image.fromarray(seg_img).save(out, format="PNG")
predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
return predictions
def resize_annotation(
annotation: Dict[str, Any],
orig_size: Tuple[int, int],
target_size: Tuple[int, int],
threshold: float = 0.5,
resample: PILImageResampling = PILImageResampling.NEAREST,
):
"""
Resizes an annotation to a target size.
Args:
annotation (`Dict[str, Any]`):
The annotation dictionary.
orig_size (`Tuple[int, int]`):
The original size of the input image.
target_size (`Tuple[int, int]`):
The target size of the image, as returned by the preprocessing `resize` step.
threshold (`float`, *optional*, defaults to 0.5):
The threshold used to binarize the segmentation masks.
resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
The resampling filter to use when resizing the masks.
"""
ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
ratio_height, ratio_width = ratios
new_annotation = {}
new_annotation["size"] = target_size
for key, value in annotation.items():
if key == "boxes":
boxes = value
scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
new_annotation["boxes"] = scaled_boxes
elif key == "area":
area = value
scaled_area = area * (ratio_width * ratio_height)
new_annotation["area"] = scaled_area
elif key == "masks":
masks = value[:, None]
masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
masks = masks.astype(np.float32)
masks = masks[:, 0] > threshold
new_annotation["masks"] = masks
elif key == "size":
new_annotation["size"] = target_size
else:
new_annotation[key] = value
return new_annotation
def binary_mask_to_rle(mask):
"""
Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.
Args:
mask (`torch.Tensor` or `numpy.array`):
A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
segment_id or class_id.
Returns:
`List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
format.
"""
if is_torch_tensor(mask):
mask = mask.numpy()
pixels = mask.flatten()
pixels = np.concatenate([[0], pixels, [0]])
runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
runs[1::2] -= runs[::2]
return list(runs)
def convert_segmentation_to_rle(segmentation):
"""
Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.
```
# 获取唯一的分割标识符列表,表示图像分割中的不同类别或段落
segment_ids = torch.unique(segmentation)
# 初始化空列表,用于存储每个类别或段落的运行长度编码
run_length_encodings = []
# 遍历每个唯一的分割标识符
for idx in segment_ids:
# 创建一个与分割图像匹配的二进制掩码,其中与当前标识符匹配的像素值为1,否则为0
mask = torch.where(segmentation == idx, 1, 0)
# 将二进制掩码转换为运行长度编码(Run-Length Encoding,RLE)
rle = binary_mask_to_rle(mask)
# 将当前类别或段落的运行长度编码添加到结果列表中
run_length_encodings.append(rle)
# 返回所有类别或段落的运行长度编码列表
return run_length_encodings
# 创建一个函数来移除低分和无对象的数据,保留符合条件的 `masks`, `scores` 和 `labels`
def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
"""
Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
`labels`.
Args:
masks (`torch.Tensor`):
A tensor of shape `(num_queries, height, width)`.
scores (`torch.Tensor`):
A tensor of shape `(num_queries)`.
labels (`torch.Tensor`):
A tensor of shape `(num_queries)`.
object_mask_threshold (`float`):
A number between 0 and 1 used to binarize the masks.
Raises:
`ValueError`: Raised when the first dimension doesn't match in all input tensors.
Returns:
`Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
< `object_mask_threshold`.
"""
# 检查所有输入张量的第一个维度是否相等,如果不相等则抛出 ValueError 异常
if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
raise ValueError("mask, scores and labels must have the same shape!")
# 创建一个布尔索引,用于选择符合条件的对象
to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
# 返回符合条件的 `masks`, `scores` 和 `labels`
return masks[to_keep], scores[to_keep], labels[to_keep]
# 检查分割 mask 的有效性,返回是否存在符合条件的 mask 以及该类别 k 的 mask
def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
# 获取与类别 k 相关联的 mask
mask_k = mask_labels == k
# 计算类别 k 的 mask 区域面积
mask_k_area = mask_k.sum()
# 计算类别 k 在预测中的原始区域面积
original_area = (mask_probs[k] >= mask_threshold).sum()
# 检查是否存在类别 k 的 mask 以及原始区域面积是否大于 0
mask_exists = mask_k_area > 0 and original_area > 0
# 如果 mask 存在,则进一步检查区域面积比例是否大于给定阈值
if mask_exists:
area_ratio = mask_k_area / original_area
if not area_ratio.item() > overlap_mask_area_threshold:
mask_exists = False
# 返回 mask 是否存在以及类别 k 的 mask
return mask_exists, mask_k
# 计算分割 mask 的各个段落,并返回分割结果
def compute_segments(
mask_probs,
pred_scores,
pred_labels,
mask_threshold: float = 0.5,
overlap_mask_area_threshold: float = 0.8,
label_ids_to_fuse: Optional[Set[int]] = None,
target_size: Tuple[int, int] = None,
):
# 根据 target_size 设置高度和宽度
height = mask_probs.shape[1] if target_size is None else target_size[0]
width = mask_probs.shape[2] if target_size is None else target_size[1]
# 创建一个空的整数类型张量用于存储分割结果
segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
# 创建一个空的字典列表,用于存储每个分割段落的信息
segments: List[Dict] = []
# 如果指定了 target_size,则对 mask_probs 进行双线性插值
if target_size is not None:
mask_probs = nn.functional.interpolate(
mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
)[0]
# 初始化当前分割段落的 ID
current_segment_id = 0
# 根据预测得分加权每个 mask
mask_probs *= pred_scores.view(-1, 1, 1)
# 找到每个像素位置的预测类别标签
mask_labels = mask_probs.argmax(0) # [height, width]
# 记录每个类别的实例数量
stuff_memory_list: Dict[str, int] = {}
# 对预测标签的每个样本进行循环处理
for k in range(pred_labels.shape[0]):
# 获取当前样本的预测类别
pred_class = pred_labels[k].item()
# 判断当前类别是否需要融合
should_fuse = pred_class in label_ids_to_fuse
# 检查当前样本的分割掩码是否存在并且足够大以成为一个段落
mask_exists, mask_k = check_segment_validity(
mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
)
# 如果存在有效的分割掩码
if mask_exists:
# 如果当前类别在stuff_memory_list中已存在,获取其对应的段落ID
if pred_class in stuff_memory_list:
current_segment_id = stuff_memory_list[pred_class]
else:
# 否则,增加当前段落ID并分配给当前类别
current_segment_id += 1
# 将当前对象段落添加到最终的分割图中
segmentation[mask_k] = current_segment_id
# 获取当前样本的预测分数,并四舍五入保留6位小数
segment_score = round(pred_scores[k].item(), 6)
# 将当前段落信息添加到segments列表中
segments.append(
{
"id": current_segment_id,
"label_id": pred_class,
"was_fused": should_fuse,
"score": segment_score,
}
)
# 如果需要融合,则更新stuff_memory_list中当前类别对应的段落ID
if should_fuse:
stuff_memory_list[pred_class] = current_segment_id
# 返回最终的分割图和segments列表作为结果
return segmentation, segments
class DetrImageProcessor(BaseImageProcessor):
r"""
Constructs a Detr image processor.
Args:
format (`str`, *optional*, defaults to `"coco_detection"`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_resize (`bool`, *optional*, defaults to `True`):
Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
in the `preprocess` method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
`do_rescale` parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize (`bool`, *optional*, defaults to True):
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
`preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
"""
model_input_names = ["pixel_values", "pixel_mask"]
def __init__(
self,
format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
**kwargs,
) -> None:
# 如果在 `kwargs` 中有 "pad_and_return_pixel_mask",则将 `do_pad` 设置为对应的值,并移除该参数
if "pad_and_return_pixel_mask" in kwargs:
do_pad = kwargs.pop("pad_and_return_pixel_mask")
# 如果 `kwargs` 中有 "max_size",发出一次警告,并将其移除;建议使用 `size['longest_edge']` 进行设置
if "max_size" in kwargs:
logger.warning_once(
"The `max_size` parameter is deprecated and will be removed in v4.26. "
"Please specify in `size['longest_edge'] instead`.",
)
max_size = kwargs.pop("max_size")
else:
max_size = None if size is None else 1333
# 如果 `size` 为 `None`,则设置默认的尺寸字典,其中包括 "shortest_edge" 和 "longest_edge" 的默认值
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
# 根据给定的 `size` 和 `max_size`,获取调整后的尺寸字典,确保图像尺寸的合理性
size = get_size_dict(size, max_size=max_size, default_to_square=False)
# 兼容性处理:如果 `do_convert_annotations` 为 `None`,则设置为 `do_normalize` 的值
if do_convert_annotations is None:
do_convert_annotations = do_normalize
# 调用父类的初始化方法,传递 `kwargs` 中的参数
super().__init__(**kwargs)
# 设置对象的各种属性,用于图像处理流程中的参数控制和数据处理
self.format = format
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
# 定义有效的处理器键列表,用于验证和访问处理器对象的属性
self._valid_processor_keys = [
"images",
"annotations",
"return_segmentation_masks",
"masks_path",
"do_resize",
"size",
"resample",
"do_rescale",
"rescale_factor",
"do_normalize",
"do_convert_annotations",
"image_mean",
"image_std",
"do_pad",
"format",
"return_tensors",
"data_format",
"input_data_format",
]
@classmethod
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
"""
从字典创建图像处理器的实例,可以通过此方法更新参数,例如通过 `DetrImageProcessor.from_pretrained(checkpoint, size=600, max_size=800)` 创建图像处理器。
"""
# 复制输入的字典,以确保不修改原始数据
image_processor_dict = image_processor_dict.copy()
# 如果 `kwargs` 中包含 "max_size",则更新 `image_processor_dict` 中的 "max_size" 参数,并从 `kwargs` 中移除
if "max_size" in kwargs:
image_processor_dict["max_size"] = kwargs.pop("max_size")
# 如果 `kwargs` 中包含 "pad_and_return_pixel_mask",则更新 `image_processor_dict` 中的 "pad_and_return_pixel_mask" 参数,并从 `kwargs` 中移除
if "pad_and_return_pixel_mask" in kwargs:
image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
# 调用父类的 `from_dict` 方法,传递更新后的 `image_processor_dict` 和未处理的 `kwargs`
return super().from_dict(image_processor_dict, **kwargs)
def prepare_annotation(
self,
image: np.ndarray,
target: Dict,
format: Optional[AnnotationFormat] = None,
return_segmentation_masks: bool = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Dict:
"""
准备一个用于输入 DETR 模型的注释。
"""
# 如果未指定格式,则使用默认格式 `self.format`
format = format if format is not None else self.format
# 根据注释格式调用相应的准备函数来准备注释数据
if format == AnnotationFormat.COCO_DETECTION:
# 如果 `return_segmentation_masks` 未指定,则根据情况设置为 False
return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
# 调用 `prepare_coco_detection_annotation` 函数来准备 COCO 检测格式的注释数据
target = prepare_coco_detection_annotation(
image, target, return_segmentation_masks, input_data_format=input_data_format
)
elif format == AnnotationFormat.COCO_PANOPTIC:
# 如果 `return_segmentation_masks` 未指定,则根据情况设置为 True
return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
# 调用 `prepare_coco_panoptic_annotation` 函数来准备 COCO 全景格式的注释数据
target = prepare_coco_panoptic_annotation(
image,
target,
masks_path=masks_path,
return_masks=return_segmentation_masks,
input_data_format=input_data_format,
)
else:
# 如果指定的格式不支持,则抛出 ValueError 异常
raise ValueError(f"Format {format} is not supported.")
# 返回处理后的注释数据
return target
def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
"""
准备输入数据,调用 `prepare_annotation` 方法来处理目标注释。
"""
# 发出警告,提示 `prepare` 方法将在 v4.33 版本中移除,建议使用 `prepare_annotation` 方法代替
logger.warning_once(
"The `prepare` method is deprecated and will be removed in a v4.33. "
"Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
"does not return the image anymore.",
)
# 调用 `prepare_annotation` 方法来处理目标注释
target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
# 返回处理后的图像和注释数据
return image, target
def convert_coco_poly_to_mask(self, *args, **kwargs):
"""
将 COCO 多边形格式的注释转换为掩码格式的方法,发出警告表示此方法将在 v4.33 版本中移除。
"""
logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
# 调用 `convert_coco_poly_to_mask` 函数并返回其结果
return convert_coco_poly_to_mask(*args, **kwargs)
# 警告日志:方法已弃用,将在 v4.33 版本移除
def prepare_coco_detection(self, *args, **kwargs):
logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
# 调用替代方法处理 COCO 检测注释数据集
return prepare_coco_detection_annotation(*args, **kwargs)
# 警告日志:方法已弃用,将在 v4.33 版本移除
def prepare_coco_panoptic(self, *args, **kwargs):
logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
# 调用替代方法处理 COCO 全景分割注释数据集
return prepare_coco_panoptic_annotation(*args, **kwargs)
# 图像调整大小方法,根据指定尺寸和参数调整输入图像大小
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
int, smaller edge of the image will be matched to this number.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
`height` and `width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# 如果 `max_size` 在参数中,则发出警告日志并弹出该参数
if "max_size" in kwargs:
logger.warning_once(
"The `max_size` parameter is deprecated and will be removed in v4.26. "
"Please specify in `size['longest_edge'] instead`.",
)
max_size = kwargs.pop("max_size")
else:
max_size = None
# 获取调整后的大小字典,包括最大尺寸和默认不是正方形
size = get_size_dict(size, max_size=max_size, default_to_square=False)
# 根据给定的尺寸信息调整输出图像的大小
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
else:
# 如果尺寸信息不完整,则引发值错误异常
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
# 使用指定参数调整图像大小并返回调整后的图像
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
)
return image
# 调用全局函数 `resize_annotation` 来调整给定注释的大小,以匹配调整后的图像大小
def resize_annotation(
self,
annotation,
orig_size,
size,
resample: PILImageResampling = PILImageResampling.NEAREST,
) -> Dict:
"""
Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
to this number.
"""
return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
# TODO (Amy) - update to use `rescale_factor` instead of `scale`
# 根据给定的 `rescale_factor` 缩放图像,更新后的尺寸为 `image = image * rescale_factor`
def rescale(
self,
image: np.ndarray,
rescale_factor: float,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the input image. If unset, is inferred from the input image. Can be
one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
# 根据给定的图像大小归一化注释框的坐标,从 `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]`
# 转换为 `[center_x, center_y, width, height]` 格式,并将绝对像素值转换为相对值
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
"""
return normalize_annotation(annotation, image_size=image_size)
# 更新填充图像后的注释信息,根据输入和输出图像大小、填充值和是否更新边界框信息进行更新
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
) -> Dict:
"""
Update the annotation for a padded image.
"""
# 创建一个新的空字典用于存储更新后的注释信息
new_annotation = {}
# 将输出图像的尺寸信息添加到新注释字典中
new_annotation["size"] = output_image_size
# 遍历原始注释中的每个键值对
for key, value in annotation.items():
# 如果键是"masks"
if key == "masks":
# 获取masks值
masks = value
# 对masks进行填充操作,使用指定的填充模式和常数值
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
# 对填充后的masks进行安全压缩,移除维度为1的维度
masks = safe_squeeze(masks, 1)
# 将处理后的masks存入新注释字典中
new_annotation["masks"] = masks
# 如果键是"boxes"且需要更新边界框
elif key == "boxes" and update_bboxes:
# 获取boxes值
boxes = value
# 根据输入和输出图像的尺寸比例调整边界框的坐标值
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
# 将调整后的boxes存入新注释字典中
new_annotation["boxes"] = boxes
# 如果键是"size"
elif key == "size":
# 将输出图像的尺寸信息存入新注释字典中(这一步似乎是多余的,因为在初始化时已经添加过)
new_annotation["size"] = output_image_size
else:
# 将其他键值对直接复制到新注释字典中
new_annotation[key] = value
# 返回更新后的注释字典
return new_annotation
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
"""
# 获取输入图像的高度和宽度
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
# 获取输出图像的高度和宽度
output_height, output_width = output_size
# 计算在图像底部和右侧需要填充的像素数
pad_bottom = output_height - input_height
pad_right = output_width - input_width
# 构建填充的配置元组
padding = ((0, pad_bottom), (0, pad_right))
# 对输入图像进行填充操作,使用指定的填充模式和常数值
padded_image = pad(
image,
padding,
mode=PaddingMode.CONSTANT,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
# 如果提供了注释信息,则更新注释以匹配填充后的图像
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
# 返回填充后的图像及其对应的注释信息(如果有)
return padded_image, annotation
# 定义一个类方法 `pad`,用于填充图像数据。
def pad(
self,
images: List[np.ndarray], # 输入参数 `images` 是一个 NumPy 数组的列表,表示图像数据。
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, # 可选参数 `annotations`,可以是单个注解类型或注解类型的列表。
constant_values: Union[float, Iterable[float]] = 0, # 填充时的常数值,可以是单个浮点数或者浮点数的可迭代对象,默认为 0。
return_pixel_mask: bool = True, # 是否返回像素掩码,默认为 True。
return_tensors: Optional[Union[str, TensorType]] = None, # 可选参数,指定返回的张量类型或者字符串。
data_format: Optional[ChannelDimension] = None, # 可选参数,指定数据格式。
input_data_format: Optional[Union[str, ChannelDimension]] = None, # 可选参数,指定输入数据的格式。
update_bboxes: bool = True, # 是否更新边界框,默认为 True。
# 定义一个类方法 `preprocess`,用于预处理图像数据。
def preprocess(
self,
images: ImageInput, # 输入参数 `images`,表示输入的图像数据。
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, # 可选参数 `annotations`,可以是单个注解类型或注解类型的列表。
return_segmentation_masks: bool = None, # 是否返回分割掩码,默认为 None。
masks_path: Optional[Union[str, pathlib.Path]] = None, # 可选参数 `masks_path`,指定掩码路径,可以是字符串或路径对象。
do_resize: Optional[bool] = None, # 是否调整大小,默认为 None。
size: Optional[Dict[str, int]] = None, # 可选参数 `size`,指定大小的字典。
resample=None, # PIL 图像重采样方法。
do_rescale: Optional[bool] = None, # 是否重新缩放,默认为 None。
rescale_factor: Optional[Union[int, float]] = None, # 可选参数 `rescale_factor`,重新缩放的因子。
do_normalize: Optional[bool] = None, # 是否归一化,默认为 None。
do_convert_annotations: Optional[bool] = None, # 是否转换注解,默认为 None。
image_mean: Optional[Union[float, List[float]]] = None, # 图像均值,可以是单个浮点数或浮点数列表。
image_std: Optional[Union[float, List[float]]] = None, # 图像标准差,可以是单个浮点数或浮点数列表。
do_pad: Optional[bool] = None, # 是否填充,默认为 None。
format: Optional[Union[str, AnnotationFormat]] = None, # 注解格式,可以是字符串或注解格式对象。
return_tensors: Optional[Union[TensorType, str]] = None, # 返回的张量类型或字符串。
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, # 数据格式,默认为首通道优先。
input_data_format: Optional[Union[str, ChannelDimension]] = None, # 输入数据的格式。
**kwargs, # 其余关键字参数。
# 后处理方法 - TODO: 添加对其他框架的支持
# 受 https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258 启发的
def post_process(self, outputs, target_sizes):
"""
Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
bottom_right_x, bottom_right_y) format. Only supports PyTorch.
Args:
outputs ([`DetrObjectDetectionOutput`]):
Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
original image size (before any data augmentation). For visualization, this should be the image size
after data augment, but before padding.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
# 发出警告,提醒用户 `post_process` 方法即将被移除
logger.warning_once(
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
)
# 获取输出中的分类分数和预测框
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
# 检查输出的长度与目标尺寸长度是否一致
if len(out_logits) != len(target_sizes):
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
# 检查目标尺寸的形状是否为 (batch_size, 2)
if target_sizes.shape[1] != 2:
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
# 对分类分数进行 softmax 处理,得到概率分布,并获取最大概率的类别标签和分数
prob = nn.functional.softmax(out_logits, -1)
scores, labels = prob[..., :-1].max(-1)
# 将预测框转换为 [x0, y0, x1, y1] 的格式(左上角和右下角坐标)
boxes = center_to_corners_format(out_bbox)
# 将相对坐标 [0, 1] 转换为绝对坐标 [0, height],乘以图片尺寸因子
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]
# 组装结果,每个字典包含预测的分数、标签和框
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
return results
# 定义一个方法用于后处理分割模型的输出,将输出转换为图像分割预测。仅支持 PyTorch。
def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
"""
Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch.
Args:
outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
threshold (`float`, *optional*, defaults to 0.9):
Threshold to use to filter out queries.
mask_threshold (`float`, *optional*, defaults to 0.5):
Threshold to use when turning the predicted masks into binary values.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
in the batch as predicted by the model.
"""
# 发出警告信息,提醒用户此函数即将在 Transformers v5 中删除,建议使用 `post_process_semantic_segmentation`。
logger.warning_once(
"`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_semantic_segmentation`.",
)
# 从模型输出中提取逻辑回归结果和预测的掩码
out_logits, raw_masks = outputs.logits, outputs.pred_masks
# 空标签的索引为输出 logits 的最后一个维度索引减一
empty_label = out_logits.shape[-1] - 1
# 存储预测结果的列表
preds = []
# 将输入转换为元组形式
def to_tuple(tup):
if isinstance(tup, tuple):
return tup
return tuple(tup.cpu().tolist())
# 遍历每个样本的 logits、掩码和目标尺寸
for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
# 对 logits 进行 softmax 操作,获取每个预测的最大分数和对应的标签
cur_scores, cur_labels = cur_logits.softmax(-1).max(-1)
# 过滤掉空查询和分数低于阈值的检测
keep = cur_labels.ne(empty_label) & (cur_scores > threshold)
cur_scores = cur_scores[keep]
cur_labels = cur_labels[keep]
cur_masks = cur_masks[keep]
# 使用双线性插值将掩码调整至目标尺寸
cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
# 将掩码转换为二进制值,根据 mask_threshold 进行阈值化
cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
# 将当前样本的分数、标签和掩码存储到预测字典中
predictions = {"scores": cur_scores, "labels": cur_labels, "masks": cur_masks}
preds.append(predictions)
# 返回所有样本的预测结果列表
return preds
# 参考自 https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
# 将模型输出转换为实例分割预测结果。仅支持 PyTorch。
def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
"""
Converts the output of [`DetrForSegmentation`] into actual instance segmentation predictions. Only supports
PyTorch.
Args:
results (`List[Dict]`):
Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added.
outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model.
orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
image size (before any data augmentation).
max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
original image size (before any data augmentation).
threshold (`float`, *optional*, defaults to 0.5):
Threshold to use when turning the predicted masks into binary values.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
image in the batch as predicted by the model.
"""
# 发出警告信息,提醒用户函数将在 Transformers 的 v5 版本中移除,请使用 `post_process_instance_segmentation`。
logger.warning_once(
"`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_instance_segmentation`.",
)
# 检查 orig_target_sizes 和 max_target_sizes 的长度是否相等,如果不相等则引发 ValueError。
if len(orig_target_sizes) != len(max_target_sizes):
raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes")
# 获取最大的高度和宽度值
max_h, max_w = max_target_sizes.max(0)[0].tolist()
# 压缩模型输出中的预测 masks,并进行插值操作,使其与 max_h 和 max_w 的尺寸一致
outputs_masks = outputs.pred_masks.squeeze(2)
outputs_masks = nn.functional.interpolate(
outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
)
# 将 masks 转换为二进制值,根据给定的阈值进行阈值化,并移动到 CPU
outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
# 遍历每个输出,调整 masks 的尺寸并保存到 results 中
for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
img_h, img_w = t[0], t[1]
results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
results[i]["masks"] = nn.functional.interpolate(
results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
).byte()
# 返回处理后的结果列表
return results
# 受启发于 https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
# 受启发于 https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258
def post_process_object_detection(
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
):
"""
Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
bottom_right_x, bottom_right_y) format. Only supports PyTorch.
Args:
outputs ([`DetrObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
# Extract logits and bounding boxes from model outputs
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
# Check if target_sizes is provided and validate its dimension
if target_sizes is not None:
if len(out_logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
# Compute softmax probabilities and extract scores and labels
prob = nn.functional.softmax(out_logits, -1)
scores, labels = prob[..., :-1].max(-1)
# Convert bounding boxes from center format to [x0, y0, x1, y1]
boxes = center_to_corners_format(out_bbox)
# If target_sizes is provided, convert relative coordinates to absolute coordinates
if target_sizes is not None:
if isinstance(target_sizes, list):
img_h = torch.Tensor([i[0] for i in target_sizes])
img_w = torch.Tensor([i[1] for i in target_sizes])
else:
img_h, img_w = target_sizes.unbind(1)
# Compute scaling factors and apply to bounding boxes
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]
# Filter predictions based on score threshold and construct results dictionary
results = []
for s, l, b in zip(scores, labels, boxes):
score = s[s > threshold]
label = l[s > threshold]
box = b[s > threshold]
results.append({"scores": score, "labels": label, "boxes": box})
return results
# 将模型输出转换为语义分割地图。仅支持 PyTorch。
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple[int, int]] = None):
"""
Converts the output of [`DetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
Args:
outputs ([`DetrForSegmentation`]):
Raw outputs of the model.
target_sizes (`List[Tuple[int, int]]`, *optional*):
A list of tuples (`Tuple[int, int]`) containing the target size (height, width) of each image in the
batch. If unset, predictions will not be resized.
Returns:
`List[torch.Tensor]`:
A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
`torch.Tensor` correspond to a semantic class id.
"""
# 获取类别查询的 logits,形状为 [batch_size, num_queries, num_classes+1]
class_queries_logits = outputs.logits
# 获取掩码查询的 logits,形状为 [batch_size, num_queries, height, width]
masks_queries_logits = outputs.pred_masks
# 移除最后一个类别(null 类别)的 logits,使用 softmax 进行归一化
masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
# 使用 sigmoid 函数将掩码 logits 转换为概率,形状为 [batch_size, num_queries, height, width]
masks_probs = masks_queries_logits.sigmoid()
# 计算语义分割 logits,形状为 (batch_size, num_classes, height, width)
segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
batch_size = class_queries_logits.shape[0]
# 调整 logits 的大小并计算语义分割地图
if target_sizes is not None:
# 检查目标大小的数量与 logits 的批次维度是否匹配
if batch_size != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
semantic_segmentation = []
for idx in range(batch_size):
# 使用双线性插值将 logits 调整到指定大小
resized_logits = nn.functional.interpolate(
segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
)
# 获取每个像素点的语义类别
semantic_map = resized_logits[0].argmax(dim=0)
semantic_segmentation.append(semantic_map)
else:
# 获取每个像素点的语义类别,并按批次组织成列表
semantic_segmentation = segmentation.argmax(dim=1)
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
return semantic_segmentation
# 受 https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218 启发
def post_process_instance_segmentation(
self,
outputs,
threshold: float = 0.5,
mask_threshold: float = 0.5,
overlap_mask_area_threshold: float = 0.8,
target_sizes: Optional[List[Tuple[int, int]]] = None,
return_coco_annotation: Optional[bool] = False,
# 参考自 https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
# 定义一个方法用于处理全景分割的后处理
def post_process_panoptic_segmentation(
self,
outputs,
threshold: float = 0.5,
mask_threshold: float = 0.5,
overlap_mask_area_threshold: float = 0.8,
label_ids_to_fuse: Optional[Set[int]] = None,
target_sizes: Optional[List[Tuple[int, int]]] = None,
.\models\detr\modeling_detr.py
""" PyTorch DETR model."""
import math
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union
import torch
from torch import Tensor, nn
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_accelerate_available,
is_scipy_available,
is_timm_available,
is_vision_available,
logging,
replace_return_docstrings,
requires_backends,
)
from ...utils.backbone_utils import load_backbone
from .configuration_detr import DetrConfig
if is_accelerate_available():
from accelerate import PartialState
from accelerate.utils import reduce
if is_scipy_available():
from scipy.optimize import linear_sum_assignment
if is_timm_available():
from timm import create_model
if is_vision_available():
from transformers.image_transforms import center_to_corners_format
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DetrConfig"
_CHECKPOINT_FOR_DOC = "facebook/detr-resnet-50"
DETR_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/detr-resnet-50",
]
@dataclass
class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
"""
Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
"""
intermediate_hidden_states: Optional[torch.FloatTensor] = None
@dataclass
class DetrModelOutput(Seq2SeqModelOutput):
"""
DETR 编码-解码模型的输出基类。这个类在 `Seq2SeqModelOutput` 的基础上增加了一个属性,
即可选的中间解码器激活堆栈,即每个解码器层的输出,每个输出通过了一个 layernorm。
在使用辅助解码损失训练模型时非常有用。
"""
intermediate_hidden_states: Optional[torch.FloatTensor] = None
@dataclass
class DetrObjectDetectionOutput(ModelOutput):
"""
[`DetrForObjectDetection`] 的输出类型。
"""
loss: Optional[torch.FloatTensor] = None
loss_dict: Optional[Dict] = None
logits: torch.FloatTensor = None
pred_boxes: torch.FloatTensor = None
auxiliary_outputs: Optional[List[Dict]] = None
last_hidden_state: Optional[torch.FloatTensor] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class DetrSegmentationOutput(ModelOutput):
"""
[`DetrForSegmentation`] 的输出类型。
"""
loss: Optional[torch.FloatTensor] = None
loss_dict: Optional[Dict] = None
logits: torch.FloatTensor = None
pred_boxes: torch.FloatTensor = None
pred_masks: torch.FloatTensor = None
auxiliary_outputs: Optional[List[Dict]] = None
last_hidden_state: Optional[torch.FloatTensor] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
class DetrFrozenBatchNorm2d(nn.Module):
"""
固定统计量和仿射参数的 BatchNorm2d。
从 torchvision.misc.ops 中复制粘贴,添加了 eps 在 rqsrt 前,否则除了 torchvision.models.resnet[18,34,50,101] 之外的任何模型都会产生 NaN。
"""
def __init__(self, n):
super().__init__()
self.register_buffer("weight", torch.ones(n))
self.register_buffer("bias", torch.zeros(n))
self.register_buffer("running_mean", torch.zeros(n))
self.register_buffer("running_var", torch.ones(n))
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
pass
):
num_batches_tracked_key = prefix + "num_batches_tracked"
if num_batches_tracked_key in state_dict:
del state_dict[num_batches_tracked_key]
super()._load_from_state_dict(
state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
)
def forward(self, x):
weight = self.weight.reshape(1, -1, 1, 1)
bias = self.bias.reshape(1, -1, 1, 1)
running_var = self.running_var.reshape(1, -1, 1, 1)
running_mean = self.running_mean.reshape(1, -1, 1, 1)
epsilon = 1e-5
scale = weight * (running_var + epsilon).rsqrt()
bias = bias - running_mean * scale
return x * scale + bias
def replace_batch_norm(model):
r"""
Recursively replace all `torch.nn.BatchNorm2d` with `DetrFrozenBatchNorm2d`.
Args:
model (torch.nn.Module):
input model
"""
for name, module in model.named_children():
if isinstance(module, nn.BatchNorm2d):
new_module = DetrFrozenBatchNorm2d(module.num_features)
if not module.weight.device == torch.device("meta"):
new_module.weight.data.copy_(module.weight)
new_module.bias.data.copy_(module.bias)
new_module.running_mean.data.copy_(module.running_mean)
new_module.running_var.data.copy_(module.running_var)
model._modules[name] = new_module
if len(list(module.children())) > 0:
replace_batch_norm(module)
class DetrConvEncoder(nn.Module):
"""
Convolutional backbone, using either the AutoBackbone API or one from the timm library.
nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
"""
def __init__(self, config):
super().__init__()
self.config = config
if config.use_timm_backbone:
requires_backends(self, ["timm"])
kwargs = {}
if config.dilation:
kwargs["output_stride"] = 16
backbone = create_model(
config.backbone,
pretrained=config.use_pretrained_backbone,
features_only=True,
out_indices=(1, 2, 3, 4),
in_chans=config.num_channels,
**kwargs,
)
else:
backbone = load_backbone(config)
with torch.no_grad():
replace_batch_norm(backbone)
self.model = backbone
self.intermediate_channel_sizes = (
self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
)
backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
if "resnet" in backbone_model_type:
for name, parameter in self.model.named_parameters():
if config.use_timm_backbone:
if "layer2" not in name and "layer3" not in name and "layer4" not in name:
parameter.requires_grad_(False)
else:
if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
parameter.requires_grad_(False)
def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
out = []
for feature_map in features:
mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
out.append((feature_map, mask))
return out
class DetrConvModel(nn.Module):
"""
This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
"""
def __init__(self, conv_encoder, position_embedding):
super().__init__()
self.conv_encoder = conv_encoder
self.position_embedding = position_embedding
def forward(self, pixel_values, pixel_mask):
out = self.conv_encoder(pixel_values, pixel_mask)
pos = []
for feature_map, mask in out:
pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
return out, pos
class DetrSinePositionEmbedding(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
need paper, generalized to work on images.
"""
def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
super().__init__()
self.embedding_dim = embedding_dim
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, pixel_values, pixel_mask):
if pixel_mask is None:
raise ValueError("No pixel mask provided")
y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
if self.normalize:
y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
pos_x = x_embed[:, :, :, None] / dim_t
pos_y = y_embed[:, :, :, None] / dim_t
pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
class DetrLearnedPositionEmbedding(nn.Module):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, embedding_dim=256):
super().__init__()
self.row_embeddings = nn.Embedding(50, embedding_dim)
self.column_embeddings = nn.Embedding(50, embedding_dim)
def forward(self, pixel_values, pixel_mask=None):
height, width = pixel_values.shape[-2:]
width_values = torch.arange(width, device=pixel_values.device)
height_values = torch.arange(height, device=pixel_values.device)
x_emb = self.column_embeddings(width_values)
y_emb = self.row_embeddings(height_values)
pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
pos = pos.permute(2, 0, 1)
pos = pos.unsqueeze(0)
pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
return pos
def build_position_encoding(config):
n_steps = config.d_model // 2
if config.position_embedding_type == "sine":
position_embedding = DetrSinePositionEmbedding(n_steps, normalize=True)
elif config.position_embedding_type == "learned":
position_embedding = DetrLearnedPositionEmbedding(n_steps)
else:
raise ValueError(f"Not supported {config.position_embedding_type}")
return position_embedding
class DetrAttention(nn.Module):
"""
Multi-headed attention from 'Attention Is All You Need' paper.
Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
bias: bool = True,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if self.head_dim * num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor], **kwargs):
position_embeddings = kwargs.pop("position_embeddings", None)
if kwargs:
raise ValueError(f"Unexpected arguments {kwargs.keys()}")
if position_embeddings is not None and object_queries is not None:
raise ValueError(
"Cannot specify both position_embeddings and object_queries. Please use just object_queries"
)
if position_embeddings is not None:
logger.warning_once(
"position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
)
object_queries = position_embeddings
return tensor if object_queries is None else tensor + object_queries
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
object_queries: Optional[torch.Tensor] = None,
key_value_states: Optional[torch.Tensor] = None,
spatial_position_embeddings: Optional[torch.Tensor] = None,
output_attentions: bool = False,
**kwargs,
):
def __init__(self, config: DetrConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = DetrAttention(
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
object_queries: torch.Tensor = None,
output_attentions: bool = False,
**kwargs,
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
隐藏状态,形状为 `(batch, seq_len, embed_dim)` 的输入张量
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
values.
注意力掩码,大小为 `(batch, 1, target_len, source_len)`,其中填充元素由非常大的负值表示
object_queries (`torch.FloatTensor`, *optional*):
Object queries (also called content embeddings), to be added to the hidden states.
对象查询(也称为内容嵌入),将添加到隐藏状态中
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
是否返回所有注意力层的注意力张量。详细信息请参见返回的张量中的 `attentions` 字段
"""
position_embeddings = kwargs.pop("position_embeddings", None)
if kwargs:
raise ValueError(f"Unexpected arguments {kwargs.keys()}")
if position_embeddings is not None and object_queries is not None:
raise ValueError(
"Cannot specify both position_embeddings and object_queries. Please use just object_queries"
)
if position_embeddings is not None:
logger.warning_once(
"position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
)
object_queries = position_embeddings
residual = hidden_states
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
object_queries=object_queries,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
if self.training:
if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class DetrDecoderLayer(nn.Module):
def __init__(self, config: DetrConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = DetrAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = DetrAttention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
object_queries: Optional[torch.Tensor] = None,
query_position_embeddings: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
**kwargs,
class DetrClassificationHead(nn.Module):
"""用于句子级分类任务的头部模块。"""
def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
super().__init__()
self.dense = nn.Linear(input_dim, inner_dim)
self.dropout = nn.Dropout(p=pooler_dropout)
self.out_proj = nn.Linear(inner_dim, num_classes)
def forward(self, hidden_states: torch.Tensor):
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense(hidden_states)
hidden_states = torch.tanh(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states)
return hidden_states
class DetrPreTrainedModel(PreTrainedModel):
config_class = DetrConfig
base_model_prefix = "model"
main_input_name = "pixel_values"
_no_split_modules = [r"DetrConvEncoder", r"DetrEncoderLayer", r"DetrDecoderLayer"]
def _init_weights(self, module):
std = self.config.init_std
xavier_std = self.config.init_xavier_std
if isinstance(module, DetrMHAttentionMap):
nn.init.zeros_(module.k_linear.bias)
nn.init.zeros_(module.q_linear.bias)
nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
elif isinstance(module, DetrLearnedPositionEmbedding):
nn.init.uniform_(module.row_embeddings.weight)
nn.init.uniform_(module.column_embeddings.weight)
if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`DetrConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it.
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
- 1 for pixels that are real (i.e. **not masked**),
- 0 for pixels that are padding (i.e. **masked**).
[What are attention masks?](../glossary
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
Not used by default. Can be used to mask object queries.
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
can choose to directly pass a flattened representation of an image.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
embedded representation.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class DetrEncoder(DetrPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`DetrEncoderLayer`].
The encoder updates the flattened feature map through multiple self-attention layers.
Small tweak for DETR:
- object_queries are added to the forward pass.
Args:
config: DetrConfig
"""
def __init__(self, config: DetrConfig):
super().__init__(config)
self.dropout = config.dropout # 从配置中获取 dropout 率
self.layerdrop = config.encoder_layerdrop # 从配置中获取 encoder 层级 dropout 率
self.layers = nn.ModuleList([DetrEncoderLayer(config) for _ in range(config.encoder_layers)]) # 创建指定数量的编码器层
# in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
# 在原始的DETR中,编码器末端不使用layernorm,因为默认情况下"normalize_before"设置为False
# Initialize weights and apply final processing
self.post_init() # 执行初始化和最终处理
def forward(
self,
inputs_embeds=None,
attention_mask=None,
object_queries=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
**kwargs,
class DetrDecoder(DetrPreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetrDecoderLayer`].
The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
Some small tweaks for DETR:
- object_queries and query_position_embeddings are added to the forward pass.
- if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
Args:
config: DetrConfig
"""
def __init__(self, config: DetrConfig):
super().__init__(config)
self.dropout = config.dropout # 从配置中获取 dropout 率
self.layerdrop = config.decoder_layerdrop # 从配置中获取 decoder 层级 dropout 率
self.layers = nn.ModuleList([DetrDecoderLayer(config) for _ in range(config.decoder_layers)]) # 创建指定数量的解码器层
# in DETR, the decoder uses layernorm after the last decoder layer output
# 在DETR中,解码器在最后一层解码器输出后使用layernorm
self.layernorm = nn.LayerNorm(config.d_model) # 创建指定维度的layernorm层
self.gradient_checkpointing = False # 默认关闭梯度检查点
# Initialize weights and apply final processing
self.post_init() # 执行初始化和最终处理
def forward(
self,
inputs_embeds=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
object_queries=None,
query_position_embeddings=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
**kwargs,
@add_start_docstrings(
"""
The bare DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
any specific head on top.
""",
DETR_START_DOCSTRING,
)
class DetrModel(DetrPreTrainedModel):
"""
DETR模型,包括骨干和编码器-解码器Transformer,输出没有特定头部的原始隐藏状态。
"""
# 初始化函数,接受一个DetrConfig类型的配置对象作为参数
def __init__(self, config: DetrConfig):
# 调用父类的初始化方法,传入配置对象
super().__init__(config)
# 创建backbone和位置编码
backbone = DetrConvEncoder(config)
object_queries = build_position_encoding(config)
# 使用创建的backbone和位置编码创建DetrConvModel对象,并赋给self.backbone属性
self.backbone = DetrConvModel(backbone, object_queries)
# 创建投影层,使用nn.Conv2d进行初始化
self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
# 创建查询位置嵌入层,使用nn.Embedding进行初始化
self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
# 创建编码器和解码器对象
self.encoder = DetrEncoder(config)
self.decoder = DetrDecoder(config)
# 初始化权重并进行最终处理
self.post_init()
# 返回编码器对象
def get_encoder(self):
return self.encoder
# 返回解码器对象
def get_decoder(self):
return self.decoder
# 冻结backbone的参数,使其不可训练
def freeze_backbone(self):
# 遍历backbone的模型参数,并设置requires_grad为False
for name, param in self.backbone.conv_encoder.model.named_parameters():
param.requires_grad_(False)
# 解冻backbone的参数,使其可训练
def unfreeze_backbone(self):
# 遍历backbone的模型参数,并设置requires_grad为True
for name, param in self.backbone.conv_encoder.model.named_parameters():
param.requires_grad_(True)
# 前向传播函数,根据DETR的输入文档字符串进行注释
@add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING)
# 替换返回的文档字符串类型为DetrModelOutput,并使用_CONFIG_FOR_DOC作为配置类
@replace_return_docstrings(output_type=DetrModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.FloatTensor] = None,
encoder_outputs: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""
DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
such as COCO detection.
"""
# 导入所需模块和函数
@add_start_docstrings(
"""
DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
such as COCO panoptic.
""",
DETR_START_DOCSTRING,
)
# DETR模型的子类,用于分割任务,例如COCO全景分割
class DetrForSegmentation(DetrPreTrainedModel):
# 使用给定的配置初始化对象检测模型
def __init__(self, config: DetrConfig):
# 调用父类的初始化方法
super().__init__(config)
# 创建对象检测模型实例
self.detr = DetrForObjectDetection(config)
# 初始化分割头部
# 从配置中获取隐藏大小和注意力头数
hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
# 从对象检测模型中获取中间层通道大小
intermediate_channel_sizes = self.detr.model.backbone.conv_encoder.intermediate_channel_sizes
# 创建小型卷积分割头部实例
self.mask_head = DetrMaskHeadSmallConv(
hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
)
# 创建 DETR 多头注意力地图实例
self.bbox_attention = DetrMHAttentionMap(
hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
)
# 执行初始化权重和最终处理
self.post_init()
# 将输入参数和返回值的文档字符串添加到模型的前向方法中
@add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING)
# 替换返回值的文档字符串为分割输出类型,并指定配置类
@replace_return_docstrings(output_type=DetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.FloatTensor] = None,
encoder_outputs: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[List[dict]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 定义一个函数 `_expand`,用于将给定的张量在第一维度上插入新维度,然后在该维度上重复指定次数,并将结果展平。
def _expand(tensor, length: int):
return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
# 从 https://github.com/facebookresearch/detr/blob/master/models/segmentation.py 中引用的代码片段
# 定义了一个名为 `DetrMaskHeadSmallConv` 的类,用于实现一个简单的卷积头部,使用组归一化。通过 FPN 方法进行上采样。
class DetrMaskHeadSmallConv(nn.Module):
"""
Simple convolutional head, using group norm. Upsampling is done using a FPN approach
"""
def __init__(self, dim, fpn_dims, context_dim):
super().__init__()
# 如果 `dim` 不是 8 的倍数,抛出错误,因为 GroupNorm 的组数设置为 8
if dim % 8 != 0:
raise ValueError(
"The hidden_size + number of attention heads must be divisible by 8 as the number of groups in"
" GroupNorm is set to 8"
)
# 定义中间层的维度列表,依次为 `dim`, `context_dim // 2`, `context_dim // 4`, `context_dim // 8`, `context_dim // 16`, `context_dim // 64`
inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
# 定义卷积层和组归一化层
self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
self.gn1 = nn.GroupNorm(8, dim)
self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
# 设置类属性 `dim`
self.dim = dim
# 适配器层,用于将 FPN 的输出适配到不同层的输入维度
self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
# 初始化所有卷积层的权重和偏置
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_uniform_(m.weight, a=1)
nn.init.constant_(m.bias, 0)
# 定义前向传播函数,接受输入参数 x(特征张量)、bbox_mask(边界框掩码张量)、fpns(特征金字塔网络列表)
def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
# 将 x(投影后的特征图,形状为 (batch_size, d_model, height/32, width/32))与 bbox_mask(注意力映射,
# 形状为 (batch_size, n_queries, n_heads, height/32, width/32))拼接起来
x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
# 经过第一层线性变换层
x = self.lay1(x)
# 经过第一个组归一化层
x = self.gn1(x)
# 经过 ReLU 激活函数
x = nn.functional.relu(x)
# 经过第二层线性变换层
x = self.lay2(x)
# 经过第二个组归一化层
x = self.gn2(x)
# 经过 ReLU 激活函数
x = nn.functional.relu(x)
# 获取当前的特征金字塔网络(fpns)中的第一个子网络
cur_fpn = self.adapter1(fpns[0])
# 如果当前特征金字塔网络的批次数不等于 x 的批次数,则扩展它以匹配 x 的批次数
if cur_fpn.size(0) != x.size(0):
cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
# 将当前特征金字塔网络的输出与 x 插值后相加
x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
# 经过第三层线性变换层
x = self.lay3(x)
# 经过第三个组归一化层
x = self.gn3(x)
# 经过 ReLU 激活函数
x = nn.functional.relu(x)
# 获取当前的特征金字塔网络中的第二个子网络
cur_fpn = self.adapter2(fpns[1])
# 如果当前特征金字塔网络的批次数不等于 x 的批次数,则扩展它以匹配 x 的批次数
if cur_fpn.size(0) != x.size(0):
cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
# 将当前特征金字塔网络的输出与 x 插值后相加
x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
# 经过第四层线性变换层
x = self.lay4(x)
# 经过第四个组归一化层
x = self.gn4(x)
# 经过 ReLU 激活函数
x = nn.functional.relu(x)
# 获取当前的特征金字塔网络中的第三个子网络
cur_fpn = self.adapter3(fpns[2])
# 如果当前特征金字塔网络的批次数不等于 x 的批次数,则扩展它以匹配 x 的批次数
if cur_fpn.size(0) != x.size(0):
cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
# 将当前特征金字塔网络的输出与 x 插值后相加
x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
# 经过第五层线性变换层
x = self.lay5(x)
# 经过第五个组归一化层
x = self.gn5(x)
# 经过 ReLU 激活函数
x = nn.functional.relu(x)
# 经过输出层的线性变换
x = self.out_lay(x)
# 返回最终的输出张量
return x
class DetrMHAttentionMap(nn.Module):
"""This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
super().__init__()
self.num_heads = num_heads
self.hidden_dim = hidden_dim
self.dropout = nn.Dropout(dropout)
# Linear transformation for queries
self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
# Linear transformation for keys
self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
# Normalization factor for scaling dot products in attention calculation
self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
def forward(self, q, k, mask: Optional[Tensor] = None):
# Linear transformation of queries
q = self.q_linear(q)
# Convolutional transformation of keys
k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
# Reshape queries and keys for multi-head attention computation
queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
# Compute scaled dot-product attention scores
weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
# Apply mask to attention weights if provided
if mask is not None:
weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min)
# Apply softmax to obtain attention distributions
weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
# Apply dropout
weights = self.dropout(weights)
return weights
def dice_loss(inputs, targets, num_boxes):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs (0 for the negative class and 1 for the positive
class).
"""
# Apply sigmoid function to inputs
inputs = inputs.sigmoid()
# Flatten the inputs
inputs = inputs.flatten(1)
# Compute numerator of DICE coefficient
numerator = 2 * (inputs * targets).sum(1)
# Compute denominator of DICE coefficient
denominator = inputs.sum(-1) + targets.sum(-1)
# Compute DICE loss
loss = 1 - (numerator + 1) / (denominator + 1)
return loss.sum() / num_boxes
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
"""
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
Args:
inputs (`torch.FloatTensor` of arbitrary shape):
The predictions for each example.
targets (`torch.FloatTensor` with the same shape as `inputs`)
A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
and 1 for the positive class).
alpha (`float`, *optional*, defaults to `0.25`):
Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
gamma (`int`, *optional*, defaults to `2`):
Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
Returns:
Loss tensor
"""
# 对输入进行 sigmoid 操作,将输出值限制在 (0, 1) 范围内
prob = inputs.sigmoid()
# 使用二元交叉熵损失函数计算损失,但保留每个样本的损失值,不进行汇总
ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
# 计算 modulating factor,用于调节损失
p_t = prob * targets + (1 - prob) * (1 - targets)
# 计算最终的损失值,使用 focal loss 的形式进行加权
loss = ce_loss * ((1 - p_t) ** gamma)
# 如果 alpha 大于等于 0,则使用 focal loss 的 alpha 权重调节损失
if alpha >= 0:
alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
loss = alpha_t * loss
# 计算最终的平均损失值,并对所有样本求和,然后除以 num_boxes 得到平均损失
return loss.mean(1).sum() / num_boxes
"""
This class computes the losses for DetrForObjectDetection/DetrForSegmentation. The process happens in two steps: 1)
we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
of matched ground-truth / prediction (supervise class and box).
A note on the `num_classes` argument (copied from original repo in detr.py): "the naming of the `num_classes`
parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where `max_obj_id` is
the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass `num_classes` to
be 91. As another example, for a dataset that has a single class with `id` 1, you should pass `num_classes` to be 2
(`max_obj_id` + 1). For more details on this, check the following discussion
https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223"
Args:
matcher (`DetrHungarianMatcher`):
Module able to compute a matching between targets and proposals.
num_classes (`int`):
Number of object categories, omitting the special no-object category.
eos_coef (`float`):
Relative classification weight applied to the no-object category.
losses (`List[str]`):
List of all the losses to be applied. See `get_loss` for a list of all available losses.
"""
def __init__(self, matcher, num_classes, eos_coef, losses):
super().__init__()
self.matcher = matcher # 初始化匹配器,用于计算目标与模型输出之间的匹配
self.num_classes = num_classes # 目标类别数,不包括特殊的无对象类别
self.eos_coef = eos_coef # 无对象类别的相对分类权重
self.losses = losses # 待应用的所有损失列表
# 创建一个权重张量,用于交叉熵计算,最后一个元素用于处理无对象类别
empty_weight = torch.ones(self.num_classes + 1)
empty_weight[-1] = self.eos_coef
self.register_buffer("empty_weight", empty_weight)
# removed logging parameter, which was part of the original implementation
def loss_labels(self, outputs, targets, indices, num_boxes):
"""
Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim
[nb_target_boxes]
"""
if "logits" not in outputs:
raise KeyError("No logits were found in the outputs")
source_logits = outputs["logits"] # 获取模型输出的逻辑回归结果
idx = self._get_source_permutation_idx(indices) # 获取源排列的索引
target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) # 获取目标类别
target_classes = torch.full(
source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
)
target_classes[idx] = target_classes_o # 将目标类别放入正确的位置
# 计算交叉熵损失
loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight)
losses = {"loss_ce": loss_ce} # 存储交叉熵损失
return losses
@torch.no_grad()
def loss_cardinality(self, outputs, targets, indices, num_boxes):
"""
Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
"""
# 获取模型输出中的分类 logits
logits = outputs["logits"]
# 获取 logits 的设备信息
device = logits.device
# 计算目标长度,即每个目标包含的类标签数
target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
# 计算预测中非空盒子数量
# 非空盒子是指预测中不是“no-object”类别(即最后一个类别)的预测
card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
# 计算预测盒子数量的绝对误差
card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
# 构建 losses 字典,包含基于预测盒子数量的误差
losses = {"cardinality_error": card_err}
return losses
def loss_boxes(self, outputs, targets, indices, num_boxes):
"""
Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
are expected in format (center_x, center_y, w, h), normalized by the image size.
"""
# 检查模型输出中是否存在预测框
if "pred_boxes" not in outputs:
raise KeyError("No predicted boxes found in outputs")
# 根据索引获取源排列的索引
idx = self._get_source_permutation_idx(indices)
# 获取预测框和目标框
source_boxes = outputs["pred_boxes"][idx]
target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
# 计算边界框的 L1 回归损失
loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
# 构建 losses 字典,包含边界框的 L1 回归损失和 GIoU 损失
losses = {}
losses["loss_bbox"] = loss_bbox.sum() / num_boxes
# 计算广义 IoU 损失
loss_giou = 1 - torch.diag(
generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
)
losses["loss_giou"] = loss_giou.sum() / num_boxes
return losses
def loss_masks(self, outputs, targets, indices, num_boxes):
"""
Compute the losses related to the masks: the focal loss and the dice loss.
Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
"""
# 检查输出中是否包含预测的 masks
if "pred_masks" not in outputs:
raise KeyError("No predicted masks found in outputs")
# 获取源排列索引,用于根据预测和目标的排列顺序调整预测 masks
source_idx = self._get_source_permutation_idx(indices)
# 获取目标排列索引
target_idx = self._get_target_permutation_idx(indices)
# 获取预测的 masks
source_masks = outputs["pred_masks"]
# 根据源排列索引选择对应的预测 masks
source_masks = source_masks[source_idx]
# 获取目标 masks 列表
masks = [t["masks"] for t in targets]
# 使用 nested_tensor_from_tensor_list 函数将目标 masks 转换为 NestedTensor,同时获取有效区域 valid
target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
# 将目标 masks 转换为与预测 masks 相同的设备类型
target_masks = target_masks.to(source_masks)
# 根据目标排列索引选择对应的目标 masks
target_masks = target_masks[target_idx]
# 将预测 masks 上采样至目标大小
source_masks = nn.functional.interpolate(
source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
)
# 压缩维度,将预测 masks 变为一维
source_masks = source_masks[:, 0].flatten(1)
# 压缩维度,将目标 masks 变为一维
target_masks = target_masks.flatten(1)
# 将目标 masks 变换为与预测 masks 相同的形状
target_masks = target_masks.view(source_masks.shape)
# 计算损失,包括 sigmoid focal loss 和 dice loss
losses = {
"loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
"loss_dice": dice_loss(source_masks, target_masks, num_boxes),
}
return losses
def _get_source_permutation_idx(self, indices):
# 根据 indices 排列预测,返回批次索引和源索引
batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
source_idx = torch.cat([source for (source, _) in indices])
return batch_idx, source_idx
def _get_target_permutation_idx(self, indices):
# 根据 indices 排列目标,返回批次索引和目标索引
batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
target_idx = torch.cat([target for (_, target) in indices])
return batch_idx, target_idx
def get_loss(self, loss, outputs, targets, indices, num_boxes):
# 定义损失函数映射
loss_map = {
"labels": self.loss_labels,
"cardinality": self.loss_cardinality,
"boxes": self.loss_boxes,
"masks": self.loss_masks,
}
# 检查所请求的损失是否在损失映射中
if loss not in loss_map:
raise ValueError(f"Loss {loss} not supported")
# 返回所请求损失函数的结果
return loss_map[loss](outputs, targets, indices, num_boxes)
def forward(self, outputs, targets):
"""
This performs the loss computation.
Args:
outputs (`dict`, *optional*):
Dictionary of tensors, see the output specification of the model for the format.
targets (`List[dict]`, *optional*):
List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
losses applied, see each loss' doc.
"""
# Exclude auxiliary outputs from the outputs dictionary
outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
# Retrieve indices that match outputs of the last layer with targets
indices = self.matcher(outputs_without_aux, targets)
# Compute the total number of target boxes across all samples for normalization
num_boxes = sum(len(t["class_labels"]) for t in targets)
# Convert num_boxes to a tensor of float type, and move it to the same device as outputs
num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
world_size = 1
# Check if acceleration is available and adjust num_boxes and world_size accordingly
if is_accelerate_available():
# If PartialState._shared_state is not empty, reduce num_boxes
if PartialState._shared_state != {}:
num_boxes = reduce(num_boxes)
# Get the number of processes from PartialState
world_size = PartialState().num_processes
# Normalize num_boxes considering the number of processes
num_boxes = torch.clamp(num_boxes / world_size, min=1).item()
# Compute losses for each specified loss function
losses = {}
for loss in self.losses:
losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
# If there are auxiliary outputs, compute losses for each auxiliary output separately
if "auxiliary_outputs" in outputs:
for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
indices = self.matcher(auxiliary_outputs, targets)
for loss in self.losses:
if loss == "masks":
# Skip computation of masks loss for auxiliary outputs due to cost
continue
# Append index to keys in losses dictionary for each auxiliary output
l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
losses.update(l_dict)
# Return computed losses
return losses
# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
class DetrMLPPredictionHead(nn.Module):
"""
Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
height and width of a bounding box w.r.t. an image.
Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
# Define a list of linear layers with ReLU activation for the MLP
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
# Feed forward through each linear layer with ReLU activation, except the last layer
for i, layer in enumerate(self.layers):
x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
# taken from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
class DetrHungarianMatcher(nn.Module):
"""
This class computes an assignment between the targets and the predictions of the network.
For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
un-matched (and thus treated as non-objects).
Args:
class_cost:
The relative weight of the classification error in the matching cost.
bbox_cost:
The relative weight of the L1 error of the bounding box coordinates in the matching cost.
giou_cost:
The relative weight of the giou loss of the bounding box in the matching cost.
"""
def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
super().__init__()
# Ensure that the "scipy" library is available when initializing this module
requires_backends(self, ["scipy"])
self.class_cost = class_cost
self.bbox_cost = bbox_cost
self.giou_cost = giou_cost
# Check if all costs are non-zero; raise an error if they are all zero
if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
raise ValueError("All costs of the Matcher can't be 0")
@torch.no_grad()
def forward(self, outputs, targets):
"""
Args:
outputs (`dict`):
A dictionary that contains at least these entries:
* "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
* "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
targets (`List[dict]`):
A list of targets (len(targets) = batch_size), where each target is a dict containing:
* "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
ground-truth objects in the target) containing the class labels
* "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
Returns:
`List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
# Extract batch size and number of queries from the outputs
batch_size, num_queries = outputs["logits"].shape[:2]
# Flatten logits and apply softmax to get probabilities over classes
out_prob = outputs["logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes]
# Flatten predicted boxes
out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4]
# Concatenate target class labels into a single tensor
target_ids = torch.cat([v["class_labels"] for v in targets])
# Concatenate target boxes into a single tensor
target_bbox = torch.cat([v["boxes"] for v in targets])
# Compute classification cost matrix based on negative log likelihood approximation
class_cost = -out_prob[:, target_ids]
# Compute L1 cost matrix between predicted and target boxes
bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
# Compute generalized IoU cost matrix between predicted and target boxes
giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
# Combine different costs into a final cost matrix using predefined weights
cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
# Reshape cost matrix to batch size x num_queries x (sum of all target boxes)
cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
# Split cost matrix based on number of target boxes in each sample and perform linear sum assignment
sizes = [len(v["boxes"]) for v in targets]
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
# Return indices as a list of tuples containing selected predictions and corresponding targets
return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
# below: bounding box utilities taken from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
def _upcast(t: Tensor) -> Tensor:
"""
Protects from numerical overflows in multiplications by upcasting to the equivalent higher type.
Args:
t (`Tensor`): The input tensor to be upcasted.
Returns:
`Tensor`: The upcasted tensor.
"""
if t.is_floating_point():
return t if t.dtype in (torch.float32, torch.float64) else t.float()
else:
return t if t.dtype in (torch.int32, torch.int64) else t.int()
def box_area(boxes: Tensor) -> Tensor:
"""
Computes the area of a set of bounding boxes, specified by (x1, y1, x2, y2) coordinates.
Args:
boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
Boxes for which the area will be computed. They are expected in (x1, y1, x2, y2) format with `0 <= x1 < x2` and `0 <= y1 < y2`.
Returns:
`torch.FloatTensor`: A tensor containing the area for each box.
"""
boxes = _upcast(boxes)
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# modified from torchvision to also return the union
def box_iou(boxes1, boxes2):
"""
Computes the Intersection over Union (IoU) between two sets of bounding boxes.
Args:
boxes1 (`Tensor`): Bounding boxes in format (x1, y1, x2, y2).
boxes2 (`Tensor`): Bounding boxes in format (x1, y1, x2, y2).
Returns:
`Tensor`: IoU scores for each pair of boxes.
`Tensor`: Union area for each pair of boxes.
"""
area1 = box_area(boxes1)
area2 = box_area(boxes2)
left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2]
inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
def generalized_box_iou(boxes1, boxes2):
"""
Computes the Generalized Intersection over Union (IoU) between two sets of bounding boxes.
Args:
boxes1 (`Tensor`): Bounding boxes in format [x0, y0, x1, y1].
boxes2 (`Tensor`): Bounding boxes in format [x0, y0, x1, y1].
Returns:
`Tensor`: Generalized IoU scores for each pair of boxes.
"""
# degenerate boxes gives inf / nan results
# so do an early check
if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
iou, union = box_iou(boxes1, boxes2)
top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2]
area = width_height[:, :, 0] * width_height[:, :, 1]
return iou - (area - union) / area
# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306
def _max_by_axis(the_list):
"""
Finds the maximum value along each axis of a list of lists.
Args:
the_list (`List[List[int]]`): A list of lists of integers.
Returns:
`List[int]`: A list containing the maximum value along each axis.
"""
maxes = the_list[0]
for sublist in the_list[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
class NestedTensor(object):
# Placeholder for further implementation, class not fully shown
pass
# 初始化方法,接受张量列表和可选的掩码张量作为参数
def __init__(self, tensors, mask: Optional[Tensor]):
# 将传入的张量列表赋值给实例变量 tensors
self.tensors = tensors
# 将传入的掩码张量赋值给实例变量 mask
self.mask = mask
# 将 NestedTensor 对象中的张量数据移动到指定的设备上
def to(self, device):
# 将 self.tensors 中的张量数据移动到指定设备,并赋值给 cast_tensor
cast_tensor = self.tensors.to(device)
# 获取实例变量 self.mask
mask = self.mask
# 如果 mask 不为 None
if mask is not None:
# 将 mask 中的数据移动到指定设备,并赋值给 cast_mask
cast_mask = mask.to(device)
else:
# 如果 mask 为 None,则将 cast_mask 设置为 None
cast_mask = None
# 返回一个新的 NestedTensor 对象,其中的张量和掩码都已经移动到指定设备上
return NestedTensor(cast_tensor, cast_mask)
# 返回 NestedTensor 对象中包含的张量和掩码
def decompose(self):
return self.tensors, self.mask
# 定制对象的字符串表示,返回张量的字符串表示
def __repr__(self):
return str(self.tensors)
# 根据给定的张量列表创建嵌套张量
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
# 检查第一个张量的维度是否为3
if tensor_list[0].ndim == 3:
# 计算张量列表中每个张量的最大尺寸
max_size = _max_by_axis([list(img.shape) for img in tensor_list])
# 构建批次的形状,包括批次大小、通道数、高度和宽度
batch_shape = [len(tensor_list)] + max_size
batch_size, num_channels, height, width = batch_shape
# 获取张量列表中第一个张量的数据类型和设备
dtype = tensor_list[0].dtype
device = tensor_list[0].device
# 创建一个全零的张量,形状为批次形状,指定数据类型和设备
tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
# 创建一个全一的掩码张量,形状为(batch_size, height, width),数据类型为布尔型,设备为指定设备
mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
# 遍历张量列表中的每个张量,以及新创建的张量和掩码
for img, pad_img, m in zip(tensor_list, tensor, mask):
# 将原始图像的数据复制到新创建的张量中对应的位置
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
# 将掩码中对应图像部分的值设置为False,表示实际数据存在的位置
m[: img.shape[1], : img.shape[2]] = False
else:
# 如果张量维度不为3,抛出值错误异常
raise ValueError("Only 3-dimensional tensors are supported")
# 返回嵌套张量对象,包括数据张量和掩码张量
return NestedTensor(tensor, mask)
.\models\detr\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {"configuration_detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig", "DetrOnnxConfig"]}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_detr"] = ["DetrFeatureExtractor"]
_import_structure["image_processing_detr"] = ["DetrImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_detr"] = [
"DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
"DetrForObjectDetection",
"DetrForSegmentation",
"DetrModel",
"DetrPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig, DetrOnnxConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_detr import DetrFeatureExtractor
from .image_processing_detr import DetrImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_detr import (
DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
DetrForObjectDetection,
DetrForSegmentation,
DetrModel,
DetrPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\dialogpt\convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
import argparse
import os
import torch
from transformers.utils import WEIGHTS_NAME
DIALOGPT_MODELS = ["small", "medium", "large"]
OLD_KEY = "lm_head.decoder.weight"
NEW_KEY = "lm_head.weight"
def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str):
d = torch.load(checkpoint_path)
d[NEW_KEY] = d.pop(OLD_KEY)
os.makedirs(pytorch_dump_folder_path, exist_ok=True)
torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--dialogpt_path", default=".", type=str)
args = parser.parse_args()
for MODEL in DIALOGPT_MODELS:
checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
convert_dialogpt_checkpoint(
checkpoint_path,
pytorch_dump_folder_path,
)
.\models\dialogpt\__init__.py
import datetime
def calculate_age(birthdate):
today = datetime.date.today()
age = today.year - birthdate.year
if today.month < birthdate.month or (today.month == birthdate.month and today.day < birthdate.day):
age -= 1
return age
.\models\dinat\configuration_dinat.py
"""
Dilated Neighborhood Attention Transformer model configuration
"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
logger = logging.get_logger(__name__)
DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"shi-labs/dinat-mini-in1k-224": "https://huggingface.co/shi-labs/dinat-mini-in1k-224/resolve/main/config.json",
}
class DinatConfig(BackboneConfigMixin, PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DinatModel`]. It is used to instantiate a Dinat
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Dinat
[shi-labs/dinat-mini-in1k-224](https://huggingface.co/shi-labs/dinat-mini-in1k-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import DinatConfig, DinatModel
>>> # Initializing a Dinat shi-labs/dinat-mini-in1k-224 style configuration
>>> configuration = DinatConfig()
>>> # Initializing a model (with random weights) from the shi-labs/dinat-mini-in1k-224 style configuration
>>> model = DinatModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "dinat"
attribute_map = {
"num_attention_heads": "num_heads",
"num_hidden_layers": "num_layers",
}
super().__init__(**kwargs)
self.patch_size = patch_size
self.num_channels = num_channels
self.embed_dim = embed_dim
self.depths = depths
self.num_layers = len(depths)
self.num_heads = num_heads
self.kernel_size = kernel_size
self.dilations = dilations
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
self.layer_scale_init_value = layer_scale_init_value
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
self._out_features, self._out_indices = get_aligned_output_features_output_indices(
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
)
.\models\dinat\modeling_dinat.py
""" PyTorch Dilated Neighborhood Attention Transformer model."""
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BackboneOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
OptionalDependencyNotAvailable,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_natten_available,
logging,
replace_return_docstrings,
requires_backends,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_dinat import DinatConfig
if is_natten_available():
from natten.functional import natten2dav, natten2dqkrpb
else:
def natten2dqkrpb(*args, **kwargs):
raise OptionalDependencyNotAvailable()
def natten2dav(*args, **kwargs):
raise OptionalDependencyNotAvailable()
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DinatConfig"
_CHECKPOINT_FOR_DOC = "shi-labs/dinat-mini-in1k-224"
_EXPECTED_OUTPUT_SHAPE = [1, 7, 7, 512]
_IMAGE_CLASS_CHECKPOINT = "shi-labs/dinat-mini-in1k-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
DINAT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"shi-labs/dinat-mini-in1k-224",
]
last_hidden_state: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class DinatModelOutput(ModelOutput):
"""
Dinat model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
Average pooling of the last layer hidden-state.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
last_hidden_state: torch.FloatTensor = None
pooler_output: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class DinatImageClassifierOutput(ModelOutput):
"""
Dinat outputs for image classification.
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
分类(如果 `config.num_labels==1` 则为回归)的损失值。
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
SoftMax 之前的分类(或回归,如果 `config.num_labels==1`)分数。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型每一层的隐藏状态,包括初始嵌入输出。
包含形状为 `(batch_size, sequence_length, hidden_size)` 的 `torch.FloatTensor` 元组。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
注意力权重经过注意力 SoftMax 后的值,用于计算自注意力头中的加权平均值。
包含形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的 `torch.FloatTensor` 元组。
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型每一层的隐藏状态,包括初始嵌入输出,且包含空间维度。
包含形状为 `(batch_size, hidden_size, height, width)` 的 `torch.FloatTensor` 元组。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 从 transformers.models.nat.modeling_nat.NatEmbeddings 复制的类,将 Nat 替换为 Dinat
class DinatEmbeddings(nn.Module):
"""
构建补丁和位置嵌入。
Args:
config:
模型配置对象,包含嵌入维度等参数。
"""
def __init__(self, config):
super().__init__()
# 使用 DinatPatchEmbeddings 类构建补丁嵌入
self.patch_embeddings = DinatPatchEmbeddings(config)
# 应用 LayerNorm 进行归一化
self.norm = nn.LayerNorm(config.embed_dim)
# 应用 dropout 进行正则化
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor]:
# 生成补丁嵌入
embeddings = self.patch_embeddings(pixel_values)
# 对嵌入应用 LayerNorm
embeddings = self.norm(embeddings)
# 对归一化后的嵌入应用 dropout
embeddings = self.dropout(embeddings)
return embeddings
# 从 transformers.models.nat.modeling_nat.NatPatchEmbeddings 复制的类,将 Nat 替换为 Dinat
class DinatPatchEmbeddings(nn.Module):
"""
这个类将形状为 `(batch_size, num_channels, height, width)` 的 `pixel_values` 转换为形状为
`(batch_size, height, width, hidden_size)` 的初始隐藏状态(补丁嵌入),以供 Transformer 消费。
Args:
config:
模型配置对象,包含补丁大小、通道数和嵌入维度等参数。
"""
def __init__(self, config):
super().__init__()
patch_size = config.patch_size
num_channels, hidden_size = config.num_channels, config.embed_dim
self.num_channels = num_channels
if patch_size == 4:
pass
else:
# TODO: 支持任意的补丁大小。
raise ValueError("Dinat 目前仅支持补丁大小为 4。")
# 使用两个卷积层进行投影
self.projection = nn.Sequential(
nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
)
def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor:
_, num_channels, height, width = pixel_values.shape
# 检查像素值的通道维度是否与配置中设置的一致
if num_channels != self.num_channels:
raise ValueError(
"确保像素值的通道维度与配置中设置的一致。"
)
# 应用投影来生成补丁嵌入,然后重新排列维度
embeddings = self.projection(pixel_values)
embeddings = embeddings.permute(0, 2, 3, 1)
return embeddings
# 从 transformers.models.nat.modeling_nat.NatDownsampler 复制的类,将 Nat 替换为 Dinat
class DinatDownsampler(nn.Module):
"""
卷积下采样层。
Args:
dim (`int`):
输入通道数。
norm_layer (`nn.Module`, *optional*, 默认为 `nn.LayerNorm`):
归一化层类。
"""
def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
super().__init__()
self.dim = dim
# 使用卷积进行降维
self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
# 应用归一化层
self.norm = norm_layer(2 * dim)
# 定义前向传播方法,接受一个形状为 [batch_size, height, width, channels] 的张量 input_feature,并返回一个形状相同的张量
def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
# 调用 self.reduction 方法对输入张量进行维度变换,将通道维移到第二个位置,然后再次调用 permute 将通道维还原到最后一个位置
input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
# 对变换后的张量 input_feature 进行规范化处理
input_feature = self.norm(input_feature)
# 返回处理后的张量 input_feature
return input_feature
# 从transformers.models.beit.modeling_beit.drop_path复制而来
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
每个样本中应用在残差块主路径上的路径丢弃(随机深度)。
注释由Ross Wightman提供:这与我为EfficientNet等网络创建的DropConnect实现相同,
但原始名称有误导,因为'Drop Connect'是另一篇论文中不同形式的dropout...
参见讨论:https://github.com/tensorflow/tpu/issues/494
我选择将层和参数名称更改为'drop path',而不是混合使用DropConnect作为层名称并使用'survival rate'作为参数。
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1) # 适用于不同维度张量,而不仅仅是2D ConvNets
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # 二值化
output = input.div(keep_prob) * random_tensor
return output
# 从transformers.models.beit.modeling_beit.BeitDropPath复制,将Beit改为Dinat
class DinatDropPath(nn.Module):
"""每个样本中应用在残差块主路径上的路径丢弃(随机深度)。"""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class NeighborhoodAttention(nn.Module):
def __init__(self, config, dim, num_heads, kernel_size, dilation):
super().__init__()
if dim % num_heads != 0:
raise ValueError(
f"隐藏大小({dim})不是注意力头数({num_heads})的整数倍"
)
self.num_attention_heads = num_heads
self.attention_head_size = int(dim / num_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.kernel_size = kernel_size
self.dilation = dilation
# rpb是可学习的相对位置偏置;与Swin使用的概念相同。
self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1)))
self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
# 从transformers.models.nat.modeling_nat.NeighborhoodAttention.transpose_for_scores复制,将Nat改为Dinat
# 将输入张量 x 进行形状转换,以便进行多头注意力计算
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 3, 1, 2, 4)
# 实现 Transformer 的前向传播
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
# 通过 self.query、self.key、self.value 函数获取查询、键和值张量,并进行形状转换
query_layer = self.transpose_for_scores(self.query(hidden_states))
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
# 对查询张量应用缩放因子,以便在计算注意力权重之前缩放
query_layer = query_layer / math.sqrt(self.attention_head_size)
# 计算注意力分数,包括相对位置偏置
attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, self.dilation)
# 将注意力分数归一化为注意力概率
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# 使用 dropout 随机丢弃一部分注意力概率,这在 Transformer 中是标准做法
attention_probs = self.dropout(attention_probs)
# 计算上下文张量,结合注意力概率和值张量
context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, self.dilation)
# 对上下文张量进行维度置换,以适应后续处理
context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous()
# 调整上下文张量的形状,以适应全头尺寸
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
# 根据输出选项返回结果,包括上下文张量和(如果需要的话)注意力概率
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
# Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionOutput
class NeighborhoodAttentionOutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
# 定义一个全连接层,输入和输出维度都是 dim
self.dense = nn.Linear(dim, dim)
# 定义一个 Dropout 层,使用配置中的概率来丢弃注意力机制的概率
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
# 将输入的 hidden_states 经过全连接层 dense
hidden_states = self.dense(hidden_states)
# 对经过全连接层后的 hidden_states 进行 Dropout 处理
hidden_states = self.dropout(hidden_states)
return hidden_states
class NeighborhoodAttentionModule(nn.Module):
def __init__(self, config, dim, num_heads, kernel_size, dilation):
super().__init__()
# 创建一个邻域注意力模块,使用给定的参数
self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size, dilation)
# 创建一个输出层,将邻域注意力模块的输出映射到指定维度上
self.output = NeighborhoodAttentionOutput(config, dim)
# 初始化一个空的集合,用于存储被剪枝的注意力头索引
self.pruned_heads = set()
# Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionModule.prune_heads
def prune_heads(self, heads):
# 如果 heads 列表为空,则直接返回
if len(heads) == 0:
return
# 查找可剪枝的注意力头和对应的索引
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# 剪枝线性层
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# 更新超参数并存储被剪枝的头
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
# Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionModule.forward
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
# 执行邻域注意力模块的前向传播
self_outputs = self.self(hidden_states, output_attentions)
# 将邻域注意力模块的输出传递给输出层,同时传入原始的 hidden_states
attention_output = self.output(self_outputs[0], hidden_states)
# 如果需要输出注意力权重,则将它们加入到输出中
outputs = (attention_output,) + self_outputs[1:]
return outputs
# Copied from transformers.models.nat.modeling_nat.NatIntermediate with Nat->Dinat
class DinatIntermediate(nn.Module):
def __init__(self, config, dim):
super().__init__()
# 定义一个线性层,将输入维度 dim 映射到 config.mlp_ratio * dim 的输出维度
self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
# 根据配置中的激活函数类型选择对应的激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 将输入的 hidden_states 经过线性层 dense
hidden_states = self.dense(hidden_states)
# 将线性层的输出经过选择的激活函数 intermediate_act_fn 处理
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
# 从transformers.models.nat.modeling_nat.NatOutput复制并将Nat->Dinat
class DinatOutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
# 使用线性层将输入维度映射到指定维度,mlp_ratio为配置参数
self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
# 使用指定的dropout概率创建一个dropout层
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 将输入的hidden_states通过线性层映射
hidden_states = self.dense(hidden_states)
# 对映射后的结果进行dropout处理
hidden_states = self.dropout(hidden_states)
return hidden_states
class DinatLayer(nn.Module):
def __init__(self, config, dim, num_heads, dilation, drop_path_rate=0.0):
super().__init__()
# 设置用于分块前馈的块大小
self.chunk_size_feed_forward = config.chunk_size_feed_forward
# 设置卷积核大小
self.kernel_size = config.kernel_size
# 设置扩张率
self.dilation = dilation
# 计算窗口大小,是卷积核大小和扩张率的乘积
self.window_size = self.kernel_size * self.dilation
# 在LayerNorm之前应用LayerNorm进行归一化,eps是配置参数
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
# 使用NeighborhoodAttentionModule创建注意力层,config为配置参数
self.attention = NeighborhoodAttentionModule(
config, dim, num_heads, kernel_size=self.kernel_size, dilation=self.dilation
)
# 如果drop_path_rate大于0,创建DropPath层,否则创建Identity层
self.drop_path = DinatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
# 在LayerNorm之后应用LayerNorm进行归一化,eps是配置参数
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
# 创建DinatIntermediate层,处理中间状态
self.intermediate = DinatIntermediate(config, dim)
# 创建DinatOutput层,产生最终输出
self.output = DinatOutput(config, dim)
# 如果配置中的layer_scale_init_value大于0,则创建可训练参数,否则为None
self.layer_scale_parameters = (
nn.Parameter(config.layer_scale_init_value * torch.ones((2, dim)), requires_grad=True)
if config.layer_scale_init_value > 0
else None
)
def maybe_pad(self, hidden_states, height, width):
# 获取当前窗口大小
window_size = self.window_size
# 默认填充值为0
pad_values = (0, 0, 0, 0, 0, 0)
# 如果输入的高度或宽度小于窗口大小,则进行填充
if height < window_size or width < window_size:
pad_l = pad_t = 0
pad_r = max(0, window_size - width)
pad_b = max(0, window_size - height)
pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b)
# 对隐藏状态进行填充
hidden_states = nn.functional.pad(hidden_states, pad_values)
return hidden_states, pad_values
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
#
) -> Tuple[torch.Tensor, torch.Tensor]:
# 获取隐藏状态的批量大小、高度、宽度和通道数
batch_size, height, width, channels = hidden_states.size()
# 保存隐藏状态的快捷方式
shortcut = hidden_states
# 对隐藏状态进行 layer normalization
hidden_states = self.layernorm_before(hidden_states)
# 如果隐藏状态小于卷积核大小乘以膨胀率,则进行填充
hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
# 获取填充后的高度和宽度
_, height_pad, width_pad, _ = hidden_states.shape
# 执行注意力机制,获取注意力输出
attention_outputs = self.attention(hidden_states, output_attentions=output_attentions)
# 从注意力输出中提取主要的注意力输出
attention_output = attention_outputs[0]
# 检查是否进行了填充
was_padded = pad_values[3] > 0 or pad_values[5] > 0
if was_padded:
# 如果有填充,则裁剪注意力输出以匹配原始尺寸
attention_output = attention_output[:, :height, :width, :].contiguous()
# 如果存在层缩放参数,则应用第一个参数到注意力输出
if self.layer_scale_parameters is not None:
attention_output = self.layer_scale_parameters[0] * attention_output
# 将注意力输出与快捷方式相加,应用 drop path 操作
hidden_states = shortcut + self.drop_path(attention_output)
# 对层输出进行 layer normalization
layer_output = self.layernorm_after(hidden_states)
# 经过中间层和输出层的处理
layer_output = self.output(self.intermediate(layer_output))
# 如果存在层缩放参数,则应用第二个参数到层输出
if self.layer_scale_parameters is not None:
layer_output = self.layer_scale_parameters[1] * layer_output
# 将层输出与隐藏状态相加,再应用 drop path 操作
layer_output = hidden_states + self.drop_path(layer_output)
# 构造层输出元组,可能包含注意力权重
layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
return layer_outputs
# 定义了一个名为 DinatStage 的自定义神经网络模块,继承自 nn.Module
class DinatStage(nn.Module):
# 初始化函数,接收多个参数用于配置模块
def __init__(self, config, dim, depth, num_heads, dilations, drop_path_rate, downsample):
super().__init__()
self.config = config # 存储配置参数
self.dim = dim # 存储维度参数
# 使用 nn.ModuleList 存储 DinatLayer 层的列表
self.layers = nn.ModuleList(
[
DinatLayer(
config=config,
dim=dim,
num_heads=num_heads,
dilation=dilations[i],
drop_path_rate=drop_path_rate[i],
)
for i in range(depth)
]
)
# 如果 downsample 参数不为 None,则创建 downsample 层
if downsample is not None:
self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm)
else:
self.downsample = None
self.pointing = False # 初始化 pointing 属性为 False
# 重写 forward 方法,执行前向传播计算
# 从 transformers.models.nat.modeling_nat.NatStage.forward 复制而来
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
_, height, width, _ = hidden_states.size()
# 遍历 self.layers 列表中的每个 DinatLayer 层,依次计算输出
for i, layer_module in enumerate(self.layers):
layer_outputs = layer_module(hidden_states, output_attentions)
hidden_states = layer_outputs[0] # 更新 hidden_states
hidden_states_before_downsampling = hidden_states
# 如果存在 downsample 层,则对计算前的 hidden_states 进行下采样
if self.downsample is not None:
hidden_states = self.downsample(hidden_states_before_downsampling)
# 返回计算后的 hidden_states 和计算前的 hidden_states_before_downsampling
stage_outputs = (hidden_states, hidden_states_before_downsampling)
# 如果需要输出注意力矩阵,则将其加入 stage_outputs 中
if output_attentions:
stage_outputs += layer_outputs[1:]
return stage_outputs
# 定义了一个名为 DinatEncoder 的自定义神经网络模块,继承自 nn.Module
class DinatEncoder(nn.Module):
# 初始化函数,接收配置参数 config
def __init__(self, config):
super().__init__()
self.num_levels = len(config.depths) # 计算深度级别数量
self.config = config # 存储配置参数
# 根据配置参数创建多层 DinatStage 模块,并存储在 nn.ModuleList 中
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
self.levels = nn.ModuleList(
[
DinatStage(
config=config,
dim=int(config.embed_dim * 2**i_layer),
depth=config.depths[i_layer],
num_heads=config.num_heads[i_layer],
dilations=config.dilations[i_layer],
drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
downsample=DinatDownsampler if (i_layer < self.num_levels - 1) else None,
)
for i_layer in range(self.num_levels)
]
)
# 重写 forward 方法,执行前向传播计算
# 从 transformers.models.nat.modeling_nat.NatEncoder.forward 复制而来,Nat->Dinat
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
output_hidden_states_before_downsampling: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple, DinatEncoderOutput]:
# 如果没有要输出的隐藏状态,则置空
all_hidden_states = () if output_hidden_states else None
# 如果没有要输出的重塑后的隐藏状态,则置空
all_reshaped_hidden_states = () if output_hidden_states else None
# 如果没有要输出的注意力权重,则置空
all_self_attentions = () if output_attentions else None
if output_hidden_states:
# 重新排列隐藏状态的维度顺序:从 b h w c 到 b c h w
reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
# 将当前隐藏状态添加到所有隐藏状态的元组中
all_hidden_states += (hidden_states,)
# 将重塑后的隐藏状态添加到所有重塑后的隐藏状态的元组中
all_reshaped_hidden_states += (reshaped_hidden_state,)
for i, layer_module in enumerate(self.levels):
# 对每一层模块进行前向传播
layer_outputs = layer_module(hidden_states, output_attentions)
# 更新当前隐藏状态为当前层的输出的第一个元素
hidden_states = layer_outputs[0]
# 如果需要输出隐藏状态且需要输出下采样前的隐藏状态
hidden_states_before_downsampling = layer_outputs[1]
if output_hidden_states and output_hidden_states_before_downsampling:
# 重新排列下采样前的隐藏状态的维度顺序:从 b h w c 到 b c h w
reshaped_hidden_state = hidden_states_before_downsampling.permute(0, 3, 1, 2)
# 将下采样前的隐藏状态添加到所有隐藏状态的元组中
all_hidden_states += (hidden_states_before_downsampling,)
# 将重塑后的隐藏状态添加到所有重塑后的隐藏状态的元组中
all_reshaped_hidden_states += (reshaped_hidden_state,)
elif output_hidden_states and not output_hidden_states_before_downsampling:
# 重新排列当前隐藏状态的维度顺序:从 b h w c 到 b c h w
reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
# 将当前隐藏状态添加到所有隐藏状态的元组中
all_hidden_states += (hidden_states,)
# 将重塑后的隐藏状态添加到所有重塑后的隐藏状态的元组中
all_reshaped_hidden_states += (reshaped_hidden_state,)
if output_attentions:
# 将当前层的注意力权重添加到所有注意力权重的元组中
all_self_attentions += layer_outputs[2:]
if not return_dict:
# 如果不返回字典,则返回非空值的元组
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
# 返回 DinatEncoderOutput 对象,包含最终的隐藏状态、所有隐藏状态、所有注意力权重和所有重塑后的隐藏状态
return DinatEncoderOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
reshaped_hidden_states=all_reshaped_hidden_states,
)
# DinatPreTrainedModel 类的子类,用于处理权重初始化以及下载和加载预训练模型的简单接口
class DinatPreTrainedModel(PreTrainedModel):
# 模型的配置类,指定为 DinatConfig
config_class = DinatConfig
# 基础模型的前缀名称为 "dinat"
base_model_prefix = "dinat"
# 主要输入的名称为 "pixel_values"
main_input_name = "pixel_values"
def _init_weights(self, module):
"""初始化模型的权重"""
# 如果是 nn.Linear 或 nn.Conv2d 模块
if isinstance(module, (nn.Linear, nn.Conv2d)):
# 使用正态分布初始化权重数据,标准差为 self.config.initializer_range
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果存在偏置项,则将其初始化为零
if module.bias is not None:
module.bias.data.zero_()
# 如果是 nn.LayerNorm 模块
elif isinstance(module, nn.LayerNorm):
# 将偏置项初始化为零
module.bias.data.zero_()
# 将权重初始化为全1
module.weight.data.fill_(1.0)
# DINAT_START_DOCSTRING 是字符串常量,用于保存 DinatModel 类的文档字符串
DINAT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`DinatConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# DINAT_INPUTS_DOCSTRING 是字符串常量,用于保存 DinatModel 类的输入参数文档字符串
DINAT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# add_start_docstrings 装饰器,用于给 DinatModel 类添加文档字符串
@add_start_docstrings(
"The bare Dinat Model transformer outputting raw hidden-states without any specific head on top.",
DINAT_START_DOCSTRING,
)
# DinatModel 类的定义,继承自 DinatPreTrainedModel 类
# 从 transformers.models.nat.modeling_nat.NatModel 复制而来,将 Nat 替换为 Dinat,NAT 替换为 DINAT
class DinatModel(DinatPreTrainedModel):
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
requires_backends(self, ["natten"]) # 要求后端支持 "natten" 模块
self.config = config # 保存配置信息
self.num_levels = len(config.depths) # 确定金字塔层数
self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1)) # 计算特征数量
self.embeddings = DinatEmbeddings(config) # 初始化嵌入层
self.encoder = DinatEncoder(config) # 初始化编码器
self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) # 初始化层归一化层
self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None # 根据参数决定是否添加池化层
# 初始化权重并进行最终处理
self.post_init()
def get_input_embeddings(self):
return self.embeddings.patch_embeddings # 返回输入嵌入
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads) # 剪枝模型的注意力头部
@add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=DinatModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, DinatModelOutput]:
# 设置是否输出注意力权重,默认从模型配置中获取
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 设置是否输出隐藏状态,默认从模型配置中获取
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 设置是否返回字典格式的输出,默认从模型配置中获取
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果未提供像素值,则抛出数值错误异常
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
# 将像素值传入嵌入层进行处理
embedding_output = self.embeddings(pixel_values)
# 使用编码器处理嵌入输出
encoder_outputs = self.encoder(
embedding_output,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取编码器的序列输出,并进行 LayerNormalization
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output)
# 初始化池化输出为 None
pooled_output = None
# 如果模型有池化层,则对序列输出进行池化操作
if self.pooler is not None:
pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2))
pooled_output = torch.flatten(pooled_output, 1)
# 如果不要求以字典格式返回结果,则返回元组形式的输出
if not return_dict:
output = (sequence_output, pooled_output) + encoder_outputs[1:]
return output
# 否则,以自定义的输出对象形式返回结果,包括最后的隐藏状态、池化输出以及各层的隐藏状态和注意力权重
return DinatModelOutput(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
)
@add_start_docstrings(
"""
Dinat Model transformer with an image classification head on top (a linear layer on top of the final hidden state
of the [CLS] token) e.g. for ImageNet.
""",
DINAT_START_DOCSTRING,
)
class DinatForImageClassification(DinatPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 检查后端库是否已经加载
requires_backends(self, ["natten"])
# 设置分类任务的类别数目
self.num_labels = config.num_labels
# 初始化 DinatModel 模型
self.dinat = DinatModel(config)
# 分类器头部
self.classifier = (
nn.Linear(self.dinat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=DinatImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, DinatImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 确保返回字典存在,如果未提供则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用自注意力网络模型(DINAT),传入像素值和其他选项参数
outputs = self.dinat(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 提取汇聚后的输出特征
pooled_output = outputs[1]
# 将汇聚后的特征输入分类器,生成预测 logits
logits = self.classifier(pooled_output)
# 初始化损失值为 None
loss = None
# 如果提供了标签
if labels is not None:
# 如果问题类型未定义,则根据标签类型设置问题类型
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型计算损失
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果不要求返回字典,则组装输出元组
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 返回 DINAT 图像分类器输出对象,包括损失、logits、隐藏状态、注意力等
return DinatImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
reshaped_hidden_states=outputs.reshaped_hidden_states,
)
# 使用装饰器添加文档字符串,描述了这个类是一个用于如DETR和MaskFormer等框架的NAT骨干。
# 这里继承了DinatPreTrainedModel和BackboneMixin类。
class DinatBackbone(DinatPreTrainedModel, BackboneMixin):
def __init__(self, config):
super().__init__(config)
# 调用父类的初始化方法,传递配置对象给父类
super()._init_backbone(config)
# 确保所需的后端库存在
requires_backends(self, ["natten"])
# 初始化嵌入层和编码器
self.embeddings = DinatEmbeddings(config)
self.encoder = DinatEncoder(config)
# 计算每个阶段的特征维度列表,这些维度是根据配置的嵌入维度和深度计算得出的
self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
# 为输出特征的隐藏状态添加层归一化
hidden_states_norms = {}
for stage, num_channels in zip(self._out_features, self.channels):
hidden_states_norms[stage] = nn.LayerNorm(num_channels)
self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
# 执行后续的权重初始化和最终处理
self.post_init()
# 获取输入嵌入层的方法
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
# 使用装饰器添加文档字符串,描述了这个方法的输入参数和输出类型
# 并替换返回值的文档字符串,指定输出类型为BackboneOutput,配置类为_CONFIG_FOR_DOC
def forward(
self,
pixel_values: torch.Tensor,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""
如果 return_dict 参数为 None,则使用 self.config.use_return_dict 决定返回值类型
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
"""
如果 output_hidden_states 参数为 None,则使用 self.config.output_hidden_states 决定是否输出隐藏状态
"""
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
"""
如果 output_attentions 参数为 None,则使用 self.config.output_attentions 决定是否输出注意力权重
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
"""
使用 self.embeddings 将 pixel_values 转换为嵌入输出
"""
embedding_output = self.embeddings(pixel_values)
"""
使用 self.encoder 对嵌入输出进行编码,设置输出选项和返回值类型为字典
"""
outputs = self.encoder(
embedding_output,
output_attentions=output_attentions,
output_hidden_states=True,
output_hidden_states_before_downsampling=True,
return_dict=True,
)
"""
从编码器输出中获取重塑后的隐藏状态
"""
hidden_states = outputs.reshaped_hidden_states
"""
初始化空的特征图列表
"""
feature_maps = ()
"""
遍历阶段名称和隐藏状态,将符合条件的特征图添加到列表中
"""
for stage, hidden_state in zip(self.stage_names, hidden_states):
if stage in self.out_features:
batch_size, num_channels, height, width = hidden_state.shape
hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
hidden_state = hidden_state.view(batch_size, height * width, num_channels)
hidden_state = self.hidden_states_norms[stage](hidden_state)
hidden_state = hidden_state.view(batch_size, height, width, num_channels)
hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
feature_maps += (hidden_state,)
"""
如果不需要返回字典形式的结果,则将特征图和可能的隐藏状态组成元组返回
"""
if not return_dict:
output = (feature_maps,)
if output_hidden_states:
output += (outputs.hidden_states,)
return output
"""
否则,返回包含特征图、隐藏状态和注意力权重的 BackboneOutput 对象
"""
return BackboneOutput(
feature_maps=feature_maps,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=outputs.attentions,
)