Transformers 源码解析(三十八)
.\models\depth_anything\convert_depth_anything_to_hf.py
"""从原始仓库转换 Depth Anything 检查点。URL:
https://github.com/LiheYoung/Depth-Anything"""
import argparse
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_dpt_config(model_name):
if "small" in model_name:
backbone_config = Dinov2Config.from_pretrained(
"facebook/dinov2-small", out_indices=[9, 10, 11, 12], apply_layernorm=True, reshape_hidden_states=False
)
fusion_hidden_size = 64
neck_hidden_sizes = [48, 96, 192, 384]
elif "base" in model_name:
backbone_config = Dinov2Config.from_pretrained(
"facebook/dinov2-base", out_indices=[9, 10, 11, 12], apply_layernorm=True, reshape_hidden_states=False
)
fusion_hidden_size = 128
neck_hidden_sizes = [96, 192, 384, 768]
elif "large" in model_name:
backbone_config = Dinov2Config.from_pretrained(
"facebook/dinov2-large", out_indices=[21, 22, 23, 24], apply_layernorm=True, reshape_hidden_states=False
)
fusion_hidden_size = 256
neck_hidden_sizes = [256, 512, 1024, 1024]
else:
raise NotImplementedError("To do")
config = DepthAnythingConfig(
reassemble_hidden_size=backbone_config.hidden_size,
patch_size=backbone_config.patch_size,
backbone_config=backbone_config,
fusion_hidden_size=fusion_hidden_size,
neck_hidden_sizes=neck_hidden_sizes,
)
return config
def create_rename_keys(config):
rename_keys = []
rename_keys.append(("pretrained.cls_token", "backbone.embeddings.cls_token"))
rename_keys.append(("pretrained.mask_token", "backbone.embeddings.mask_token"))
rename_keys.append(("pretrained.pos_embed", "backbone.embeddings.position_embeddings"))
rename_keys.append(("pretrained.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("pretrained.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
for i in range(config.backbone_config.num_hidden_layers):
rename_keys.append((f"pretrained.blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1"))
rename_keys.append((f"pretrained.blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1"))
rename_keys.append((f"pretrained.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight"))
rename_keys.append((f"pretrained.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias"))
rename_keys.append((f"pretrained.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight"))
rename_keys.append((f"pretrained.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias"))
rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight"))
rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias"))
rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight"))
rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias"))
rename_keys.append((f"pretrained.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
rename_keys.append((f"pretrained.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
rename_keys.append(("pretrained.norm.weight", "backbone.layernorm.weight"))
rename_keys.append(("pretrained.norm.bias", "backbone.layernorm.bias"))
for i in range(4):
rename_keys.append((f"depth_head.projects.{i}.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
rename_keys.append((f"depth_head.projects.{i}.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))
if i != 2:
rename_keys.append((f"depth_head.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
rename_keys.append((f"depth_head.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))
mapping = {1:3, 2:2, 3:1, 4:0}
for i in range(1, 5):
j = mapping[i]
rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))
for i in range(4):
rename_keys.append((f"depth_head.scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))
rename_keys.append(("depth_head.scratch.output_conv1.weight", "head.conv1.weight"))
rename_keys.append(("depth_head.scratch.output_conv1.bias", "head.conv1.bias"))
rename_keys.append(("depth_head.scratch.output_conv2.0.weight", "head.conv2.weight"))
rename_keys.append(("depth_head.scratch.output_conv2.0.bias", "head.conv2.bias"))
rename_keys.append(("depth_head.scratch.output_conv2.2.weight", "head.conv3.weight"))
rename_keys.append(("depth_head.scratch.output_conv2.2.bias", "head.conv3.bias"))
return rename_keys
def read_in_q_k_v(state_dict, config):
hidden_size = config.backbone_config.hidden_size
for i in range(config.backbone_config.num_hidden_layers):
in_proj_weight = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.bias")
state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[hidden_size: hidden_size * 2, :]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[hidden_size: hidden_size * 2]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:]
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits):
"""
Copy/paste/tweak model's weights to our DPT structure.
"""
config = get_dpt_config(model_name)
model_name_to_filename = {
"depth-anything-small": "depth_anything_vits14.pth",
"depth-anything-base": "depth_anything_vitb14.pth",
"depth-anything-large": "depth_anything_vitl14.pth",
}
filename = model_name_to_filename[model_name]
filepath = hf_hub_download(
repo_id="LiheYoung/Depth-Anything", filename=f"checkpoints/{filename}", repo_type="space"
)
state_dict = torch.load(filepath, map_location="cpu")
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_q_k_v(state_dict, config)
model = DepthAnythingForDepthEstimation(config)
model.load_state_dict(state_dict)
model.eval()
processor = DPTImageProcessor(
do_resize=True,
size={"height": 518, "width": 518},
ensure_multiple_of=14,
keep_aspect_ratio=True,
do_rescale=True,
do_normalize=True,
image_mean=[0.485, 0.456, 0.406],
image_std=[0.229, 0.224, 0.225],
)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
pixel_values = processor(image, return_tensors="pt").pixel_values
with torch.no_grad():
outputs = model(pixel_values)
predicted_depth = outputs.predicted_depth
print("Shape of predicted depth:", predicted_depth.shape)
print("First values:", predicted_depth[0, :3, :3])
if verify_logits:
expected_shape = torch.Size([1, 518, 686])
if model_name == "depth-anything-small":
expected_slice = torch.tensor(
[[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]],
)
elif model_name == "depth-anything-base":
expected_slice = torch.tensor(
[[26.3997, 26.3004, 26.3928], [26.2260, 26.2092, 26.3427], [26.0719, 26.0483, 26.1254]],
)
elif model_name == "depth-anything-large":
expected_slice = torch.tensor(
[[87.9968, 87.7493, 88.2704], [87.1927, 87.6611, 87.3640], [86.7789, 86.9469, 86.7991]]
)
else:
raise ValueError("Not supported")
assert predicted_depth.shape == torch.Size(expected_shape)
assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-6)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model and processor to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(repo_id=f"LiheYoung/{model_name}-hf")
processor.push_to_hub(repo_id=f"LiheYoung/{model_name}-hf")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="depth-anything-small",
type=str,
choices=name_to_checkpoint.keys(),
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to push the model to the hub after conversion.",
)
parser.add_argument(
"--verify_logits",
action="store_false",
required=False,
help="Whether to verify the logits after conversion.",
)
args = parser.parse_args()
convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits)
.\models\depth_anything\modeling_depth_anything.py
""" PyTorch Depth Anything model."""
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ...file_utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
replace_return_docstrings,
)
from ...modeling_outputs import DepthEstimatorOutput
from ...modeling_utils import PreTrainedModel
from ...utils import logging
from ..auto import AutoBackbone
from .configuration_depth_anything import DepthAnythingConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DepthAnythingConfig"
DEPTH_ANYTHING_PRETRAINED_MODEL_ARCHIVE_LIST = [
"LiheYoung/depth-anything-small-hf",
]
DEPTH_ANYTHING_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`DepthAnythingConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
DEPTH_ANYTHING_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class DepthAnythingReassembleLayer(nn.Module):
"""
PyTorch 模块用于 Depth Anything 重组层的定义
"""
def __init__(self, config, channels, factor):
super().__init__()
self.projection = nn.Conv2d(in_channels=config.reassemble_hidden_size, out_channels=channels, kernel_size=1)
if factor > 1:
self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0)
elif factor == 1:
self.resize = nn.Identity()
elif factor < 1:
self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1)
def forward(self, hidden_state):
hidden_state = self.projection(hidden_state)
hidden_state = self.resize(hidden_state)
return hidden_state
class DepthAnythingReassembleStage(nn.Module):
"""
This class reassembles the hidden states of the backbone into image-like feature representations at various
resolutions.
This happens in 3 stages:
1. Take the patch embeddings and reshape them to image-like feature representations.
2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
3. Resizing the spatial dimensions (height, width).
Args:
config (`[DepthAnythingConfig]`):
Model configuration class defining the model architecture.
"""
def __init__(self, config):
super().__init__()
self.config = config
self.layers = nn.ModuleList()
for channels, factor in zip(config.neck_hidden_sizes, config.reassemble_factors):
self.layers.append(DepthAnythingReassembleLayer(config, channels=channels, factor=factor))
def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
"""
Args:
hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
List of hidden states from the backbone.
"""
out = []
for i, hidden_state in enumerate(hidden_states):
hidden_state = hidden_state[:, 1:]
batch_size, _, num_channels = hidden_state.shape
hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels)
hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
hidden_state = self.layers[i](hidden_state)
out.append(hidden_state)
return out
class DepthAnythingPreActResidualLayer(nn.Module):
"""
ResidualConvUnit, pre-activate residual unit.
Args:
config (`[DepthAnythingConfig]`):
Model configuration class defining the model architecture.
"""
def __init__(self, config):
super().__init__()
self.activation1 = nn.ReLU()
self.convolution1 = nn.Conv2d(
config.fusion_hidden_size,
config.fusion_hidden_size,
kernel_size=3,
stride=1,
padding=1,
bias=True,
)
self.activation2 = nn.ReLU()
self.convolution2 = nn.Conv2d(
config.fusion_hidden_size,
config.fusion_hidden_size,
kernel_size=3,
stride=1,
padding=1,
bias=True,
)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
residual = hidden_state
hidden_state = self.activation1(hidden_state)
hidden_state = self.convolution1(hidden_state)
hidden_state = self.activation2(hidden_state)
hidden_state = self.convolution2(hidden_state)
return hidden_state + residual
class DepthAnythingFeatureFusionLayer(nn.Module):
"""Feature fusion layer, merges feature maps from different stages.
Args:
config (`[DepthAnythingConfig]`):
Model configuration class defining the model architecture.
"""
def __init__(self, config):
super().__init__()
self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
self.residual_layer1 = DepthAnythingPreActResidualLayer(config)
self.residual_layer2 = DepthAnythingPreActResidualLayer(config)
def forward(self, hidden_state, residual=None, size=None):
if residual is not None:
if hidden_state.shape != residual.shape:
residual = nn.functional.interpolate(
residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
)
hidden_state = hidden_state + self.residual_layer1(residual)
hidden_state = self.residual_layer2(hidden_state)
modifier = {"scale_factor": 2} if size is None else {"size": size}
hidden_state = nn.functional.interpolate(
hidden_state,
**modifier,
mode="bilinear",
align_corners=True,
)
hidden_state = self.projection(hidden_state)
return hidden_state
class DepthAnythingFeatureFusionStage(nn.Module):
def __init__(self, config):
super().__init__()
self.layers = nn.ModuleList()
for _ in range(len(config.neck_hidden_sizes)):
self.layers.append(DepthAnythingFeatureFusionLayer(config))
def forward(self, hidden_states, size=None):
hidden_states = hidden_states[::-1]
fused_hidden_states = []
size = hidden_states[1].shape[2:]
fused_hidden_state = self.layers[0](hidden_states[0], size=size)
fused_hidden_states.append(fused_hidden_state)
for idx, (hidden_state, layer) in enumerate(zip(hidden_states[1:], self.layers[1:])):
size = hidden_states[1:][idx + 1].shape[2:] if idx != (len(hidden_states[1:]) - 1) else None
fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size)
fused_hidden_states.append(fused_hidden_state)
return fused_hidden_states
class DepthAnythingPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DepthAnythingConfig
base_model_prefix = "depth_anything"
main_input_name = "pixel_values"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
class DepthAnythingNeck(nn.Module):
"""
DepthAnythingNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
input and produces another list of tensors as output. For DepthAnything, it includes 2 stages:
* DepthAnythingReassembleStage
* DepthAnythingFeatureFusionStage.
Args:
config (dict): config dict.
"""
def __init__(self, config):
super().__init__()
self.config = config
self.reassemble_stage = DepthAnythingReassembleStage(config)
self.convs = nn.ModuleList()
for channel in config.neck_hidden_sizes:
self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False))
self.fusion_stage = DepthAnythingFeatureFusionStage(config)
def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
"""
Args:
hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
List of hidden states from the backbone.
"""
if not isinstance(hidden_states, (tuple, list)):
raise ValueError("hidden_states should be a tuple or list of tensors")
if len(hidden_states) != len(self.config.neck_hidden_sizes):
raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)
features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]
output = self.fusion_stage(features)
return output
class DepthAnythingDepthEstimationHead(nn.Module):
"""
Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
the predictions to the input resolution after the first convolutional layer (details can be found in the DPT paper's
supplementary material).
"""
def __init__(self, config):
super().__init__()
self.head_in_index = config.head_in_index
self.patch_size = config.patch_size
features = config.fusion_hidden_size
self.conv1 = nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1)
self.conv2 = nn.Conv2d(features // 2, config.head_hidden_size, kernel_size=3, stride=1, padding=1)
self.activation1 = nn.ReLU()
self.conv3 = nn.Conv2d(config.head_hidden_size, 1, kernel_size=1, stride=1, padding=0)
self.activation2 = nn.ReLU()
hidden_states = hidden_states[self.head_in_index]
predicted_depth = self.conv1(hidden_states)
predicted_depth = nn.functional.interpolate(
predicted_depth,
(int(patch_height * self.patch_size), int(patch_width * self.patch_size)),
mode="bilinear",
align_corners=True,
)
predicted_depth = self.conv2(predicted_depth)
predicted_depth = self.activation1(predicted_depth)
predicted_depth = self.conv3(predicted_depth)
predicted_depth = self.activation2(predicted_depth)
predicted_depth = predicted_depth.squeeze(dim=1)
return predicted_depth
@add_start_docstrings(
"""
Depth Anything Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
""",
DEPTH_ANYTHING_START_DOCSTRING,
)
class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.backbone = AutoBackbone.from_config(config.backbone_config)
self.neck = DepthAnythingNeck(config)
self.head = DepthAnythingDepthEstimationHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(DEPTH_ANYTHING_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\depth_anything\__init__.py
from typing import TYPE_CHECKING
from ...file_utils import _LazyModule, is_torch_available
from ...utils import OptionalDependencyNotAvailable
_import_structure = {
"configuration_depth_anything": ["DEPTH_ANYTHING_PRETRAINED_CONFIG_ARCHIVE_MAP", "DepthAnythingConfig"]
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_depth_anything"] = [
"DEPTH_ANYTHING_PRETRAINED_MODEL_ARCHIVE_LIST",
"DepthAnythingForDepthEstimation",
"DepthAnythingPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_depth_anything import DEPTH_ANYTHING_PRETRAINED_CONFIG_ARCHIVE_MAP, DepthAnythingConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_depth_anything import (
DEPTH_ANYTHING_PRETRAINED_MODEL_ARCHIVE_LIST,
DepthAnythingForDepthEstimation,
DepthAnythingPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deta\configuration_deta.py
""" DETA model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
DETA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"ut/deta": "https://huggingface.co/ut/deta/resolve/main/config.json",
}
class DetaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DetaModel`]. It is used to instantiate a DETA
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the DETA
[SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import DetaConfig, DetaModel
>>> # Initializing a DETA SenseTime/deformable-detr style configuration
>>> configuration = DetaConfig()
>>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
>>> model = DetaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "deta"
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "encoder_attention_heads",
}
def __init__(
self,
backbone_config=None,
backbone=None,
use_pretrained_backbone=False,
use_timm_backbone=False,
backbone_kwargs=None,
num_queries=900,
max_position_embeddings=2048,
encoder_layers=6,
encoder_ffn_dim=2048,
encoder_attention_heads=8,
decoder_layers=6,
decoder_ffn_dim=1024,
decoder_attention_heads=8,
encoder_layerdrop=0.0,
is_encoder_decoder=True,
activation_function="relu",
d_model=256,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
init_xavier_std=1.0,
return_intermediate=True,
auxiliary_loss=False,
position_embedding_type="sine",
num_feature_levels=5,
encoder_n_points=4,
decoder_n_points=4,
two_stage=True,
two_stage_num_proposals=300,
with_box_refine=True,
assign_first_stage=True,
assign_second_stage=True,
class_cost=1,
bbox_cost=5,
giou_cost=2,
mask_loss_coefficient=1,
dice_loss_coefficient=1,
bbox_loss_coefficient=5,
giou_loss_coefficient=2,
eos_coefficient=0.1,
focal_alpha=0.25,
disable_custom_kernels=True,
**kwargs,
):
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
.\models\deta\convert_deta_resnet_to_pytorch.py
"""Convert DETA checkpoints from the original repository.
URL: https://github.com/jozhang97/DETA/tree/master"""
import argparse
import json
from pathlib import Path
import requests
import torch
from huggingface_hub import cached_download, hf_hub_download, hf_hub_url
from PIL import Image
from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_deta_config():
config = DetaConfig(
num_queries=900,
encoder_ffn_dim=2048,
decoder_ffn_dim=2048,
num_feature_levels=5,
assign_first_stage=True,
with_box_refine=True,
two_stage=True,
)
config.num_labels = 91
repo_id = "huggingface/label-files"
filename = "coco-detection-id2label.json"
id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
def create_rename_keys(config):
rename_keys = []
rename_keys.append(("backbone.0.body.conv1.weight", "model.backbone.model.embedder.embedder.convolution.weight"))
rename_keys.append(("backbone.0.body.bn1.weight", "model.backbone.model.embedder.embedder.normalization.weight"))
rename_keys.append(("backbone.0.body.bn1.bias", "model.backbone.model.embedder.embedder.normalization.bias"))
rename_keys.append(("backbone.0.body.bn1.running_mean", "model.backbone.model.embedder.embedder.normalization.running_mean"))
rename_keys.append(("backbone.0.body.bn1.running_var", "model.backbone.model.embedder.embedder.normalization.running_var"))
for i in range(config.encoder_layers):
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
for i in range(config.decoder_layers):
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def read_in_decoder_q_k_v(state_dict, config):
hidden_size = config.d_model
for i in range(config.decoder_layers):
in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[hidden_size:2*hidden_size, :]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size:2*hidden_size]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
"""
复制/粘贴/调整模型权重到我们的 DETA 结构中。
"""
config = get_deta_config()
if model_name == "deta-resnet-50":
filename = "adet_checkpoint0011.pth"
elif model_name == "deta-resnet-50-24-epochs":
filename = "adet_2x_checkpoint0023.pth"
else:
raise ValueError(f"Model name {model_name} not supported")
checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename=filename)
state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_decoder_q_k_v(state_dict, config)
for key in state_dict.copy().keys():
if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer.decoder", "model.decoder")] = val
if "input_proj" in key:
val = state_dict.pop(key)
state_dict["model." + key] = val
if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer", "model")] = val
model = DetaForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
processor = DetaImageProcessor(format="coco_detection")
img = prepare_img()
encoding = processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values.to(device))
if model_name == "deta-resnet-50":
expected_logits = torch.tensor(
[[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
)
expected_boxes = torch.tensor([[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]])
elif model_name == "deta-resnet-50-24-epochs":
expected_logits = torch.tensor(
[[-7.1688, -2.4857, -4.8669], [-7.8630, -3.8154, -4.2674], [-7.2730, -4.1865, -5.5323]]
)
expected_boxes = torch.tensor([[0.5021, 0.4971, 0.9994], [0.2546, 0.5486, 0.4731], [0.1686, 0.1986, 0.2142]])
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
print("Everything ok!")
if pytorch_dump_folder_path:
logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(f"jozhang97/{model_name}")
processor.push_to_hub(f"jozhang97/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
type=str,
default="deta-resnet-50",
choices=["deta-resnet-50", "deta-resnet-50-24-epochs"],
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\deta\convert_deta_swin_to_pytorch.py
def get_deta_config(model_name):
backbone_config = SwinConfig(
embed_dim=192,
depths=(2, 2, 18, 2),
num_heads=(6, 12, 24, 48),
window_size=12,
out_features=["stage2", "stage3", "stage4"],
)
config = DetaConfig(
backbone_config=backbone_config,
num_queries=900,
encoder_ffn_dim=2048,
decoder_ffn_dim=2048,
num_feature_levels=5,
assign_first_stage=True,
with_box_refine=True,
two_stage=True,
)
repo_id = "huggingface/label-files"
if "o365" in model_name:
num_labels = 366
filename = "object365-id2label.json"
else:
num_labels = 91
filename = "coco-detection-id2label.json"
id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.num_labels = num_labels
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
def create_rename_keys(config):
rename_keys = []
rename_keys.append(("backbone.0.body.patch_embed.proj.weight", "model.backbone.model.embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("backbone.0.body.patch_embed.proj.bias", "model.backbone.model.embeddings.patch_embeddings.projection.bias"))
rename_keys.append(("backbone.0.body.patch_embed.norm.weight", "model.backbone.model.embeddings.norm.weight"))
rename_keys.append(("backbone.0.body.patch_embed.norm.bias", "model.backbone.model.embeddings.norm.bias"))
for i in range(len(config.backbone_config.depths)):
for j in range(config.backbone_config.depths[i]):
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
if i < 3:
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.reduction.weight", f"model.backbone.model.encoder.layers.{i}.downsample.reduction.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.weight", f"model.backbone.model.encoder.layers.{i}.downsample.norm.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.bias", f"model.backbone.model.encoder.layers.{i}.downsample.norm.bias"))
rename_keys.append(("backbone.0.body.norm1.weight", "model.backbone.model.hidden_states_norms.stage2.weight"))
rename_keys.append(("backbone.0.body.norm1.bias", "model.backbone.model.hidden_states_norms.stage2.bias"))
rename_keys.append(("backbone.0.body.norm2.weight", "model.backbone.model.hidden_states_norms.stage3.weight"))
rename_keys.append(("backbone.0.body.norm2.bias", "model.backbone.model.hidden_states_norms.stage3.bias"))
rename_keys.append(("backbone.0.body.norm3.weight", "model.backbone.model.hidden_states_norms.stage4.weight"))
rename_keys.append(("backbone.0.body.norm3.bias", "model.backbone.model.hidden_states_norms.stage4.bias"))
for i in range(config.encoder_layers):
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
for i in range(config.decoder_layers):
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def read_in_swin_q_k_v(state_dict, backbone_config):
num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
for i in range(len(backbone_config.depths)):
dim = num_features[i]
for j in range(backbone_config.depths[i]):
in_proj_weight = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.bias")
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
dim : dim * 2, :
]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
dim : dim * 2
]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-dim :, :
]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
def read_in_decoder_q_k_v(state_dict, config):
hidden_size = config.d_model
for i in range(config.decoder_layers):
in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
hidden_size : hidden_size * 2, :
]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
"""
复制/粘贴/调整模型的权重到我们的 DETA 结构中。
"""
config = get_deta_config(model_name)
if model_name == "deta-swin-large":
checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename="adet_swin_ft.pth")
elif model_name == "deta-swin-large-o365":
checkpoint_path = hf_hub_download(repo_id="jozhang97/deta-swin-l-o365", filename="deta_swin_pt_o365.pth")
else:
raise ValueError(f"Model name {model_name} not supported")
state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
for name, param in state_dict.items():
print(name, param.shape)
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_swin_q_k_v(state_dict, config.backbone_config)
read_in_decoder_q_k_v(state_dict, config)
for key in state_dict.copy().keys():
if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer.decoder", "model.decoder")] = val
if "input_proj" in key:
val = state_dict.pop(key)
state_dict["model." + key] = val
if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer", "model")] = val
model = DetaForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
processor = DetaImageProcessor(format="coco_detection")
img = prepare_img()
encoding = processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values.to(device))
print("Logits:", outputs.logits[0, :3, :3])
print("Boxes:", outputs.pred_boxes[0, :3, :3])
if model_name == "deta-swin-large":
expected_logits = torch.tensor(
[[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
)
expected_boxes = torch.tensor([[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]])
expected_logits = torch.tensor(
[[-8.0122, -3.5720, -4.9717], [-8.1547, -3.6886, -4.6389], [-7.6610, -3.6194, -5.0134]]
)
expected_boxes = torch.tensor([[0.2523, 0.5549, 0.4881], [0.7715, 0.4149, 0.4601], [0.5503, 0.2753, 0.0575]])
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
print("Everything ok!")
if pytorch_dump_folder_path:
logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(f"jozhang97/{model_name}")
processor.push_to_hub(f"jozhang97/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
type=str,
default="deta-swin-large",
choices=["deta-swin-large", "deta-swin-large-o365"],
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model.",
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\deta\image_processing_deta.py
"""Image processor class for Deformable DETR."""
import pathlib
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
from ...feature_extraction_utils import BatchFeature
from ...image_processing_utils import BaseImageProcessor, get_size_dict
from ...image_transforms import (
PaddingMode,
center_to_corners_format,
corners_to_center_format,
pad,
rescale,
resize,
rgb_to_id,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
AnnotationFormat,
AnnotationType,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_batched,
is_scaled_image,
to_numpy_array,
valid_images,
validate_annotations,
validate_preprocess_arguments,
)
from ...utils import (
is_flax_available,
is_jax_tensor,
is_tf_available,
is_tf_tensor,
is_torch_available,
is_torch_tensor,
is_torchvision_available,
is_vision_available,
logging,
)
from ...utils.generic import TensorType
if is_torch_available():
import torch
if is_torchvision_available():
from torchvision.ops.boxes import batched_nms
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
"""
Computes the output image size given the input image size and the desired output size.
Args:
image_size (`Tuple[int, int]`):
The input image size.
size (`int`):
The desired output size.
max_size (`int`, *optional*):
The maximum allowed output size.
"""
height, width = image_size
if max_size is not None:
min_original_size = float(min((height, width)))
max_original_size = float(max((height, width)))
if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size))
return int(round(height * size / min((height, width)))), int(round(width * size / min((height, width))))
if (height <= width and height == size) or (width <= height and width == size):
return height, width
if width < height:
ow = size
oh = int(size * height / width)
else:
oh = size
ow = int(size * width / height)
return (oh, ow)
def get_resize_output_image_size(
input_image: np.ndarray,
size: Union[int, Tuple[int, int], List[int]],
max_size: Optional[int] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image size and the desired output size. If the desired output size
is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
image size is computed by keeping the aspect ratio of the input image size.
Args:
input_image (`np.ndarray`):
The image to resize.
size (`int` or `Tuple[int, int]` or `List[int]`):
The desired output size.
max_size (`int`, *optional*):
The maximum allowed output size.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
if isinstance(size, (list, tuple)):
return size
return get_size_with_aspect_ratio(image_size, size, max_size)
def get_numpy_to_framework_fn(arr) -> Callable:
"""
Returns a function that converts a numpy array to the framework of the input array.
Args:
arr (`np.ndarray`): The array to convert.
"""
if isinstance(arr, np.ndarray):
return np.array
if is_tf_available() and is_tf_tensor(arr):
import tensorflow as tf
return tf.convert_to_tensor
if is_torch_available() and is_torch_tensor(arr):
import torch
return torch.tensor
if is_flax_available() and is_jax_tensor(arr):
import jax.numpy as jnp
return jnp.array
raise ValueError(f"Cannot convert arrays of type {type(arr)}")
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
"""
Squeezes an array, but only if the axis specified has dim 1.
"""
if axis is None:
return arr.squeeze()
try:
return arr.squeeze(axis=axis)
except ValueError:
return arr
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
image_height, image_width = image_size
norm_annotation = {}
for key, value in annotation.items():
if key == "boxes":
boxes = value
boxes = corners_to_center_format(boxes)
boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
norm_annotation[key] = boxes
else:
norm_annotation[key] = value
return norm_annotation
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
Return the maximum value across all indices of an iterable of values.
"""
return [max(values_i) for values_i in zip(*values)]
def get_max_height_width(
images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
"""
Get the maximum height and width across all images in a batch.
"""
if input_data_format is None:
input_data_format = infer_channel_dimension_format(images[0])
if input_data_format == ChannelDimension.FIRST:
_, max_height, max_width = max_across_indices([img.shape for img in images])
elif input_data_format == ChannelDimension.LAST:
max_height, max_width, _ = max_across_indices([img.shape for img in images])
else:
raise ValueError(f"Invalid channel dimension format: {input_data_format}")
return (max_height, max_width)
def make_pixel_mask(
image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
"""
Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
Args:
image (`np.ndarray`):
Image to make the pixel mask for.
output_size (`Tuple[int, int]`):
Output size of the mask.
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
mask = np.zeros(output_size, dtype=np.int64)
mask[:input_height, :input_width] = 1
return mask
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
"""
Convert a COCO polygon annotation to a mask.
Args:
segmentations (`List[List[float]]`):
List of polygons, each polygon represented by a list of x-y coordinates.
height (`int`):
Height of the mask.
width (`int`):
Width of the mask.
"""
try:
from pycocotools import mask as coco_mask
except ImportError:
raise ImportError("Pycocotools is not installed in your environment.")
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = np.asarray(mask, dtype=np.uint8)
mask = np.any(mask, axis=2)
masks.append(mask)
if masks:
masks = np.stack(masks, axis=0)
else:
masks = np.zeros((0, height, width), dtype=np.uint8)
return masks
def prepare_coco_detection_annotation(
image,
target,
return_segmentation_masks: bool = False,
input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
"""
将COCO格式的目标转换为DETA期望的格式。
"""
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
image_id = target["image_id"]
image_id = np.asarray([image_id], dtype=np.int64)
annotations = target["annotations"]
annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
classes = [obj["category_id"] for obj in annotations]
classes = np.asarray(classes, dtype=np.int64)
area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
boxes = [obj["bbox"] for obj in annotations]
boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2]
boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
new_target = {}
new_target["image_id"] = image_id
new_target["class_labels"] = classes[keep]
new_target["boxes"] = boxes[keep]
new_target["area"] = area[keep]
new_target["iscrowd"] = iscrowd[keep]
new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
if annotations and "keypoints" in annotations[0]:
keypoints = [obj["keypoints"] for obj in annotations]
keypoints = np.asarray(keypoints, dtype=np.float32)
keypoints = keypoints[keep]
num_keypoints = keypoints.shape[0]
keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
new_target["keypoints"] = keypoints
if return_segmentation_masks:
segmentation_masks = [obj["segmentation"] for obj in annotations]
masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
new_target["masks"] = masks[keep]
return new_target
def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
"""
计算提供的全景分割掩码周围的边界框。
Args:
masks: 格式为`[number_masks, height, width]`的掩码,其中N是掩码的数量
Returns:
boxes: 格式为`[number_masks, 4]`的边界框,使用xyxy格式
"""
if masks.size == 0:
return np.zeros((0, 4))
h, w = masks.shape[-2:]
y = np.arange(0, h, dtype=np.float32)
x = np.arange(0, w, dtype=np.float32)
y, x = np.meshgrid(y, x, indexing="ij")
x_mask = masks * np.expand_dims(x, axis=0)
x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
x_min = x.filled(fill_value=1e8)
x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
y_mask = masks * np.expand_dims(y, axis=0)
y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
y_min = y.filled(fill_value=1e8)
y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
return np.stack([x_min, y_min, x_max, y_max], 1)
def prepare_coco_panoptic_annotation(
image: np.ndarray,
target: Dict,
masks_path: Union[str, pathlib.Path],
return_masks: bool = True,
input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
"""
Prepare a coco panoptic annotation for DETA.
"""
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
annotation_path = pathlib.Path(masks_path) / target["file_name"]
new_target = {}
new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
if "segments_info" in target:
masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
masks = rgb_to_id(masks)
ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
masks = masks == ids[:, None, None]
masks = masks.astype(np.uint8)
if return_masks:
new_target["masks"] = masks
new_target["boxes"] = masks_to_boxes(masks)
new_target["class_labels"] = np.array(
[segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
)
new_target["iscrowd"] = np.asarray(
[segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
)
new_target["area"] = np.asarray(
[segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
)
return new_target
def resize_annotation(
annotation: Dict[str, Any],
orig_size: Tuple[int, int],
target_size: Tuple[int, int],
threshold: float = 0.5,
resample: PILImageResampling = PILImageResampling.NEAREST,
):
"""
Resizes an annotation to a target size.
Args:
annotation (`Dict[str, Any]`):
The annotation dictionary.
orig_size (`Tuple[int, int]`):
The original size of the input image.
target_size (`Tuple[int, int]`):
The target size of the image, as returned by the preprocessing `resize` step.
threshold (`float`, *optional*, defaults to 0.5):
The threshold used to binarize the segmentation masks.
resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
The resampling filter to use when resizing the masks.
"""
ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
ratio_height, ratio_width = ratios
new_annotation = {}
new_annotation["size"] = target_size
for key, value in annotation.items():
if key == "boxes":
boxes = value
scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
new_annotation["boxes"] = scaled_boxes
elif key == "area":
area = value
scaled_area = area * (ratio_width * ratio_height)
new_annotation["area"] = scaled_area
elif key == "masks":
masks = value[:, None]
masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
masks = masks.astype(np.float32)
masks = masks[:, 0] > threshold
new_annotation["masks"] = masks
elif key == "size":
new_annotation["size"] = target_size
else:
new_annotation[key] = value
return new_annotation
class DetaImageProcessor(BaseImageProcessor):
r"""
Constructs a Deformable DETR image processor.
Args:
format (`str`, *optional*, defaults to `"coco_detection"`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_resize (`bool`, *optional*, defaults to `True`):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
the `preprocess` method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
`do_rescale` parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize:
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
`preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
"""
model_input_names = ["pixel_values", "pixel_mask"]
def __init__(
self,
format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
do_convert_annotations: bool = True,
do_pad: bool = True,
**kwargs,
) -> None:
if "pad_and_return_pixel_mask" in kwargs:
do_pad = kwargs.pop("pad_and_return_pixel_mask")
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, default_to_square=False)
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs)
self.format = format
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
def prepare_annotation(
self,
image: np.ndarray,
target: Dict,
format: Optional[AnnotationFormat] = None,
return_segmentation_masks: bool = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Dict:
"""
Prepare an annotation for feeding into DETA model.
"""
format = format if format is not None else self.format
if format == AnnotationFormat.COCO_DETECTION:
return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_detection_annotation(
image, target, return_segmentation_masks, input_data_format=input_data_format
)
elif format == AnnotationFormat.COCO_PANOPTIC:
return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_panoptic_annotation(
image,
target,
masks_path=masks_path,
return_masks=return_segmentation_masks,
input_data_format=input_data_format,
)
else:
raise ValueError(f"Format {format} is not supported.")
return target
def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
logger.warning_once(
"The `prepare` method is deprecated and will be removed in a v4.33. "
"Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
"does not return the image anymore.",
)
target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
return image, target
def convert_coco_poly_to_mask(self, *args, **kwargs):
logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
return convert_coco_poly_to_mask(*args, **kwargs)
def prepare_coco_detection(self, *args, **kwargs):
logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
return prepare_coco_detection_annotation(*args, **kwargs)
def prepare_coco_panoptic(self, *args, **kwargs):
logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
return prepare_coco_panoptic_annotation(*args, **kwargs)
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
int, smaller edge of the image will be matched to this number.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
The desired output size. Can contain keys `shortest_edge` and `longest_edge` or `height` and `width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input
image.
"""
size = get_size_dict(size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
else:
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format
)
return image
def resize_annotation(
self,
annotation,
orig_size,
size,
resample: PILImageResampling = PILImageResampling.NEAREST,
) -> Dict:
"""
Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
to this number.
"""
return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
def rescale(
self,
image: np.ndarray,
rescale_factor: float,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the input image. If unset, is inferred from the input image. Can be
one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
"""
return normalize_annotation(annotation, image_size=image_size)
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
output_height, output_width = output_size
pad_bottom = output_height - input_height
pad_right = output_width - input_width
padding = ((0, pad_bottom), (0, pad_right))
padded_image = pad(
image,
padding,
mode=PaddingMode.CONSTANT,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
def pad(
self,
images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
):
def preprocess(
self,
images: ImageInput,
annotations: Optional[Union[List[Dict], List[List[Dict]]]] = None,
return_segmentation_masks: bool = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
resample=None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[Union[int, float]] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: Optional[bool] = None,
format: Optional[Union[str, AnnotationFormat]] = None,
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
def post_process_object_detection(
self,
outputs,
threshold: float = 0.5,
target_sizes: Union[TensorType, List[Tuple]] = None,
nms_threshold: float = 0.7,
.\models\deta\modeling_deta.py
""" PyTorch DETA model. """
import copy
import math
import os
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
from torch import Tensor, nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from ...activations import ACT2FN
from ...file_utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_scipy_available,
is_torch_cuda_available,
is_vision_available,
replace_return_docstrings,
)
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import meshgrid
from ...utils import is_accelerate_available, is_ninja_available, is_torchvision_available, logging, requires_backends
from ...utils.backbone_utils import load_backbone
from .configuration_deta import DetaConfig
logger = logging.get_logger(__name__)
MultiScaleDeformableAttention = None
def load_cuda_kernels():
from torch.utils.cpp_extension import load
global MultiScaleDeformableAttention
root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deta"
src_files = [
root / filename
for filename in [
"vision.cpp",
os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
os.path.join("cuda", "ms_deform_attn_cuda.cu"),
]
]
load(
"MultiScaleDeformableAttention",
src_files,
with_cuda=True,
extra_include_paths=[str(root)],
extra_cflags=["-DWITH_CUDA=1"],
extra_cuda_cflags=[
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
],
)
class MultiScaleDeformableAttentionFunction(Function):
@staticmethod
def forward(
context,
value,
value_spatial_shapes,
value_level_start_index,
sampling_locations,
attention_weights,
im2col_step,
):
context.im2col_step = im2col_step
output = MultiScaleDeformableAttention.ms_deform_attn_forward(
value,
value_spatial_shapes,
value_level_start_index,
sampling_locations,
attention_weights,
context.im2col_step,
)
context.save_for_backward(
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
)
return output
@staticmethod
@once_differentiable
def backward(context, grad_output):
(
value,
value_spatial_shapes,
value_level_start_index,
sampling_locations,
attention_weights,
) = context.saved_tensors
grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
value,
value_spatial_shapes,
value_level_start_index,
sampling_locations,
attention_weights,
grad_output,
context.im2col_step,
)
return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
if is_accelerate_available():
from accelerate import PartialState
from accelerate.utils import reduce
if is_vision_available():
from transformers.image_transforms import center_to_corners_format
if is_torchvision_available():
from torchvision.ops.boxes import batched_nms
if is_scipy_available():
from scipy.optimize import linear_sum_assignment
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DetaConfig"
_CHECKPOINT_FOR_DOC = "jozhang97/deta-swin-large-o365"
DETA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"jozhang97/deta-swin-large-o365",
]
@dataclass
class DetaDecoderOutput(ModelOutput):
"""
DetaDecoder 的输出基类。这个类在 BaseModelOutputWithCrossAttentions 基础上增加了两个属性:
- 中间解码器隐藏状态的堆叠张量(即每个解码器层的输出)
- 中间参考点的堆叠张量。
"""
last_hidden_state: torch.FloatTensor = None
intermediate_hidden_states: torch.FloatTensor = None
intermediate_reference_points: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class DetaModelOutput(ModelOutput):
"""
Deformable DETR编码器-解码器模型输出的基类。
"""
init_reference_points: torch.FloatTensor = None
last_hidden_state: torch.FloatTensor = None
intermediate_hidden_states: torch.FloatTensor = None
intermediate_reference_points: torch.FloatTensor = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
enc_outputs_class: Optional[torch.FloatTensor] = None
enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
output_proposals: Optional[torch.FloatTensor] = None
@dataclass
class DetaObjectDetectionOutput(ModelOutput):
"""
DetaForObjectDetection模型的输出类型。
"""
loss: Optional[torch.FloatTensor] = None
loss_dict: Optional[Dict] = None
logits: torch.FloatTensor = None
pred_boxes: torch.FloatTensor = None
auxiliary_outputs: Optional[List[Dict]] = None
init_reference_points: Optional[torch.FloatTensor] = None
last_hidden_state: Optional[torch.FloatTensor] = None
intermediate_hidden_states: Optional[torch.FloatTensor] = None
intermediate_reference_points: Optional[torch.FloatTensor] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
enc_outputs_class: Optional = None
enc_outputs_coord_logits: Optional = None
output_proposals: Optional[torch.FloatTensor] = None
def _get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
def inverse_sigmoid(x, eps=1e-5):
x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps)
return torch.log(x1 / x2)
class DetaFrozenBatchNorm2d(nn.Module):
"""
批量归一化层,其中批次统计信息和仿射参数被固定。
从torchvision.misc.ops中复制粘贴,添加eps以在没有此项的情况下保证任何模型(而不仅仅是torchvision.models.resnet[18,34,50,101])不产生nan。
"""
def __init__(self, n):
super().__init__()
self.register_buffer("weight", torch.ones(n))
self.register_buffer("bias", torch.zeros(n))
self.register_buffer("running_mean", torch.zeros(n))
self.register_buffer("running_var", torch.ones(n))
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
num_batches_tracked_key = prefix + "num_batches_tracked"
if num_batches_tracked_key in state_dict:
del state_dict[num_batches_tracked_key]
super()._load_from_state_dict(
state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
)
def forward(self, x):
weight = self.weight.reshape(1, -1, 1, 1)
bias = self.bias.reshape(1, -1, 1, 1)
running_var = self.running_var.reshape(1, -1, 1, 1)
running_mean = self.running_mean.reshape(1, -1, 1, 1)
epsilon = 1e-5
scale = weight * (running_var + epsilon).rsqrt()
bias = bias - running_mean * scale
return x * scale + bias
def replace_batch_norm(model):
r"""
递归替换所有的 `torch.nn.BatchNorm2d` 层为 `DetaFrozenBatchNorm2d`。
Args:
model (torch.nn.Module):
输入的模型
"""
for name, module in model.named_children():
if isinstance(module, nn.BatchNorm2d):
new_module = DetaFrozenBatchNorm2d(module.num_features)
if not module.weight.device == torch.device("meta"):
new_module.weight.data.copy_(module.weight)
new_module.bias.data.copy_(module.bias)
new_module.running_mean.data.copy_(module.running_mean)
new_module.running_var.data.copy_(module.running_var)
model._modules[name] = new_module
if len(list(module.children())) > 0:
replace_batch_norm(module)
class DetaBackboneWithPositionalEncodings(nn.Module):
"""
带有位置编码的主干模型。
nn.BatchNorm2d 层被上述定义的 DetaFrozenBatchNorm2d 替换。
"""
def __init__(self, config):
super().__init__()
backbone = load_backbone(config)
with torch.no_grad():
replace_batch_norm(backbone)
self.model = backbone
self.intermediate_channel_sizes = self.model.channels
if config.backbone_config.model_type == "resnet":
for name, parameter in self.model.named_parameters():
if "stages.1" not in name and "stages.2" not in name and "stages.3" not in name:
parameter.requires_grad_(False)
self.position_embedding = build_position_encoding(config)
def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
"""
如果 `config.num_feature_levels > 1`,则输出 ResNet 中 C_3 到 C_5 的后续阶段的特征图,否则输出 C_5 的特征图。
"""
features = self.model(pixel_values).feature_maps
out = []
pos = []
for feature_map in features:
mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
position_embeddings = self.position_embedding(feature_map, mask).to(feature_map.dtype)
out.append((feature_map, mask))
pos.append(position_embeddings)
return out, pos
class DetaSinePositionEmbedding(nn.Module):
"""
这是一种更标准的位置编码版本,与 Attention is all you
"""
需要纸张,通用于处理图像。
初始化函数,设置模型参数并进行必要的检查。
"""
def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
super().__init__()
# 设置嵌入维度
self.embedding_dim = embedding_dim
# 温度参数,影响位置编码的范围
self.temperature = temperature
# 是否进行归一化处理
self.normalize = normalize
# 如果传入了scale但未开启normalize,则引发错误
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
# 如果未传入scale,则默认设置为2π
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, pixel_values, pixel_mask):
# 如果未提供像素掩码,则引发错误
if pixel_mask is None:
raise ValueError("No pixel mask provided")
# 在y方向上对掩码进行累积求和,作为位置编码的一部分
y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
# 在x方向上对掩码进行累积求和,作为位置编码的一部分
x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
# 如果设置了归一化,则对位置编码进行归一化处理
if self.normalize:
eps = 1e-6
y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
# 生成维度向量,用于计算位置编码
dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
# 计算x和y方向上的位置编码
pos_x = x_embed[:, :, :, None] / dim_t
pos_y = y_embed[:, :, :, None] / dim_t
# 对位置编码进行正弦和余弦变换,然后展平处理
pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
# 拼接x和y方向的位置编码,并将通道维度放到正确的位置
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
# 返回位置编码结果
return pos
# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding
class DetaLearnedPositionEmbedding(nn.Module):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, embedding_dim=256):
super().__init__()
# 创建一个嵌入层用于行位置编码,嵌入维度为 embedding_dim,总共有 50 个位置
self.row_embeddings = nn.Embedding(50, embedding_dim)
# 创建一个嵌入层用于列位置编码,嵌入维度为 embedding_dim,总共有 50 个位置
self.column_embeddings = nn.Embedding(50, embedding_dim)
def forward(self, pixel_values, pixel_mask=None):
# 获取输入像素值的高度和宽度
height, width = pixel_values.shape[-2:]
# 创建一个张量,包含从 0 到 width-1 的整数,设备类型与 pixel_values 相同
width_values = torch.arange(width, device=pixel_values.device)
# 创建一个张量,包含从 0 到 height-1 的整数,设备类型与 pixel_values 相同
height_values = torch.arange(height, device=pixel_values.device)
# 获取列位置的嵌入向量
x_emb = self.column_embeddings(width_values)
# 获取行位置的嵌入向量
y_emb = self.row_embeddings(height_values)
# 拼接列和行位置的嵌入向量,形成位置编码矩阵
pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
# 将位置编码矩阵进行维度置换,变为 (embedding_dim, height, width) 的形式
pos = pos.permute(2, 0, 1)
# 在最前面添加一个维度,变为 (1, embedding_dim, height, width) 的形式
pos = pos.unsqueeze(0)
# 将位置编码矩阵扩展为与输入像素值相同的张量形状
pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
# 返回位置编码张量
return pos
# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->Deta
def build_position_encoding(config):
# 计算位置编码的步数,为模型维度除以 2
n_steps = config.d_model // 2
if config.position_embedding_type == "sine":
# 若使用正弦位置编码类型,则创建 DetaSinePositionEmbedding 对象
# 此处暂未暴露其他参数的更好方法
position_embedding = DetaSinePositionEmbedding(n_steps, normalize=True)
elif config.position_embedding_type == "learned":
# 若使用学习得到的位置编码类型,则创建 DetaLearnedPositionEmbedding 对象
position_embedding = DetaLearnedPositionEmbedding(n_steps)
else:
# 抛出异常,指出不支持的位置编码类型
raise ValueError(f"Not supported {config.position_embedding_type}")
# 返回创建的位置编码对象
return position_embedding
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
def multi_scale_deformable_attention(
value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
) -> Tensor:
# 获取输入 value 张量的维度信息
batch_size, _, num_heads, hidden_dim = value.shape
# 获取 sampling_locations 张量的维度信息
_, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
# 将 value 张量按照空间形状进行分割,并存储到 value_list 中
value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
# 计算采样网格,将采样位置放大为 2 倍并减去 1
sampling_grids = 2 * sampling_locations - 1
# 初始化采样值列表为空列表
sampling_value_list = []
for level_id, (height, width) in enumerate(value_spatial_shapes):
# 遍历每个级别的空间形状,level_id 是级别索引,(height, width) 是高度和宽度元组
# 扁平化 value_list[level_id],将其转置,然后重塑为指定形状
# 得到形状为 batch_size*num_heads, hidden_dim, height, width 的 value_l_
value_l_ = (
value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
)
# 提取 sampling_grids 中特定 level_id 的数据,进行转置和扁平化操作
# 得到形状为 batch_size*num_heads, num_queries, num_points, 2 的 sampling_grid_l_
sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
# 使用双线性插值的方式,根据 sampling_grid_l_ 对 value_l_ 进行采样
# 得到形状为 batch_size*num_heads, hidden_dim, num_queries, num_points 的 sampling_value_l_
sampling_value_l_ = nn.functional.grid_sample(
value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
)
# 将当前级别的采样值列表添加到 sampling_value_list 中
sampling_value_list.append(sampling_value_l_)
# 调整注意力权重的形状,转置以匹配后续计算的需求
# 最终形状为 batch_size, num_heads, num_queries, num_levels * num_points
attention_weights = attention_weights.transpose(1, 2).reshape(
batch_size * num_heads, 1, num_queries, num_levels * num_points
)
# 计算最终的输出,对采样值列表进行堆叠和扁平化操作,并乘以注意力权重,最后重新调整形状
output = (
(torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
.sum(-1)
.view(batch_size, num_heads * hidden_dim, num_queries)
)
# 返回输出,并重新调整其形状以匹配预期的输出格式
return output.transpose(1, 2).contiguous()
# 从transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention复制而来,修改为DetaMultiscaleDeformableAttention
class DetaMultiscaleDeformableAttention(nn.Module):
"""
在Deformable DETR中提出的多尺度可变形注意力机制。
"""
def __init__(self, config: DetaConfig, num_heads: int, n_points: int):
super().__init__()
# 检查是否加载了CUDA内核函数并且Ninja库可用,如果没有加载则尝试加载
kernel_loaded = MultiScaleDeformableAttention is not None
if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
try:
load_cuda_kernels()
except Exception as e:
logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
# 检查d_model是否能被num_heads整除,如果不能则抛出错误
if config.d_model % num_heads != 0:
raise ValueError(
f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
)
# 计算每个注意力头的维度
dim_per_head = config.d_model // num_heads
# 检查dim_per_head是否是2的幂
if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
warnings.warn(
"You'd better set embed_dim (d_model) in DetaMultiscaleDeformableAttention to make the"
" dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
" implementation."
)
# 初始化im2col步长为64
self.im2col_step = 64
# 保存配置参数
self.d_model = config.d_model
self.n_levels = config.num_feature_levels
self.n_heads = num_heads
self.n_points = n_points
# 初始化偏移量线性层、注意力权重线性层、值投影线性层和输出投影线性层
self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
self.value_proj = nn.Linear(config.d_model, config.d_model)
self.output_proj = nn.Linear(config.d_model, config.d_model)
# 根据配置参数决定是否禁用自定义内核函数
self.disable_custom_kernels = config.disable_custom_kernels
# 重置模型参数
self._reset_parameters()
# 重置模型参数的方法
def _reset_parameters(self):
# 初始化采样偏移权重为常数 0.0
nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
# 获取默认的数据类型
default_dtype = torch.get_default_dtype()
# 创建角度列表 thetas,作为每个注意力头的角度偏移量
thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads)
# 初始化网格 grid_init,其中包含每个头的位置编码
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
# 标准化网格使其范围在 [-1, 1] 内,并重复以匹配所有级别和点的数量
grid_init = (
(grid_init / grid_init.abs().max(-1, keepdim=True)[0])
.view(self.n_heads, 1, 1, 2)
.repeat(1, self.n_levels, self.n_points, 1)
)
# 根据点的索引调整网格初始化值
for i in range(self.n_points):
grid_init[:, :, i, :] *= i + 1
# 使用 torch.no_grad() 上下文管理器,将初始化的网格作为偏移量的偏置参数
with torch.no_grad():
self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
# 初始化注意力权重为常数 0.0
nn.init.constant_(self.attention_weights.weight.data, 0.0)
nn.init.constant_(self.attention_weights.bias.data, 0.0)
# 使用 xavier_uniform 方法初始化值投影权重和偏置
nn.init.xavier_uniform_(self.value_proj.weight.data)
nn.init.constant_(self.value_proj.bias.data, 0.0)
nn.init.xavier_uniform_(self.output_proj.weight.data)
nn.init.constant_(self.output_proj.bias.data, 0.0)
# 从transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention复制代码,将DeformableDetr->Deta,Deformable DETR->DETA
class DetaMultiheadAttention(nn.Module):
"""
'Attention Is All You Need'论文中的多头注意力机制。
这里,我们根据Deformable DETR论文的说明,为查询和键添加位置嵌入。
"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
bias: bool = True,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if self.head_dim * num_heads != self.embed_dim:
raise ValueError(
f"embed_dim必须能被num_heads整除 (得到 `embed_dim`: {self.embed_dim} 和 `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
# 线性层,用于投影键(key)、值(value)和查询(query)
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
# 重新形状张量,以便进行多头注意力计算
return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
# 如果提供了位置嵌入,则将其添加到张量中
return tensor if position_embeddings is None else tensor + position_embeddings
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_embeddings: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
# 前向传播函数,计算多头注意力机制
class DetaEncoderLayer(nn.Module):
def __init__(self, config: DetaConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = DetaMultiscaleDeformableAttention(
config,
num_heads=config.encoder_attention_heads,
n_points=config.encoder_n_points,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
position_embeddings: torch.Tensor = None,
reference_points=None,
spatial_shapes=None,
level_start_index=None,
output_attentions: bool = False,
):
# 前向传播函数,包括多尺度可变形注意力和前馈神经网络层的计算
# 定义一个方法,用于处理多尺度变形注意力模块的计算
def forward(
self,
hidden_states, # 输入到层的隐藏状态张量,形状为(batch_size, sequence_length, hidden_size)
attention_mask, # 注意力掩码张量,形状为(batch_size, sequence_length)
position_embeddings=None, # 位置嵌入张量,可选参数,将添加到hidden_states中
reference_points=None, # 参考点张量,可选参数
spatial_shapes=None, # 主干特征图的空间形状张量,可选参数
level_start_index=None, # 级别开始索引张量,可选参数
output_attentions=False, # 是否返回所有注意力层的注意力张量的标志,详见返回的张量中的'attentions'
):
residual = hidden_states # 保存初始输入的残差连接
# 在多尺度特征图上应用多尺度变形注意力模块
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
encoder_hidden_states=hidden_states,
encoder_attention_mask=attention_mask,
position_embeddings=position_embeddings,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # 使用dropout进行正则化
hidden_states = residual + hidden_states # 添加残差连接
hidden_states = self.self_attn_layer_norm(hidden_states) # 对结果进行层归一化处理
residual = hidden_states # 保存第一次处理后的残差连接
hidden_states = self.activation_fn(self.fc1(hidden_states)) # 使用激活函数处理全连接层fc1的结果
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) # 使用dropout进行正则化
hidden_states = self.fc2(hidden_states) # 进行第二个全连接层fc2的线性变换
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # 使用dropout进行正则化
hidden_states = residual + hidden_states # 添加第二次残差连接
hidden_states = self.final_layer_norm(hidden_states) # 对结果进行最终的层归一化处理
if self.training: # 如果处于训练模式
# 检查是否存在无穷大或NaN值,如果有,则进行数值截断
if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,) # 输出结果为隐藏状态张量的元组
if output_attentions: # 如果需要返回注意力张量
outputs += (attn_weights,) # 将注意力张量添加到输出元组中
return outputs # 返回最终输出
class DetaDecoderLayer(nn.Module):
def __init__(self, config: DetaConfig):
super().__init__()
self.embed_dim = config.d_model
# self-attention
self.self_attn = DetaMultiheadAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
# Layer normalization for self-attention output
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
# cross-attention
self.encoder_attn = DetaMultiscaleDeformableAttention(
config,
num_heads=config.decoder_attention_heads,
n_points=config.decoder_n_points,
)
# Layer normalization for cross-attention output
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
# feedforward neural networks
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
# Layer normalization for final output after feedforward networks
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
position_embeddings: Optional[torch.Tensor] = None,
reference_points=None,
spatial_shapes=None,
level_start_index=None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
):
# Forward pass through the decoder layer
# Self-attention mechanism
self_attn_output = self.self_attn(
hidden_states,
hidden_states,
hidden_states,
attn_mask=None, # Optional masking
output_attentions=output_attentions,
)
self_attn_output = self.dropout(self_attn_output[0]) # Apply dropout
hidden_states = hidden_states + self_attn_output # Residual connection
hidden_states = self.self_attn_layer_norm(hidden_states) # Layer normalization
# Cross-attention mechanism
encoder_attn_output = self.encoder_attn(
hidden_states,
encoder_hidden_states,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
attn_mask=encoder_attention_mask,
output_attentions=output_attentions,
)
encoder_attn_output = self.dropout(encoder_attn_output[0]) # Apply dropout
hidden_states = hidden_states + encoder_attn_output # Residual connection
hidden_states = self.encoder_attn_layer_norm(hidden_states) # Layer normalization
# Feedforward neural network
intermediate_output = self.activation_fn(self.fc1(hidden_states)) # First linear layer + activation
intermediate_output = self.dropout(intermediate_output) # Apply dropout
ffn_output = self.fc2(intermediate_output) # Second linear layer
ffn_output = self.dropout(ffn_output) # Apply dropout
hidden_states = hidden_states + ffn_output # Residual connection
hidden_states = self.final_layer_norm(hidden_states) # Layer normalization
return hidden_states
# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead
class DetaClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
super().__init__()
# Fully connected layer for dimension reduction
self.dense = nn.Linear(input_dim, inner_dim)
# Dropout for regularization
self.dropout = nn.Dropout(p=pooler_dropout)
# Output projection layer for classification
self.out_proj = nn.Linear(inner_dim, num_classes)
def forward(self, hidden_states: torch.Tensor):
hidden_states = self.dropout(hidden_states) # Apply dropout
hidden_states = self.dense(hidden_states) # Linear transformation
hidden_states = torch.tanh(hidden_states) # Apply activation function (tanh)
hidden_states = self.dropout(hidden_states) # Apply dropout
hidden_states = self.out_proj(hidden_states) # Final linear transformation for classification
return hidden_states
class DetaPreTrainedModel(PreTrainedModel):
config_class = DetaConfig
base_model_prefix = "model"
main_input_name = "pixel_values"
_no_split_modules = [r"DetaBackboneWithPositionalEncodings", r"DetaEncoderLayer", r"DetaDecoderLayer"]
supports_gradient_checkpointing = True
# 初始化神经网络模块的权重
def _init_weights(self, module):
std = self.config.init_std # 获取初始化标准差参数
if isinstance(module, DetaLearnedPositionEmbedding):
nn.init.uniform_(module.row_embeddings.weight) # 均匀分布初始化行嵌入权重
nn.init.uniform_(module.column_embeddings.weight) # 均匀分布初始化列嵌入权重
elif isinstance(module, DetaMultiscaleDeformableAttention):
module._reset_parameters() # 重置多尺度可变形注意力模块的参数
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
# 对于线性层、二维卷积层、批归一化层,使用正态分布初始化权重,偏置初始化为零
# 与 TF 版本稍有不同,TF 版本使用截断正态分布进行初始化
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
# 对于嵌入层,使用正态分布初始化权重,如果定义了填充索引,则对应权重置零
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
if hasattr(module, "reference_points") and not self.config.two_stage:
# 如果模块具有 reference_points 属性且非两阶段配置,则使用 Xavier 均匀分布初始化权重,偏置初始化为零
nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
nn.init.constant_(module.reference_points.bias.data, 0.0)
if hasattr(module, "level_embed"):
# 如果模块具有 level_embed 属性,则使用正态分布初始化权重
nn.init.normal_(module.level_embed)
# DETA_START_DOCSTRING 是一个原始文档字符串,描述了这个模型的继承和一般行为。
# 它继承自 PreTrainedModel 类,可以查阅该超类的文档以了解其实现的通用方法,
# 如下载或保存模型、调整输入嵌入大小、修剪头部等。
DETA_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`DetaConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# DETA_INPUTS_DOCSTRING 是一个空的文档字符串,将用于描述输入参数的信息。
DETA_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it.
像素数值。默认情况下将忽略填充部分的像素值。
Pixel values can be obtained using [`AutoImageProcessor`]. See [`AutoImageProcessor.__call__`] for details.
可以使用 [`AutoImageProcessor`] 获得像素值。详细信息请参见 [`AutoImageProcessor.__call__`]。
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
遮罩,用于避免在填充像素值上执行注意力操作。遮罩的取值范围为 `[0, 1]`:
- 1 for pixels that are real (i.e. **not masked**),
- 0 for pixels that are padding (i.e. **masked**).
- 值为 1 表示真实像素(即**未遮罩**),
- 值为 0 表示填充像素(即**已遮罩**)。
[What are attention masks?](../glossary
[注意力遮罩是什么?](../glossary
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
Not used by default. Can be used to mask object queries.
默认情况下不使用。可以用于屏蔽对象查询。
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
元组包含 (`last_hidden_state`, *可选*: `hidden_states`, *可选*: `attentions`)
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
`last_hidden_state` 的形状为 `(batch_size, sequence_length, hidden_size)`,*可选*,是编码器最后一层的输出隐藏状态序列。用于解码器的交叉注意力。
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
can choose to directly pass a flattened representation of an image.
可选项,可以直接传递一个展平的图像表示,而不是传递扁平化的特征图(骨干网络输出 + 投影层的输出)。
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
embedded representation.
可选项,可以直接传递嵌入表示,而不是用零张量初始化查询。
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
是否返回所有注意力层的注意力张量。有关更多详细信息,请参阅返回张量中的 `attentions`。
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
是否返回所有层的隐藏状态。有关更多详细信息,请参阅返回张量中的 `hidden_states`。
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
是否返回 [`~file_utils.ModelOutput`] 而不是普通元组。
"""
class DetaEncoder(DetaPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
[`DetaEncoderLayer`].
The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
Args:
config: DetaConfig
"""
def __init__(self, config: DetaConfig):
super().__init__(config)
self.dropout = config.dropout
# 创建多个 DetaEncoderLayer 组成的层列表
self.layers = nn.ModuleList([DetaEncoderLayer(config) for _ in range(config.encoder_layers)])
self.gradient_checkpointing = False
# 初始化权重并应用最终处理
self.post_init()
@staticmethod
def get_reference_points(spatial_shapes, valid_ratios, device):
"""
Get reference points for each feature map. Used in decoder.
Args:
spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
Spatial shapes of each feature map.
valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
Valid ratios of each feature map.
device (`torch.device`):
Device on which to create the tensors.
Returns:
`torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
"""
reference_points_list = []
# 遍历每个特征图的空间形状
for level, (height, width) in enumerate(spatial_shapes):
# 创建网格矩阵,生成参考点
ref_y, ref_x = meshgrid(
torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
indexing="ij",
)
# 重新形状和缩放参考点,考虑有效比率
ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
ref = torch.stack((ref_x, ref_y), -1)
reference_points_list.append(ref)
# 拼接所有参考点并应用有效比率
reference_points = torch.cat(reference_points_list, 1)
reference_points = reference_points[:, :, None] * valid_ratios[:, None]
return reference_points
def forward(
self,
inputs_embeds=None,
attention_mask=None,
position_embeddings=None,
spatial_shapes=None,
level_start_index=None,
valid_ratios=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
# 将 `position_embeddings`, `reference_points`, `spatial_shapes` 和 `valid_ratios` 添加到前向传播中。
# 同时返回所有解码层的中间输出和参考点的堆栈。
Args:
config: DetaConfig
"""
def __init__(self, config: DetaConfig):
super().__init__(config)
self.dropout = config.dropout
self.layers = nn.ModuleList([DetaDecoderLayer(config) for _ in range(config.decoder_layers)])
self.gradient_checkpointing = False
self.bbox_embed = None
self.class_embed = None
self.post_init()
def forward(
self,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
position_embeddings=None,
reference_points=None,
spatial_shapes=None,
level_start_index=None,
valid_ratios=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
"""
The bare DETA Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
any specific head on top.
"""
@add_start_docstrings(
"""
The bare DETA Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
any specific head on top.
""",
DETA_START_DOCSTRING,
)
class DetaModel(DetaPreTrainedModel):
def __init__(self, config: DetaConfig):
super().__init__(config)
if config.two_stage:
requires_backends(self, ["torchvision"])
self.backbone = DetaBackboneWithPositionalEncodings(config)
intermediate_channel_sizes = self.backbone.intermediate_channel_sizes
if config.num_feature_levels > 1:
num_backbone_outs = len(intermediate_channel_sizes)
input_proj_list = []
for _ in range(num_backbone_outs):
in_channels = intermediate_channel_sizes[_]
input_proj_list.append(
nn.Sequential(
nn.Conv2d(in_channels, config.d_model, kernel_size=1),
nn.GroupNorm(32, config.d_model),
)
)
for _ in range(config.num_feature_levels - num_backbone_outs):
input_proj_list.append(
nn.Sequential(
nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
nn.GroupNorm(32, config.d_model),
)
)
in_channels = config.d_model
self.input_proj = nn.ModuleList(input_proj_list)
else:
self.input_proj = nn.ModuleList(
[
nn.Sequential(
nn.Conv2d(intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
nn.GroupNorm(32, config.d_model),
)
]
)
if not config.two_stage:
self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)
self.encoder = DetaEncoder(config)
self.decoder = DetaDecoder(config)
self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
if config.two_stage:
self.enc_output = nn.Linear(config.d_model, config.d_model)
self.enc_output_norm = nn.LayerNorm(config.d_model)
self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
self.pix_trans = nn.Linear(config.d_model, config.d_model)
self.pix_trans_norm = nn.LayerNorm(config.d_model)
else:
self.reference_points = nn.Linear(config.d_model, 2)
self.assign_first_stage = config.assign_first_stage
self.two_stage_num_proposals = config.two_stage_num_proposals
self.post_init()
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
def freeze_backbone(self):
for name, param in self.backbone.model.named_parameters():
param.requires_grad_(False)
def unfreeze_backbone(self):
for name, param in self.backbone.model.named_parameters():
param.requires_grad_(True)
def get_valid_ratio(self, mask, dtype=torch.float32):
"""获取所有特征图的有效比例。"""
_, height, width = mask.shape
valid_height = torch.sum(mask[:, :, 0], 1)
valid_width = torch.sum(mask[:, 0, :], 1)
valid_ratio_height = valid_height.to(dtype) / height
valid_ratio_width = valid_width.to(dtype) / width
valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
return valid_ratio
def get_proposal_pos_embed(self, proposals):
"""获取提议的位置嵌入。"""
num_pos_feats = self.config.d_model // 2
temperature = 10000
scale = 2 * math.pi
dim_t = torch.arange(num_pos_feats, dtype=torch.int64, device=proposals.device).float()
dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
proposals = proposals.sigmoid() * scale
pos = proposals[:, :, :, None] / dim_t
pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
return pos
@add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DetaModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.FloatTensor] = None,
encoder_outputs: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
DETA Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
such as COCO detection.
""",
DETA_START_DOCSTRING,
)
class DetaForObjectDetection(DetaPreTrainedModel):
_tied_weights_keys = [r"bbox_embed\.\d+"]
_no_split_modules = None
def __init__(self, config: DetaConfig):
super().__init__(config)
self.model = DetaModel(config)
self.class_embed = nn.Linear(config.d_model, config.num_labels)
self.bbox_embed = DetaMLPPredictionHead(
input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
)
prior_prob = 0.01
bias_value = -math.log((1 - prior_prob) / prior_prob)
self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value
nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
if config.with_box_refine:
self.class_embed = _get_clones(self.class_embed, num_pred)
self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
self.model.decoder.bbox_embed = self.bbox_embed
else:
nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
self.model.decoder.bbox_embed = None
if config.two_stage:
self.model.decoder.class_embed = self.class_embed
for box_embed in self.bbox_embed:
nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
self.post_init()
@torch.jit.unused
def _set_aux_loss(self, outputs_class, outputs_coord):
aux_loss = [
{"logits": logits, "pred_boxes": pred_boxes}
for logits, pred_boxes in zip(outputs_class.transpose(0, 1)[:-1], outputs_coord.transpose(0, 1)[:-1])
]
return aux_loss
@add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DetaObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.FloatTensor] = None,
encoder_outputs: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[List[dict]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def dice_loss(inputs, targets, num_boxes):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs (0 for the negative class and 1 for the positive
class).
num_boxes: Number of boxes (or examples) to compute the average loss over.
"""
inputs = inputs.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * (inputs * targets).sum(1)
denominator = inputs.sum(-1) + targets.sum(-1)
loss = 1 - (numerator + 1) / (denominator + 1)
return loss.sum() / num_boxes
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
"""
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
Args:
inputs (`torch.FloatTensor` of arbitrary shape):
The predictions for each example.
targets (`torch.FloatTensor` with the same shape as `inputs`)
A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
and 1 for the positive class).
num_boxes: Number of boxes (or examples) to compute the average loss over.
alpha (`float`, *optional*, defaults to `0.25`):
Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
gamma (`int`, *optional*, defaults to `2`):
Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
Returns:
Loss tensor
"""
prob = inputs.sigmoid()
ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
p_t = prob * targets + (1 - prob) * (1 - targets)
loss = ce_loss * ((1 - p_t) ** gamma)
if alpha >= 0:
alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
loss = alpha_t * loss
return loss.mean(1).sum() / num_boxes
class DetaLoss(nn.Module):
"""
This class computes the losses for `DetaForObjectDetection`. The process happens in two steps: 1) we compute
hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
ground-truth / prediction (supervised class and box).
Args:
matcher (`DetaHungarianMatcher`):
Module able to compute a matching between targets and proposals.
num_classes (`int`):
Number of object categories, omitting the special no-object category.
focal_alpha (`float`):
Alpha parameter in focal loss.
losses (`List[str]`):
List of all the losses to be applied. See `get_loss` for a list of all available losses.
"""
pass
def __init__(
self,
matcher,
num_classes,
focal_alpha,
losses,
num_queries,
assign_first_stage=False,
assign_second_stage=False,
):
super().__init__()
self.matcher = matcher
self.num_classes = num_classes
self.focal_alpha = focal_alpha
self.losses = losses
self.assign_first_stage = assign_first_stage
self.assign_second_stage = assign_second_stage
if self.assign_first_stage:
self.stg1_assigner = DetaStage1Assigner()
if self.assign_second_stage:
self.stg2_assigner = DetaStage2Assigner(num_queries)
def loss_labels(self, outputs, targets, indices, num_boxes):
"""
Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
of dim [nb_target_boxes]
"""
if "logits" not in outputs:
raise KeyError("No logits were found in the outputs")
source_logits = outputs["logits"]
idx = self._get_source_permutation_idx(indices)
target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
target_classes = torch.full(
source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
)
target_classes[idx] = target_classes_o
target_classes_onehot = torch.zeros(
[source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
dtype=source_logits.dtype,
layout=source_logits.layout,
device=source_logits.device,
)
target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
target_classes_onehot = target_classes_onehot[:, :, :-1]
loss_ce = (
sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
* source_logits.shape[1]
)
losses = {"loss_ce": loss_ce}
return losses
@torch.no_grad()
def loss_cardinality(self, outputs, targets, indices, num_boxes):
logits = outputs["logits"]
device = logits.device
target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
losses = {"cardinality_error": card_err}
return losses
def loss_boxes(self, outputs, targets, indices, num_boxes):
"""
Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
are expected in format (center_x, center_y, w, h), normalized by the image size.
"""
if "pred_boxes" not in outputs:
raise KeyError("No predicted boxes found in outputs")
idx = self._get_source_permutation_idx(indices)
source_boxes = outputs["pred_boxes"][idx]
target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
losses = {}
losses["loss_bbox"] = loss_bbox.sum() / num_boxes
loss_giou = 1 - torch.diag(
generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
)
losses["loss_giou"] = loss_giou.sum() / num_boxes
return losses
def _get_source_permutation_idx(self, indices):
batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
source_idx = torch.cat([source for (source, _) in indices])
return batch_idx, source_idx
def _get_target_permutation_idx(self, indices):
batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
target_idx = torch.cat([target for (_, target) in indices])
return batch_idx, target_idx
def get_loss(self, loss, outputs, targets, indices, num_boxes):
loss_map = {
"labels": self.loss_labels,
"cardinality": self.loss_cardinality,
"boxes": self.loss_boxes,
}
if loss not in loss_map:
raise ValueError(f"Loss {loss} not supported")
return loss_map[loss](outputs, targets, indices, num_boxes)
def forward(self, outputs, targets):
"""
This performs the loss computation.
Args:
outputs (`dict`, *optional*):
Dictionary of tensors, see the output specification of the model for the format.
targets (`List[dict]`, *optional*):
List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
losses applied, see each loss' doc.
"""
outputs_without_aux = {k: v for k, v in outputs.items() if k not in ("auxiliary_outputs", "enc_outputs")}
if self.assign_second_stage:
indices = self.stg2_assigner(outputs_without_aux, targets)
else:
indices = self.matcher(outputs_without_aux, targets)
num_boxes = sum(len(t["class_labels"]) for t in targets)
num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
world_size = 1
if is_accelerate_available():
if PartialState._shared_state != {}:
num_boxes = reduce(num_boxes)
world_size = PartialState().num_processes
num_boxes = torch.clamp(num_boxes / world_size, min=1).item()
losses = {}
for loss in self.losses:
losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
if "auxiliary_outputs" in outputs:
for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
if not self.assign_second_stage:
indices = self.matcher(auxiliary_outputs, targets)
for loss in self.losses:
l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
losses.update(l_dict)
if "enc_outputs" in outputs:
enc_outputs = outputs["enc_outputs"]
bin_targets = copy.deepcopy(targets)
for bt in bin_targets:
bt["class_labels"] = torch.zeros_like(bt["class_labels"])
if self.assign_first_stage:
indices = self.stg1_assigner(enc_outputs, bin_targets)
else:
indices = self.matcher(enc_outputs, bin_targets)
for loss in self.losses:
l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
l_dict = {k + "_enc": v for k, v in l_dict.items()}
losses.update(l_dict)
return losses
class DetaMLPPredictionHead(nn.Module):
"""
Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
height and width of a bounding box w.r.t. an image.
Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
for i, layer in enumerate(self.layers):
x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
class DetaHungarianMatcher(nn.Module):
"""
This class computes an assignment between the targets and the predictions of the network.
For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
un-matched (and thus treated as non-objects).
Args:
class_cost:
The relative weight of the classification error in the matching cost.
bbox_cost:
The relative weight of the L1 error of the bounding box coordinates in the matching cost.
giou_cost:
The relative weight of the giou loss of the bounding box in the matching cost.
"""
def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
super().__init__()
requires_backends(self, ["scipy"])
self.class_cost = class_cost
self.bbox_cost = bbox_cost
self.giou_cost = giou_cost
if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
raise ValueError("All costs of the Matcher can't be 0")
@torch.no_grad()
def forward(self, outputs, targets):
"""
Args:
outputs (`dict`):
A dictionary that contains at least these entries:
* "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
* "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
targets (`List[dict]`):
A list of targets (len(targets) = batch_size), where each target is a dict containing:
* "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
ground-truth objects in the target) containing the class labels
* "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
Returns:
`List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
batch_size, num_queries = outputs["logits"].shape[:2]
out_prob = outputs["logits"].flatten(0, 1).sigmoid()
out_bbox = outputs["pred_boxes"].flatten(0, 1)
target_ids = torch.cat([v["class_labels"] for v in targets])
target_bbox = torch.cat([v["boxes"] for v in targets])
alpha = 0.25
gamma = 2.0
neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
sizes = [len(v["boxes"]) for v in targets]
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
def _upcast(t: Tensor) -> Tensor:
"""
Protects from numerical overflows in multiplications by upcasting to the equivalent higher type.
Args:
t (`torch.Tensor`): The input tensor to be upcasted.
Returns:
`torch.Tensor`: The upcasted tensor.
"""
if t.is_floating_point():
return t if t.dtype in (torch.float32, torch.float64) else t.float()
else:
return t if t.dtype in (torch.int32, torch.int64) else t.int()
def box_area(boxes: Tensor) -> Tensor:
"""
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
Args:
boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
< x2` and `0 <= y1 < y2`.
Returns:
`torch.FloatTensor`: a tensor containing the area for each box.
"""
boxes = _upcast(boxes)
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
def box_iou(boxes1, boxes2):
"""
Computes the IoU (Intersection over Union) between two sets of bounding boxes.
Args:
boxes1 (`torch.Tensor`): Bounding boxes in format (x1, y1, x2, y2).
boxes2 (`torch.Tensor`): Bounding boxes in format (x1, y1, x2, y2).
Returns:
`torch.Tensor`: A tensor containing IoU values for each pair of boxes.
"""
area1 = box_area(boxes1)
area2 = box_area(boxes2)
left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])
right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])
width_height = (right_bottom - left_top).clamp(min=0)
inter = width_height[:, :, 0] * width_height[:, :, 1]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
def generalized_box_iou(boxes1, boxes2):
"""
Computes the Generalized IoU (GIoU) between two sets of bounding boxes.
Args:
boxes1 (`torch.Tensor`): Bounding boxes in format (x1, y1, x2, y2).
boxes2 (`torch.Tensor`): Bounding boxes in format (x1, y1, x2, y2).
Returns:
`torch.Tensor`: A [N, M] pairwise matrix containing GIoU values.
"""
if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
iou, union = box_iou(boxes1, boxes2)
top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
width_height = (bottom_right - top_left).clamp(min=0)
area = width_height[:, :, 0] * width_height[:, :, 1]
return iou - (area - union) / area
def nonzero_tuple(x):
"""
A 'as_tuple=True' version of torch.nonzero to support torchscript. because of
https://github.com/pytorch/pytorch/issues/38718
"""
if torch.jit.is_scripting():
if x.dim() == 0:
return x.unsqueeze(0).nonzero().unbind(1)
return x.nonzero().unbind(1)
else:
return x.nonzero(as_tuple=True)
class DetaMatcher(object):
"""
This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will
have exactly zero or one matches; each ground-truth element may be matched to zero or more predicted elements.
The matching is determined by the MxN match_quality_matrix, that characterizes how well each (ground-truth,
prediction)-pair match each other. For example, if the elements are boxes, this matrix may contain box
intersection-over-union overlap values.
The matcher returns (a) a vector of length N containing the index of the ground-truth element m in [0, M) that
matches to prediction n in [0, N). (b) a vector of length N containing the labels for each prediction.
"""
def __init__(self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False):
"""
Args:
thresholds (`list[float]`):
A list of thresholds used to stratify predictions into levels.
labels (`list[int`):
A list of values to label predictions belonging at each level. A label can be one of {-1, 0, 1}
signifying {ignore, negative class, positive class}, respectively.
allow_low_quality_matches (`bool`, *optional*, defaults to `False`):
If `True`, produce additional matches for predictions with maximum match quality lower than
high_threshold. See `set_low_quality_matches_` for more details.
For example,
thresholds = [0.3, 0.5] labels = [0, -1, 1] All predictions with iou < 0.3 will be marked with 0 and
thus will be considered as false positives while training. All predictions with 0.3 <= iou < 0.5 will
be marked with -1 and thus will be ignored. All predictions with 0.5 <= iou will be marked with 1 and
thus will be considered as true positives.
"""
thresholds = thresholds[:]
if thresholds[0] < 0:
raise ValueError("Thresholds should be positive")
thresholds.insert(0, -float("inf"))
thresholds.append(float("inf"))
if not all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])):
raise ValueError("Thresholds should be sorted.")
if not all(l in [-1, 0, 1] for l in labels):
raise ValueError("All labels should be either -1, 0 or 1")
if len(labels) != len(thresholds) - 1:
raise ValueError("Number of labels should be equal to number of thresholds - 1")
self.thresholds = thresholds
self.labels = labels
self.allow_low_quality_matches = allow_low_quality_matches
def __call__(self, match_quality_matrix):
"""
Args:
match_quality_matrix (Tensor[float]): an MxN tensor, containing the
pairwise quality between M ground-truth elements and N predicted elements. All elements must be >= 0
(due to the use of `torch.nonzero` for selecting indices in `set_low_quality_matches_`).
Returns:
matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
ground-truth index in [0, M)
match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
whether a prediction is a true or false positive or ignored
"""
assert match_quality_matrix.dim() == 2
if match_quality_matrix.numel() == 0:
default_matches = match_quality_matrix.new_full((match_quality_matrix.size(1),), 0, dtype=torch.int64)
default_match_labels = match_quality_matrix.new_full(
(match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
)
return default_matches, default_match_labels
assert torch.all(match_quality_matrix >= 0)
matched_vals, matches = match_quality_matrix.max(dim=0)
match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
for l, low, high in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
low_high = (matched_vals >= low) & (matched_vals < high)
match_labels[low_high] = l
if self.allow_low_quality_matches:
self.set_low_quality_matches_(match_labels, match_quality_matrix)
return matches, match_labels
highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
_, pred_inds_with_highest_quality = nonzero_tuple(match_quality_matrix == highest_quality_foreach_gt[:, None])
match_labels[pred_inds_with_highest_quality] = 1
import torch
def subsample_labels(labels: torch.Tensor, num_samples: int, positive_fraction: float, bg_label: int):
"""
Return `num_samples` (or fewer, if not enough found) random samples from `labels` which is a mixture of positives &
negatives. It will try to return as many positives as possible without exceeding `positive_fraction * num_samples`,
and then try to fill the remaining slots with negatives.
Args:
labels (Tensor): (N, ) label vector with values:
* -1: ignore
* bg_label: background ("negative") class
* otherwise: one or more foreground ("positive") classes
num_samples (int): The total number of labels with value >= 0 to return.
Values that are not sampled will be filled with -1 (ignore).
positive_fraction (float): The number of subsampled labels with values > 0
is `min(num_positives, int(positive_fraction * num_samples))`. The number of negatives sampled is
`min(num_negatives, num_samples - num_positives_sampled)`. In order words, if there are not enough
positives, the sample is filled with negatives. If there are also not enough negatives, then as many
elements are sampled as is possible.
bg_label (int): label index of background ("negative") class.
Returns:
pos_idx, neg_idx (Tensor):
1D vector of indices. The total length of both is `num_samples` or fewer.
"""
positive = torch.nonzero((labels != -1) & (labels != bg_label)).squeeze(1)
negative = torch.nonzero(labels == bg_label).squeeze(1)
num_pos = int(num_samples * positive_fraction)
num_pos = min(positive.numel(), num_pos)
num_neg = num_samples - num_pos
num_neg = min(negative.numel(), num_neg)
perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
pos_idx = positive[perm1]
neg_idx = negative[perm2]
return pos_idx, neg_idx
def sample_topk_per_gt(pr_inds, gt_inds, iou, k):
if len(gt_inds) == 0:
return pr_inds, gt_inds
gt_inds2, counts = gt_inds.unique(return_counts=True)
scores, pr_inds2 = iou[gt_inds2].topk(k, dim=1)
gt_inds2 = gt_inds2[:, None].repeat(1, k)
pr_inds3 = torch.cat([pr[:c] for c, pr in zip(counts, pr_inds2)])
gt_inds3 = torch.cat([gt[:c] for c, gt in zip(counts, gt_inds2)])
return pr_inds3, gt_inds3
class DetaStage2Assigner(nn.Module):
def __init__(self, num_queries, max_k=4):
super().__init__()
self.positive_fraction = 0.25
self.bg_label = 400
self.batch_size_per_image = num_queries
self.proposal_matcher = DetaMatcher(thresholds=[0.6], labels=[0, 1], allow_low_quality_matches=True)
self.k = max_k
def _sample_proposals(self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor):
"""
根据 N 个提议与 M 个真实数据的匹配情况,采样提议并设置它们的分类标签。
Args:
matched_idxs (Tensor): 长度为 N 的向量,每个元素是每个提议最佳匹配的真实数据索引,取值范围为 [0, M)。
matched_labels (Tensor): 长度为 N 的向量,每个元素是提议的分类标签(来自 cfg.MODEL.ROI_HEADS.IOU_LABELS)。
gt_classes (Tensor): 长度为 M 的向量,每个元素是真实数据的类别。
Returns:
Tensor: 采样提议的索引向量,每个元素在 [0, N) 范围内。
Tensor: 与采样提议对应的分类标签向量,每个元素与采样提议的索引向量一一对应。每个样本被标记为一个类别在 [0, num_classes) 或背景 (num_classes)。
"""
has_gt = gt_classes.numel() > 0
if has_gt:
gt_classes = gt_classes[matched_idxs]
gt_classes[matched_labels == 0] = self.bg_label
gt_classes[matched_labels == -1] = -1
else:
gt_classes = torch.zeros_like(matched_idxs) + self.bg_label
sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
gt_classes, self.batch_size_per_image, self.positive_fraction, self.bg_label
)
sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
return sampled_idxs, gt_classes[sampled_idxs]
def forward(self, outputs, targets, return_cost_matrix=False):
bs = len(targets)
indices = []
ious = []
for b in range(bs):
iou, _ = box_iou(
center_to_corners_format(targets[b]["boxes"]),
center_to_corners_format(outputs["init_reference"][b].detach()),
)
matched_idxs, matched_labels = self.proposal_matcher(
iou
)
(
sampled_idxs,
sampled_gt_classes,
) = self._sample_proposals(
matched_idxs, matched_labels, targets[b]["class_labels"]
)
pos_pr_inds = sampled_idxs[sampled_gt_classes != self.bg_label]
pos_gt_inds = matched_idxs[pos_pr_inds]
pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou)
indices.append((pos_pr_inds, pos_gt_inds))
ious.append(iou)
if return_cost_matrix:
return indices, ious
return indices
def postprocess_indices(self, pr_inds, gt_inds, iou):
return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)
class DetaStage1Assigner(nn.Module):
def __init__(self, t_low=0.3, t_high=0.7, max_k=4):
super().__init__()
self.positive_fraction = 0.5
self.batch_size_per_image = 256
self.k = max_k
self.t_low = t_low
self.t_high = t_high
self.anchor_matcher = DetaMatcher(
thresholds=[t_low, t_high], labels=[0, -1, 1], allow_low_quality_matches=True
)
def _subsample_labels(self, label):
"""
随机抽样一部分正负样本,并将标签向量中未包含在抽样中的元素设置为忽略标签(-1)。
Args:
labels (Tensor): 包含标签值-1, 0, 1的向量。将在原地被修改并返回。
"""
pos_idx, neg_idx = subsample_labels(label, self.batch_size_per_image, self.positive_fraction, 0)
label.fill_(-1)
label.scatter_(0, pos_idx, 1)
label.scatter_(0, neg_idx, 0)
return label
def forward(self, outputs, targets):
bs = len(targets)
indices = []
for b in range(bs):
anchors = outputs["anchors"][b]
if len(targets[b]["boxes"]) == 0:
indices.append(
(
torch.tensor([], dtype=torch.long, device=anchors.device),
torch.tensor([], dtype=torch.long, device=anchors.device),
)
)
continue
iou, _ = box_iou(
center_to_corners_format(targets[b]["boxes"]),
center_to_corners_format(anchors),
)
matched_idxs, matched_labels = self.anchor_matcher(
iou
)
matched_labels = self._subsample_labels(matched_labels)
all_pr_inds = torch.arange(len(anchors), device=matched_labels.device)
pos_pr_inds = all_pr_inds[matched_labels == 1]
pos_gt_inds = matched_idxs[pos_pr_inds]
pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou)
pos_pr_inds, pos_gt_inds = pos_pr_inds.to(anchors.device), pos_gt_inds.to(anchors.device)
indices.append((pos_pr_inds, pos_gt_inds))
return indices
def postprocess_indices(self, pr_inds, gt_inds, iou):
return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)
.\models\deta\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_deta": ["DETA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetaConfig"],
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_deta"] = ["DetaImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_deta"] = [
"DETA_PRETRAINED_MODEL_ARCHIVE_LIST",
"DetaForObjectDetection",
"DetaModel",
"DetaPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_deta import DETA_PRETRAINED_CONFIG_ARCHIVE_MAP, DetaConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_deta import DetaImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_deta import (
DETA_PRETRAINED_MODEL_ARCHIVE_LIST,
DetaForObjectDetection,
DetaModel,
DetaPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\detr\configuration_detr.py
""" DETR model configuration"""
from collections import OrderedDict
from typing import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/detr-resnet-50": "https://huggingface.co/facebook/detr-resnet-50/resolve/main/config.json",
}
class DetrConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DetrModel`]. It is used to instantiate a DETR
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the DETR
[facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import DetrConfig, DetrModel
>>> # Initializing a DETR facebook/detr-resnet-50 style configuration
>>> configuration = DetrConfig()
>>> # Initializing a model (with random weights) from the facebook/detr-resnet-50 style configuration
>>> model = DetrModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "detr"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "encoder_attention_heads",
}
def __init__(
self,
use_timm_backbone=True,
backbone_config=None,
num_channels=3,
num_queries=100,
encoder_layers=6,
encoder_ffn_dim=2048,
encoder_attention_heads=8,
decoder_layers=6,
decoder_ffn_dim=2048,
decoder_attention_heads=8,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
is_encoder_decoder=True,
activation_function="relu",
d_model=256,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
init_xavier_std=1.0,
auxiliary_loss=False,
position_embedding_type="sine",
backbone="resnet50",
use_pretrained_backbone=True,
backbone_kwargs=None,
dilation=False,
class_cost=1,
bbox_cost=5,
giou_cost=2,
mask_loss_coefficient=1,
dice_loss_coefficient=1,
bbox_loss_coefficient=5,
giou_loss_coefficient=2,
eos_coefficient=0.1,
**kwargs,
):
pass
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
@classmethod
def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
"""从预训练的backbone模型配置中实例化一个DetrConfig(或其派生类)对象。
Args:
backbone_config ([PretrainedConfig]):
预训练的backbone模型的配置对象。
Returns:
[DetrConfig]: DetrConfig对象的一个实例
"""
return cls(backbone_config=backbone_config, **kwargs)
class DetrOnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
("pixel_mask", {0: "batch"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-5
@property
def default_onnx_opset(self) -> int:
return 12