Transformers 源码解析(一百二十九)
.\models\xmod\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_xmod": [
"XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP",
"XmodConfig",
"XmodOnnxConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_xmod"] = [
"XMOD_PRETRAINED_MODEL_ARCHIVE_LIST",
"XmodForCausalLM",
"XmodForMaskedLM",
"XmodForMultipleChoice",
"XmodForQuestionAnswering",
"XmodForSequenceClassification",
"XmodForTokenClassification",
"XmodModel",
"XmodPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_xmod import XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP, XmodConfig, XmodOnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_xmod import (
XMOD_PRETRAINED_MODEL_ARCHIVE_LIST,
XmodForCausalLM,
XmodForMaskedLM,
XmodForMultipleChoice,
XmodForQuestionAnswering,
XmodForSequenceClassification,
XmodForTokenClassification,
XmodModel,
XmodPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\x_clip\configuration_x_clip.py
import os
from typing import Union
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/xclip-base-patch32": "https://huggingface.co/microsoft/xclip-base-patch32/resolve/main/config.json",
}
class XCLIPTextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`XCLIPModel`]. It is used to instantiate an X-CLIP
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the X-CLIP
[microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
# 模型类型为文本X-CLIP模型
model_type = "xclip_text_model"
# 初始化函数,设置X-CLIP文本模型的各种参数
def __init__(
self,
vocab_size=49408, # 词汇表大小,默认49408,定义可表示的不同token数量
hidden_size=512, # 编码器层和池化器层的维度
intermediate_size=2048, # Transformer编码器中"intermediate"(即前馈)层的维度
num_hidden_layers=12, # Transformer编码器中的隐藏层层数
num_attention_heads=8, # Transformer编码器中每个注意力层的注意头数
max_position_embeddings=77, # 模型可能使用的最大序列长度,一般设置为较大值,如512、1024或2048
hidden_act="quick_gelu", # 编码器和池化器中的非线性激活函数,支持"gelu"、"relu"、"selu"和"quick_gelu"
layer_norm_eps=1e-5, # 层归一化层使用的epsilon值
attention_dropout=0.0, # 注意力概率的dropout比率
initializer_range=0.02, # 用于初始化所有权重矩阵的截断正态分布的标准差
initializer_factor=1.0, # 初始化所有权重矩阵的因子(内部初始化测试时应保持为1)
pad_token_id=1, # 填充token的ID
bos_token_id=0, # 开始token的ID
eos_token_id=2, # 结束token的ID
**kwargs, # 其他关键字参数
):
):
# 调用父类的初始化方法,设置Transformer模型的各种参数,包括填充、起始和结束标记的ID
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
# 设置当前对象的词汇表大小、隐藏层大小、中间层大小、隐藏层的数量、注意力头的数量、最大位置嵌入、层归一化epsilon值、隐藏层激活函数、初始化范围、初始化因子以及注意力机制的dropout率
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
# 将token相关的参数设置到kwargs中
cls._set_token_in_kwargs(kwargs)
# 从预训练模型名称或路径中获取配置字典和更新后的kwargs
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# 如果配置字典中的模型类型是"xclip",则使用其文本配置字典
if config_dict.get("model_type") == "xclip":
config_dict = config_dict["text_config"]
# 如果配置字典中存在"model_type"属性,并且当前类定义了"model_type"属性,且它们不相等,发出警告信息
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
# 根据配置字典和kwargs创建当前类的实例
return cls.from_dict(config_dict, **kwargs)
# XCLIPVisionConfig 类,继承自 PretrainedConfig,用于存储 X-CLIP 模型的配置信息
class XCLIPVisionConfig(PretrainedConfig):
r"""
这是一个配置类,用于存储 [`XCLIPModel`] 的配置。根据指定的参数实例化 X-CLIP 模型,定义模型架构。使用默认配置实例化将得到与 X-CLIP
[microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) 架构类似的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。详细信息请参阅 [`PretrainedConfig`] 的文档。
"""
Args:
hidden_size (`int`, *optional*, defaults to 768):
# 编码器层和池化层的维度大小
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
# Transformer 编码器中“中间”(即前馈)层的维度大小
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
# Transformer 编码器中的隐藏层数量
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
# Transformer 编码器中每个注意力层的注意力头数量
Number of attention heads for each attention layer in the Transformer encoder.
mit_hidden_size (`int`, *optional*, defaults to 512):
# Multiframe Integration Transformer(MIT)中编码器层的维度大小
Dimensionality of the encoder layers of the Multiframe Integration Transformer (MIT).
mit_intermediate_size (`int`, *optional*, defaults to 2048):
# Multiframe Integration Transformer(MIT)中“中间”(即前馈)层的维度大小
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Multiframe Integration Transformer (MIT).
mit_num_hidden_layers (`int`, *optional*, defaults to 1):
# Multiframe Integration Transformer(MIT)中的隐藏层数量
Number of hidden layers in the Multiframe Integration Transformer (MIT).
mit_num_attention_heads (`int`, *optional*, defaults to 8):
# Multiframe Integration Transformer(MIT)中每个注意力层的注意力头数量
Number of attention heads for each attention layer in the Multiframe Integration Transformer (MIT).
image_size (`int`, *optional*, defaults to 224):
# 每个图像的分辨率大小
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 32):
# 每个图像块(patch)的分辨率大小
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
# 编码器和池化层中的非线性激活函数
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"`, `"gelu_new"` and ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
# 层归一化层使用的 epsilon 值
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
# 注意力概率的 dropout 比率
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
# 用于初始化所有权重矩阵的截断正态分布的标准差
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1):
# 用于初始化所有权重矩阵的因子(内部测试时保持为1)
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
drop_path_rate (`float`, *optional*, defaults to 0.0):
# 随机深度的比率
Stochastic depth rate.
Example:
```
>>> from transformers import XCLIPVisionModel, XCLIPVisionConfig
>>> # 使用 microsoft/xclip-base-patch32 风格的配置初始化 XCLIPVisionModel
>>> configuration = XCLIPVisionConfig()
>>> # 使用 microsoft/xclip-base-patch32 风格的配置初始化 XCLIPVisionModel 模型
>>> model = XCLIPVisionModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
# 设定模型类型为"xclip_vision_model"
model_type = "xclip_vision_model"
# 定义初始化方法,设置模型各种参数
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
num_hidden_layers=12,
num_attention_heads=12,
mit_hidden_size=512,
mit_intermediate_size=2048,
mit_num_hidden_layers=1,
mit_num_attention_heads=8,
num_channels=3,
image_size=224,
patch_size=32,
num_frames=8,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
drop_path_rate=0.0,
**kwargs,
):
# 调用父类的初始化方法,传递额外参数
super().__init__(**kwargs)
# 设置模型各项参数
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.mit_hidden_size = mit_hidden_size
self.mit_intermediate_size = mit_intermediate_size
self.mit_num_hidden_layers = mit_num_hidden_layers
self.mit_num_attention_heads = mit_num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.num_frames = num_frames
self.image_size = image_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.drop_path_rate = drop_path_rate
@classmethod
# 从预训练模型加载配置,返回预训练配置的实例
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
# 设置Token到kwargs中
cls._set_token_in_kwargs(kwargs)
# 获取配置字典和更新后的kwargs
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# 如果配置字典的模型类型是"xclip",则使用视觉配置字典
if config_dict.get("model_type") == "xclip":
config_dict = config_dict["vision_config"]
# 如果配置字典中包含模型类型,并且模型类型与当前类的model_type不一致,发出警告
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
# 从配置字典创建实例,并传入更新后的kwargs
return cls.from_dict(config_dict, **kwargs)
# XCLIPConfig 类继承自 PretrainedConfig,用于存储 X-CLIP 模型的配置信息。
# 该配置类包含了初始化 X-CLIP 模型所需的各种参数,定义了文本模型和视觉模型的配置。
# 使用默认参数实例化一个配置对象将会得到与 microsoft/xclip-base-patch32 架构类似的配置。
# 配置对象继承自 PretrainedConfig,并可用于控制模型的输出。详细信息请参阅 PretrainedConfig 的文档。
class XCLIPConfig(PretrainedConfig):
r"""
[`XCLIPConfig`] is the configuration class to store the configuration of a [`XCLIPModel`]. It is used to
instantiate X-CLIP model according to the specified arguments, defining the text model and vision model configs.
Instantiating a configuration with the defaults will yield a similar configuration to that of the X-CLIP
[microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`XCLIPTextConfig`].
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`XCLIPVisionConfig`].
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
prompt_layers (`int`, *optional*, defaults to 2):
Number of layers in the video specific prompt generator.
prompt_alpha (`float`, *optional*, defaults to 0.1):
Alpha value to use in the video specific prompt generator.
prompt_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the video specific prompt generator. If string,
`"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
prompt_num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads in the cross-attention of the video specific prompt generator.
prompt_attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for the attention layers in the video specific prompt generator.
prompt_projection_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for the projection layers in the video specific prompt generator.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* parameter. Default is used as per the original XCLIP implementation.
kwargs (*optional*):
Dictionary of keyword arguments.
"""
model_type = "xclip"
def __init__(
self,
text_config=None,
vision_config=None,
projection_dim=512,
prompt_layers=2,
prompt_alpha=0.1,
prompt_hidden_act="quick_gelu",
prompt_num_attention_heads=8,
prompt_attention_dropout=0.0,
prompt_projection_dropout=0.0,
logit_scale_init_value=2.6592,
**kwargs,
):
# 从文本模型配置和视觉模型配置实例化一个 XCLIPConfig(或其派生类)对象
def from_text_vision_configs(cls, text_config: XCLIPTextConfig, vision_config: XCLIPVisionConfig, **kwargs):
r"""
从 XCLIP 文本模型配置和 XCLIP 视觉模型配置实例化一个 [`XCLIPConfig`](或其派生类)对象。
Returns:
[`XCLIPConfig`]: 配置对象的一个实例
"""
# 使用 text_config 的字典表示和 vision_config 的字典表示实例化一个配置对象,并传递额外的关键字参数
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
.\models\x_clip\convert_x_clip_original_pytorch_to_hf.py
import argparse
import gdown
import numpy as np
import torch
from huggingface_hub import hf_hub_download
from transformers import (
CLIPTokenizer,
CLIPTokenizerFast,
VideoMAEImageProcessor,
XCLIPConfig,
XCLIPModel,
XCLIPProcessor,
XCLIPTextConfig,
XCLIPVisionConfig,
)
def get_xclip_config(model_name, num_frames):
text_config = XCLIPTextConfig()
start_idx = model_name.find("patch")
patch_size = int(model_name[start_idx + len("patch"): start_idx + len("patch") + 2])
vision_config = XCLIPVisionConfig(patch_size=patch_size, num_frames=num_frames)
if "large" in model_name:
text_config.hidden_size = 768
text_config.intermediate_size = 3072
text_config.num_attention_heads = 12
vision_config.hidden_size = 1024
vision_config.intermediate_size = 4096
vision_config.num_attention_heads = 16
vision_config.num_hidden_layers = 24
vision_config.mit_hidden_size = 768
vision_config.mit_intermediate_size = 3072
if model_name == "xclip-large-patch14-16-frames":
vision_config.image_size = 336
config = XCLIPConfig.from_text_vision_configs(text_config, vision_config)
if "large" in model_name:
config.projection_dim = 768
return config
def rename_key(name):
if name == "token_embedding.weight":
name = name.replace("token_embedding.weight", "text_model.embeddings.token_embedding.weight")
if name == "positional_embedding":
name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight")
if "ln_1" in name:
name = name.replace("ln_1", "layer_norm1")
if "ln_2" in name:
name = name.replace("ln_2", "layer_norm2")
if "c_fc" in name:
name = name.replace("c_fc", "fc1")
if "c_proj" in name:
name = name.replace("c_proj", "fc2")
if name.startswith("transformer.resblocks"):
name = name.replace("transformer.resblocks", "text_model.encoder.layers")
if "attn.out_proj" in name and "message" not in name:
name = name.replace("attn.out_proj", "self_attn.out_proj")
if "ln_final" in name:
name = name.replace("ln_final", "text_model.final_layer_norm")
if name == "visual.class_embedding":
name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
if name == "visual.positional_embedding":
name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight")
if name.startswith("visual.transformer.resblocks"):
name = name.replace("visual.transformer.resblocks", "vision_model.encoder.layers")
if "visual.conv1" in name:
name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding")
if "visual.ln_pre" in name:
name = name.replace("visual.ln_pre", "vision_model.pre_layernorm")
if "visual.ln_post" in name:
name = name.replace("visual.ln_post", "vision_model.post_layernorm")
if "visual.proj" in name:
name = name.replace("visual.proj", "visual_projection.weight")
if "text_projection" in name:
name = name.replace("text_projection", "text_projection.weight")
if "prompts_visual_proj" in name:
name = name.replace("prompts_visual_proj", "prompts_visual_projection")
if "prompts_visual_ln" in name:
name = name.replace("prompts_visual_ln", "prompts_visual_layernorm")
if name == "mit.positional_embedding":
name = name.replace("positional", "position")
if name.startswith("mit.resblocks"):
name = name.replace("mit.resblocks", "mit.encoder.layers")
if name.startswith("prompts_generator.norm"):
name = name.replace("prompts_generator.norm", "prompts_generator.layernorm")
return name
def convert_state_dict(orig_state_dict, config):
return orig_state_dict
def prepare_video(num_frames):
if num_frames == 8:
filename = "eating_spaghetti_8_frames.npy"
elif num_frames == 16:
filename = "eating_spaghetti.npy"
elif num_frames == 32:
filename = "eating_spaghetti_32_frames.npy"
file = hf_hub_download(
repo_id="hf-internal-testing/spaghetti-video",
filename=filename,
repo_type="dataset",
)
video = np.load(file)
return list(video)
def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
pass
model_to_url = {
"xclip-base-patch32": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_8.pth",
"xclip-base-patch32-16-frames": (
"https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_16.pth"
),
"xclip-base-patch16": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_16_8.pth",
"xclip-base-patch16-16-frames": (
"https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_16_16.pth"
),
"xclip-large-patch14": "https://drive.google.com/u/0/uc?id=1NUOImq0o5DlQTST17iIP3vG7DgmHQuCx&export=download&confirm=t&uuid=b26caedc-88e2-473e-830a-9d158b653cdb",
"xclip-large-patch14-16-frames": "https://drive.google.com/u/0/uc?id=1FOYgnJc097OJ4lGwtRCCydQyVPJEOH7d&export=download&confirm=t&uuid=538fa810-e671-4050-b385-9a623f89804f",
"xclip-base-patch16-kinetics-600": (
"https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_8.pth"
),
"xclip-base-patch16-kinetics-600-16-frames": (
"https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_16.pth"
),
"xclip-large-patch14-kinetics-600": "https://drive.google.com/u/0/uc?id=1FV8C1INuM91sLAN4ImjzePLIlpMSihwV&export=download&confirm=t&uuid=141d4977-4a65-44ae-864f-4b0c19f838be",
"xclip-base-patch16-hmdb-2-shot": (
"https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_2.pth"
),
"xclip-base-patch16-hmdb-4-shot": (
"https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_4.pth"
),
"xclip-base-patch16-hmdb-8-shot": (
"https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_8.pth"
),
"xclip-base-patch16-hmdb-16-shot": (
"https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_16.pth"
),
"xclip-base-patch16-ucf-2-shot": (
"https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_2.pth"
),
"xclip-base-patch16-ucf-4-shot": (
"https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_4.pth"
),
"xclip-base-patch16-ucf-8-shot": (
"https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_8.pth"
),
"xclip-base-patch16-ucf-16-shot": (
"https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_16.pth"
),
"xclip-base-patch16-zero-shot": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/zero.pth",
}
checkpoint_url = model_to_url[model_name]
num_frames = 8
if "16-frames" in model_name:
num_frames = 16
elif "shot" in model_name:
num_frames = 32
config = get_xclip_config(model_name, num_frames)
model = XCLIPModel(config)
model.eval()
if "drive" in checkpoint_url:
output = "pytorch_model.bin"
gdown.cached_download(checkpoint_url, output, quiet=False)
state_dict = torch.load(output, map_location="cpu")["model"]
else:
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
state_dict = convert_state_dict(state_dict, config)
model = XCLIPModel(config)
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
assert missing_keys == ["text_model.embeddings.position_ids", "vision_model.embeddings.position_ids"]
model.eval()
size = 336 if model_name == "xclip-large-patch14-16-frames" else 224
image_processor = VideoMAEImageProcessor(size=size)
slow_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
processor = XCLIPProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
video = prepare_video(num_frames)
inputs = processor(
text=["playing sports", "eating spaghetti", "go shopping"], videos=video, return_tensors="pt", padding=True
)
print("Shape of pixel values:", inputs.pixel_values.shape)
with torch.no_grad():
outputs = model(**inputs)
logits_per_video = outputs.logits_per_video
probs = logits_per_video.softmax(dim=1)
print("Probs:", probs)
if model_name == "xclip-base-patch32":
expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
elif model_name == "xclip-base-patch32-16-frames":
expected_probs = torch.tensor([[7.0999e-04, 9.9883e-01, 4.5580e-04]])
elif model_name == "xclip-base-patch16":
expected_probs = torch.tensor([[0.0083, 0.9681, 0.0236]])
elif model_name == "xclip-base-patch16-16-frames":
expected_probs = torch.tensor([[7.6937e-04, 9.9728e-01, 1.9473e-03]])
elif model_name == "xclip-large-patch14":
expected_probs = torch.tensor([[0.0062, 0.9864, 0.0075]])
elif model_name == "xclip-large-patch14-16-frames":
expected_probs = torch.tensor([[3.3877e-04, 9.9937e-01, 2.8888e-04]])
elif model_name == "xclip-base-patch16-kinetics-600":
expected_probs = torch.tensor([[0.0555, 0.8914, 0.0531]])
elif model_name == "xclip-base-patch16-kinetics-600-16-frames":
expected_probs = torch.tensor([[3.8554e-04, 9.9929e-01, 3.2754e-04]])
elif model_name == "xclip-large-patch14-kinetics-600":
expected_probs = torch.tensor([[0.0036, 0.9920, 0.0045]])
elif model_name == "xclip-base-patch16-hmdb-2-shot":
expected_probs = torch.tensor([[7.1890e-06, 9.9994e-01, 5.6559e-05]])
elif model_name == "xclip-base-patch16-hmdb-4-shot":
expected_probs = torch.tensor([[1.0320e-05, 9.9993e-01, 6.2435e-05]])
elif model_name == "xclip-base-patch16-hmdb-8-shot":
expected_probs = torch.tensor([[4.1377e-06, 9.9990e-01, 9.8386e-05]])
elif model_name == "xclip-base-patch16-hmdb-16-shot":
expected_probs = torch.tensor([[4.1347e-05, 9.9962e-01, 3.3411e-04]])
elif model_name == "xclip-base-patch16-ucf-2-shot":
expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
elif model_name == "xclip-base-patch16-ucf-4-shot":
expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
elif model_name == "xclip-base-patch16-ucf-8-shot":
expected_probs = torch.tensor([[0.0027, 0.9904, 0.0070]])
elif model_name == "xclip-base-patch16-ucf-16-shot":
expected_probs = torch.tensor([[9.8219e-04, 9.9593e-01, 3.0863e-03]])
elif model_name == "xclip-base-patch16-zero-shot":
expected_probs = torch.tensor([[3.5082e-04, 9.9785e-01, 1.7966e-03]])
else:
raise ValueError(f"Model name {model_name} not supported")
assert torch.allclose(probs, expected_probs, atol=1e-3)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing model, processor and slow tokenizer files to the hub...")
model.push_to_hub(model_name, organization="nielsr")
processor.push_to_hub(model_name, organization="nielsr")
slow_tokenizer.push_to_hub(model_name, organization="nielsr")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="xclip-base-patch32",
type=str,
help="Name of the model.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_xclip_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\x_clip\modeling_x_clip.py
""" PyTorch X-CLIP model."""
from copy import copy
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "microsoft/xclip-base-patch32"
XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/xclip-base-patch32",
]
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
"""
Computes the contrastive loss given logits.
Args:
logits (torch.Tensor): The logits from the model.
Returns:
torch.Tensor: The computed contrastive loss.
"""
return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
def x_clip_loss(similarity: torch.Tensor) -> torch.Tensor:
"""
Computes the X-CLIP loss given similarity scores.
Args:
similarity (torch.Tensor): The similarity scores between captions and images.
Returns:
torch.Tensor: The computed X-CLIP loss.
"""
caption_loss = contrastive_loss(similarity)
image_loss = contrastive_loss(similarity.t())
return (caption_loss + image_loss) / 2.0
@dataclass
class XCLIPOutput(ModelOutput):
"""
Placeholder class for model outputs specific to X-CLIP.
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for video-text similarity.
logits_per_video (`torch.FloatTensor` of shape `(video_batch_size, text_batch_size)`):
The scaled dot product scores between `video_embeds` and `text_embeds`. This represents the video-text
similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, video_batch_size)`):
The scaled dot product scores between `text_embeds` and `video_embeds`. This represents the text-video
similarity scores.
text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of [`XCLIPTextModel`].
video_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
The video embeddings obtained by applying the projection layer to the pooled output of
[`XCLIPVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
The output of the [`XCLIPTextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
The output of the [`XCLIPVisionModel`].
mit_output (`BaseModelOutputWithPooling`):
The output of `XCLIPMultiframeIntegrationTransformer` (MIT for short).
"""
# Optional attributes initialized to None
loss: Optional[torch.FloatTensor] = None
logits_per_video: torch.FloatTensor = None
logits_per_text: torch.FloatTensor = None
text_embeds: torch.FloatTensor = None
video_embeds: torch.FloatTensor = None
text_model_output: BaseModelOutputWithPooling = None
vision_model_output: BaseModelOutputWithPooling = None
mit_output: BaseModelOutputWithPooling = None
# Method to convert instance to a tuple, handling special cases for complex types
def to_tuple(self) -> Tuple[Any]:
return tuple(
# Return self[k] for basic attributes, or convert and return tuple for complex attributes
self[k]
if k not in ["text_model_output", "vision_model_output", "mit_output"]
else getattr(self, k).to_tuple() # Convert complex attribute to tuple
for k in self.keys() # Iterate through all attribute keys
)
# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->XCLIP
class XCLIPVisionEmbeddings(nn.Module):
def __init__(self, config: XCLIPVisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
# 定义一个可学习的类别嵌入向量
self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
# 定义用于提取图像补丁特征的二维卷积层
self.patch_embedding = nn.Conv2d(
in_channels=config.num_channels,
out_channels=self.embed_dim,
kernel_size=self.patch_size,
stride=self.patch_size,
bias=False,
)
# 计算图像中补丁的数量
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
# 定义一个位置嵌入层,用于表示每个位置的特征
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
# 注册一个位置 id 的缓冲区张量,用于表示序列中每个位置的索引
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
target_dtype = self.patch_embedding.weight.dtype
# 使用卷积层提取图像补丁的特征表示
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
# 将类别嵌入向量扩展到每个样本
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
# 将类别嵌入向量和图像补丁特征连接起来作为最终的视觉嵌入表示
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
# 添加位置嵌入到最终的视觉嵌入表示中
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->XCLIP
class XCLIPTextEmbeddings(nn.Module):
def __init__(self, config: XCLIPTextConfig):
super().__init__()
embed_dim = config.hidden_size
# 定义一个用于词嵌入的嵌入层
self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
# 定义一个用于位置嵌入的嵌入层
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
# 注册一个位置 id 的缓冲区张量,用于表示序列中每个位置的索引
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
) -> torch.Tensor:
seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
# 如果没有提供位置 id,则使用预定义的位置 id
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
# 如果没有提供嵌入的输入,则使用输入 id 来获取词嵌入
if inputs_embeds is None:
inputs_embeds = self.token_embedding(input_ids)
# 使用位置嵌入层获取位置嵌入
position_embeddings = self.position_embedding(position_ids)
# 将词嵌入和位置嵌入相加作为最终的文本嵌入表示
embeddings = inputs_embeds + position_embeddings
return embeddings
# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->XCLIP
# 定义一个名为 XCLIPAttention 的类,表示从论文 'Attention Is All You Need' 中的多头注意力机制
class XCLIPAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size # 获取隐藏大小作为嵌入维度
self.num_heads = config.num_attention_heads # 获取注意力头数
self.head_dim = self.embed_dim // self.num_heads # 计算每个注意力头的维度
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5 # 缩放因子
self.dropout = config.attention_dropout # 注意力机制中的 dropout 概率
# 线性变换层,用于查询、键、值和输出的投影
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
# 重新形状张量以便多头注意力计算
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
# 前向传播函数,执行多头注意力计算
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
# 定义一个名为 XCLIPMLP 的类,从 CLIP 模型中复制,表示多层感知机(MLP)部分
class XCLIPMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act] # 激活函数选择
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) # 第一个全连接层
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) # 第二个全连接层
# 前向传播函数,执行全连接层的计算
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.fc1(hidden_states) # 第一层全连接
hidden_states = self.activation_fn(hidden_states) # 应用激活函数
hidden_states = self.fc2(hidden_states) # 第二层全连接
return hidden_states # 返回计算结果
# 定义一个名为 XCLIPEncoderLayer 的类,从 CLIP 模型中复制,表示编码器层
class XCLIPEncoderLayer(nn.Module):
def __init__(self, config: XCLIPConfig):
super().__init__()
self.embed_dim = config.hidden_size # 获取隐藏大小作为嵌入维度
self.self_attn = XCLIPAttention(config) # 自注意力层
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) # 第一个层归一化层
self.mlp = XCLIPMLP(config) # 多层感知机层
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) # 第二个层归一化层
# 前向传播函数,执行编码器层的计算
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
causal_attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False,
"""
定义了一个方法,接收以下参数并返回结果的元组:
- hidden_states (`torch.FloatTensor`): 形状为 `(batch, seq_len, embed_dim)` 的输入层张量
- attention_mask (`torch.FloatTensor`): 形状为 `(batch, 1, tgt_len, src_len)` 的注意力掩码,
其中填充元素用非常大的负值表示
- output_attentions (`bool`, *可选*): 是否返回所有注意力层的注意力张量。查看返回张量中的 `attentions` 获取更多细节。
"""
residual = hidden_states # 保存输入 hidden_states 作为残差连接的基础
hidden_states = self.layer_norm1(hidden_states) # 应用第一个层归一化
# 使用 self_attn 处理 hidden_states,接收 attention_mask 和 output_attentions 作为参数
# 返回处理后的 hidden_states 和可能的注意力权重 attn_weights
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
)
hidden_states = residual + hidden_states # 将残差连接应用于处理后的 hidden_states
residual = hidden_states # 更新残差连接的基础为当前的 hidden_states
hidden_states = self.layer_norm2(hidden_states) # 应用第二个层归一化
hidden_states = self.mlp(hidden_states) # 应用 MLP 层
hidden_states = residual + hidden_states # 再次将残差连接应用于处理后的 hidden_states
outputs = (hidden_states,) # 将最终处理后的 hidden_states 存储在 outputs 中
if output_attentions:
outputs += (attn_weights,) # 如果需要返回 attentions,则将 attn_weights 添加到 outputs 中
return outputs # 返回包含 hidden_states 和可能的 attn_weights 的元组作为输出
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # binarize
output = input.div(keep_prob) * random_tensor
return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->XCLIP
class XCLIPDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class XCLIPVisionEncoderLayer(nn.Module):
"""
This corresponds to the `CrossFramelAttentionBlock` class in the original implementation.
"""
def __init__(self, config: XCLIPConfig):
super().__init__()
self.num_frames = config.num_frames
self.embed_dim = config.hidden_size
# Message passing components
self.message_fc = nn.Linear(self.embed_dim, self.embed_dim) # Linear transformation for message passing
self.message_ln = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) # Layer normalization for messages
self.message_attn = XCLIPAttention(config) # Attention mechanism for message passing
# Drop path implementation
self.drop_path = XCLIPDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
# Self-attention mechanism
self.self_attn = XCLIPAttention(config) # Self-attention mechanism
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) # Layer normalization after self-attention
# MLP (Feedforward neural network)
self.mlp = XCLIPMLP(config) # Multilayer perceptron for feature transformation
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) # Layer normalization after MLP
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
causal_attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False,
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
输入到层的隐藏状态张量,形状为 `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
`(config.encoder_attention_heads,)`.
注意力掩码张量,大小为 `(batch, 1, tgt_len, src_len)`,其中填充元素由非常大的负值表示。
causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Causal mask for the text model. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
文本模型的因果关注掩码。掩码值选定在 `[0, 1]` 之间:
- 1 表示 **未屏蔽** 的标记,
- 0 表示 **屏蔽** 的标记。
[什么是注意力掩码?](../glossary
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
是否返回所有注意力层的注意力张量。详细信息请参见返回的张量下的 `attentions`。
"""
# 计算输入张量的维度
batch_time, seq_length, hidden_size = hidden_states.size()
# 计算批次大小
batch_size = batch_time // self.num_frames
# 提取第一个时间步的隐藏状态,通过全连接层生成消息标记
msg_token = self.message_fc(hidden_states[:, 0, :])
# 调整形状以匹配批次和帧数
msg_token = msg_token.view(batch_size, self.num_frames, hidden_size)
# 对消息标记应用注意力操作,并通过随机丢弃路径进行正则化
msg_token = msg_token + self.drop_path(self.message_attn(self.message_ln(msg_token))[0])
# 添加虚拟序列维度
msg_token = msg_token.view(-1, 1, hidden_size)
# 将消息标记连接到原始输入张量中
hidden_states = torch.cat([hidden_states, msg_token], dim=1)
# 保存残差连接
residual = hidden_states
# 对连接后的张量进行层归一化
hidden_states = self.layer_norm1(hidden_states)
# 应用自注意力机制,并返回注意力权重
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
)
# 添加残差连接
hidden_states = residual + hidden_states
# 裁剪处理后的张量,恢复原始的序列长度
hidden_states = hidden_states[:, :seq_length, :]
# 保存残差连接
residual = hidden_states
# 对连接后的张量再次进行层归一化
hidden_states = self.layer_norm2(hidden_states)
# 应用多层感知机层
hidden_states = self.mlp(hidden_states)
# 添加残差连接
hidden_states = residual + hidden_states
# 输出结果为包含处理后的隐藏状态的元组
outputs = (hidden_states,)
# 如果需要输出注意力权重,则将它们添加到输出元组中
if output_attentions:
outputs += (attn_weights,)
return outputs
# 定义一个名为 XCLIPPreTrainedModel 的类,继承自 PreTrainedModel,用于处理权重初始化和预训练模型的下载与加载的简单接口。
class XCLIPPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 指定该类的配置类为 XCLIPConfig,用于处理模型配置信息
config_class = XCLIPConfig
# 基础模型的前缀名称为 "x_clip",用于标识模型的基础结构
base_model_prefix = "x_clip"
# 支持梯度检查点技术,允许在模型训练时进行梯度检查点
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
# 从配置中获取初始化因子
factor = self.config.initializer_factor
# 如果 module 是 XCLIPTextEmbeddings 类型
if isinstance(module, XCLIPTextEmbeddings):
# 初始化 token_embedding 的权重
module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
# 初始化 position_embedding 的权重
module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
# 如果 module 是 XCLIPVisionEmbeddings 类型
elif isinstance(module, XCLIPVisionEmbeddings):
# 重新设置初始化因子
factor = self.config.initializer_factor
# 初始化 class_embedding 的权重
nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
# 初始化 patch_embedding 的权重
nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
# 初始化 position_embedding 的权重
nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
# 如果 module 是 XCLIPAttention 类型
elif isinstance(module, XCLIPAttention):
# 重新设置初始化因子
factor = self.config.initializer_factor
# 计算输入投影的标准差
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
# 计算输出投影的标准差
out_proj_std = (module.embed_dim**-0.5) * factor
# 初始化 q_proj 的权重
nn.init.normal_(module.q_proj.weight, std=in_proj_std)
# 初始化 k_proj 的权重
nn.init.normal_(module.k_proj.weight, std=in_proj_std)
# 初始化 v_proj 的权重
nn.init.normal_(module.v_proj.weight, std=in_proj_std)
# 初始化 out_proj 的权重
nn.init.normal_(module.out_proj.weight, std=out_proj_std)
# 如果 module 是 XCLIPMLP 类型
elif isinstance(module, XCLIPMLP):
# 重新设置初始化因子
factor = self.config.initializer_factor
# 计算输入投影的标准差
in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
# 计算全连接层的标准差
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
# 初始化 fc1 的权重
nn.init.normal_(module.fc1.weight, std=fc_std)
# 初始化 fc2 的权重
nn.init.normal_(module.fc2.weight, std=in_proj_std)
# 如果 module 是 XCLIPModel 类型
elif isinstance(module, XCLIPModel):
# 重新设置初始化因子
factor = self.config.initializer_factor
# 初始化 text_projection 的权重
nn.init.normal_(
module.text_projection.weight,
std=module.text_embed_dim**-0.5 * factor,
)
# 初始化 visual_projection 的权重
nn.init.normal_(
module.visual_projection.weight,
std=module.vision_embed_dim**-0.5 * factor,
)
# 初始化 prompts_visual_projection 的权重
nn.init.normal_(module.prompts_visual_projection, mean=0.0, std=module.vision_embed_dim**-0.5 * factor)
# 如果 module 是 XCLIPMultiframeIntegrationTransformer 类型
elif isinstance(module, XCLIPMultiframeIntegrationTransformer):
# 初始化 position_embedding 的权重
nn.init.normal_(module.position_embedding, std=self.config.initializer_factor)
# 如果 module 是 nn.LayerNorm 类型
if isinstance(module, nn.LayerNorm):
# 将偏置项初始化为零
module.bias.data.zero_()
# 将权重初始化为 1.0
module.weight.data.fill_(1.0)
# 如果 module 是 nn.Linear 类型
if isinstance(module, nn.Linear):
# 初始化权重
module.weight.data.normal_(mean=0.0, std=self.config.initializer_factor)
# 如果有偏置项,将偏置项初始化为零
if module.bias is not None:
module.bias.data.zero_()
# X_CLIP_START_DOCSTRING 是一个包含模型描述信息的原始字符串文档
X_CLIP_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`XCLIPConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# X_CLIP_TEXT_INPUTS_DOCSTRING 是一个关于文本输入参数的文档字符串
X_CLIP_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# X_CLIP_VISION_INPUTS_DOCSTRING 是一个空字符串,暂时未包含内容
X_CLIP_VISION_INPUTS_DOCSTRING = r"""
"""
Args:
# `pixel_values` 是一个 torch.FloatTensor,表示图像的像素值,形状为 `(batch_size, num_channels, height, width)`
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
# 是否输出所有注意力层的注意力张量,默认为 False
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
# 是否输出所有隐藏层的隐藏状态,默认为 False
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
# 是否返回一个 `utils.ModelOutput` 对象而不是普通的元组,默认为 False
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
X_CLIP_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class XCLIPEncoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`XCLIPEncoderLayer`].
Args:
config: XCLIPConfig
"""
def __init__(self, config: XCLIPConfig):
super().__init__()
self.config = config
self.layers = nn.ModuleList([XCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
inputs_embeds,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
class XCLIPTextTransformer(nn.Module):
def __init__(self, config: XCLIPTextConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
self.embeddings = XCLIPTextEmbeddings(config)
self.encoder = XCLIPEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is None:
raise ValueError("You have to specify either input_ids")
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1])
hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
causal_attention_mask = _create_4d_causal_attention_mask(
input_shape, hidden_states.dtype, device=hidden_states.device
)
if attention_mask is not None:
attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
last_hidden_state = self.final_layer_norm(last_hidden_state)
pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class XCLIPTextModel(XCLIPPreTrainedModel):
config_class = XCLIPTextConfig
def __init__(self, config: XCLIPTextConfig):
super().__init__(config)
self.text_model = XCLIPTextTransformer(config)
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.text_model.embeddings.token_embedding
def set_input_embeddings(self, value):
self.text_model.embeddings.token_embedding = value
@add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
Examples:
```
>>> from transformers import AutoTokenizer, XCLIPTextModel
>>> model = XCLIPTextModel.from_pretrained("microsoft/xclip-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled (EOS token) states
```
使用文本模型处理传入的参数,并返回处理结果
"""
return self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
class XCLIPVisionEncoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`XCLIPVisionEncoderLayer`].
Args:
config: XCLIPConfig
"""
def __init__(self, config: XCLIPConfig):
super().__init__()
self.config = config
self.layers = nn.ModuleList([XCLIPVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
inputs_embeds,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
pass
class XCLIPVisionTransformer(nn.Module):
"""
This corresponds to the `CrossFrameCommunicationTransformer` class in the original implementation.
"""
def __init__(self, config: XCLIPVisionConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
self.embeddings = XCLIPVisionEmbeddings(config)
self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = XCLIPVisionEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig)
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
hidden_states = self.embeddings(pixel_values)
hidden_states = self.pre_layernorm(hidden_states)
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
pooled_output = last_hidden_state[:, 0, :]
pooled_output = self.post_layernorm(pooled_output)
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class XCLIPVisionModel(XCLIPPreTrainedModel):
config_class = XCLIPVisionConfig
main_input_name = "pixel_values"
def __init__(self, config: XCLIPVisionConfig):
super().__init__(config)
self.vision_model = XCLIPVisionTransformer(config)
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.vision_model.embeddings.patch_embedding
@add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
...
class XCLIPMultiframeIntegrationTransformer(nn.Module):
"""
这对应于原始实现中的`MultiframeIntegrationTransformer`类。
"""
def __init__(self, config: XCLIPVisionConfig):
super().__init__()
self.position_embedding = nn.Parameter(torch.empty(1, config.num_frames, config.hidden_size))
self.encoder = XCLIPEncoder(config)
def forward(
self,
hidden_states,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
residual = hidden_states
hidden_states = hidden_states + self.position_embedding
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
last_hidden_state = last_hidden_state.type(hidden_states.dtype) + residual
pooled_output = last_hidden_state.mean(dim=1, keepdim=False)
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class XCLIPCrossAttention(nn.Module):
"""来自'Attention Is All You Need'论文的多头注意力"""
def __init__(self, config):
super().__init__()
self.num_heads = config.prompt_num_attention_heads
dim = config.projection_dim
head_dim = dim // self.num_heads
self.scale = head_dim**-0.5
self.q_proj = nn.Linear(dim, dim, bias=False)
self.k_proj = nn.Linear(dim, dim, bias=False)
self.v_proj = nn.Linear(dim, dim, bias=False)
self.attn_drop = nn.Dropout(config.prompt_attention_dropout)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(config.prompt_projection_dropout)
def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
"""调整张量形状以便注意力计算"""
return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(self, queries, keys, values):
"""模型的前向传播方法"""
batch_size, query_seq_len, hidden_size = queries.shape
batch_size, key_seq_len, hidden_size = keys.shape
queries = (
self.q_proj(queries)
.reshape(batch_size, query_seq_len, self.num_heads, hidden_size // self.num_heads)
.permute(0, 2, 1, 3)
)
keys = (
self.k_proj(keys)
.reshape(batch_size, key_seq_len, self.num_heads, hidden_size // self.num_heads)
.permute(0, 2, 1, 3)
)
values = (
self.v_proj(values)
.reshape(batch_size, key_seq_len, self.num_heads, hidden_size // self.num_heads)
.permute(0, 2, 1, 3)
)
attn = (queries @ keys.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ values).transpose(1, 2).reshape(batch_size, query_seq_len, hidden_size)
x = self.proj(x)
x = self.proj_drop(x)
return x
class PromptGeneratorLayer(nn.Module):
def __init__(self, config):
super().__init__()
embed_dim = config.projection_dim
self.cross_attn = XCLIPCrossAttention(config)
self.norm1 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
self.norm3 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
self.mlp = nn.Sequential(
nn.Linear(embed_dim, embed_dim * 4),
ACT2FN[config.prompt_hidden_act],
nn.Dropout(config.prompt_attention_dropout),
nn.Linear(embed_dim * 4, embed_dim),
)
def forward(self, x, visual):
x = x + self.cross_attn(self.norm1(x), visual, visual)
x = x + self.mlp(self.norm3(x))
return x
class XCLIPPromptGenerator(nn.Module):
"""This corresponds to the `VideoSpecificPrompt` class in the original implementation."""
def __init__(self, config):
super().__init__()
embed_dim = config.projection_dim
self.layernorm = nn.LayerNorm(embed_dim, eps=config.vision_config.layer_norm_eps)
self.decoder = nn.ModuleList([PromptGeneratorLayer(config) for _ in range(config.prompt_layers)])
self.alpha = nn.Parameter(torch.ones(embed_dim) * config.prompt_alpha)
def forward(self, text, visual):
visual = self.layernorm(visual)
for layer in self.decoder:
text = layer(text, visual)
return self.alpha * text
@add_start_docstrings(X_CLIP_START_DOCSTRING)
class XCLIPModel(XCLIPPreTrainedModel):
config_class = XCLIPConfig
def __init__(self, config: XCLIPConfig):
super().__init__(config)
if not isinstance(config.text_config, XCLIPTextConfig):
raise ValueError(
"config.text_config is expected to be of type XCLIPTextConfig but is of type"
f" {type(config.text_config)}."
)
if not isinstance(config.vision_config, XCLIPVisionConfig):
raise ValueError(
"config.vision_config is expected to be of type XCLIPVisionConfig but is of type"
f" {type(config.vision_config)}."
)
text_config = config.text_config
vision_config = config.vision_config
self.projection_dim = config.projection_dim
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size
self.text_model = XCLIPTextTransformer(text_config)
self.vision_model = XCLIPVisionTransformer(vision_config)
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim, eps=config.vision_config.layer_norm_eps)
self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim))
mit_config = copy(vision_config)
mit_config.hidden_size = vision_config.mit_hidden_size
mit_config.intermediate_size = vision_config.mit_intermediate_size
mit_config.num_hidden_layers = vision_config.mit_num_hidden_layers
mit_config.num_attention_heads = vision_config.mit_num_attention_heads
self.mit = XCLIPMultiframeIntegrationTransformer(mit_config)
self.prompts_generator = XCLIPPromptGenerator(config)
self.post_init()
@add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
def get_text_features(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
applying the projection layer to the pooled output of [`XCLIPTextModel`].
Examples:
```
>>> from transformers import AutoTokenizer, AutoModel
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")
>>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
text_embeds = text_outputs[1]
text_embeds = self.text_projection(text_embeds)
return text_embeds
@add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
def get_video_features(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
video_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The video embeddings obtained by
applying the projection layer to the pooled output of [`XCLIPVideoModel`].
Examples:
```
>>> from transformers import AutoTokenizer, AutoModel
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")
>>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
>>> video_inputs = torch.randn(2, 3, 224, 224) # Example pixel values tensor
>>> video_features = model.get_video_features(pixel_values=video_inputs)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
video_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
video_embeds = video_outputs[1]
video_embeds = self.video_projection(video_embeds)
return video_embeds
@add_start_docstrings_to_model_forward(X_CLIP_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=XCLIPOutput, config_class=XCLIPConfig)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[XCLIPOutput, Tuple[torch.FloatTensor]]:
r"""
Returns:
Union[XCLIPOutput, Tuple[torch.FloatTensor]]: Depending on the configuration, this method can return either
a single XCLIPOutput object or a tuple containing a torch.FloatTensor.
"""
.\models\x_clip\processing_x_clip.py
"""
Image/Text processor class for XCLIP
"""
import warnings
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
class XCLIPProcessor(ProcessorMixin):
r"""
Constructs an X-CLIP processor which wraps a VideoMAE image processor and a CLIP tokenizer into a single processor.
[`XCLIPProcessor`] offers all the functionalities of [`VideoMAEImageProcessor`] and [`CLIPTokenizerFast`]. See the
[`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
Args:
image_processor ([`VideoMAEImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`CLIPTokenizerFast`], *optional*):
The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "VideoMAEImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
return ["input_ids", "attention_mask", "position_ids", "pixel_values"]
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
.\models\x_clip\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_x_clip": [
"XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
"XCLIPConfig",
"XCLIPTextConfig",
"XCLIPVisionConfig",
],
"processing_x_clip": ["XCLIPProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_x_clip"] = [
"XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
"XCLIPModel",
"XCLIPPreTrainedModel",
"XCLIPTextModel",
"XCLIPVisionModel",
]
if TYPE_CHECKING:
from .configuration_x_clip import (
XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
XCLIPConfig,
XCLIPTextConfig,
XCLIPVisionConfig,
)
from .processing_x_clip import XCLIPProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_x_clip import (
XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
XCLIPModel,
XCLIPPreTrainedModel,
XCLIPTextModel,
XCLIPVisionModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\yolos\configuration_yolos.py
""" YOLOS 模型配置"""
from collections import OrderedDict
from typing import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
YOLOS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"hustvl/yolos-small": "https://huggingface.co/hustvl/yolos-small/resolve/main/config.json",
}
class YolosConfig(PretrainedConfig):
r"""
这是一个配置类,用于存储 [`YolosModel`] 的配置。它用于根据指定的参数实例化 YOLOS 模型,定义模型架构。使用默认值实例化配置将产生类似 YOLOS [hustvl/yolos-base](https://huggingface.co/hustvl/yolos-base) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型输出。有关更多信息,请阅读 [`PretrainedConfig`] 的文档。
示例:
```
>>> from transformers import YolosConfig, YolosModel
>>> # 初始化一个 YOLOS hustvl/yolos-base 风格的配置
>>> configuration = YolosConfig()
>>> # 使用配置初始化一个(具有随机权重)hustvl/yolos-base 风格的模型
>>> model = YolosModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "yolos"
def __init__(
self,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
layer_norm_eps=1e-12,
image_size=[512, 864],
patch_size=16,
num_channels=3,
qkv_bias=True,
num_detection_tokens=100,
use_mid_position_embeddings=True,
auxiliary_loss=False,
class_cost=1,
bbox_cost=5,
giou_cost=2,
bbox_loss_coefficient=5,
giou_loss_coefficient=2,
eos_coefficient=0.1,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.qkv_bias = qkv_bias
self.num_detection_tokens = num_detection_tokens
self.use_mid_position_embeddings = use_mid_position_embeddings
self.auxiliary_loss = auxiliary_loss
self.class_cost = class_cost
self.bbox_cost = bbox_cost
self.giou_cost = giou_cost
self.bbox_loss_coefficient = bbox_loss_coefficient
self.giou_loss_coefficient = giou_loss_coefficient
self.eos_coefficient = eos_coefficient
class YolosOnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-4
@property
def default_onnx_opset(self) -> int:
return 12
.\models\yolos\convert_yolos_to_pytorch.py
"""Convert YOLOS checkpoints from the original repository. URL: https://github.com/hustvl/YOLOS"""
import argparse
import json
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import YolosConfig, YolosForObjectDetection, YolosImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_yolos_config(yolos_name: str) -> YolosConfig:
config = YolosConfig()
if "yolos_ti" in yolos_name:
config.hidden_size = 192
config.intermediate_size = 768
config.num_hidden_layers = 12
config.num_attention_heads = 3
config.image_size = [800, 1333]
config.use_mid_position_embeddings = False
elif yolos_name == "yolos_s_dWr":
config.hidden_size = 330
config.num_hidden_layers = 14
config.num_attention_heads = 6
config.intermediate_size = 1320
elif "yolos_s" in yolos_name:
config.hidden_size = 384
config.intermediate_size = 1536
config.num_hidden_layers = 12
config.num_attention_heads = 6
elif "yolos_b" in yolos_name:
config.image_size = [800, 1344]
config.num_labels = 91
repo_id = "huggingface/label-files"
filename = "coco-detection-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
def read_in_q_k_v(state_dict: dict, config: YolosConfig, base_model: bool = False):
for i in range(config.num_hidden_layers):
in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
config.hidden_size : config.hidden_size * 2, :
]
state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
config.hidden_size : config.hidden_size * 2
]
state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
def rename_key(name: str) -> str:
if "backbone" in name:
name = name.replace("backbone", "vit")
if "cls_token" in name:
name = name.replace("cls_token", "embeddings.cls_token")
if "det_token" in name:
name = name.replace("det_token", "embeddings.detection_tokens")
if "mid_pos_embed" in name:
name = name.replace("mid_pos_embed", "encoder.mid_position_embeddings")
if "pos_embed" in name:
name = name.replace("pos_embed", "embeddings.position_embeddings")
if "patch_embed.proj" in name:
name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
if "blocks" in name:
name = name.replace("blocks", "encoder.layer")
if "attn.proj" in name:
name = name.replace("attn.proj", "attention.output.dense")
if "attn" in name:
name = name.replace("attn", "attention.self")
if "norm1" in name:
name = name.replace("norm1", "layernorm_before")
if "norm2" in name:
name = name.replace("norm2", "layernorm_after")
if "mlp.fc1" in name:
name = name.replace("mlp.fc1", "intermediate.dense")
if "mlp.fc2" in name:
name = name.replace("mlp.fc2", "output.dense")
if "class_embed" in name:
name = name.replace("class_embed", "class_labels_classifier")
if "bbox_embed" in name:
name = name.replace("bbox_embed", "bbox_predictor")
if "vit.norm" in name:
name = name.replace("vit.norm", "vit.layernorm")
return name
def convert_state_dict(orig_state_dict: dict, model: YolosForObjectDetection) -> dict:
for key in orig_state_dict.copy().keys():
val = orig_state_dict.pop(key)
if "qkv" in key:
key_split = key.split(".")
layer_num = int(key_split[2])
dim = model.vit.encoder.layer[layer_num].attention.attention.all_head_size
if "weight" in key:
orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.weight"] = val[:dim, :]
orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.weight"] = val[dim:dim * 2, :]
orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
else:
orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.bias"] = val[:dim]
orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.bias"] = val[dim:dim * 2]
orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.bias"] = val[-dim:]
else:
orig_state_dict[rename_key(key)] = val
return orig_state_dict
def prepare_img() -> torch.Tensor:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_yolos_checkpoint(
yolos_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False
):
"""
Copy/paste/tweak model's weights to our YOLOS structure.
复制/粘贴/调整模型的权重到我们的 YOLOS 结构中。
"""
config = get_yolos_config(yolos_name)
state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
model = YolosForObjectDetection(config)
model.eval()
new_state_dict = convert_state_dict(state_dict, model)
model.load_state_dict(new_state_dict)
size = 800 if yolos_name != "yolos_ti" else 512
image_processor = YolosImageProcessor(format="coco_detection", size=size)
encoding = image_processor(images=prepare_img(), return_tensors="pt")
outputs = model(**encoding)
logits, pred_boxes = outputs.logits, outputs.pred_boxes
expected_slice_logits, expected_slice_boxes = None, None
if yolos_name == "yolos_ti":
expected_slice_logits = torch.tensor(
[[-39.5022, -11.9820, -17.6888], [-29.9574, -9.9769, -17.7691], [-42.3281, -20.7200, -30.6294]]
)
expected_slice_boxes = torch.tensor(
[[0.4021, 0.0836, 0.7979], [0.0184, 0.2609, 0.0364], [0.1781, 0.2004, 0.2095]]
)
elif yolos_name == "yolos_s_200_pre":
expected_slice_logits = torch.tensor(
[[-24.0248, -10.3024, -14.8290], [-42.0392, -16.8200, -27.4334], [-27.2743, -11.8154, -18.7148]]
)
expected_slice_boxes = torch.tensor(
[[0.2559, 0.5455, 0.4706], [0.2989, 0.7279, 0.1875], [0.7732, 0.4017, 0.4462]]
)
elif yolos_name == "yolos_s_300_pre":
expected_slice_logits = torch.tensor(
[[-36.2220, -14.4385, -23.5457], [-35.6970, -14.7583, -21.3935], [-31.5939, -13.6042, -16.8049]]
)
expected_slice_boxes = torch.tensor(
[[0.7614, 0.2316, 0.4728], [0.7168, 0.4495, 0.3855], [0.4996, 0.1466, 0.9996]]
)
elif yolos_name == "yolos_s_dWr":
expected_slice_logits = torch.tensor(
[[-42.8668, -24.1049, -41.1690], [-34.7456, -14.1274, -24.9194], [-33.7898, -12.1946, -25.6495]]
)
expected_slice_boxes = torch.tensor(
[[0.5587, 0.2773, 0.0605], [0.5004, 0.3014, 0.9994], [0.4999, 0.1548, 0.9994]]
)
elif yolos_name == "yolos_base":
expected_slice_logits = torch.tensor(
[[-40.6064, -24.3084, -32.6447], [-55.1990, -30.7719, -35.5877], [-51.4311, -33.3507, -35.6462]]
)
expected_slice_boxes = torch.tensor(
[[0.5555, 0.2794, 0.0655], [0.9049, 0.2664, 0.1894], [0.9183, 0.1984, 0.1635]]
)
else:
raise ValueError(f"Unknown yolos_name: {yolos_name}")
assert torch.allclose(logits[0, :3, :3], expected_slice_logits, atol=1e-4)
assert torch.allclose(pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {yolos_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
model_mapping = {
"yolos_ti": "yolos-tiny",
"yolos_s_200_pre": "yolos-small",
"yolos_s_300_pre": "yolos-small-300",
"yolos_s_dWr": "yolos-small-dwr",
"yolos_base": "yolos-base",
}
print("Pushing to the hub...")
model_name = model_mapping[yolos_name]
image_processor.push_to_hub(model_name, organization="hustvl")
model.push_to_hub(model_name, organization="hustvl")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--yolos_name",
default="yolos_s_200_pre",
type=str,
help=(
"Name of the YOLOS model you'd like to convert. Should be one of 'yolos_ti', 'yolos_s_200_pre',"
" 'yolos_s_300_pre', 'yolos_s_dWr', 'yolos_base'."
),
)
parser.add_argument(
"--checkpoint_path", default=None, type=str, help="Path to the original state dict (.pth file)."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_yolos_checkpoint(args.yolos_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\yolos\feature_extraction_yolos.py
"""YOLOS 的特征提取器类。"""
import warnings
from ...image_transforms import rgb_to_id as _rgb_to_id
from ...utils import logging
from .image_processing_yolos import YolosImageProcessor
logger = logging.get_logger(__name__)
def rgb_to_id(x):
warnings.warn(
"rgb_to_id has moved and will not be importable from this module from v5. "
"Please import from transformers.image_transforms instead.",
FutureWarning,
)
return _rgb_to_id(x)
class YolosFeatureExtractor(YolosImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class YolosFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
" use YolosImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\yolos\image_processing_yolos.py
"""
Compute the size of the image while maintaining the aspect ratio based on the given size and optional maximum size.
"""
aspect_ratio = float(image_size[0]) / image_size[1]
if max_size is None or (size[0] <= max_size[0] and size[1] <= max_size[1]):
return size
new_height = int(round(size[0] / aspect_ratio))
if new_height <= max_size[0]:
return size[0], new_height
new_width = int(round(size[1] * aspect_ratio))
return new_width, size[1]
height, width = image_size
if max_size is not None:
min_original_size = float(min((height, width)))
max_original_size = float(max((height, width)))
if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size))
if width < height and width != size:
height = int(size * height / width)
width = size
elif height < width and height != size:
width = int(size * width / height)
height = size
width_mod = np.mod(width, 16)
height_mod = np.mod(height, 16)
width = width - width_mod
height = height - height_mod
return (height, width)
def get_resize_output_image_size(
input_image: np.ndarray,
size: Union[int, Tuple[int, int], List[int]],
max_size: Optional[int] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image size and the desired output size. If the desired output size
is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
image size is computed by keeping the aspect ratio of the input image size.
Args:
input_image (`np.ndarray`):
The image to resize.
size (`int` or `Tuple[int, int]` or `List[int]`):
The desired output size.
max_size (`int`, *optional*):
The maximum allowed output size.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
if isinstance(size, (list, tuple)):
return size
return get_size_with_aspect_ratio(image_size, size, max_size)
def get_numpy_to_framework_fn(arr) -> Callable:
"""
Returns a function that converts a numpy array to the framework of the input array.
Args:
arr (`np.ndarray`): The array to convert.
"""
if isinstance(arr, np.ndarray):
return np.array
if is_tf_available() and is_tf_tensor(arr):
import tensorflow as tf
return tf.convert_to_tensor
if is_torch_available() and is_torch_tensor(arr):
import torch
return torch.tensor
if is_flax_available() and is_jax_tensor(arr):
import jax.numpy as jnp
return jnp.array
raise ValueError(f"Cannot convert arrays of type {type(arr)}")
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
"""
Squeezes an array, but only if the axis specified has dim 1.
"""
if axis is None:
return arr.squeeze()
try:
return arr.squeeze(axis=axis)
except ValueError:
return arr
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
image_height, image_width = image_size
norm_annotation = {}
for key, value in annotation.items():
if key == "boxes":
boxes = value
boxes = corners_to_center_format(boxes)
boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
norm_annotation[key] = boxes
else:
norm_annotation[key] = value
return norm_annotation
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
返回可迭代值中所有索引上的最大值列表。
"""
return [max(values_i) for values_i in zip(*values)]
def make_pixel_mask(
image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
"""
为图像生成像素掩码,其中1表示有效像素,0表示填充。
Args:
image (`np.ndarray`):
需要生成像素掩码的图像。
output_size (`Tuple[int, int]`):
掩码的输出尺寸。
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
mask = np.zeros(output_size, dtype=np.int64)
mask[:input_height, :input_width] = 1
return mask
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
"""
将COCO格式的多边形注释转换为掩码。
Args:
segmentations (`List[List[float]]`):
多边形的列表,每个多边形由一组x-y坐标表示。
height (`int`):
掩码的高度。
width (`int`):
掩码的宽度。
"""
try:
from pycocotools import mask as coco_mask
except ImportError:
raise ImportError("Pycocotools未安装在您的环境中。")
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = np.asarray(mask, dtype=np.uint8)
mask = np.any(mask, axis=2)
masks.append(mask)
if masks:
masks = np.stack(masks, axis=0)
else:
masks = np.zeros((0, height, width), dtype=np.uint8)
return masks
def prepare_coco_detection_annotation(
image,
target,
return_segmentation_masks: bool = False,
input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
"""
将COCO格式中的目标转换为DETR所期望的格式。
"""
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
image_id = target["image_id"]
image_id = np.asarray([image_id], dtype=np.int64)
annotations = target["annotations"]
annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
classes = [obj["category_id"] for obj in annotations]
classes = np.asarray(classes, dtype=np.int64)
area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
boxes = [obj["bbox"] for obj in annotations]
boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2]
boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
new_target = {}
new_target["image_id"] = image_id
new_target["class_labels"] = classes[keep]
new_target["boxes"] = boxes[keep]
new_target["area"] = area[keep]
new_target["iscrowd"] = iscrowd[keep]
new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
if annotations and "keypoints" in annotations[0]:
keypoints = [obj["keypoints"] for obj in annotations]
keypoints = np.asarray(keypoints, dtype=np.float32)
keypoints = keypoints[keep]
num_keypoints = keypoints.shape[0]
keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
new_target["keypoints"] = keypoints
if return_segmentation_masks:
segmentation_masks = [obj["segmentation"] for obj in annotations]
masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
new_target["masks"] = masks[keep]
return new_target
def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
"""
Compute the bounding boxes around the provided panoptic segmentation masks.
Args:
masks: masks in format `[number_masks, height, width]` where N is the number of masks
Returns:
boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
"""
if masks.size == 0:
return np.zeros((0, 4))
h, w = masks.shape[-2:]
y = np.arange(0, h, dtype=np.float32)
x = np.arange(0, w, dtype=np.float32)
y, x = np.meshgrid(y, x, indexing="ij")
x_mask = masks * np.expand_dims(x, axis=0)
x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
x_min = x.filled(fill_value=1e8)
x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
y_mask = masks * np.expand_dims(y, axis=0)
y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
y_min = y.filled(fill_value=1e8)
y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
return np.stack([x_min, y_min, x_max, y_max], 1)
def prepare_coco_panoptic_annotation(
image: np.ndarray,
target: Dict,
masks_path: Union[str, pathlib.Path],
return_masks: bool = True,
input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
"""
Prepare a coco panoptic annotation for YOLOS.
"""
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
annotation_path = pathlib.Path(masks_path) / target["file_name"]
new_target = {}
new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
if "segments_info" in target:
masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
masks = rgb_to_id(masks)
ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
masks = masks == ids[:, None, None]
masks = masks.astype(np.uint8)
if return_masks:
new_target["masks"] = masks
new_target["boxes"] = masks_to_boxes(masks)
new_target["class_labels"] = np.array(
[segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
)
new_target["iscrowd"] = np.asarray(
[segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
)
new_target["area"] = np.asarray(
[segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
)
return new_target
def get_segmentation_image(
masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False
):
h, w = input_size
final_h, final_w = target_size
m_id = scipy.special.softmax(masks.transpose(0, 1), -1)
if m_id.shape[-1] == 0:
m_id = np.zeros((h, w), dtype=np.int64)
else:
m_id = m_id.argmax(-1).reshape(h, w)
if deduplicate:
for equiv in stuff_equiv_classes.values():
for eq_id in equiv:
m_id[m_id == eq_id] = equiv[0]
seg_img = id_to_rgb(m_id)
seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
return seg_img
def get_mask_area(seg_img: np.ndarray, target_size: Tuple[int, int], n_classes: int) -> np.ndarray:
final_h, final_w = target_size
np_seg_img = seg_img.astype(np.uint8)
np_seg_img = np_seg_img.reshape(final_h, final_w, 3)
m_id = rgb_to_id(np_seg_img)
area = [(m_id == i).sum() for i in range(n_classes)]
return area
def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
probs = scipy.special.softmax(logits, axis=-1)
labels = probs.argmax(-1, keepdims=True)
scores = np.take_along_axis(probs, labels, axis=-1)
scores, labels = scores.squeeze(-1), labels.squeeze(-1)
return scores, labels
def resize_annotation(
annotation: Dict[str, Any],
orig_size: Tuple[int, int],
target_size: Tuple[int, int],
threshold: float = 0.5,
resample: PILImageResampling = PILImageResampling.NEAREST,
):
"""
Resizes an annotation to a target size.
Args:
annotation (`Dict[str, Any]`):
The annotation dictionary.
orig_size (`Tuple[int, int]`):
The original size of the input image.
target_size (`Tuple[int, int]`):
The target size of the image, as returned by the preprocessing `resize` step.
threshold (`float`, *optional*, defaults to 0.5):
The threshold used to binarize the segmentation masks.
resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
The resampling filter to use when resizing the masks.
"""
ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
ratio_height, ratio_width = ratios
new_annotation = {}
new_annotation["size"] = target_size
for key, value in annotation.items():
if key == "boxes":
boxes = value
scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
new_annotation["boxes"] = scaled_boxes
elif key == "area":
area = value
scaled_area = area * (ratio_width * ratio_height)
new_annotation["area"] = scaled_area
elif key == "masks":
masks = value[:, None]
masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
masks = masks.astype(np.float32)
masks = masks[:, 0] > threshold
new_annotation["masks"] = masks
elif key == "size":
new_annotation["size"] = target_size
else:
new_annotation[key] = value
return new_annotation
def binary_mask_to_rle(mask):
"""
Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.
Args:
mask (`torch.Tensor` or `numpy.array`):
A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
segment_id or class_id.
Returns:
`List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
format.
"""
if is_torch_tensor(mask):
mask = mask.numpy()
pixels = mask.flatten()
pixels = np.concatenate([[0], pixels, [0]])
runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
runs[1::2] -= runs[::2]
return list(runs)
def convert_segmentation_to_rle(segmentation):
"""
Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.
Args:
segmentation (`torch.Tensor` or `numpy.array`):
A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
Returns:
`List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
"""
segment_ids = torch.unique(segmentation)
run_length_encodings = []
for idx in segment_ids:
mask = torch.where(segmentation == idx, 1, 0)
rle = binary_mask_to_rle(mask)
run_length_encodings.append(rle)
return run_length_encodings
def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
"""
Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
`labels`.
Args:
masks (`torch.Tensor`):
A tensor of shape `(num_queries, height, width)`.
scores (`torch.Tensor`):
A tensor of shape `(num_queries)`.
labels (`torch.Tensor`):
A tensor of shape `(num_queries)`.
object_mask_threshold (`float`):
A number between 0 and 1 used to binarize the masks.
Raises:
`ValueError`: Raised when the first dimension doesn't match in all input tensors.
Returns:
`Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
< `object_mask_threshold`.
"""
if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
raise ValueError("mask, scores and labels must have the same shape!")
to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
return masks[to_keep], scores[to_keep], labels[to_keep]
def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
mask_k = mask_labels == k
mask_k_area = mask_k.sum()
original_area = (mask_probs[k] >= mask_threshold).sum()
mask_exists = mask_k_area > 0 and original_area > 0
if mask_exists:
area_ratio = mask_k_area / original_area
if not area_ratio.item() > overlap_mask_area_threshold:
mask_exists = False
return mask_exists, mask_k
def compute_segments(
mask_probs,
pred_scores,
pred_labels,
mask_threshold: float = 0.5,
overlap_mask_area_threshold: float = 0.8,
label_ids_to_fuse: Optional[Set[int]] = None,
target_size: Tuple[int, int] = None,
):
height = mask_probs.shape[1] if target_size is None else target_size[0]
width = mask_probs.shape[2] if target_size is None else target_size[1]
segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
segments: List[Dict] = []
if target_size is not None:
mask_probs = nn.functional.interpolate(
mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
)[0]
current_segment_id = 0
mask_probs *= pred_scores.view(-1, 1, 1)
mask_labels = mask_probs.argmax(0)
stuff_memory_list: Dict[str, int] = {}
for k in range(pred_labels.shape[0]):
pred_class = pred_labels[k].item()
should_fuse = pred_class in label_ids_to_fuse
mask_exists, mask_k = check_segment_validity(
mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
)
if mask_exists:
if pred_class in stuff_memory_list:
current_segment_id = stuff_memory_list[pred_class]
else:
current_segment_id += 1
segmentation[mask_k] = current_segment_id
segment_score = round(pred_scores[k].item(), 6)
segments.append(
{
"id": current_segment_id,
"label_id": pred_class,
"was_fused": should_fuse,
"score": segment_score,
}
)
if should_fuse:
stuff_memory_list[pred_class] = current_segment_id
return segmentation, segments
class YolosImageProcessor(BaseImageProcessor):
r"""
Constructs a Detr image processor.
Args:
format (`str`, *optional*, defaults to `"coco_detection"`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_resize (`bool`, *optional*, defaults to `True`):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
the `preprocess` method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
`do_rescale` parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize:
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
`preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
"""
model_input_names = ["pixel_values", "pixel_mask"]
def __init__(
self,
format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
**kwargs,
) -> None:
if "pad_and_return_pixel_mask" in kwargs:
do_pad = kwargs.pop("pad_and_return_pixel_mask")
if "max_size" in kwargs:
logger.warning_once(
"The `max_size` parameter is deprecated and will be removed in v4.26. "
"Please specify in `size['longest_edge'] instead`.",
)
max_size = kwargs.pop("max_size")
else:
max_size = None if size is None else 1333
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, max_size=max_size, default_to_square=False)
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs)
self.format = format
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
self._valid_processor_keys = [
"images",
"annotations",
"return_segmentation_masks",
"masks_path",
"do_resize",
"size",
"resample",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"do_convert_annotations",
"do_pad",
"format",
"return_tensors",
"data_format",
"input_data_format",
]
@classmethod
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
image_processor_dict = image_processor_dict.copy()
if "max_size" in kwargs:
image_processor_dict["max_size"] = kwargs.pop("max_size")
if "pad_and_return_pixel_mask" in kwargs:
image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
return super().from_dict(image_processor_dict, **kwargs)
def prepare_annotation(
self,
image: np.ndarray,
target: Dict,
format: Optional[AnnotationFormat] = None,
return_segmentation_masks: bool = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Dict:
"""
Prepare an annotation for feeding into DETR model.
"""
format = format if format is not None else self.format
if format == AnnotationFormat.COCO_DETECTION:
return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_detection_annotation(
image, target, return_segmentation_masks, input_data_format=input_data_format
)
elif format == AnnotationFormat.COCO_PANOPTIC:
return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_panoptic_annotation(
image,
target,
masks_path=masks_path,
return_masks=return_segmentation_masks,
input_data_format=input_data_format,
)
else:
raise ValueError(f"Format {format} is not supported.")
return target
def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
logger.warning_once(
"The `prepare` method is deprecated and will be removed in a v4.33. "
"Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
"does not return the image anymore.",
)
target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
return image, target
def convert_coco_poly_to_mask(self, *args, **kwargs):
logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
return convert_coco_poly_to_mask(*args, **kwargs)
def prepare_coco_detection(self, *args, **kwargs):
logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
return prepare_coco_detection_annotation(*args, **kwargs)
def prepare_coco_panoptic(self, *args, **kwargs):
logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
return prepare_coco_panoptic_annotation(*args, **kwargs)
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
def resize_image(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
int, smaller edge of the image will be matched to this number.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
`height` and `width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
if "max_size" in kwargs:
logger.warning_once(
"The `max_size` parameter is deprecated and will be removed in v4.26. "
"Please specify in `size['longest_edge'] instead`.",
)
max_size = kwargs.pop("max_size")
else:
max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
else:
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
)
return image
def resize_annotation(
self,
annotation,
orig_size,
size,
resample: PILImageResampling = PILImageResampling.NEAREST,
) -> Dict:
"""
Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
to this number.
"""
return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
def rescale(
self,
image: np.ndarray,
rescale_factor: float,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the input image. If unset, is inferred from the input image. Can be
one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
"""
return normalize_annotation(annotation, image_size=image_size)
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation to reflect padding changes in the input image.
Args:
annotation (`Dict`):
The original annotation to update.
input_image_size (`Tuple[int, int]`):
Size of the original input image before padding.
output_image_size (`Tuple[int, int]`):
Size of the padded output image after padding.
padding:
Details of padding applied to the image.
update_bboxes:
Boolean flag indicating whether to update bounding boxes in the annotation.
Returns:
`Dict`: Updated annotation reflecting changes due to padding.
"""
return _update_annotation_for_padded_image(
annotation, input_image_size, output_image_size, padding, update_bboxes
)
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
output_height, output_width = output_size
pad_bottom = output_height - input_height
pad_right = output_width - input_width
padding = ((0, pad_bottom), (0, pad_right))
padded_image = pad(
image,
padding,
mode=PaddingMode.CONSTANT,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
def pad(
self,
images: List[np.ndarray],
annotations: Optional[List[Dict[str, Any]]] = None,
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = False,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
def preprocess(
self,
images: ImageInput,
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
return_segmentation_masks: bool = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
resample=None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[Union[int, float]] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: Optional[bool] = None,
format: Optional[Union[str, AnnotationFormat]] = None,
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
def post_process(self, outputs, target_sizes):
"""
Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
bottom_right_x, bottom_right_y) format. Only supports PyTorch.
Args:
outputs ([`YolosObjectDetectionOutput`]):
Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
original image size (before any data augmentation). For visualization, this should be the image size
after data augment, but before padding.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
logger.warning_once(
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
)
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
if len(out_logits) != len(target_sizes):
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
if target_sizes.shape[1] != 2:
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
prob = nn.functional.softmax(out_logits, -1)
scores, labels = prob[..., :-1].max(-1)
boxes = center_to_corners_format(out_bbox)
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
return results
def post_process_object_detection(
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
"""
Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
bottom_right_x, bottom_right_y) format. Only supports PyTorch.
Args:
outputs ([`YolosObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
if target_sizes is not None:
if len(out_logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
prob = nn.functional.softmax(out_logits, -1)
scores, labels = prob[..., :-1].max(-1)
boxes = center_to_corners_format(out_bbox)
if target_sizes is not None:
if isinstance(target_sizes, list):
img_h = torch.Tensor([i[0] for i in target_sizes])
img_w = torch.Tensor([i[1] for i in target_sizes])
else:
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]
results = []
for s, l, b in zip(scores, labels, boxes):
score = s[s > threshold]
label = l[s > threshold]
box = b[s > threshold]
results.append({"scores": score, "labels": label, "boxes": box})
return results
.\models\yolos\modeling_yolos.py
""" PyTorch YOLOS 模型."""
import collections.abc
import math
from dataclasses import dataclass
from typing import Dict, List, Optional, Set, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_accelerate_available,
is_scipy_available,
is_vision_available,
logging,
replace_return_docstrings,
requires_backends,
)
from .configuration_yolos import YolosConfig
if is_scipy_available():
from scipy.optimize import linear_sum_assignment
if is_vision_available():
from transformers.image_transforms import center_to_corners_format
if is_accelerate_available():
from accelerate import PartialState
from accelerate.utils import reduce
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "YolosConfig"
_CHECKPOINT_FOR_DOC = "hustvl/yolos-small"
_EXPECTED_OUTPUT_SHAPE = [1, 3401, 384]
YOLOS_PRETRAINED_MODEL_ARCHIVE_LIST = [
"hustvl/yolos-small",
]
@dataclass
class YolosObjectDetectionOutput(ModelOutput):
"""
[`YolosForObjectDetection`] 的输出类型。
"""
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
scale-invariant IoU loss.
loss_dict (`Dict`, *optional*):
A dictionary containing the individual losses. Useful for logging.
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
Classification logits (including no-object) for all queries.
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
possible padding). You can use [`~YolosImageProcessor.post_process`] to retrieve the unnormalized bounding
boxes.
auxiliary_outputs (`list[Dict]`, *optional*):
Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
`pred_boxes`) for each decoder layer.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
"""
loss: Optional[torch.FloatTensor] = None
loss_dict: Optional[Dict] = None
logits: torch.FloatTensor = None
pred_boxes: torch.FloatTensor = None
auxiliary_outputs: Optional[List[Dict]] = None
last_hidden_state: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
class YolosEmbeddings(nn.Module):
"""
Construct the CLS token, detection tokens, position and patch embeddings.
构建 CLS token、检测 token、位置和补丁嵌入。
"""
def __init__(self, config: YolosConfig) -> None:
super().__init__()
self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
self.detection_tokens = nn.Parameter(torch.zeros(1, config.num_detection_tokens, config.hidden_size))
self.patch_embeddings = YolosPatchEmbeddings(config)
num_patches = self.patch_embeddings.num_patches
self.position_embeddings = nn.Parameter(
torch.zeros(1, num_patches + config.num_detection_tokens + 1, config.hidden_size)
)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.interpolation = InterpolateInitialPositionEmbeddings(config)
self.config = config
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
batch_size, num_channels, height, width = pixel_values.shape
embeddings = self.patch_embeddings(pixel_values)
batch_size, seq_len, _ = embeddings.size()
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
detection_tokens = self.detection_tokens.expand(batch_size, -1, -1)
embeddings = torch.cat((cls_tokens, embeddings, detection_tokens), dim=1)
position_embeddings = self.interpolation(self.position_embeddings, (height, width))
embeddings = embeddings + position_embeddings
embeddings = self.dropout(embeddings)
return embeddings
class InterpolateInitialPositionEmbeddings(nn.Module):
def __init__(self, config) -> None:
super().__init__()
self.config = config
def forward(self, pos_embed, img_size=(800, 1344)) -> torch.Tensor:
cls_pos_embed = pos_embed[:, 0, :]
cls_pos_embed = cls_pos_embed[:, None]
det_pos_embed = pos_embed[:, -self.config.num_detection_tokens :, :]
patch_pos_embed = pos_embed[:, 1 : -self.config.num_detection_tokens, :]
patch_pos_embed = patch_pos_embed.transpose(1, 2)
batch_size, hidden_size, seq_len = patch_pos_embed.shape
patch_height, patch_width = (
self.config.image_size[0] // self.config.patch_size,
self.config.image_size[1] // self.config.patch_size,
)
patch_pos_embed = patch_pos_embed.view(batch_size, hidden_size, patch_height, patch_width)
height, width = img_size
new_patch_heigth, new_patch_width = height // self.config.patch_size, width // self.config.patch_size
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed, size=(new_patch_heigth, new_patch_width), mode="bicubic", align_corners=False
)
patch_pos_embed = patch_pos_embed.flatten(2).transpose(1, 2)
scale_pos_embed = torch.cat((cls_pos_embed, patch_pos_embed, det_pos_embed), dim=1)
return scale_pos_embed
class InterpolateMidPositionEmbeddings(nn.Module):
"""
模块用于在Transformer模型中插值中间位置的位置嵌入。
Args:
config: 模型配置参数对象
Attributes:
config: 存储模型配置参数的对象
Methods:
forward(pos_embed, img_size=(800, 1344)): 前向传播方法,用于计算位置嵌入的插值结果。
"""
def __init__(self, config) -> None:
super().__init__()
self.config = config
def forward(self, pos_embed, img_size=(800, 1344)) -> torch.Tensor:
"""
执行前向传播计算插值后的位置嵌入。
Args:
pos_embed: 位置嵌入张量,形状为(batch_size, seq_length, hidden_size, seq_len)
img_size: 图像大小元组,默认为(800, 1344)
Returns:
scale_pos_embed: 插值后的位置嵌入张量,形状为(batch_size, seq_length, hidden_size)
"""
cls_pos_embed = pos_embed[:, :, 0, :]
cls_pos_embed = cls_pos_embed[:, None]
det_pos_embed = pos_embed[:, :, -self.config.num_detection_tokens :, :]
patch_pos_embed = pos_embed[:, :, 1 : -self.config.num_detection_tokens, :]
patch_pos_embed = patch_pos_embed.transpose(2, 3)
depth, batch_size, hidden_size, seq_len = patch_pos_embed.shape
patch_height, patch_width = (
self.config.image_size[0] // self.config.patch_size,
self.config.image_size[1] // self.config.patch_size,
)
patch_pos_embed = patch_pos_embed.view(depth * batch_size, hidden_size, patch_height, patch_width)
height, width = img_size
new_patch_height, new_patch_width = height // self.config.patch_size, width // self.config.patch_size
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed, size=(new_patch_height, new_patch_width), mode="bicubic", align_corners=False
)
patch_pos_embed = (
patch_pos_embed.flatten(2)
.transpose(1, 2)
.contiguous()
.view(depth, batch_size, new_patch_height * new_patch_width, hidden_size)
)
scale_pos_embed = torch.cat((cls_pos_embed, patch_pos_embed, det_pos_embed), dim=2)
return scale_pos_embed
class YolosPatchEmbeddings(nn.Module):
"""
此类将输入的`pixel_values`(形状为(batch_size, num_channels, height, width))转换为Transformer模型消费的初始隐藏状态(补丁嵌入),
形状为(batch_size, seq_length, hidden_size)。
Args:
config: 模型配置参数对象
Attributes:
image_size: 图像大小元组
patch_size: 补丁大小元组
num_channels: 输入图像的通道数
num_patches: 图像中的补丁数量
Methods:
__init__(config): 初始化方法,设置类属性和卷积投影
"""
def __init__(self, config):
super().__init__()
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.hidden_size
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
batch_size, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
return embeddings
class YolosSelfAttention(nn.Module):
def __init__(self, config: YolosConfig) -> None:
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
f"heads {config.num_attention_heads}."
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
mixed_query_layer = self.query(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class YolosSelfOutput(nn.Module):
"""
The residual connection is defined in YolosLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
"""
def __init__(self, config: YolosConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class YolosAttention(nn.Module):
def __init__(self, config: YolosConfig) -> None:
super().__init__()
self.attention = YolosSelfAttention(config)
self.output = YolosSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads: Set[int]) -> None:
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
)
self.attention.query = prune_linear_layer(self.attention.query, index)
self.attention.key = prune_linear_layer(self.attention.key, index)
self.attention.value = prune_linear_layer(self.attention.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
self_outputs = self.attention(hidden_states, head_mask, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class YolosIntermediate(nn.Module):
def __init__(self, config: YolosConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class YolosOutput(nn.Module):
def __init__(self, config: YolosConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states + input_tensor
return hidden_states
class YolosLayer(nn.Module):
"""这对应于timm实现中的Block类。"""
def __init__(self, config: YolosConfig) -> None:
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = YolosAttention(config)
self.intermediate = YolosIntermediate(config)
self.output = YolosOutput(config)
self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
self_attention_outputs = self.attention(
self.layernorm_before(hidden_states),
head_mask,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
hidden_states = attention_output + hidden_states
layer_output = self.layernorm_after(hidden_states)
layer_output = self.intermediate(layer_output)
layer_output = self.output(layer_output, hidden_states)
outputs = (layer_output,) + outputs
return outputs
def __init__(self, config: YolosConfig) -> None:
super().__init__()
self.config = config
self.layer = nn.ModuleList([YolosLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
seq_length = (
1 + (config.image_size[0] * config.image_size[1] // config.patch_size**2) + config.num_detection_tokens
)
self.mid_position_embeddings = (
nn.Parameter(
torch.zeros(
config.num_hidden_layers - 1,
1,
seq_length,
config.hidden_size,
)
)
if config.use_mid_position_embeddings
else None
)
self.interpolation = InterpolateMidPositionEmbeddings(config) if config.use_mid_position_embeddings else None
def forward(
self,
hidden_states: torch.Tensor,
height,
width,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[tuple, BaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if self.config.use_mid_position_embeddings:
interpolated_mid_position_embeddings = self.interpolation(self.mid_position_embeddings, (height, width))
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
layer_head_mask,
output_attentions,
)
else:
layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
hidden_states = layer_outputs[0]
if self.config.use_mid_position_embeddings:
if i < (self.config.num_hidden_layers - 1):
hidden_states = hidden_states + interpolated_mid_position_embeddings[i]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
@add_start_docstrings(
"The bare YOLOS Model transformer outputting raw hidden-states without any specific head on top.",
YOLOS_START_DOCSTRING,
)
class YolosModel(YolosPreTrainedModel):
"""
YolosModel extends YolosPreTrainedModel to implement a transformer model without specific heads on top.
Inherits from YolosPreTrainedModel and utilizes the provided YOLOS_START_DOCSTRING for detailed documentation.
"""
def __init__(self, config: YolosConfig, add_pooling_layer: bool = True):
super().__init__(config)
self.config = config
self.embeddings = YolosEmbeddings(config)
self.encoder = YolosEncoder(config)
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.pooler = YolosPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self) -> YolosPatchEmbeddings:
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
"""
Prunes heads of the model.
Args:
heads_to_prune (`dict` of {layer_num: list of heads to prune in this layer}):
See base class `PreTrainedModel`.
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(YOLOS_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
embedding_output = self.embeddings(pixel_values)
encoder_outputs = self.encoder(
embedding_output,
height=pixel_values.shape[-2],
width=pixel_values.shape[-1],
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output)
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
if not return_dict:
head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
return head_outputs + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class YolosPooler(nn.Module):
def __init__(self, config: YolosConfig):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
@add_start_docstrings(
"""
YOLOS Model (consisting of a ViT encoder) with object detection heads on top, for tasks such as COCO detection.
""",
YOLOS_START_DOCSTRING,
)
class YolosForObjectDetection(YolosPreTrainedModel):
def __init__(self, config: YolosConfig):
super().__init__(config)
self.vit = YolosModel(config, add_pooling_layer=False)
self.class_labels_classifier = YolosMLPPredictionHead(
input_dim=config.hidden_size, hidden_dim=config.hidden_size, output_dim=config.num_labels + 1, num_layers=3
)
self.bbox_predictor = YolosMLPPredictionHead(
input_dim=config.hidden_size, hidden_dim=config.hidden_size, output_dim=4, num_layers=3
)
self.post_init()
@torch.jit.unused
def _set_aux_loss(self, outputs_class, outputs_coord):
return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
@add_start_docstrings_to_model_forward(YOLOS_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=YolosObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
labels: Optional[List[Dict]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
pass
def dice_loss(inputs, targets, num_boxes):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs (0 for the negative class and 1 for the positive
class).
"""
inputs = inputs.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * (inputs * targets).sum(1)
denominator = inputs.sum(-1) + targets.sum(-1)
loss = 1 - (numerator + 1) / (denominator + 1)
return loss.sum() / num_boxes
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
"""
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
Args:
inputs (`torch.FloatTensor` of arbitrary shape):
The predictions for each example.
targets (`torch.FloatTensor` with the same shape as `inputs`)
A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
and 1 for the positive class).
alpha (`float`, *optional*, defaults to `0.25`):
Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
gamma (`int`, *optional*, defaults to `2`):
Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
Returns:
Loss tensor
"""
prob = inputs.sigmoid()
ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
p_t = prob * targets + (1 - prob) * (1 - targets)
loss = ce_loss * ((1 - p_t) ** gamma)
if alpha >= 0:
alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
loss = alpha_t * loss
return loss.mean(1).sum() / num_boxes
class YolosLoss(nn.Module):
"""
This class computes the losses for YolosForObjectDetection/YolosForSegmentation. The process happens in two steps: 1)
we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
of matched ground-truth / prediction (supervise class and box).
A note on the `num_classes` argument (copied from original repo in detr.py): "the naming of the `num_classes`
parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where `max_obj_id` is
the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass `num_classes` to
be 91. As another example, for a dataset that has a single class with `id` 1, you should pass `num_classes` to be 2
(`max_obj_id` + 1). For more details on this, check the following discussion
https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223"
Args:
matcher (`YolosHungarianMatcher`):
Module able to compute a matching between targets and proposals.
num_classes (`int`):
Number of object categories, omitting the special no-object category.
eos_coef (`float`):
Relative classification weight applied to the no-object category.
losses (`List[str]`):
List of all the losses to be applied. See `get_loss` for a list of all available losses.
"""
pass
def __init__(self, matcher, num_classes, eos_coef, losses):
super().__init__()
self.matcher = matcher
self.num_classes = num_classes
self.eos_coef = eos_coef
self.losses = losses
empty_weight = torch.ones(self.num_classes + 1)
empty_weight[-1] = self.eos_coef
self.register_buffer("empty_weight", empty_weight)
def loss_labels(self, outputs, targets, indices, num_boxes):
"""
Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim
[nb_target_boxes]
"""
if "logits" not in outputs:
raise KeyError("No logits were found in the outputs")
source_logits = outputs["logits"]
idx = self._get_source_permutation_idx(indices)
target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
target_classes = torch.full(
source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
)
target_classes[idx] = target_classes_o
loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight)
losses = {"loss_ce": loss_ce}
return losses
@torch.no_grad()
def loss_cardinality(self, outputs, targets, indices, num_boxes):
"""
Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
"""
logits = outputs["logits"]
device = logits.device
target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
losses = {"cardinality_error": card_err}
return losses
def loss_boxes(self, outputs, targets, indices, num_boxes):
"""
Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
are expected in format (center_x, center_y, w, h), normalized by the image size.
"""
if "pred_boxes" not in outputs:
raise KeyError("No predicted boxes found in outputs")
idx = self._get_source_permutation_idx(indices)
source_boxes = outputs["pred_boxes"][idx]
target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
losses = {}
losses["loss_bbox"] = loss_bbox.sum() / num_boxes
loss_giou = 1 - torch.diag(
generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
)
losses["loss_giou"] = loss_giou.sum() / num_boxes
return losses
def loss_masks(self, outputs, targets, indices, num_boxes):
"""
Compute the losses related to the masks: the focal loss and the dice loss.
Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
"""
if "pred_masks" not in outputs:
raise KeyError("No predicted masks found in outputs")
source_idx = self._get_source_permutation_idx(indices)
target_idx = self._get_target_permutation_idx(indices)
source_masks = outputs["pred_masks"]
source_masks = source_masks[source_idx]
masks = [t["masks"] for t in targets]
target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
target_masks = target_masks.to(source_masks)
target_masks = target_masks[target_idx]
source_masks = nn.functional.interpolate(
source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
)
source_masks = source_masks[:, 0].flatten(1)
target_masks = target_masks.flatten(1)
target_masks = target_masks.view(source_masks.shape)
losses = {
"loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
"loss_dice": dice_loss(source_masks, target_masks, num_boxes),
}
return losses
def _get_source_permutation_idx(self, indices):
batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
source_idx = torch.cat([source for (source, _) in indices])
return batch_idx, source_idx
def _get_target_permutation_idx(self, indices):
batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
target_idx = torch.cat([target for (_, target) in indices])
return batch_idx, target_idx
def get_loss(self, loss, outputs, targets, indices, num_boxes):
loss_map = {
"labels": self.loss_labels,
"cardinality": self.loss_cardinality,
"boxes": self.loss_boxes,
"masks": self.loss_masks,
}
if loss not in loss_map:
raise ValueError(f"Loss {loss} not supported")
return loss_map[loss](outputs, targets, indices, num_boxes)
def forward(self, outputs, targets):
"""
This performs the loss computation.
Args:
outputs (`dict`, *optional*):
Dictionary of tensors, see the output specification of the model for the format.
targets (`List[dict]`, *optional*):
List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
losses applied, see each loss' doc.
"""
outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
indices = self.matcher(outputs_without_aux, targets)
num_boxes = sum(len(t["class_labels"]) for t in targets)
num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
world_size = 1
if is_accelerate_available():
if PartialState._shared_state != {}:
num_boxes = reduce(num_boxes)
world_size = PartialState().num_processes
num_boxes = torch.clamp(num_boxes / world_size, min=1).item()
losses = {}
for loss in self.losses:
losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
if "auxiliary_outputs" in outputs:
for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
indices = self.matcher(auxiliary_outputs, targets)
for loss in self.losses:
if loss == "masks":
continue
l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
losses.update(l_dict)
return losses
class YolosMLPPredictionHead(nn.Module):
"""
简单的多层感知机(MLP,也称为 FFN),用于预测边界框相对于图像的归一化中心坐标、高度和宽度。
从 https://github.com/facebookresearch/detr/blob/master/models/detr.py 复制而来
"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
for i, layer in enumerate(self.layers):
x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
class YolosHungarianMatcher(nn.Module):
"""
这个类计算网络预测与目标之间的匹配。
为了效率考虑,目标不包括无对象。因此,一般情况下预测数量比目标多。
在这种情况下,我们对最佳预测进行 1 对 1 的匹配,而其余预测则不匹配(因此视为非对象)。
Args:
class_cost:
匹配成本中分类误差的相对权重。
bbox_cost:
匹配成本中边界框坐标 L1 误差的相对权重。
giou_cost:
匹配成本中边界框 giou 损失的相对权重。
"""
def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
super().__init__()
requires_backends(self, ["scipy"])
self.class_cost = class_cost
self.bbox_cost = bbox_cost
self.giou_cost = giou_cost
if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
raise ValueError("All costs of the Matcher can't be 0")
@torch.no_grad()
def forward(self, outputs, targets):
"""
Args:
outputs (`dict`):
A dictionary that contains at least these entries:
* "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
* "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
targets (`List[dict]`):
A list of targets (len(targets) = batch_size), where each target is a dict containing:
* "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
ground-truth objects in the target) containing the class labels
* "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
Returns:
`List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
batch_size, num_queries = outputs["logits"].shape[:2]
out_prob = outputs["logits"].flatten(0, 1).softmax(-1)
out_bbox = outputs["pred_boxes"].flatten(0, 1)
target_ids = torch.cat([v["class_labels"] for v in targets])
target_bbox = torch.cat([v["boxes"] for v in targets])
class_cost = -out_prob[:, target_ids]
bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
sizes = [len(v["boxes"]) for v in targets]
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
def _upcast(t: Tensor) -> Tensor:
if t.is_floating_point():
return t if t.dtype in (torch.float32, torch.float64) else t.float()
else:
return t if t.dtype in (torch.int32, torch.int64) else t.int()
def box_area(boxes: Tensor) -> Tensor:
"""
计算一组边界框的面积,这些边界框由其 (x1, y1, x2, y2) 坐标指定。
Args:
boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
待计算面积的边界框。这些边界框应该以 (x1, y1, x2, y2) 格式提供,其中 `0 <= x1 < x2` 且 `0 <= y1 < y2`。
Returns:
`torch.FloatTensor`: 包含每个边界框面积的张量。
"""
boxes = _upcast(boxes)
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])
right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])
width_height = (right_bottom - left_top).clamp(min=0)
inter = width_height[:, :, 0] * width_height[:, :, 1]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
def generalized_box_iou(boxes1, boxes2):
"""
使用 https://giou.stanford.edu/ 中的广义 IoU 计算。边界框应该以 [x0, y0, x1, y1] (左上角和右下角) 格式提供。
Returns:
`torch.FloatTensor`: 一个形状为 [N, M] 的成对矩阵,其中 N = len(boxes1),M = len(boxes2)
"""
if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
raise ValueError(f"boxes1 必须以 [x0, y0, x1, y1] (左上角和右下角) 格式提供,但是提供了 {boxes1}")
if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
raise ValueError(f"boxes2 必须以 [x0, y0, x1, y1] (左上角和右下角) 格式提供,但是提供了 {boxes2}")
iou, union = box_iou(boxes1, boxes2)
top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
width_height = (bottom_right - top_left).clamp(min=0)
area = width_height[:, :, 0] * width_height[:, :, 1]
return iou - (area - union) / area
def _max_by_axis(the_list):
maxes = the_list[0]
for sublist in the_list[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
class NestedTensor(object):
def __init__(self, tensors, mask: Optional[Tensor]):
self.tensors = tensors
self.mask = mask
def to(self, device):
cast_tensor = self.tensors.to(device)
mask = self.mask
if mask is not None:
cast_mask = mask.to(device)
else:
cast_mask = None
return NestedTensor(cast_tensor, cast_mask)
def decompose(self):
return self.tensors, self.mask
def __repr__(self):
return str(self.tensors)
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
if tensor_list[0].ndim == 3:
max_size = _max_by_axis([list(img.shape) for img in tensor_list])
batch_shape = [len(tensor_list)] + max_size
batch_size, num_channels, height, width = batch_shape
dtype = tensor_list[0].dtype
device = tensor_list[0].device
tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
for img, pad_img, m in zip(tensor_list, tensor, mask):
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
m[: img.shape[1], : img.shape[2]] = False
else:
raise ValueError("Only 3-dimensional tensors are supported")
return NestedTensor(tensor, mask)