Transformers 源码解析(二十五)
.\models\clap\processing_clap.py
"""
Audio/Text processor class for CLAP
"""
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
class ClapProcessor(ProcessorMixin):
r"""
Constructs a CLAP processor which wraps a CLAP feature extractor and a RoBerta tokenizer into a single processor.
[`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`RobertaTokenizerFast`]. See the
[`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information.
Args:
feature_extractor ([`ClapFeatureExtractor`]):
The audio processor is a required input.
tokenizer ([`RobertaTokenizerFast`]):
The tokenizer is a required input.
"""
feature_extractor_class = "ClapFeatureExtractor"
tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
.\models\clap\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_clap": [
"CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
"ClapAudioConfig",
"ClapConfig",
"ClapTextConfig",
],
"processing_clap": ["ClapProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_clap"] = [
"CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
"ClapModel",
"ClapPreTrainedModel",
"ClapTextModel",
"ClapTextModelWithProjection",
"ClapAudioModel",
"ClapAudioModelWithProjection",
]
_import_structure["feature_extraction_clap"] = ["ClapFeatureExtractor"]
if TYPE_CHECKING:
from .configuration_clap import (
CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
ClapAudioConfig,
ClapConfig,
ClapTextConfig,
)
from .processing_clap import ClapProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_clap import ClapFeatureExtractor
from .modeling_clap import (
CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
ClapAudioModel,
ClapAudioModelWithProjection,
ClapModel,
ClapPreTrainedModel,
ClapTextModel,
ClapTextModelWithProjection,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\clip\configuration_clip.py
""" CLIP 模型配置"""
import os
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
if TYPE_CHECKING:
from ...processing_utils import ProcessorMixin
from ...utils import TensorType
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/config.json",
}
class CLIPTextConfig(PretrainedConfig):
r"""
这是一个配置类,用于存储 [`CLIPTextModel`] 的配置。根据指定的参数实例化 CLIP 文本编码器,定义模型架构。
使用默认配置实例化将得到类似于 CLIP [openai/clip-vit-base-patch32] 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。
"""
model_type = "clip_text_model"
def __init__(
self,
vocab_size=49408,
hidden_size=512,
intermediate_size=2048,
projection_dim=512,
num_hidden_layers=12,
num_attention_heads=8,
max_position_embeddings=77,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
pad_token_id=1,
bos_token_id=49406,
eos_token_id=49407,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "clip":
config_dict = config_dict["text_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class CLIPVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 32):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1.0):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
Example:
```
>>> from transformers import CLIPVisionConfig, CLIPVisionModel
>>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
>>> configuration = CLIPVisionConfig()
>>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
>>> model = CLIPVisionModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
model_type = "clip_vision_model"
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
projection_dim=512,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
image_size=224,
patch_size=32,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
**kwargs,
):
# 调用父类的构造方法,初始化基类的属性
super().__init__(**kwargs)
# 初始化模型的各种参数
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
# 获取预训练模型的配置字典和额外的关键字参数
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# 如果从 CLIPConfig 加载,获取视觉配置字典
if config_dict.get("model_type") == "clip":
config_dict = config_dict["vision_config"]
# 如果配置字典中存在模型类型,且与当前类的模型类型不匹配,发出警告
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
# 使用配置字典和额外参数创建类的实例
return cls.from_dict(config_dict, **kwargs)
class CLIPConfig(PretrainedConfig):
r"""
[`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
a configuration with the defaults will yield a similar configuration to that of the CLIP
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`CLIPTextConfig`].
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
kwargs (*optional*):
Dictionary of keyword arguments.
Example:
```
>>> from transformers import CLIPConfig, CLIPModel
>>>
>>> configuration = CLIPConfig()
>>>
>>> model = CLIPModel(configuration)
>>>
>>> configuration = model.config
>>>
>>> from transformers import CLIPTextConfig, CLIPVisionConfig
>>>
>>> config_text = CLIPTextConfig()
>>> config_vision = CLIPVisionConfig()
>>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
```"""
model_type = "clip"
def __init__(
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
):
# 调用父类的初始化方法,初始化基类的配置
super().__init__(**kwargs)
# 设定文本配置
self.text_config = text_config
# 设定视觉配置
self.vision_config = vision_config
# 设定投影维度
self.projection_dim = projection_dim
# 设定logit_scale参数的初始值
self.logit_scale_init_value = logit_scale_init_value
@classmethod
def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
r"""
Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
configuration.
Returns:
[`CLIPConfig`]: An instance of a configuration object
"""
# 从文本配置和视觉配置创建一个新的 `CLIPConfig` 实例
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
class CLIPOnnxConfig(OnnxConfig):
@property
# 定义一个方法 `inputs`,返回一个有序字典,描述了输入数据的结构
def inputs(self) -> Mapping[str, Mapping[int, str]]:
# 返回一个有序字典,包含三个键值对,每个键值对描述了不同输入的维度信息
return OrderedDict(
[
("input_ids", {0: "batch", 1: "sequence"}), # 表示 input_ids 维度为 [batch, sequence]
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}), # pixel_values 维度为 [batch, num_channels, height, width]
("attention_mask", {0: "batch", 1: "sequence"}), # attention_mask 维度为 [batch, sequence]
]
)
# 定义一个只读属性 `outputs`,返回一个有序字典,描述了输出数据的结构
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
# 返回一个有序字典,包含四个键值对,每个键值对描述了不同输出的维度信息
return OrderedDict(
[
("logits_per_image", {0: "batch"}), # logits_per_image 维度为 [batch]
("logits_per_text", {0: "batch"}), # logits_per_text 维度为 [batch]
("text_embeds", {0: "batch"}), # text_embeds 维度为 [batch]
("image_embeds", {0: "batch"}), # image_embeds 维度为 [batch]
]
)
# 定义一个方法 `atol_for_validation`,返回浮点数值,表示验证中的绝对容差
@property
def atol_for_validation(self) -> float:
return 1e-4
# 定义一个方法 `generate_dummy_inputs`,生成虚拟输入数据的字典
def generate_dummy_inputs(
self,
processor: "ProcessorMixin",
batch_size: int = -1,
seq_length: int = -1,
framework: Optional["TensorType"] = None,
) -> Mapping[str, Any]:
# 使用父类的方法生成文本输入的虚拟数据字典
text_input_dict = super().generate_dummy_inputs(
processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
)
# 使用父类的方法生成图像输入的虚拟数据字典
image_input_dict = super().generate_dummy_inputs(
processor.image_processor, batch_size=batch_size, framework=framework
)
# 返回合并了文本和图像输入数据字典的结果
return {**text_input_dict, **image_input_dict}
# 定义一个只读属性 `default_onnx_opset`,返回整数值,表示默认的 ONNX 运算集版本
@property
def default_onnx_opset(self) -> int:
return 14
.\models\clip\convert_clip_original_pytorch_to_hf.py
import argparse
import torch
from clip import load
from transformers import CLIPConfig, CLIPModel
def copy_attn_layer(hf_attn_layer, pt_attn_layer):
q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
hf_attn_layer.q_proj.weight.data = q_proj
hf_attn_layer.q_proj.bias.data = q_proj_bias
hf_attn_layer.k_proj.weight.data = k_proj
hf_attn_layer.k_proj.bias.data = k_proj_bias
hf_attn_layer.v_proj.weight.data = v_proj
hf_attn_layer.v_proj.bias.data = v_proj_bias
hf_attn_layer.out_proj.weight = pt_attn_layer.out_proj.weight
hf_attn_layer.out_proj.bias = pt_attn_layer.out_proj.bias
def copy_mlp(hf_mlp, pt_mlp):
copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
def copy_linear(hf_linear, pt_linear):
hf_linear.weight = pt_linear.weight
hf_linear.bias = pt_linear.bias
def copy_layer(hf_layer, pt_layer):
copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
copy_mlp(hf_layer.mlp, pt_layer.mlp)
copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
def copy_layers(hf_layers, pt_layers):
for hf_layer, pt_layer in zip(hf_layers, pt_layers):
copy_layer(hf_layer, pt_layer)
def copy_encoder(hf_encoder, pt_model):
hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
def copy_text_model_and_projection(hf_model, pt_model):
hf_model.text_projection.weight.data = pt_model.text_projection.data.T
copy_encoder(hf_model.text_model, pt_model)
def copy_vison_model_and_projection(hf_model, pt_model):
hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T
copy_linear(hf_model.visual_model.layer_norm, pt_model.visual.ln)
copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
@torch.no_grad()
def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
"""
Copy/paste/tweak model's weights to transformers design.
"""
if config_path is not None:
config = CLIPConfig.from_pretrained(config_path)
else:
config = CLIPConfig(projection_dim=512, text_config={}, vision_config={})
hf_model = CLIPModel(config).eval()
pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
pt_model = pt_model.eval()
copy_text_model_and_projection(hf_model, pt_model)
copy_vison_model_and_projection(hf_model, pt_model)
hf_model.logit_scale = pt_model.logit_scale
input_ids = torch.arange(0, 77).unsqueeze(0)
pixel_values = torch.randn(1, 3, 224, 224)
hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
hf_logits_per_image = hf_outputs.logits_per_image
hf_logits_per_text = hf_outputs.logits_per_text
pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
hf_model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
args = parser.parse_args()
convert_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
.\models\clip\feature_extraction_clip.py
"""Feature extractor class for CLIP."""
import warnings
from ...utils import logging
from .image_processing_clip import CLIPImageProcessor
logger = logging.get_logger(__name__)
class CLIPFeatureExtractor(CLIPImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
" use CLIPImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\clip\image_processing_clip.py
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
convert_to_rgb,
get_resize_output_image_size,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
logger = logging.get_logger(__name__)
if is_vision_available():
import PIL
class CLIPImageProcessor(BaseImageProcessor):
r"""
Constructs a CLIP image processor.
"""
"""
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
`do_resize` in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
method.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
`preprocess` method.
crop_size (`Dict[str, int]` *optional*, defaults to 224):
Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 224}
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.do_convert_rgb = do_convert_rgb
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_center_crop",
"crop_size",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"do_convert_rgb",
"return_tensors",
"data_format",
"input_data_format",
]
if "use_square_size" in kwargs:
self.size = {"height": size["shortest_edge"], "width": size["shortest_edge"]}
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
def resize_image(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
resized to keep the input aspect ratio.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image. Can specify 'shortest_edge' or 'height' and 'width'.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
default_to_square = True
if "shortest_edge" in size:
size = size["shortest_edge"]
default_to_square = False
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
else:
raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
output_size = get_resize_output_image_size(
image,
size=size,
default_to_square=default_to_square,
input_data_format=input_data_format,
)
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
.\models\clip\modeling_clip.py
""" PyTorch CLIP模型。"""
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "CLIPConfig"
_CHECKPOINT_FOR_DOC = "openai/clip-vit-base-patch32"
_IMAGE_CLASS_CHECKPOINT = "openai/clip-vit-base-patch32"
_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_0"
CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openai/clip-vit-base-patch32",
]
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
caption_loss = contrastive_loss(similarity)
image_loss = contrastive_loss(similarity.t())
return (caption_loss + image_loss) / 2.0
@dataclass
class CLIPVisionModelOutput(ModelOutput):
"""
CLIP视觉模型输出的基类,同时包含最后隐藏状态的池化图像嵌入。
"""
"""
Args:
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
The image embeddings obtained by applying the projection layer to the pooler_output.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
image_embeds: Optional[torch.FloatTensor] = None
last_hidden_state: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class CLIPTextModelOutput(ModelOutput):
"""
Base class for text model's outputs that also contains a pooling of the last hidden states.
Args:
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
The text embeddings obtained by applying the projection layer to the pooler_output.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
text_embeds: Optional[torch.FloatTensor] = None
last_hidden_state: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class CLIPOutput(ModelOutput):
"""
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for image-text similarity.
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
similarity scores.
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
similarity scores.
text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
text_model_output(`BaseModelOutputWithPooling`):
The output of the [`CLIPTextModel`].
vision_model_output(`BaseModelOutputWithPooling`):
The output of the [`CLIPVisionModel`].
"""
# Optional: Loss tensor representing contrastive loss for image-text similarity
loss: Optional[torch.FloatTensor] = None
# Optional: Scores indicating image-text similarity (image_batch_size x text_batch_size)
logits_per_image: torch.FloatTensor = None
# Optional: Scores indicating text-image similarity (text_batch_size x image_batch_size)
logits_per_text: torch.FloatTensor = None
# Optional: Text embeddings derived from CLIPTextModel's pooled output
text_embeds: torch.FloatTensor = None
# Optional: Image embeddings derived from CLIPVisionModel's pooled output
image_embeds: torch.FloatTensor = None
# Optional: Output object from CLIPTextModel with pooling
text_model_output: BaseModelOutputWithPooling = None
# Optional: Output object from CLIPVisionModel with pooling
vision_model_output: BaseModelOutputWithPooling = None
def to_tuple(self) -> Tuple[Any]:
# Convert all attributes except 'text_model_output' and 'vision_model_output' to a tuple
return tuple(
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
class CLIPVisionEmbeddings(nn.Module):
# CLIP 视觉嵌入模块,继承自 nn.Module 类
def __init__(self, config: CLIPVisionConfig):
# 初始化函数,接受 CLIPVisionConfig 类型的配置参数
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
# 类别嵌入向量,作为可学习参数
self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
# 图像块嵌入层,使用 Conv2d 实现,将图像分割为块并转换为嵌入表示
self.patch_embedding = nn.Conv2d(
in_channels=config.num_channels,
out_channels=self.embed_dim,
kernel_size=self.patch_size,
stride=self.patch_size,
bias=False,
)
# 计算图像中的块数和位置嵌入维度
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
# 注册位置索引张量,用于嵌入位置编码
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
# 前向传播函数,接收像素值张量并返回嵌入表示的张量
batch_size = pixel_values.shape[0]
target_dtype = self.patch_embedding.weight.dtype
# 对输入像素值进行图像块嵌入
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
# 类别嵌入张量扩展到每个样本
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
# 将类别嵌入和图像块嵌入连接成一个张量
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
# 加上位置嵌入
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
class CLIPTextEmbeddings(nn.Module):
# CLIP 文本嵌入模块,继承自 nn.Module 类
def __init__(self, config: CLIPTextConfig):
# 初始化函数,接受 CLIPTextConfig 类型的配置参数
super().__init__()
embed_dim = config.hidden_size
# 词汇表嵌入层和位置嵌入层,使用 Embedding 实现
self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
# 注册位置索引张量,用于嵌入位置编码
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
) -> torch.Tensor:
# 前向传播函数,接收输入的词汇 IDs 或嵌入表示,返回文本嵌入表示的张量
seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if inputs_embeds is None:
inputs_embeds = self.token_embedding(input_ids)
# 获取位置嵌入
position_embeddings = self.position_embedding(position_ids)
# 计算最终的文本嵌入张量
embeddings = inputs_embeds + position_embeddings
return embeddings
class CLIPAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
# CLIP 注意力模块,继承自 nn.Module 类
# 初始化函数,用于初始化一个注意力机制模型实例
def __init__(self, config):
# 调用父类的初始化方法
super().__init__()
# 将配置参数保存在实例中
self.config = config
# 从配置中获取隐藏层大小作为嵌入维度
self.embed_dim = config.hidden_size
# 从配置中获取注意力头的数量
self.num_heads = config.num_attention_heads
# 计算每个注意力头的维度
self.head_dim = self.embed_dim // self.num_heads
# 检查 embed_dim 是否能被 num_heads 整除,否则抛出异常
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
# 计算缩放因子,用于注意力分数的缩放
self.scale = self.head_dim**-0.5
# 从配置中获取注意力机制的 dropout 率
self.dropout = config.attention_dropout
# 初始化线性变换层,用于查询、键、值和输出的投影
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
# 辅助函数,用于调整张量形状以适应多头注意力的计算
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
# 将张量重新形状为 [bsz, seq_len, num_heads, head_dim]
reshaped_tensor = tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
# 交换维度,变成 [bsz, num_heads, seq_len, head_dim]
transposed_tensor = reshaped_tensor.transpose(1, 2).contiguous()
return transposed_tensor
# 前向传播函数,实现注意力机制的计算过程
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
class CLIPMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config # 保存配置信息到实例变量中
self.activation_fn = ACT2FN[config.hidden_act] # 根据配置选择激活函数
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) # 创建线性层 fc1
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) # 创建线性层 fc2
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.fc1(hidden_states) # 输入通过线性层 fc1
hidden_states = self.activation_fn(hidden_states) # 应用激活函数
hidden_states = self.fc2(hidden_states) # 再次通过线性层 fc2
return hidden_states # 返回处理后的隐藏状态
class CLIPEncoderLayer(nn.Module):
def __init__(self, config: CLIPConfig):
super().__init__()
self.embed_dim = config.hidden_size # 保存隐藏尺寸到实例变量
self.self_attn = CLIPAttention(config) # 创建自注意力机制
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) # 创建层归一化层1
self.mlp = CLIPMLP(config) # 创建多层感知机 MLP
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) # 创建层归一化层2
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
causal_attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
`(config.encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states # 保存输入隐藏状态作为残差连接的起点
hidden_states = self.layer_norm1(hidden_states) # 应用层归一化层1
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
) # 使用自注意力机制处理隐藏状态
hidden_states = residual + hidden_states # 添加残差连接
residual = hidden_states # 更新残差连接起点为当前隐藏状态
hidden_states = self.layer_norm2(hidden_states) # 应用层归一化层2
hidden_states = self.mlp(hidden_states) # 输入通过多层感知机 MLP
hidden_states = residual + hidden_states # 添加残差连接
outputs = (hidden_states,) # 将输出打包为元组
if output_attentions:
outputs += (attn_weights,) # 如果需要输出注意力权重,添加到输出元组中
return outputs # 返回输出元组
class CLIPPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = CLIPConfig # 指定配置类
base_model_prefix = "clip" # 模型前缀
supports_gradient_checkpointing = True # 支持梯度检查点
# 初始化模型权重的函数,根据不同的模块类型设置不同的初始化策略
def _init_weights(self, module):
"""Initialize the weights"""
# 获取初始化因子
factor = self.config.initializer_factor
# 如果模块是 CLIPTextEmbeddings 类型
if isinstance(module, CLIPTextEmbeddings):
# 初始化 token_embedding 和 position_embedding 的权重
module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
# 如果模块是 CLIPVisionEmbeddings 类型
elif isinstance(module, CLIPVisionEmbeddings):
# 初始化 class_embedding, patch_embedding 和 position_embedding 的权重
nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
# 如果模块是 CLIPAttention 类型
elif isinstance(module, CLIPAttention):
# 初始化注意力机制中的投影权重
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
out_proj_std = (module.embed_dim**-0.5) * factor
nn.init.normal_(module.q_proj.weight, std=in_proj_std)
nn.init.normal_(module.k_proj.weight, std=in_proj_std)
nn.init.normal_(module.v_proj.weight, std=in_proj_std)
nn.init.normal_(module.out_proj.weight, std=out_proj_std)
# 如果模块是 CLIPMLP 类型
elif isinstance(module, CLIPMLP):
# 初始化多层感知机中的全连接层权重
in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
nn.init.normal_(module.fc1.weight, std=fc_std)
nn.init.normal_(module.fc2.weight, std=in_proj_std)
# 如果模块是 CLIPModel 类型
elif isinstance(module, CLIPModel):
# 初始化 CLIPModel 中的文本和视觉投影权重
nn.init.normal_(
module.text_projection.weight,
std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
)
nn.init.normal_(
module.visual_projection.weight,
std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
)
# 如果模块是 CLIPVisionModelWithProjection 类型
elif isinstance(module, CLIPVisionModelWithProjection):
# 初始化视觉模型中的投影权重
nn.init.normal_(
module.visual_projection.weight,
std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
)
# 如果模块是 CLIPTextModelWithProjection 类型
elif isinstance(module, CLIPTextModelWithProjection):
# 初始化文本模型中的投影权重
nn.init.normal_(
module.text_projection.weight,
std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
)
# 如果模块是 nn.LayerNorm 类型
if isinstance(module, nn.LayerNorm):
# 初始化 LayerNorm 的偏置和权重
module.bias.data.zero_()
module.weight.data.fill_(1.0)
# 如果模块是 nn.Linear 类型并且有偏置项
if isinstance(module, nn.Linear) and module.bias is not None:
# 将线性层的偏置项初始化为零
module.bias.data.zero_()
# CLIP_START_DOCSTRING 是一个包含模型介绍和配置参数说明的原始字符串文档
CLIP_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# CLIP_TEXT_INPUTS_DOCSTRING 是一个包含关于文本输入参数的原始字符串文档
CLIP_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# CLIP_VISION_INPUTS_DOCSTRING 是一个空的字符串文档,用于表示视觉输入的参数说明
CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
CLIP_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class CLIPEncoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`CLIPEncoderLayer`].
Args:
config: CLIPConfig
"""
def __init__(self, config: CLIPConfig):
super().__init__()
self.config = config
# Initialize `num_hidden_layers` instances of CLIPEncoderLayer
self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
inputs_embeds,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# Forward pass through each layer of the encoder
# `inputs_embeds` are the embedded input tokens
# `attention_mask` masks padding tokens from attention calculation
# `causal_attention_mask` masks future tokens for autoregressive tasks
# `output_attentions` controls whether to output attentions tensors
# `output_hidden_states` controls whether to output hidden states of layers
# `return_dict` controls whether to return a ModelOutput or a tuple
pass # Placeholder for actual implementation
class CLIPTextTransformer(nn.Module):
# 初始化方法,接受一个配置对象 config: CLIPTextConfig
def __init__(self, config: CLIPTextConfig):
# 调用父类初始化方法
super().__init__()
# 将传入的配置对象保存到实例变量 self.config 中
self.config = config
# 从配置对象中获取隐藏层的维度作为嵌入的维度
embed_dim = config.hidden_size
# 创建 CLIPTextEmbeddings 对象并保存到实例变量 self.embeddings 中
self.embeddings = CLIPTextEmbeddings(config)
# 创建 CLIPEncoder 对象并保存到实例变量 self.encoder 中
self.encoder = CLIPEncoder(config)
# 创建一个 LayerNorm 层,并设定输入维度为 embed_dim,epsilon 值为 config.layer_norm_eps
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
# 为了计算 `pooled_output`,保存 EOS token 的 ID 到实例变量 self.eos_token_id 中
self.eos_token_id = config.eos_token_id
# 前向传播方法,使用装饰器将其文档字符串添加到模型的前向传播方法中
# 使用 CLIP_TEXT_INPUTS_DOCSTRING 描述输入参数
# 使用 replace_return_docstrings 装饰器,指定输出类型为 BaseModelOutputWithPooling,并使用 CLIPTextConfig 类描述配置
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 使用装饰器为类添加文档字符串,描述这是一个不带头或顶部投影的 CLIP 文本模型
@add_start_docstrings(
"""The text model from CLIP without any head or projection on top.""",
CLIP_START_DOCSTRING,
)
class CLIPTextModel(CLIPPreTrainedModel):
# 设置配置类为 CLIPTextConfig
config_class = CLIPTextConfig
# 定义不需要分割的模块列表
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
def __init__(self, config: CLIPTextConfig):
super().__init__(config)
# 使用给定的配置初始化 CLIPTextTransformer 模型
self.text_model = CLIPTextTransformer(config)
# 调用初始化函数,初始化权重并进行最终处理
self.post_init()
# 获取输入嵌入的方法,返回文本模型中的 token 嵌入
def get_input_embeddings(self) -> nn.Module:
return self.text_model.embeddings.token_embedding
# 设置输入嵌入的方法,设置文本模型中的 token 嵌入为给定的值
def set_input_embeddings(self, value):
self.text_model.embeddings.token_embedding = value
# 重写 forward 方法,使用装饰器为其添加文档字符串,描述输入参数和返回值的类型
@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
"""
Returns:
Examples:
```
>>> from transformers import AutoTokenizer, CLIPTextModel
>>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output
```"""
# 如果 return_dict 为 None,则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 text_model 的 forward 方法,传递参数并返回结果
return self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
class CLIPVisionTransformer(nn.Module):
def __init__(self, config: CLIPVisionConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
# 初始化视觉嵌入、前层归一化、编码器和后层归一化
self.embeddings = CLIPVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = CLIPEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
# 使用装饰器为 forward 方法添加文档字符串,描述输入参数和返回值的类型
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
# 定义一个方法 `forward`,用于执行模型的前向传播操作
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
执行前向传播操作,并返回模型输出的相关结果。
Returns:
根据 `return_dict` 参数的值返回不同的结果组合。
"""
# 如果 `output_attentions` 参数为 None,则使用配置中的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果 `output_hidden_states` 参数为 None,则使用配置中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果 `return_dict` 参数为 None,则使用配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 `pixel_values` 为 None,则抛出数值错误异常
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
# 将像素值 `pixel_values` 输入到嵌入层 `embeddings` 中得到隐藏状态 `hidden_states`
hidden_states = self.embeddings(pixel_values)
# 在嵌入层输出的隐藏状态上应用预层归一化 `pre_layrnorm`
hidden_states = self.pre_layrnorm(hidden_states)
# 将处理后的隐藏状态 `hidden_states` 输入到编码器 `encoder` 中
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取编码器输出的最后一层隐藏状态 `last_hidden_state`
last_hidden_state = encoder_outputs[0]
# 从最后隐藏状态中提取池化输出 `pooled_output`
pooled_output = last_hidden_state[:, 0, :]
# 在池化输出上应用后层归一化 `post_layernorm`
pooled_output = self.post_layernorm(pooled_output)
# 如果 `return_dict` 为 False,则返回包含多个元组的结果
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
# 如果 `return_dict` 为 True,则返回一个包含多个属性的 `BaseModelOutputWithPooling` 对象
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""The vision model from CLIP without any head or projection on top.""",
CLIP_START_DOCSTRING,
)
# 定义 CLIPVisionModel 类,继承自 CLIPPreTrainedModel
class CLIPVisionModel(CLIPPreTrainedModel):
# 使用 CLIPVisionConfig 作为配置类
config_class = CLIPVisionConfig
# 主要输入名称为 "pixel_values"
main_input_name = "pixel_values"
# 不需要拆分的模块列表
_no_split_modules = ["CLIPEncoderLayer"]
# 初始化函数,接受一个 CLIPVisionConfig 类型的参数 config
def __init__(self, config: CLIPVisionConfig):
# 调用父类的初始化函数
super().__init__(config)
# 创建 CLIPVisionTransformer 对象,并赋值给 self.vision_model
self.vision_model = CLIPVisionTransformer(config)
# 调用自定义的后初始化函数
self.post_init()
# 返回模型的输入嵌入层
def get_input_embeddings(self) -> nn.Module:
return self.vision_model.embeddings.patch_embedding
# 前向传播函数,接受多个可选参数并返回 Union[Tuple, BaseModelOutputWithPooling] 类型的值
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
"""
Returns:
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPVisionModel
>>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output
```"""
# 如果 return_dict 为 None,则使用 self.config.use_return_dict 的值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 self.vision_model 的前向传播函数,并返回结果
return self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 定义 CLIPModel 类,继承自 CLIPPreTrainedModel,带有 CLIP_START_DOCSTRING 的说明文档
@add_start_docstrings(CLIP_START_DOCSTRING)
class CLIPModel(CLIPPreTrainedModel):
# 使用 CLIPConfig 作为配置类
config_class = CLIPConfig
# 不需要拆分的模块列表
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
def __init__(self, config: CLIPConfig):
super().__init__(config)
# 检查配置是否符合预期类型,否则引发值错误异常
if not isinstance(config.text_config, CLIPTextConfig):
raise ValueError(
"config.text_config is expected to be of type CLIPTextConfig but is of type"
f" {type(config.text_config)}."
)
# 检查配置是否符合预期类型,否则引发值错误异常
if not isinstance(config.vision_config, CLIPVisionConfig):
raise ValueError(
"config.vision_config is expected to be of type CLIPVisionConfig but is of type"
f" {type(config.vision_config)}."
)
# 将文本和视觉配置提取到局部变量中
text_config = config.text_config
vision_config = config.vision_config
# 设置投影维度和文本嵌入维度,从配置中提取
self.projection_dim = config.projection_dim
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size
# 初始化文本模型和视觉模型
self.text_model = CLIPTextTransformer(text_config)
self.vision_model = CLIPVisionTransformer(vision_config)
# 创建用于视觉和文本投影的线性层,无偏置
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
# 创建并初始化logit_scale作为模型参数
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
def get_text_features(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
applying the projection layer to the pooled output of [`CLIPTextModel`].
Examples:
```
>>> from transformers import AutoTokenizer, CLIPModel
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```"""
# Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
# 检查是否提供了输出注意力信息,如果没有则使用模型的配置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 检查是否提供了输出隐藏状态信息,如果没有则使用模型的配置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 检查是否提供了返回字典的信息,如果没有则使用模型的配置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用文本模型的前向传播,获取文本输出
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从文本输出中获取池化后的输出(通常是第二个元素)
pooled_output = text_outputs[1]
# 将池化后的输出应用于文本投影层,得到文本特征
text_features = self.text_projection(pooled_output)
# 返回文本特征作为函数的输出
return text_features
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
def get_image_features(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
applying the projection layer to the pooled output of [`CLIPVisionModel`].
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPModel
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> image_features = model.get_image_features(**inputs)
```"""
# 设置返回类型为 torch.FloatTensor,代表图像特征向量的形状为 (batch_size, output_dim)
# 这些特征向量是通过将池化输出应用到 CLIPVisionModel 的投影层上获得的
# 返回图像特征向量
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果未指定,则使用 CLIP 模型配置中的 output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未指定,则使用 CLIP 模型配置中的 output_hidden_states
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果未指定,则使用 CLIP 模型配置中的 use_return_dict
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 使用 CLIP 模型的视觉部分进行处理,传入像素值、注意力输出、隐藏状态输出和返回字典选项
pooled_output = vision_outputs[1] # 从视觉输出中获取池化后的输出
image_features = self.visual_projection(pooled_output)
# 将池化输出应用于视觉投影层,生成图像特征向量
return image_features
"""
CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
"""
@add_start_docstrings(
"""
CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output).
""",
CLIP_START_DOCSTRING,
)
class CLIPTextModelWithProjection(CLIPPreTrainedModel):
config_class = CLIPTextConfig
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
def __init__(self, config: CLIPTextConfig):
super().__init__(config)
# Initialize the text model component using CLIPTextTransformer
self.text_model = CLIPTextTransformer(config)
# Linear projection layer to transform hidden_size to projection_dim
self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> nn.Module:
# Return the token embeddings from CLIPTextTransformer
return self.text_model.embeddings.token_embedding
def set_input_embeddings(self, value):
# Set new token embeddings for CLIPTextTransformer
self.text_model.embeddings.token_embedding = value
@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CLIPTextModelOutput, config_class=CLIPTextConfig)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CLIPTextModelOutput]:
r"""
Returns:
Examples:
```
>>> from transformers import AutoTokenizer, CLIPTextModelWithProjection
>>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> outputs = model(**inputs)
>>> text_embeds = outputs.text_embeds
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Pass input through the text model to get text_outputs
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Extract pooled_output from text_outputs
pooled_output = text_outputs[1]
# Project pooled_output using text_projection linear layer
text_embeds = self.text_projection(pooled_output)
if not return_dict:
# If return_dict is False, return tuple of outputs
outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
return tuple(output for output in outputs if output is not None)
# If return_dict is True, return CLIPTextModelOutput with specified attributes
return CLIPTextModelOutput(
text_embeds=text_embeds,
last_hidden_state=text_outputs.last_hidden_state,
hidden_states=text_outputs.hidden_states,
attentions=text_outputs.attentions,
)
"""
将字符串 CLIP_START_DOCSTRING 插入到三引号字符串中
CLIP_START_DOCSTRING 通常是一个文档字符串的起始标记
"""
CLIP_START_DOCSTRING,
# 定义一个继承自 CLIPPreTrainedModel 的类,用于视觉模型和投影
class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
# 设置配置类为 CLIPVisionConfig
config_class = CLIPVisionConfig
# 主要输入名称为 "pixel_values"
main_input_name = "pixel_values"
# 初始化方法,接受一个 CLIPVisionConfig 类型的配置对象
def __init__(self, config: CLIPVisionConfig):
# 调用父类的初始化方法
super().__init__(config)
# 创建 CLIPVisionTransformer 类的实例,作为视觉模型
self.vision_model = CLIPVisionTransformer(config)
# 创建一个线性层,用于视觉投影,输入维度为 config.hidden_size,输出维度为 config.projection_dim,无偏置
self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
# 执行后续的初始化权重和处理步骤
self.post_init()
# 获取输入嵌入的方法,返回视觉模型中的 patch_embedding 模块
def get_input_embeddings(self) -> nn.Module:
return self.vision_model.embeddings.patch_embedding
# 前向传播方法,接受像素值 pixel_values 等多个可选参数,返回 Union[Tuple, CLIPVisionModelOutput] 类型
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CLIPVisionModelOutput, config_class=CLIPVisionConfig)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CLIPVisionModelOutput]:
"""
Returns:
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPVisionModelWithProjection
>>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> image_embeds = outputs.image_embeds
```
"""
# 如果 return_dict 为 None,则使用配置中的 use_return_dict 参数
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用视觉模型的前向传播方法,获取视觉输出
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 提取池化后的输出,命名为 pooled_output
pooled_output = vision_outputs[1] # pooled_output
# 对 pooled_output 进行视觉投影,得到图像嵌入 image_embeds
image_embeds = self.visual_projection(pooled_output)
# 如果 return_dict 为 False,则返回元组形式的输出
if not return_dict:
outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
return tuple(output for output in outputs if output is not None)
# 如果 return_dict 为 True,则返回 CLIPVisionModelOutput 类型的结构化输出
return CLIPVisionModelOutput(
image_embeds=image_embeds,
last_hidden_state=vision_outputs.last_hidden_state,
hidden_states=vision_outputs.hidden_states,
attentions=vision_outputs.attentions,
)
# 添加关于图像分类的描述性注释,继承自 CLIPPreTrainedModel 的类
@add_start_docstrings(
"""
CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
the patch tokens) e.g. for ImageNet.
""",
CLIP_START_DOCSTRING,
)
class CLIPForImageClassification(CLIPPreTrainedModel):
# 主要输入名称为 "pixel_values"
main_input_name = "pixel_values"
# 初始化方法,接受一个 CLIPConfig 类型的配置参数
def __init__(self, config: CLIPConfig) -> None:
# 调用父类的初始化方法
super().__init__(config)
# 设置实例变量 num_labels,用于指定分类任务的类别数
self.num_labels = config.num_labels
# 根据配置中的视觉模型配置信息创建视觉模型,使用 CLIPVisionTransformer 类
self.vision_model = CLIPVisionTransformer(config.vision_config)
# 分类器头部部分,根据 num_labels 的值决定使用全连接层还是恒等映射
self.classifier = (
nn.Linear(config.vision_config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
# 执行后续的初始化步骤和最终处理
self.post_init()
# 前向传播方法,接受像素值、标签以及其他配置参数,返回模型输出结果
@add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 输入参数描述:
# pixel_values: 图像的像素值张量,可选
# labels: 标签张量,可选
# output_attentions: 是否输出注意力权重张量,可选
# output_hidden_states: 是否输出隐藏状态张量,可选
# return_dict: 是否返回字典类型的结果,可选
) -> Union[tuple, ImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 确定是否输出注意力权重,默认与模型配置一致
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 确定是否输出隐藏状态,默认与模型配置一致
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 确定是否使用返回字典,默认与模型配置一致
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 将输入数据传递给视觉模型,获取输出
outputs = self.vision_model(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取序列输出,通常是模型输出的第一个元素
sequence_output = outputs[0]
# 对补丁令牌进行平均池化
sequence_output = torch.mean(sequence_output[:, 1:, :], dim=1)
# 应用分类器,生成分类器的 logits
logits = self.classifier(sequence_output)
# 初始化损失为 None
loss = None
if labels is not None:
# 将标签移动到正确的设备以启用模型并行处理
labels = labels.to(logits.device)
# 根据问题类型设置模型配置
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型计算损失
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果不要求返回字典形式的输出,则返回元组
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 返回 ImageClassifierOutput 对象,包括损失、logits、隐藏状态和注意力权重
return ImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\clip\modeling_flax_clip.py
from typing import Any, Optional, Tuple, Union
import flax
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPooling
from ...modeling_flax_utils import (
ACT2FN,
FlaxPreTrainedModel,
append_replace_return_docstrings,
overwrite_call_docstring,
)
from ...utils import ModelOutput, add_start_docstrings, logging
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
logger = logging.get_logger(__name__)
CLIP_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
This model is also a
[flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
# 参数说明部分:config 是一个 CLIPConfig 类型的对象,包含模型的所有参数。
# 通过传入一个配置文件初始化,不会加载模型的权重,只加载配置信息。
# 查看 `FlaxPreTrainedModel.from_pretrained` 方法可以加载模型权重。
# dtype 是计算数据的数据类型,默认为 `jax.numpy.float32`。
# 可以选择 `jax.numpy.float32`, `jax.numpy.float16`(在GPU上)和 `jax.numpy.bfloat16`(在TPU上)。
# 这可以用于在GPU或TPU上启用混合精度训练或半精度推断。
# 如果指定了dtype,则所有计算将使用给定的dtype执行。
# 注意:这只指定计算的dtype,不影响模型参数的dtype。
# 如果希望更改模型参数的dtype,请参阅 `FlaxPreTrainedModel.to_fp16` 和 `FlaxPreTrainedModel.to_bf16`。
"""
CLIP_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
CLIP_INPUTS_DOCSTRING = r"""
Placeholder for combining textual and visual inputs documentation for the CLIP model.
"""
Args:
input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@flax.struct.dataclass
class FlaxCLIPTextModelOutput(ModelOutput):
"""
Base class for text model's outputs that also contains a pooling of the last hidden states.
Args:
text_embeds (`jnp.ndarray` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of
[`FlaxCLIPTextModel`].
last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
text_embeds: jnp.ndarray = None # 文本嵌入,通过将投影层应用于[`FlaxCLIPTextModel`]的汇聚输出获得
last_hidden_state: jnp.ndarray = None # 模型最后一层的隐藏状态输出,形状为`(batch_size, sequence_length, hidden_size)`
hidden_states: Optional[Tuple[jnp.ndarray, ...]] = None # 可选,当传递`output_hidden_states=True`或`config.output_hidden_states=True`时返回,元组中包含每层输出的隐藏状态
attentions: Optional[Tuple[jnp.ndarray, ...]] = None # 可选,当传递`output_attentions=True`或`config.output_attentions=True`时返回,元组中包含每层的注意力权重
"""
@flax.struct.dataclass
class FlaxCLIPOutput(ModelOutput):
"""
Args:
logits_per_image: (`jnp.ndarray` of shape `(image_batch_size, text_batch_size)`):
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
similarity scores.
logits_per_text: (`jnp.ndarray` of shape `(text_batch_size, image_batch_size)`):
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
similarity scores.
text_embeds: (`jnp.ndarray` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of
[`FlaxCLIPTextModel`].
image_embeds: (`jnp.ndarray` of shape `(batch_size, output_dim`):
The image embeddings obtained by applying the projection layer to the pooled output of
[`FlaxCLIPVisionModel`].
text_model_output: (`FlaxBaseModelOutputWithPooling`):
The output of the [`FlaxCLIPTextModel`].
vision_model_output: (`FlaxBaseModelOutputWithPooling`):
The output of the [`FlaxCLIPVisionModel`].
"""
logits_per_image: jnp.ndarray = None # 图像与文本嵌入之间的标量乘积得分,形状为`(image_batch_size, text_batch_size)`,表示图像与文本之间的相似度分数
logits_per_text: jnp.ndarray = None # 文本与图像嵌入之间的标量乘积得分,形状为`(text_batch_size, image_batch_size)`,表示文本与图像之间的相似度分数
text_embeds: jnp.ndarray = None # 通过将投影层应用于[`FlaxCLIPTextModel`]的汇聚输出获得的文本嵌入
image_embeds: jnp.ndarray = None # 通过将投影层应用于[`FlaxCLIPVisionModel`]的汇聚输出获得的图像嵌入
# 定义两个属性,分别用于存储文本模型和视觉模型的输出,初始值为None
text_model_output: FlaxBaseModelOutputWithPooling = None
vision_model_output: FlaxBaseModelOutputWithPooling = None
# 定义一个方法,将对象的属性转换为元组返回
def to_tuple(self) -> Tuple[Any]:
return tuple(
# 对于对象的每个属性,如果属性不是"text_model_output"或"vision_model_output",直接取其值
# 如果属性是"text_model_output"或"vision_model_output",调用其to_tuple()方法进行转换
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys() # 遍历对象的所有属性名
)
# 定义 FlaxCLIPVisionEmbeddings 类,继承自 nn.Module,用于视觉嵌入处理
class FlaxCLIPVisionEmbeddings(nn.Module):
# 类属性 config 表示 CLIPVisionConfig 的配置
config: CLIPVisionConfig
# 类属性 dtype 表示数据类型,默认为 jnp.float32
# 初始化方法 setup,用于设置模型结构和参数
def setup(self):
# 从配置中获取隐藏层大小作为嵌入维度
embed_dim = self.config.hidden_size
# 从配置中获取图像大小和patch大小
image_size = self.config.image_size
patch_size = self.config.patch_size
# 初始化类别嵌入向量,命名为 class_embedding,使用正态分布初始化
self.class_embedding = self.param("class_embedding", jax.nn.initializers.normal(stddev=0.02), (embed_dim,))
# 初始化 patch 嵌入层,使用卷积操作,无偏置,数据类型为 dtype
self.patch_embedding = nn.Conv(
embed_dim,
kernel_size=(patch_size, patch_size),
strides=(patch_size, patch_size),
padding="VALID",
use_bias=False,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(),
)
# 计算图像分割成 patch 后的总数
self.num_patches = (image_size // patch_size) ** 2
# 计算位置嵌入的总数,包括类别嵌入
num_positions = self.num_patches + 1
# 初始化位置嵌入层,使用正态分布初始化
self.position_embedding = nn.Embed(num_positions, embed_dim, embedding_init=jax.nn.initializers.normal())
# 初始化位置编号,用于确定每个位置的嵌入
self.position_ids = jnp.expand_dims(jnp.arange(0, num_positions, dtype="i4"), axis=0)
# 实现 __call__ 方法,用于执行模型的前向传播
def __call__(self, pixel_values):
# 对输入的像素值进行 patch 嵌入处理
patch_embeds = self.patch_embedding(pixel_values)
# 获取批量大小、高度、宽度和通道数
batch_size, height, width, channels = patch_embeds.shape
# 将 patch 嵌入重新形状为 (批量大小, 高度*宽度, 通道数)
patch_embeds = jnp.reshape(patch_embeds, (batch_size, height * width, channels))
# 扩展类别嵌入到每个图像片段,以便与 patch 嵌入连接
class_embeds = jnp.expand_dims(self.class_embedding, axis=(0, 1))
class_embeds = jnp.tile(class_embeds, (batch_size, 1, 1))
# 将类别嵌入和 patch 嵌入连接起来形成最终的嵌入表示
embeddings = jnp.concatenate([class_embeds, patch_embeds], axis=1)
# 将位置嵌入加到最终嵌入表示中
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
# 定义 FlaxCLIPTextEmbeddings 类,继承自 nn.Module,用于文本嵌入处理
class FlaxCLIPTextEmbeddings(nn.Module):
# 类属性 config 表示 CLIPTextConfig 的配置
config: CLIPTextConfig
# 类属性 dtype 表示数据类型,默认为 jnp.float32
# 初始化方法 setup,用于设置模型结构和参数
def setup(self):
# 从配置中获取隐藏层大小作为嵌入维度
embed_dim = self.config.hidden_size
# 初始化 token 嵌入层,使用正态分布初始化
self.token_embedding = nn.Embed(self.config.vocab_size, embed_dim, embedding_init=jax.nn.initializers.normal())
# 初始化位置嵌入层,使用正态分布初始化
self.position_embedding = nn.Embed(
self.config.max_position_embeddings, embed_dim, embedding_init=jax.nn.initializers.normal()
)
# 初始化位置编号,用于确定每个位置的嵌入
self.position_ids = jnp.expand_dims(
jnp.arange(0, self.config.max_position_embeddings, dtype="i4"), axis=(0, 1)
)
# 实现 __call__ 方法,用于执行模型的前向传播
def __call__(self, input_ids, position_ids):
# 将输入的 token 编号转换为对应的 token 嵌入
input_embeds = self.token_embedding(input_ids.astype("i4"))
# 获取对应位置编号的位置嵌入
position_embeds = self.position_embedding(position_ids.astype("i4"))
# 将 token 嵌入和位置嵌入相加得到最终的嵌入表示
embeddings = input_embeds + position_embeds
return embeddings
# 定义 FlaxCLIPAttention 类,继承自 nn.Module,用于注意力机制处理
class FlaxCLIPAttention(nn.Module):
# 类属性 config 表示 CLIPTextConfig 或 CLIPVisionConfig 的配置
config: Union[CLIPTextConfig, CLIPVisionConfig]
# 类属性 dtype 表示数据类型,默认为 jnp.float32
# 设置函数,初始化模型的注意力相关参数
def setup(self):
# 设置嵌入维度为隐藏大小
self.embed_dim = self.config.hidden_size
# 设置注意力头的数量
self.num_heads = self.config.num_attention_heads
# 计算每个注意力头的维度
self.head_dim = self.embed_dim // self.num_heads
# 检查embed_dim是否能被num_heads整除
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
# 设置缩放因子
self.scale = self.head_dim**-0.5
# 设置注意力的dropout率
self.dropout = self.config.attention_dropout
# 初始化键、值、查询、输出的线性投影层
self.k_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01))
self.v_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01))
self.q_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01))
self.out_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01))
# 根据配置确定是否是有因果关系的注意力
self.causal = isinstance(self.config, CLIPTextConfig)
# 如果是因果关系注意力,则创建因果关系的掩码
if self.causal:
self.causal_mask = make_causal_mask(jnp.ones((1, self.config.max_position_embeddings), dtype="i4"))
# 将隐藏状态按照头的数量和头的维度进行分割
def _split_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
# 将分割后的头重新合并成原始的维度
def _merge_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
# 定义模型的调用方法,用于执行自注意力机制
def __call__(
self,
hidden_states,
attention_mask=None,
deterministic: bool = True,
output_attentions: bool = False,
):
# 使用 self.q_proj 对隐藏状态进行查询投影
query = self.q_proj(hidden_states)
# 使用 self.k_proj 对隐藏状态进行键投影
key = self.k_proj(hidden_states)
# 使用 self.v_proj 对隐藏状态进行值投影
value = self.v_proj(hidden_states)
# 将查询结果按头数分割
query = self._split_heads(query)
# 将键结果按头数分割
key = self._split_heads(key)
# 将值结果按头数分割
value = self._split_heads(value)
# 初始化因果注意力掩码
causal_attention_mask = None
if self.causal:
# 如果开启因果模式,则根据查询和键的长度创建因果注意力掩码
query_length, key_length = query.shape[1], key.shape[1]
causal_attention_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length]
# 整合外部传入的注意力掩码和因果注意力掩码
if attention_mask is not None and causal_attention_mask is not None:
attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
attention_mask = combine_masks(attention_mask, causal_attention_mask, dtype="i4")
elif causal_attention_mask is not None:
attention_mask = causal_attention_mask
elif attention_mask is not None:
attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
# 根据最终得到的注意力掩码生成注意力偏置
if attention_mask is not None:
attention_bias = lax.select(
attention_mask > 0,
jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
)
else:
attention_bias = None
# 初始化 dropout 的随机数生成器
dropout_rng = None
if not deterministic and self.dropout > 0.0:
dropout_rng = self.make_rng("dropout")
# 计算注意力权重
attn_weights = dot_product_attention_weights(
query,
key,
bias=attention_bias,
dropout_rng=dropout_rng,
dropout_rate=self.dropout,
deterministic=deterministic,
dtype=self.dtype,
precision=None,
)
# 根据注意力权重计算注意力输出
attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
# 合并多头注意力的输出
attn_output = self._merge_heads(attn_output)
# 对注意力输出进行最终的投影
attn_output = self.out_proj(attn_output)
# 根据需求决定返回的输出内容
outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
return outputs
# 定义一个使用 CLIPTextConfig 或 CLIPVisionConfig 类型配置的神经网络模块
class FlaxCLIPMLP(nn.Module):
config: Union[CLIPTextConfig, CLIPVisionConfig] # 模块的配置属性,可以是文本或视觉配置类型
dtype: jnp.dtype = jnp.float32 # 默认数据类型为 jnp.float32
# 模块初始化设置方法
def setup(self):
# 根据配置中指定的激活函数选择对应的激活函数
self.activation_fn = ACT2FN[self.config.hidden_act]
# 第一个全连接层,输入大小为配置中的 intermediate_size,使用正态分布初始化权重
self.fc1 = nn.Dense(
self.config.intermediate_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(0.01),
)
# 第二个全连接层,输入大小为配置中的 hidden_size,使用正态分布初始化权重
self.fc2 = nn.Dense(self.config.hidden_size, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01))
# 模块调用方法
def __call__(self, hidden_states):
# 使用第一个全连接层进行前向传播
hidden_states = self.fc1(hidden_states)
# 使用选择的激活函数进行激活
hidden_states = self.activation_fn(hidden_states)
# 使用第二个全连接层进行前向传播
hidden_states = self.fc2(hidden_states)
return hidden_states
# 定义一个使用 CLIPTextConfig 或 CLIPVisionConfig 类型配置的编码器层模块
class FlaxCLIPEncoderLayer(nn.Module):
config: Union[CLIPTextConfig, CLIPVisionConfig] # 模块的配置属性,可以是文本或视觉配置类型
dtype: jnp.dtype = jnp.float32 # 默认数据类型为 jnp.float32
# 模块初始化设置方法
def setup(self):
# 自注意力机制
self.self_attn = FlaxCLIPAttention(self.config, dtype=self.dtype)
# 第一层归一化层,使用配置中指定的 epsilon 进行归一化
self.layer_norm1 = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# 多层感知机(MLP)模块
self.mlp = FlaxCLIPMLP(self.config, dtype=self.dtype)
# 第二层归一化层,使用配置中指定的 epsilon 进行归一化
self.layer_norm2 = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# 模块调用方法
def __call__(
self,
hidden_states,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
):
residual = hidden_states # 保存输入的残差连接
# 对输入进行第一层归一化处理
hidden_states = self.layer_norm1(hidden_states)
# 使用自注意力机制进行注意力计算
attn_outputs = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
)
hidden_states = attn_outputs[0] # 更新隐藏状态为注意力输出的第一个元素
hidden_states = residual + hidden_states # 残差连接
residual = hidden_states # 更新残差连接
# 对更新后的隐藏状态进行第二层归一化处理
hidden_states = self.layer_norm2(hidden_states)
# 使用多层感知机(MLP)模块进行前向传播
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states # 残差连接
outputs = (hidden_states,) # 将输出封装成元组
# 如果需要输出注意力信息,则添加到输出中
if output_attentions:
outputs += attn_outputs[1:]
return outputs
# 定义一个使用 CLIPTextConfig 或 CLIPVisionConfig 类型配置的多层编码器层集合模块
class FlaxCLIPLayerCollection(nn.Module):
config: Union[CLIPTextConfig, CLIPVisionConfig] # 模块的配置属性,可以是文本或视觉配置类型
dtype: jnp.dtype = jnp.float32 # 默认数据类型为 jnp.float32
# 模块初始化设置方法
def setup(self):
# 创建多层编码器层集合,每层使用 FlaxCLIPEncoderLayer 模块
self.layers = [
FlaxCLIPEncoderLayer(self.config, name=str(i), dtype=self.dtype)
for i in range(self.config.num_hidden_layers)
]
# 模块调用方法
def __call__(
self,
hidden_states,
attention_mask=None,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 遍历每层编码器层进行处理
for layer in self.layers:
# 对隐藏状态进行编码器层处理
hidden_states = layer(
hidden_states=hidden_states,
attention_mask=attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
)
# 返回处理后的结果
return hidden_states
):
# 如果不输出注意力权重,则初始化空元组
all_attentions = () if output_attentions else None
# 如果不输出隐藏状态,则初始化空元组
all_hidden_states = () if output_hidden_states else None
# 遍历模型的每一层
for layer in self.layers:
if output_hidden_states:
# 如果需要输出隐藏状态,则将当前隐藏状态添加到all_hidden_states元组中
all_hidden_states += (hidden_states,)
# 调用当前层的前向传播方法
layer_outputs = layer(
hidden_states, attention_mask, deterministic=deterministic, output_attentions=output_attentions
)
# 更新隐藏状态为当前层的输出的第一个元素
hidden_states = layer_outputs[0]
if output_attentions:
# 如果需要输出注意力权重,则将当前层的注意力权重添加到all_attentions元组中
all_attentions += (layer_outputs[1],)
if output_hidden_states:
# 如果需要输出隐藏状态,则将最终的隐藏状态添加到all_hidden_states元组中
all_hidden_states += (hidden_states,)
# 将最终的隐藏状态作为模型的输出
outputs = (hidden_states,)
if not return_dict:
# 如果不返回字典形式的输出,则返回outputs中不为None的元素组成的元组
return tuple(v for v in outputs if v is not None)
# 返回FlaxBaseModelOutput类的实例,其中包括最终的隐藏状态、所有隐藏状态和所有注意力权重
return FlaxBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
class FlaxCLIPEncoder(nn.Module):
config: Union[CLIPTextConfig, CLIPVisionConfig] # 定义config属性,可以是CLIPTextConfig或CLIPVisionConfig类型
dtype: jnp.dtype = jnp.float32 # 定义dtype属性,默认为jnp.float32类型
def setup(self):
self.layers = FlaxCLIPLayerCollection(self.config, dtype=self.dtype)
# 初始化layers属性为FlaxCLIPLayerCollection实例,使用给定的config和dtype参数
def __call__(
self,
inputs_embeds,
attention_mask=None,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
return self.layers(
hidden_states=inputs_embeds,
attention_mask=attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 调用self.layers对象,传递输入的嵌入向量inputs_embeds和其他可选参数,返回计算结果
class FlaxCLIPTextTransformer(nn.Module):
config: CLIPTextConfig # 定义config属性为CLIPTextConfig类型
dtype: jnp.dtype = jnp.float32 # 定义dtype属性,默认为jnp.float32类型
def setup(self):
self.embeddings = FlaxCLIPTextEmbeddings(self.config, dtype=self.dtype)
# 初始化embeddings属性为FlaxCLIPTextEmbeddings实例,使用给定的config和dtype参数
self.encoder = FlaxCLIPEncoder(self.config, dtype=self.dtype)
# 初始化encoder属性为FlaxCLIPEncoder实例,使用给定的config和dtype参数
self.final_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# 初始化final_layer_norm属性为nn.LayerNorm实例,使用给定的layer_norm_eps和dtype参数
# For `pooled_output` computation
self.eos_token_id = self.config.eos_token_id
# 设置eos_token_id属性为config中的eos_token_id值
def __call__(
self,
input_ids,
attention_mask,
position_ids,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 定义对象调用方法,接收输入参数,包括input_ids、attention_mask等
# 如果没有指定output_attentions,则使用self.config.output_attentions
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果没有指定output_hidden_states,则使用self.config.output_hidden_states
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果没有指定return_dict,则使用self.config.use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用input_ids和position_ids作为输入,生成hidden_states
hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
# 将hidden_states作为输入,并传入额外的参数,生成encoder_outputs
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
attention_mask=attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 选取encoder_outputs中的第一个元素作为last_hidden_state
last_hidden_state = encoder_outputs[0]
# 对last_hidden_state进行final_layer_norm处理
last_hidden_state = self.final_layer_norm(last_hidden_state)
# 如果eos_token_id等于2,则执行以下逻辑
if self.eos_token_id == 2:
# 从last_hidden_state中取出特定位置的特征,形成pooled_output
pooled_output = last_hidden_state[jnp.arange(last_hidden_state.shape[0]), input_ids.argmax(axis=-1)]
else:
# 处理eos_token_id不等于2的情况
pooled_output = last_hidden_state[
jnp.arange(last_hidden_state.shape[0]), (input_ids == self.eos_token_id).argmax(axis=-1)
]
# 如果return_dict为False,则返回last_hidden_state, pooled_output和encoder_outputs的其他部分
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
# 如果return_dict为True,则返回FlaxBaseModelOutputWithPooling对象
return FlaxBaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
# 定义一个名为 FlaxCLIPVisionTransformer 的类,继承自 nn.Module
class FlaxCLIPVisionTransformer(nn.Module):
# 类变量 config,指定为 CLIPVisionConfig 类型
config: CLIPVisionConfig
# 类变量 dtype,默认为 jnp.float32 类型
dtype: jnp.dtype = jnp.float32
# 初始化函数 setup,用于设置模型的组件
def setup(self):
# 创建 FlaxCLIPVisionEmbeddings 实例,并传入 config 和 dtype 参数
self.embeddings = FlaxCLIPVisionEmbeddings(self.config, dtype=self.dtype)
# 创建 nn.LayerNorm 实例,用于前层归一化,设定 epsilon 参数为 config 的 layer_norm_eps
self.pre_layrnorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# 创建 FlaxCLIPEncoder 实例,并传入 config 和 dtype 参数
self.encoder = FlaxCLIPEncoder(self.config, dtype=self.dtype)
# 创建 nn.LayerNorm 实例,用于后层归一化,设定 epsilon 参数为 config 的 layer_norm_eps
self.post_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# 定义调用函数,接受多个参数
def __call__(
self,
pixel_values=None,
deterministic: bool = True,
output_attentions=None,
output_hidden_states=None,
return_dict: bool = True,
):
# 根据参数设定 output_attentions,默认为 config 中的 output_attentions
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 根据参数设定 output_hidden_states,默认为 config 中的 output_hidden_states
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 根据参数设定 return_dict,默认为 config 中的 use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 embeddings 对象处理输入的像素值,得到隐藏状态
hidden_states = self.embeddings(pixel_values)
# 对隐藏状态进行前层归一化处理
hidden_states = self.pre_layrnorm(hidden_states)
# 使用 encoder 对象处理归一化后的隐藏状态
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从编码器输出中提取最后一个隐藏状态
last_hidden_state = encoder_outputs[0]
# 对最后一个隐藏状态进行池化操作,提取池化输出
pooled_output = last_hidden_state[:, 0, :]
# 对池化输出进行后层归一化处理
pooled_output = self.post_layernorm(pooled_output)
# 如果 return_dict 为 False,则返回元组形式的结果
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
# 如果 return_dict 为 True,则返回 FlaxBaseModelOutputWithPooling 的实例
return FlaxBaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
# 定义一个名为 FlaxCLIPTextPreTrainedModel 的类,继承自 FlaxPreTrainedModel
class FlaxCLIPTextPreTrainedModel(FlaxPreTrainedModel):
# 类变量 config_class,指定为 CLIPTextConfig 类型
config_class = CLIPTextConfig
# 类变量 module_class,默认为 None
module_class: nn.Module = None
# 初始化函数,接受多个参数,包括一个 config 对象
def __init__(
self,
config: CLIPTextConfig,
input_shape=(1, 1),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
# 根据传入的 config 参数和其他参数创建 module 对象
module = self.module_class(config=config, dtype=dtype, **kwargs)
# 调用父类的初始化方法,初始化模型
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
# 初始化模型权重的方法,使用给定的随机数生成器和输入形状,可选地使用现有的参数字典
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化输入张量
input_ids = jnp.zeros(input_shape, dtype="i4")
# 创建位置编码张量,广播到输入形状的维度
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
# 创建注意力掩码张量,形状与输入张量相同,并初始化为全1
attention_mask = jnp.ones_like(input_ids)
# 分离随机数生成器为参数初始化和dropout层
params_rng, dropout_rng = jax.random.split(rng)
rngs = {"params": params_rng, "dropout": dropout_rng}
# 使用模型的初始化方法初始化随机的参数
random_params = self.module.init(rngs, input_ids, attention_mask, position_ids)["params"]
# 如果存在现有参数,则将随机生成的参数与现有参数进行合并
if params is not None:
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
# 将缺失的键从随机参数复制到现有参数
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
# 冻结并返回合并后的参数字典
return freeze(unflatten_dict(params))
else:
# 直接返回随机生成的参数字典
return random_params
# 模型对象的调用方法,接受一系列输入参数并返回模型的输出
def __call__(
self,
input_ids,
attention_mask=None,
position_ids=None,
params: dict = None,
dropout_rng: jax.random.PRNGKey = None,
train: bool = False,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 如果未提供位置编码,则使用广播到输入张量形状的默认位置编码
if position_ids is None:
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
# 如果未提供注意力掩码,则创建一个与输入张量形状相同的全1掩码
if attention_mask is None:
attention_mask = jnp.ones_like(input_ids)
# 处理可能需要的任何随机数生成器
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
# 应用模型的前向传播方法并返回结果
return self.module.apply(
{"params": params or self.params}, # 模型参数,可以是传入的参数或者模型自身的参数
jnp.array(input_ids, dtype="i4"), # 输入张量,转换为32位整数
jnp.array(attention_mask, dtype="i4"), # 注意力掩码张量,转换为32位整数
jnp.array(position_ids, dtype="i4"), # 位置编码张量,转换为32位整数
not train, # 是否处于推理模式(训练模式取反)
output_attentions, # 是否输出注意力权重
output_hidden_states, # 是否输出隐藏状态
return_dict, # 是否以字典形式返回结果
rngs=rngs, # 随机数生成器字典
)
# 定义一个继承自FlaxPreTrainedModel的新模型类,用于视觉任务的预训练模型
class FlaxCLIPVisionPreTrainedModel(FlaxPreTrainedModel):
# 指定配置类为CLIPVisionConfig
config_class = CLIPVisionConfig
# 主要输入的名称为"pixel_values"
main_input_name = "pixel_values"
# 模块类的类型暂未指定
module_class: nn.Module = None
# 初始化方法,接收多个参数包括config、input_shape等
def __init__(
self,
config: CLIPVisionConfig,
input_shape: Optional[Tuple] = None,
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
# 如果未指定input_shape,默认为(1, config.image_size, config.image_size, 3)
if input_shape is None:
input_shape = (1, config.image_size, config.image_size, 3)
# 使用给定的config和dtype创建模块对象
module = self.module_class(config=config, dtype=dtype, **kwargs)
# 调用父类的初始化方法,传递config、module等参数
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
# 初始化权重的方法,接收随机数种子rng、输入形状input_shape、参数params等
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 使用正态分布生成输入张量pixel_values
pixel_values = jax.random.normal(rng, input_shape)
# 分割rng以获取参数rng和dropout_rng
params_rng, dropout_rng = jax.random.split(rng)
rngs = {"params": params_rng, "dropout": dropout_rng}
# 初始化模块的参数,返回随机生成的参数random_params
random_params = self.module.init(rngs, pixel_values)["params"]
# 如果提供了params,则将缺失的键从random_params复制到params中,并返回新的params
if params is not None:
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
return freeze(unflatten_dict(params))
else:
return random_params
# 模型的调用方法,接收多个参数如pixel_values、params等
def __call__(
self,
pixel_values,
params: dict = None,
dropout_rng: jax.random.PRNGKey = None,
train: bool = False,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 如果output_attentions、output_hidden_states或return_dict未指定,则使用config中的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 将输入张量pixel_values转置为适合模块处理的形状
pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
# 如果存在dropout_rng,则将其添加到rngs字典中
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
# 调用模块的apply方法,传递params、pixel_values和其他参数,返回模型的输出
return self.module.apply(
{"params": params or self.params},
jnp.array(pixel_values, dtype=jnp.float32),
not train,
output_attentions,
output_hidden_states,
return_dict,
rngs=rngs,
)
# 定义一个继承自FlaxPreTrainedModel的新模型类,用于通用的CLIP预训练模型
class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
# 指定配置类为CLIPConfig
config_class = CLIPConfig
# 模块类的类型暂未指定
module_class: nn.Module = None
# 初始化方法,接收多个参数包括config、input_shape等
def __init__(
self,
config: CLIPConfig,
input_shape: Optional[Tuple] = None,
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
):
# 如果未提供输入形状,则使用默认形状:((1, 1), (1, vision_config.image_size, vision_config.image_size, 3))
if input_shape is None:
input_shape = ((1, 1), (1, config.vision_config.image_size, config.vision_config.image_size, 3))
# 根据指定的配置和参数初始化模块对象
module = self.module_class(config=config, dtype=dtype, **kwargs)
# 调用父类的初始化方法,传入配置、模块对象、输入形状等参数进行初始化
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化输入张量,全部置零
input_ids = jnp.zeros(input_shape[0], dtype="i4")
# 生成位置编码,广播到与输入张量相同的形状
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape[0])
# 创建注意力掩码,与输入张量形状相同,全部置一
attention_mask = jnp.ones_like(input_ids)
# 生成像素数值,服从正态分布
pixel_values = jax.random.normal(rng, input_shape[1])
# 划分随机数生成器为参数和丢弃的两部分
params_rng, dropout_rng = jax.random.split(rng)
rngs = {"params": params_rng, "dropout": dropout_rng}
# 使用模块的初始化方法初始化随机参数
random_params = self.module.init(rngs, input_ids, pixel_values, attention_mask, position_ids)["params"]
if params is not None:
# 如果提供了参数,则使用提供的参数,否则使用随机生成的参数
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
return freeze(unflatten_dict(params))
else:
return random_params
def __call__(
self,
input_ids,
pixel_values,
attention_mask=None,
position_ids=None,
params: dict = None,
dropout_rng: jax.random.PRNGKey = None,
train: bool = False,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 如果未提供位置编码,则根据输入张量的形状生成位置编码
if position_ids is None:
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
# 如果未提供注意力掩码,则生成与输入张量相同形状的全一掩码
if attention_mask is None:
attention_mask = jnp.ones_like(input_ids)
# 转置像素值,调整维度顺序
pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
# 如果需要处理任何随机数生成器
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
# 调用模块的应用方法,传入参数和数据,返回模块处理的结果
return self.module.apply(
{"params": params or self.params},
jnp.array(input_ids, dtype="i4"),
jnp.array(pixel_values, dtype=jnp.float32),
jnp.array(attention_mask, dtype="i4"),
jnp.array(position_ids, dtype="i4"),
not train,
output_attentions,
output_hidden_states,
return_dict,
rngs=rngs,
)
def get_text_features(
self,
input_ids,
attention_mask=None,
position_ids=None,
params: dict = None,
dropout_rng: jax.random.PRNGKey = None,
train=False,
):
r"""
Args:
input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
Returns:
text_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
the projection layer to the pooled output of [`FlaxCLIPTextModel`].
Examples:
```
>>> from transformers import AutoTokenizer, FlaxCLIPModel
>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
>>> text_features = model.get_text_features(**inputs)
```"""
# 如果未提供位置 IDs,则创建一个广播以匹配输入 IDs 的长度
if position_ids is None:
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
# 如果未提供注意力遮罩,则创建一个全1数组以表示全部注意
if attention_mask is None:
attention_mask = jnp.ones_like(input_ids)
# 如果需要处理任何随机数生成器(PRNG)
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
# 定义内部函数以获取文本特征
def _get_features(module, input_ids, attention_mask, position_ids, deterministic):
# 获取文本模型的输出
text_outputs = module.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
deterministic=deterministic,
)
# 获取汇总后的输出
pooled_output = text_outputs[1]
# 应用文本投影层得到文本特征
text_features = module.text_projection(pooled_output)
return text_features
# 应用模块的方法来获取文本特征
return self.module.apply(
{"params": params or self.params},
jnp.array(input_ids, dtype="i4"),
jnp.array(attention_mask, dtype="i4"),
jnp.array(position_ids, dtype="i4"),
not train,
method=_get_features,
rngs=rngs,
)
def get_image_features(
self, pixel_values, params: dict = None, dropout_rng: jax.random.PRNGKey = None, train=False
):
):
r"""
Args:
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained
using [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
Returns:
image_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The image embeddings obtained by
applying the projection layer to the pooled output of [`FlaxCLIPVisionModel`]
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, FlaxCLIPModel
>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="np")
>>> image_features = model.get_image_features(**inputs)
```"""
# 转置像素值数组,调整通道顺序为(batch_size, height, width, num_channels)
pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
# 处理可能需要的随机数发生器
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
def _get_features(module, pixel_values, deterministic):
# 使用视觉模型处理像素值数组,获取视觉输出
vision_outputs = module.vision_model(pixel_values=pixel_values, deterministic=deterministic)
# 提取池化后的输出
pooled_output = vision_outputs[1] # pooled_output
# 将池化输出应用于视觉投影层,得到图像特征
image_features = module.visual_projection(pooled_output)
return image_features
# 应用模块的特征提取方法,返回图像特征
return self.module.apply(
{"params": params or self.params}, # 模型参数
jnp.array(pixel_values, dtype=jnp.float32), # 转换后的像素值数组
not train, # 是否训练模式
method=_get_features, # 使用_get_features方法进行特征提取
rngs=rngs, # 随机数发生器字典
)
class FlaxCLIPTextModule(nn.Module):
config: CLIPTextConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
# 初始化文本模型为 FlaxCLIPTextTransformer,使用给定的配置和数据类型
self.text_model = FlaxCLIPTextTransformer(self.config, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
position_ids,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用文本模型的前向传播方法,传递给定的输入参数,并返回结果
return self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
class FlaxCLIPTextModel(FlaxCLIPTextPreTrainedModel):
# 模型类的模块类型设置为 FlaxCLIPTextModule
module_class = FlaxCLIPTextModule
FLAX_CLIP_TEXT_MODEL_DOCSTRING = """
Returns:
Example:
```
>>> from transformers import AutoTokenizer, FlaxCLIPTextModel
>>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooler_output = outputs.pooler_output # pooled (EOS token) states
```
"""
# 覆盖 FlaxCLIPTextModel 类的 __call__ 方法的文档字符串,包括输入文档和模型输出示例
overwrite_call_docstring(FlaxCLIPTextModel, CLIP_TEXT_INPUTS_DOCSTRING + FLAX_CLIP_TEXT_MODEL_DOCSTRING)
# 追加或替换 FlaxCLIPTextModel 类的返回文档字符串,指定输出类型和配置类
append_replace_return_docstrings(
FlaxCLIPTextModel, output_type=FlaxBaseModelOutputWithPooling, config_class=CLIPTextConfig
)
class FlaxCLIPTextModelWithProjectionModule(nn.Module):
config: CLIPTextConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
# 初始化文本模型为 FlaxCLIPTextTransformer,使用给定的配置和数据类型
self.text_model = FlaxCLIPTextTransformer(self.config, dtype=self.dtype)
# 添加文本投影层,使用给定的投影维度和数据类型
self.text_projection = nn.Dense(self.config.projection_dim, use_bias=False, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
position_ids,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用文本模型生成文本输出
text_outputs = self.text_model(
input_ids=input_ids, # 输入的token IDs
attention_mask=attention_mask, # 注意力掩码
position_ids=position_ids, # 位置 IDs
deterministic=deterministic, # 是否确定性运行
output_attentions=output_attentions, # 是否输出注意力权重
output_hidden_states=output_hidden_states, # 是否输出隐藏状态
return_dict=return_dict, # 是否返回字典形式的输出
)
# 从文本输出中获取汇聚的输出(一般是平均池化或CLS token的表示)
pooled_output = text_outputs[1]
# 将汇聚的输出通过文本投影层进行转换
text_embeds = self.text_projection(pooled_output)
# 如果不返回字典形式的输出,则返回元组
if not return_dict:
return (text_embeds, text_outputs[0]) + text_outputs[2:]
# 如果返回字典形式的输出,则创建特定的输出对象
return FlaxCLIPTextModelOutput(
text_embeds=text_embeds,
last_hidden_state=text_outputs.last_hidden_state,
hidden_states=text_outputs.hidden_states,
attentions=text_outputs.attentions,
)
class FlaxCLIPTextModelWithProjection(FlaxCLIPTextPreTrainedModel):
module_class = FlaxCLIPTextModelWithProjectionModule
# 定义一个类,继承自FlaxCLIPTextPreTrainedModel,用于文本模型与投影
class FlaxCLIPTextModelWithProjection(FlaxCLIPTextPreTrainedModel):
# 指定模块类为FlaxCLIPTextModelWithProjectionModule
module_class = FlaxCLIPTextModelWithProjectionModule
FLAX_CLIP_TEXT_MODEL_WITH_PROJECTION_DOCSTRING = """
Returns:
Example:
```
>>> from transformers import AutoTokenizer, FlaxCLIPTextModelWithProjection
>>> model = FlaxCLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
>>> outputs = model(**inputs)
>>> text_embeds = outputs.text_embeds
```
"""
# 覆盖函数调用时的文档字符串,结合CLIP_TEXT_INPUTS_DOCSTRING和FLAX_CLIP_TEXT_MODEL_WITH_PROJECTION_DOCSTRING
overwrite_call_docstring(
FlaxCLIPTextModelWithProjection, CLIP_TEXT_INPUTS_DOCSTRING + FLAX_CLIP_TEXT_MODEL_WITH_PROJECTION_DOCSTRING
)
# 追加或替换函数返回的文档字符串,输出类型为FlaxCLIPTextModelOutput,配置类为CLIPTextConfig
append_replace_return_docstrings(
FlaxCLIPTextModelWithProjection, output_type=FlaxCLIPTextModelOutput, config_class=CLIPTextConfig
)
class FlaxCLIPVisionModule(nn.Module):
config: CLIPVisionConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
# 设置视觉模型为FlaxCLIPVisionTransformer,使用指定的配置和数据类型
self.vision_model = FlaxCLIPVisionTransformer(self.config, dtype=self.dtype)
def __call__(
self,
pixel_values,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用视觉模型进行前向传播
return self.vision_model(
pixel_values=pixel_values,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
class FlaxCLIPVisionModel(FlaxCLIPVisionPreTrainedModel):
module_class = FlaxCLIPVisionModule
FLAX_CLIP_VISION_MODEL_DOCSTRING = """
Returns:
Example:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, FlaxCLIPVisionModel
>>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="np")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooler_output = outputs.pooler_output # pooled CLS states
```
"""
# 覆盖函数调用时的文档字符串,结合CLIP_VISION_INPUTS_DOCSTRING和FLAX_CLIP_VISION_MODEL_DOCSTRING
overwrite_call_docstring(FlaxCLIPVisionModel, CLIP_VISION_INPUTS_DOCSTRING + FLAX_CLIP_VISION_MODEL_DOCSTRING)
# 追加或替换函数返回的文档字符串,输出类型为FlaxBaseModelOutputWithPooling,配置类为CLIPVisionConfig
append_replace_return_docstrings(
FlaxCLIPVisionModel, output_type=FlaxBaseModelOutputWithPooling, config_class=CLIPVisionConfig
)
class FlaxCLIPModule(nn.Module):
config: CLIPConfig
dtype: jnp.dtype = jnp.float32
# 设置模型的初始化过程
text_config = self.config.text_config
vision_config = self.config.vision_config
# 设置投影维度和文本、视觉嵌入的维度
self.projection_dim = self.config.projection_dim
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size
# 初始化文本模型和视觉模型,使用FlaxCLIPTextTransformer和FlaxCLIPVisionTransformer
self.text_model = FlaxCLIPTextTransformer(text_config, dtype=self.dtype)
self.vision_model = FlaxCLIPVisionTransformer(vision_config, dtype=self.dtype)
# 初始化视觉投影层和文本投影层,设置投影维度和使用正态分布初始化权重,不使用偏置
self.visual_projection = nn.Dense(
self.projection_dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(0.02),
use_bias=False,
)
self.text_projection = nn.Dense(
self.projection_dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(0.02),
use_bias=False,
)
# 初始化logit_scale参数,设置为初始值为config.logit_scale_init_value的常数值
self.logit_scale = self.param(
"logit_scale", lambda _, shape: jnp.ones(shape) * self.config.logit_scale_init_value, []
)
):
# 如果 return_dict 参数未指定,则使用配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 调用视觉模型,传入像素值和其他相关参数,获取视觉模型的输出
vision_outputs = self.vision_model(
pixel_values=pixel_values,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 调用文本模型,传入输入的 token IDs、注意力掩码、位置 IDs 等参数,获取文本模型的输出
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从视觉模型的输出中获取图像嵌入
image_embeds = vision_outputs[1]
# 通过视觉投影层处理图像嵌入
image_embeds = self.visual_projection(image_embeds)
# 从文本模型的输出中获取文本嵌入
text_embeds = text_outputs[1]
# 通过文本投影层处理文本嵌入
text_embeds = self.text_projection(text_embeds)
# 对图像嵌入进行归一化处理
image_embeds = image_embeds / jnp.linalg.norm(image_embeds, axis=-1, keepdims=True)
# 对文本嵌入进行归一化处理
text_embeds = text_embeds / jnp.linalg.norm(text_embeds, axis=-1, keepdims=True)
# 计算余弦相似度作为 logits
logit_scale = jnp.exp(self.logit_scale)
logits_per_text = jnp.matmul(text_embeds, image_embeds.T) * logit_scale
logits_per_image = logits_per_text.T
# 如果不返回字典形式的结果,则按顺序返回元组
if not return_dict:
return (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
# 返回 FlaxCLIPOutput 对象,封装各类输出和模型状态
return FlaxCLIPOutput(
logits_per_image=logits_per_image,
logits_per_text=logits_per_text,
text_embeds=text_embeds,
image_embeds=image_embeds,
text_model_output=text_outputs,
vision_model_output=vision_outputs,
)
# 使用装饰器为 FlaxCLIPModel 类添加起始文档字符串
@add_start_docstrings(CLIP_START_DOCSTRING)
# 将 FlaxCLIPPreTrainedModel 的模块类指定为 FlaxCLIPModule
class FlaxCLIPModel(FlaxCLIPPreTrainedModel):
module_class = FlaxCLIPModule
# 定义 FLAX_CLIP_MODEL_DOCSTRING 常量,该常量包含关于 FlaxCLIPModel 类的详细文档字符串
FLAX_CLIP_MODEL_DOCSTRING = """
Returns:
描述函数返回的内容。
Example:
给出一个使用示例,展示模型如何使用。
```
>>> import jax
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, FlaxCLIPModel
>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(
... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True
... )
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
```
"""
# 调用 overwrite_call_docstring 函数,将 CLIP_INPUTS_DOCSTRING 和 FLAX_CLIP_MODEL_DOCSTRING 合并作为 FlaxCLIPModel 类的文档字符串
overwrite_call_docstring(FlaxCLIPModel, CLIP_INPUTS_DOCSTRING + FLAX_CLIP_MODEL_DOCSTRING)
# 调用 append_replace_return_docstrings 函数,指定输出类型为 FlaxCLIPOutput,配置类为 CLIPConfig,为 FlaxCLIPModel 类附加和替换返回文档字符串
append_replace_return_docstrings(FlaxCLIPModel, output_type=FlaxCLIPOutput, config_class=CLIPConfig)
.\models\clip\modeling_tf_clip.py
"""
TF 2.0 CLIP 模型。
"""
from __future__ import annotations
import math
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
from ...modeling_tf_utils import (
TFModelInputType,
TFPreTrainedModel,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "openai/clip-vit-base-patch32"
TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openai/clip-vit-base-patch32",
]
LARGE_NEGATIVE = -1e8
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
"""
将注意力掩码从 `[bsz, seq_len]` 扩展到 `[bsz, 1, tgt_seq_len, src_seq_len]`。
"""
src_len = shape_list(mask)[1]
tgt_len = tgt_len if tgt_len is not None else src_len
one_cst = tf.constant(1.0)
mask = tf.cast(mask, dtype=one_cst.dtype)
expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
return (one_cst - expanded_mask) * LARGE_NEGATIVE
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
"""
计算对比损失,使用稀疏分类交叉熵作为损失函数。
"""
return tf.math.reduce_mean(
keras.metrics.sparse_categorical_crossentropy(
y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
)
)
def clip_loss(similarity: tf.Tensor) -> tf.Tensor:
"""
计算 CLIP 损失,结合文本和图像的对比损失。
"""
caption_loss = contrastive_loss(similarity)
image_loss = contrastive_loss(tf.transpose(similarity))
return (caption_loss + image_loss) / 2.0
@dataclass
class TFCLIPOutput(ModelOutput):
"""
TFCLIP 模型的输出类,继承自 ModelOutput。
"""
"""
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for image-text similarity.
logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
similarity scores.
logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
similarity scores.
text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of [`TFCLIPTextModel`].
image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
The image embeddings obtained by applying the projection layer to the pooled output of
[`TFCLIPVisionModel`].
text_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]):
The output of the [`TFCLIPTextModel`].
vision_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]):
The output of the [`TFCLIPVisionModel`].
"""
loss: tf.Tensor | None = None
logits_per_image: tf.Tensor = None
logits_per_text: tf.Tensor = None
text_embeds: tf.Tensor = None
image_embeds: tf.Tensor = None
text_model_output: TFBaseModelOutputWithPooling = None
vision_model_output: TFBaseModelOutputWithPooling = None
def to_tuple(self) -> Tuple[Any]:
return tuple(
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
class TFCLIPVisionEmbeddings(keras.layers.Layer):
def __init__(self, config: CLIPVisionConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.config = config
self.patch_embedding = keras.layers.Conv2D(
filters=self.embed_dim,
kernel_size=self.patch_size,
strides=self.patch_size,
padding="valid",
data_format="channels_last",
use_bias=False,
kernel_initializer=get_initializer(self.config.initializer_range * self.config.initializer_factor),
name="patch_embedding",
)
def build(self, input_shape: tf.TensorShape = None):
factor = self.config.initializer_factor
self.class_embedding = self.add_weight(
shape=(self.embed_dim,),
initializer=get_initializer(self.embed_dim**-0.5 * factor),
trainable=True,
name="class_embedding",
)
with tf.name_scope("position_embedding"):
self.position_embedding = self.add_weight(
shape=(self.num_positions, self.embed_dim),
initializer=get_initializer(self.config.initializer_range * factor),
trainable=True,
name="embeddings",
)
if self.built:
return
self.built = True
if getattr(self, "patch_embedding", None) is not None:
with tf.name_scope(self.patch_embedding.name):
self.patch_embedding.build([None, None, None, self.config.num_channels])
def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
"""`pixel_values` is expected to be of NCHW format."""
batch_size, num_channels, height, width = shape_list(pixel_values)
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
patch_embeds = self.patch_embedding(pixel_values)
patch_embeds = tf.reshape(tensor=patch_embeds, shape=(batch_size, self.num_patches, -1))
class_embeds = tf.broadcast_to(self.class_embedding, shape=(batch_size, 1, self.embed_dim))
embeddings = tf.concat((class_embeds, patch_embeds), axis=1)
embeddings = embeddings + self.position_embedding
return embeddings
def __init__(self, config: CLIPTextConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size
self.config = config
def build(self, input_shape: tf.TensorShape = None):
with tf.name_scope("token_embedding"):
self.weight = self.add_weight(
shape=(self.config.vocab_size, self.embed_dim),
initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
trainable=True,
name="weight",
)
with tf.name_scope("position_embedding"):
self.position_embedding = self.add_weight(
shape=(self.config.max_position_embeddings, self.embed_dim),
initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
trainable=True,
name="embeddings",
)
super().build(input_shape)
def call(
self,
input_ids: tf.Tensor = None,
position_ids: tf.Tensor = None,
inputs_embeds: tf.Tensor = None,
) -> tf.Tensor:
"""
Applies embedding based on inputs tensor.
Returns:
final_embeddings (`tf.Tensor`): output embedding tensor.
"""
if input_ids is None and inputs_embeds is None:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if position_ids is None:
position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
position_embeds = tf.gather(params=self.position_embedding, indices=position_ids)
position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
final_embeddings = inputs_embeds + position_embeds
return final_embeddings
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = self.embed_dim // self.num_attention_heads
if self.attention_head_size * self.num_attention_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_attention_heads})."
)
factor = config.initializer_factor
in_proj_std = (self.embed_dim**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
out_proj_std = (self.embed_dim**-0.5) * factor
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.q_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj"
)
self.k_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj"
)
self.v_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj"
)
self.dropout = keras.layers.Dropout(rate=config.attention_dropout)
self.out_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj"
)
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(tensor, perm=[0, 2, 1, 3])
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
causal_attention_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
"""Input shape: Batch x Time x Channel"""
batch_size = shape_list(hidden_states)[0]
mixed_query_layer = self.q_proj(inputs=hidden_states)
mixed_key_layer = self.k_proj(inputs=hidden_states)
mixed_value_layer = self.v_proj(inputs=hidden_states)
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
attention_scores = tf.divide(attention_scores, dk)
if causal_attention_mask is not None:
attention_scores = tf.add(attention_scores, causal_attention_mask)
if attention_mask is not None:
attention_scores = tf.add(attention_scores, attention_mask)
_attention_probs = stable_softmax(logits=attention_scores, axis=-1)
attention_probs = self.dropout(inputs=_attention_probs, training=training)
attention_output = tf.matmul(attention_probs, value_layer)
attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.embed_dim))
attention_output = self.out_proj(attention_output, training=training)
outputs = (attention_output, _attention_probs) if output_attentions else (attention_output,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFCLIPMLP(keras.layers.Layer):
def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs)
self.activation_fn = get_tf_activation(config.hidden_act)
factor = config.initializer_factor
in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
fc_std = (2 * config.hidden_size) ** -0.5 * factor
self.fc1 = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
)
self.fc2 = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.fc1(inputs=hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(inputs=hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.config.hidden_size])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.intermediate_size])
class TFCLIPEncoderLayer(keras.layers.Layer):
def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size
self.self_attn = TFCLIPAttention(config, name="self_attn")
self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
self.mlp = TFCLIPMLP(config, name="mlp")
self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
causal_attention_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
"""
Args:
hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`tf.Tensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
causal_attention_mask (`tf.Tensor`): causal attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`):
Whether or not to return the attentions tensors of all attention layers. See `outputs` under returned
tensors for more detail.
"""
residual = hidden_states
hidden_states = self.layer_norm1(inputs=hidden_states)
attention_outputs = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
training=training,
)
hidden_states = attention_outputs[0]
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.layer_norm2(inputs=hidden_states)
hidden_states = self.mlp(hidden_states=hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,) + attention_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "layer_norm1", None) is not None:
with tf.name_scope(self.layer_norm1.name):
self.layer_norm1.build([None, None, self.embed_dim])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
if getattr(self, "layer_norm2", None) is not None:
with tf.name_scope(self.layer_norm2.name):
self.layer_norm2.build([None, None, self.embed_dim])
class TFCLIPEncoder(keras.layers.Layer):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`TFCLIPEncoderLayer`].
Args:
config: CLIPConfig
"""
def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs)
self.layers = [TFCLIPEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)]
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
causal_attention_mask: tf.Tensor,
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layers):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
training=training,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFCLIPTextTransformer(keras.layers.Layer):
def __init__(self, config: CLIPTextConfig, **kwargs):
super().__init__(**kwargs)
self.embeddings = TFCLIPTextEmbeddings(config, name="embeddings")
self.encoder = TFCLIPEncoder(config, name="encoder")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
self.eos_token_id = config.eos_token_id
self.embed_dim = config.hidden_size
def call(
self,
input_ids: TFModelInputType,
attention_mask: tf.Tensor,
position_ids: tf.Tensor,
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
input_shape = shape_list(input_ids)
embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids)
batch_size, seq_length = input_shape
causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_length, dtype=embedding_output.dtype)
attention_mask = _expand_mask(attention_mask)
encoder_outputs = self.encoder(
hidden_states=embedding_output,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = encoder_outputs[0]
sequence_output = self.final_layer_norm(inputs=sequence_output)
if self.eos_token_id == 2:
pooled_output = tf.gather_nd(
params=sequence_output,
indices=tf.stack(
values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
),
)
else:
pooled_output = tf.gather_nd(
params=sequence_output,
indices=tf.stack(
values=(
tf.range(input_shape[0], dtype=tf.int64),
tf.math.argmax(tf.cast(input_ids == self.eos_token_id, dtype=tf.int8), axis=-1),
),
axis=1,
),
)
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32):
diag = tf.cast(tf.fill((seq_length,), 0.0), dtype)
to_mask = tf.cast(tf.fill((seq_length, seq_length), -10000.0), dtype)
to_mask = tf.linalg.band_part(to_mask, 0, -1)
to_mask = tf.linalg.set_diag(to_mask, diagonal=diag)
return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length))
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
@keras_serializable
class TFCLIPTextMainLayer(keras.layers.Layer):
config_class = CLIPTextConfig
def __init__(self, config: CLIPTextConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.text_model = TFCLIPTextTransformer(config, name="text_model")
def get_input_embeddings(self) -> keras.layers.Layer:
return self.text_model.embeddings
def set_input_embeddings(self, value: tf.Variable):
self.text_model.embeddings.weight = value
self.text_model.embeddings.vocab_size = shape_list(value)[0]
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
if input_ids is None:
raise ValueError("You have to specify input_ids")
input_shape = shape_list(input_ids)
if attention_mask is None:
attention_mask = tf.fill(dims=input_shape, value=1)
text_model_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return text_model_outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "text_model", None) is not None:
with tf.name_scope(self.text_model.name):
self.text_model.build(None)
class TFCLIPVisionTransformer(keras.layers.Layer):
def __init__(self, config: CLIPVisionConfig, **kwargs):
super().__init__(**kwargs)
self.embeddings = TFCLIPVisionEmbeddings(config, name="embeddings")
self.pre_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm")
self.encoder = TFCLIPEncoder(config, name="encoder")
self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
self.embed_dim = config.hidden_size
def call(
self,
pixel_values: TFModelInputType,
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
):
pass
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
embedding_output = self.embeddings(pixel_values=pixel_values)
embedding_output = self.pre_layernorm(inputs=embedding_output)
encoder_outputs = self.encoder(
hidden_states=embedding_output,
attention_mask=None,
causal_attention_mask=None,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = encoder_outputs[0]
pooled_output = sequence_output[:, 0, :]
pooled_output = self.post_layernorm(inputs=pooled_output)
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "pre_layernorm", None) is not None:
with tf.name_scope(self.pre_layernorm.name):
self.pre_layernorm.build([None, None, self.embed_dim])
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "post_layernorm", None) is not None:
with tf.name_scope(self.post_layernorm.name):
self.post_layernorm.build([None, self.embed_dim])
@keras_serializable
class TFCLIPVisionMainLayer(keras.layers.Layer):
config_class = CLIPVisionConfig
def __init__(self, config: CLIPVisionConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.vision_model = TFCLIPVisionTransformer(config, name="vision_model")
def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings
@unpack_inputs
def call(
self,
pixel_values: TFModelInputType | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
vision_model_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return vision_model_outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
@keras_serializable
class TFCLIPMainLayer(keras.layers.Layer):
config_class = CLIPConfig
def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs)
if not isinstance(config.text_config, CLIPTextConfig):
raise ValueError(
"config.text_config is expected to be of type CLIPTextConfig but is of type"
f" {type(config.text_config)}."
)
if not isinstance(config.vision_config, CLIPVisionConfig):
raise ValueError(
"config.vision_config is expected to be of type CLIPVisionConfig but is of type"
f" {type(config.vision_config)}."
)
self.config = config
text_config = config.text_config
vision_config = config.vision_config
self.projection_dim = config.projection_dim
self.text_model = TFCLIPTextTransformer(text_config, name="text_model")
self.vision_model = TFCLIPVisionTransformer(vision_config, name="vision_model")
self.visual_projection = keras.layers.Dense(
units=self.projection_dim,
kernel_initializer=get_initializer(vision_config.hidden_size**-0.5 * self.config.initializer_factor),
use_bias=False,
name="visual_projection",
)
self.text_projection = keras.layers.Dense(
units=self.projection_dim,
kernel_initializer=get_initializer(text_config.hidden_size**-0.5 * self.config.initializer_factor),
use_bias=False,
name="text_projection",
)
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size
def build(self, input_shape: tf.TensorShape = None):
self.logit_scale = self.add_weight(
shape=(1,),
initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
trainable=True,
name="logit_scale",
)
if self.built:
return
self.built = True
if getattr(self, "text_model", None) is not None:
with tf.name_scope(self.text_model.name):
self.text_model.build(None)
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
if getattr(self, "visual_projection", None) is not None:
with tf.name_scope(self.visual_projection.name):
self.visual_projection.build([None, None, self.vision_embed_dim])
if getattr(self, "text_projection", None) is not None:
with tf.name_scope(self.text_projection.name):
self.text_projection.build([None, None, self.text_embed_dim])
@unpack_inputs
@unpack_inputs
def get_text_features(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> tf.Tensor:
if input_ids is None:
raise ValueError("You have to specify either input_ids")
input_shape = shape_list(input_ids)
if attention_mask is None:
attention_mask = tf.fill(dims=input_shape, value=1)
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = text_outputs[1]
text_features = self.text_projection(inputs=pooled_output)
return text_features
@unpack_inputs
def get_image_features(
self,
pixel_values: TFModelInputType | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> tf.Tensor:
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = vision_outputs[1]
image_features = self.visual_projection(inputs=pooled_output)
return image_features
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
pixel_values: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
if input_ids is None and pixel_values is None:
raise ValueError("You have to specify either input_ids or pixel_values")
if input_ids is not None:
input_shape = shape_list(input_ids)
if attention_mask is None:
attention_mask = tf.fill(dims=input_shape, value=1)
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = text_outputs[1]
text_features = self.text_projection(inputs=pooled_output)
return text_features
if pixel_values is not None:
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = vision_outputs[1]
image_features = self.visual_projection(inputs=pooled_output)
return image_features
if return_loss:
return text_features, image_features
if return_dict:
return {'text_features': text_features, 'image_features': image_features}
else:
return text_features, image_features
) -> Union[TFCLIPOutput, Tuple[tf.Tensor]]:
if input_ids is None:
raise ValueError("You have to specify either input_ids")
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
input_shape = shape_list(input_ids)
if attention_mask is None:
attention_mask = tf.fill(dims=input_shape, value=1)
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
image_embeds = vision_outputs[1]
image_embeds = self.visual_projection(inputs=image_embeds)
text_embeds = text_outputs[1]
text_embeds = self.text_projection(inputs=text_embeds)
image_embeds = image_embeds / tf.norm(tensor=image_embeds, ord="euclidean", axis=-1, keepdims=True)
text_embeds = text_embeds / tf.norm(tensor=text_embeds, ord="euclidean", axis=-1, keepdims=True)
logit_scale = tf.math.exp(self.logit_scale)
logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale
logits_per_image = tf.transpose(logits_per_text)
loss = None
if return_loss:
loss = clip_loss(logits_per_text)
loss = tf.reshape(loss, (1,))
if not return_dict:
output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
return (loss,) + output if loss is not None else output
return TFCLIPOutput(
loss=loss,
logits_per_image=logits_per_image,
logits_per_text=logits_per_text,
text_embeds=text_embeds,
image_embeds=image_embeds,
text_model_output=text_outputs,
vision_model_output=vision_outputs,
)
CLIP_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
[`PreTrainedTokenizer.encode`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
注释:
# input_ids: 输入序列token在词汇表中的索引,可以是np.ndarray、tf.Tensor、List[tf.Tensor]、Dict[str, tf.Tensor]或Dict[str, np.ndarray]类型,每个示例必须具有形状为({0})。
# attention_mask: 可选参数,用于避免在填充token索引上执行注意力操作的掩码。掩码值在[0, 1]之间选择:
# - 1表示不被掩盖的token,
# - 0表示被掩盖的token。
# position_ids: 可选参数,输入序列中每个token在位置嵌入中的位置索引。选择范围为[0, config.max_position_embeddings - 1]。
# output_attentions: 可选参数,是否返回所有注意力层的注意力张量。详细信息请参见返回的张量中的`attentions`。此参数仅在动态图模式下有效,在静态图模式下将使用配置中的值。
# output_hidden_states: 可选参数,是否返回所有层的隐藏状态。详细信息请参见返回的张量中的`hidden_states`。此参数仅在动态图模式下有效,在静态图模式下将使用配置中的值。
# return_dict: 可选参数,是否返回[`~utils.ModelOutput`]而不是普通元组。此参数可以在动态图模式下使用,在静态图模式下将始终设置为True。
# training: 可选参数,默认为`False`,指示模型是否处于训练模式(例如,某些模块如dropout在训练和评估之间有不同的行为)。
# CLIP_VISION_INPUTS_DOCSTRING 是一个原始字符串(raw string),用于描述 CLIP 模型的输入参数。
CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`CLIPImageProcessor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to
return the attentions tensors of all attention layers. See `attentions` under returned tensors for more
detail. This argument can be used only in eager mode, in graph mode the value in the config will be used
instead.
output_hidden_states (`bool`, *optional`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional`):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
# CLIP_INPUTS_DOCSTRING 是一个原始字符串(raw string),用于描述 CLIP 模型的输入参数,不同于 CLIP_VISION_INPUTS_DOCSTRING。
CLIP_INPUTS_DOCSTRING = r"""
"""
# 定义一个函数,接受多种类型的输入数据作为参数,这些数据用于描述输入序列的特征和掩码
Args:
# 输入序列的标记索引,可以是多种数据类型,如 np.ndarray, tf.Tensor, List[tf.Tensor], Dict[str, tf.Tensor] 或 Dict[str, np.ndarray]
# 每个样本都必须具有形状为 ({0}) 的索引
input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
[`PreTrainedTokenizer.encode`] for details.
[What are input IDs?](../glossary#input-ids)
# 像素值,可以是多种数据类型,如 np.ndarray, tf.Tensor, List[tf.Tensor], Dict[str, tf.Tensor] 或 Dict[str, np.ndarray]
# 每个样本必须具有形状为 (batch_size, num_channels, height, width) 的像素值
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`CLIPImageProcessor.__call__`] for details.
# 可选参数,用于避免在填充标记索引上执行注意力操作的掩码
attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
# 可选参数,指定每个输入序列标记在位置嵌入中的位置索引
position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
# 可选参数,指定是否返回对比损失
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
# 可选参数,在 eager 模式下是否返回所有注意力层的注意力张量
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
# 可选参数,在 eager 模式下是否返回所有层的隐藏状态
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
# 可选参数,指定是否返回一个 `~utils.ModelOutput` 而不是普通的元组。在 eager 模式下可以使用,图模式下始终为 True。
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
eager mode, in graph mode the value will always be set to True.
# 可选参数,指定是否以训练模式运行模型(某些模块如 dropout 在训练和评估中有不同的行为)
training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
Define TFCLIPTextModel class inheriting from TFCLIPPreTrainedModel.
"""
class TFCLIPTextModel(TFCLIPPreTrainedModel):
# Specify the configuration class for text CLIP
config_class = CLIPTextConfig
def __init__(self, config: CLIPTextConfig, *inputs, **kwargs):
"""
Initialize TFCLIPTextModel.
Args:
config (CLIPTextConfig): Model configuration object.
*inputs: Variable length input arguments.
**kwargs: Keyword arguments for additional configuration.
"""
# Call superclass initialization
super().__init__(config, *inputs, **kwargs)
# Initialize the main CLIP text layer
self.clip = TFCLIPTextMainLayer(config, name="clip")
@unpack_inputs
@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPTextConfig)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
"""
Perform the forward pass of the model.
Args:
input_ids (TFModelInputType, optional): Input tensor of token ids.
attention_mask (np.ndarray or tf.Tensor, optional): Attention mask for masking padded tokens.
position_ids (np.ndarray or tf.Tensor, optional): Position indices for the input tokens.
output_attentions (bool, optional): Whether to output attentions.
output_hidden_states (bool, optional): Whether to output hidden states.
return_dict (bool, optional): Whether to return a dictionary.
training (bool, optional): Whether the model is in training mode.
Returns:
TFBaseModelOutputWithPooling or Tuple[tf.Tensor]: Model outputs.
Examples:
Example usage of the model:
```
>>> from transformers import AutoTokenizer, TFCLIPTextModel
>>> model = TFCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output
```
"""
# Forward pass through the CLIP model
outputs = self.clip(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
"""
Build method for constructing the model.
Args:
input_shape: Shape of the input tensor (not used here).
"""
if self.built:
return
self.built = True
# Build the main CLIP layer if defined
if getattr(self, "clip", None) is not None:
with tf.name_scope(self.clip.name):
self.clip.build(None)
"""
Define TFCLIPVisionModel class inheriting from TFCLIPPreTrainedModel.
"""
class TFCLIPVisionModel(TFCLIPPreTrainedModel):
# Specify the configuration class for vision CLIP
config_class = CLIPVisionConfig
# Define the main input name for vision model
main_input_name = "pixel_values"
def __init__(self, config: CLIPVisionConfig, *inputs, **kwargs):
"""
Initialize TFCLIPVisionModel.
Args:
config (CLIPVisionConfig): Model configuration object.
*inputs: Variable length input arguments.
**kwargs: Keyword arguments for additional configuration.
"""
# Call superclass initialization
super().__init__(config, *inputs, **kwargs)
# Initialize the main CLIP vision layer
self.clip = TFCLIPVisionMainLayer(config, name="clip")
@unpack_inputs
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPVisionConfig)
def call(
self,
pixel_values: TFModelInputType | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
"""
Perform the forward pass of the model.
Args:
pixel_values (TFModelInputType, optional): Input tensor of pixel values.
output_attentions (bool, optional): Whether to output attentions.
output_hidden_states (bool, optional): Whether to output hidden states.
return_dict (bool, optional): Whether to return a dictionary.
training (bool, optional): Whether the model is in training mode.
Returns:
TFBaseModelOutputWithPooling or Tuple[tf.Tensor]: Model outputs.
"""
# Forward pass through the CLIP model
outputs = self.clip(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
r"""
Returns:
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, TFCLIPVisionModel
>>> model = TFCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="tf")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output
```"""
# 调用 self.clip 方法进行模型推断
outputs = self.clip(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 返回推断的输出结果
return outputs
def build(self, input_shape=None):
# 如果模型已经构建完成,直接返回
if self.built:
return
# 设置模型已构建标志为 True
self.built = True
# 如果 self.clip 存在,则在命名空间下构建 clip 模型
if getattr(self, "clip", None) is not None:
with tf.name_scope(self.clip.name):
self.clip.build(None)
# 使用装饰器为类添加文档字符串,使用CLIP_START_DOCSTRING作为模板
@add_start_docstrings(CLIP_START_DOCSTRING)
class TFCLIPModel(TFCLIPPreTrainedModel):
# 设置配置类为CLIPConfig
config_class = CLIPConfig
# 初始化方法,接受配置对象config和任意额外输入
def __init__(self, config: CLIPConfig, *inputs, **kwargs):
# 调用父类的初始化方法
super().__init__(config, *inputs, **kwargs)
# 创建TFCLIPMainLayer实例并赋给self.clip
self.clip = TFCLIPMainLayer(config, name="clip")
# 使用装饰器解包输入并添加文档字符串,使用CLIP_TEXT_INPUTS_DOCSTRING作为模板
@unpack_inputs
@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def get_text_features(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> tf.Tensor:
r"""
返回文本特征张量(`tf.Tensor`,形状为`(batch_size, output_dim)`):
通过将投影层应用于[`TFCLIPTextModel`]的汇总输出获得的文本嵌入。
Examples:
```
>>> from transformers import AutoTokenizer, TFCLIPModel
>>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
>>> text_features = model.get_text_features(**inputs)
```"""
# 调用self.clip的get_text_features方法,传入各种输入参数
text_features = self.clip.get_text_features(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 返回文本特征张量
return text_features
# 使用装饰器解包输入并添加文档字符串,使用CLIP_VISION_INPUTS_DOCSTRING作为模板
@unpack_inputs
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
def get_image_features(
self,
pixel_values: TFModelInputType | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> tf.Tensor:
r"""
Returns:
image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
the projection layer to the pooled output of [`TFCLIPVisionModel`].
Examples:
```
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, TFCLIPModel
>>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="tf")
>>> image_features = model.get_image_features(**inputs)
```"""
# 调用 CLIP 模型获取图像特征
image_features = self.clip.get_image_features(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 返回获取的图像特征张量
return image_features
@unpack_inputs
@add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFCLIPOutput, config_class=CLIPConfig)
def call(
self,
input_ids: TFModelInputType | None = None,
pixel_values: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
def serving_output(self, output: TFCLIPOutput) -> TFCLIPOutput:
"""
返回经过服务输出处理后的 TFCLIPOutput 对象。
Parameters:
output (TFCLIPOutput): 待处理的 TFCLIPOutput 对象。
Returns:
TFCLIPOutput: 经过服务输出处理后的 TFCLIPOutput 对象。
"""
# TODO: 目前在 saved_model=True 模式下存在问题,因为 TensorFlow 无法追踪嵌套的 dataclass 结构。
# 参考链接: https://github.com/huggingface/transformers/pull/16886
return output
def build(self, input_shape=None):
"""
构建模型的方法。如果已经构建过,则直接返回,否则进行构建。
Parameters:
input_shape: 输入张量的形状,默认为 None。
"""
if self.built:
return
self.built = True
# 如果模型已经包含了 CLIP 模型实例,则在命名空间下构建该模型。
if getattr(self, "clip", None) is not None:
with tf.name_scope(self.clip.name):
self.clip.build(None)