Transformers 源码解析（二十五）

`.\models\clap\processing_clap.py`

"""
Audio/Text processor class for CLAP
"""

from ...processing_utils import ProcessorMixin  # 导入ProcessorMixin，用于处理混合功能
from ...tokenization_utils_base import BatchEncoding  # 导入BatchEncoding，用于批量编码

class ClapProcessor(ProcessorMixin):
    r"""
    Constructs a CLAP processor which wraps a CLAP feature extractor and a RoBerta tokenizer into a single processor.

    [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`RobertaTokenizerFast`]. See the
    [`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information.

    Args:
        feature_extractor ([`ClapFeatureExtractor`]):
            The audio processor is a required input.
        tokenizer ([`RobertaTokenizerFast`]):
            The tokenizer is a required input.
    """

    feature_extractor_class = "ClapFeatureExtractor"  # 设定特征提取器的类名为"ClapFeatureExtractor"
    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")  # 设定标记器的类名为"RobertaTokenizer"和"RobertaTokenizerFast"

    def __init__(self, feature_extractor, tokenizer):
        super().__init__(feature_extractor, tokenizer)  # 调用父类的初始化方法，传入特征提取器和标记器

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)  # 调用标记器的批量解码方法，并将所有参数传递给它

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer
        to the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)  # 调用标记器的解码方法，并将所有参数传递给它

    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names  # 获取标记器的模型输入名称列表
        feature_extractor_input_names = self.feature_extractor.model_input_names  # 获取特征提取器的模型输入名称列表
        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))  # 返回合并并去重后的模型输入名称列表

`.\models\clap\init.py`

# 引入必要的模块和类型检查功能
from typing import TYPE_CHECKING
# 引入自定义的异常和懒加载模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构，包括配置、处理和特征提取相关的模块
_import_structure = {
    "configuration_clap": [
        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ClapAudioConfig",
        "ClapConfig",
        "ClapTextConfig",
    ],
    "processing_clap": ["ClapProcessor"],
}

# 尝试检查是否存在 torch，如果不存在则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，则添加建模和特征提取模块到导入结构中
    _import_structure["modeling_clap"] = [
        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ClapModel",
        "ClapPreTrainedModel",
        "ClapTextModel",
        "ClapTextModelWithProjection",
        "ClapAudioModel",
        "ClapAudioModelWithProjection",
    ]
    _import_structure["feature_extraction_clap"] = ["ClapFeatureExtractor"]

# 如果是类型检查环境，引入配置和处理模块中的符号，以及特征提取和建模模块（如果 torch 可用）
if TYPE_CHECKING:
    from .configuration_clap import (
        CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
        ClapAudioConfig,
        ClapConfig,
        ClapTextConfig,
    )
    from .processing_clap import ClapProcessor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .feature_extraction_clap import ClapFeatureExtractor
        from .modeling_clap import (
            CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
            ClapAudioModel,
            ClapAudioModelWithProjection,
            ClapModel,
            ClapPreTrainedModel,
            ClapTextModel,
            ClapTextModelWithProjection,
        )

# 如果不是类型检查环境，则设置当前模块为懒加载模块
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\clip\configuration_clip.py`

# 设置文件编码为 UTF-8
# 版权声明和许可信息
# 根据 Apache 许可证 2.0 版本，许可文件的链接
# 如果符合许可证的条件，可以使用该文件，否则禁止使用
""" CLIP 模型配置"""

# 导入标准库和模块
import os
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 导入处理工具混合类和张量类型
    from ...processing_utils import ProcessorMixin
    from ...utils import TensorType

# 导入配置工具类和 ONNX 配置
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging

# 获取日志记录器
logger = logging.get_logger(__name__)

# CLIP 预训练配置映射字典
CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/config.json",
    # 查看所有 CLIP 模型：https://huggingface.co/models?filter=clip
}

# CLIPTextConfig 类，继承自 PretrainedConfig
class CLIPTextConfig(PretrainedConfig):
    r"""
    这是一个配置类，用于存储 [`CLIPTextModel`] 的配置。根据指定的参数实例化 CLIP 文本编码器，定义模型架构。
    使用默认配置实例化将得到类似于 CLIP [openai/clip-vit-base-patch32] 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。
    """
    # 定义模型类型为 CLIP 文本模型
    model_type = "clip_text_model"
    def __init__(
        self,
        vocab_size=49408,
        hidden_size=512,
        intermediate_size=2048,
        projection_dim=512,
        num_hidden_layers=12,
        num_attention_heads=8,
        max_position_embeddings=77,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-5,
        attention_dropout=0.0,
        initializer_range=0.02,
        initializer_factor=1.0,
        # This differs from `CLIPTokenizer`'s default and from openai/clip
        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
        pad_token_id=1,
        bos_token_id=49406,
        eos_token_id=49407,
        **kwargs,
    ):
        # 调用父类的初始化方法，设置特殊的标记符号的ID，并传递其他参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 设置模型的各种超参数
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.projection_dim = projection_dim
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.max_position_embeddings = max_position_embeddings
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.initializer_factor = initializer_factor
        self.attention_dropout = attention_dropout

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 调用内部方法，设置token相关的kwargs参数
        cls._set_token_in_kwargs(kwargs)

        # 获取预训练模型的配置字典和更新后的kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中的模型类型是"clip"，则获取其文本配置字典
        if config_dict.get("model_type") == "clip":
            config_dict = config_dict["text_config"]

        # 如果配置字典中包含模型类型且与当前类的模型类型不同，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 根据配置字典和kwargs创建模型配置对象并返回
        return cls.from_dict(config_dict, **kwargs)
# 定义 CLIPVisionConfig 类，继承自 PretrainedConfig，用于存储 CLIPVisionModel 的配置信息
class CLIPVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
    CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        projection_dim (`int`, *optional*, defaults to 512):
            Dimentionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 32):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```
    >>> from transformers import CLIPVisionConfig, CLIPVisionModel

    >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
    >>> configuration = CLIPVisionConfig()

    >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
    >>> model = CLIPVisionModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config


    model_type = "clip_vision_model"

    def __init__(
        self,
        hidden_size=768,
        intermediate_size=3072,
        projection_dim=512,
        num_hidden_layers=12,
        num_attention_heads=12,
        num_channels=3,
        image_size=224,
        patch_size=32,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-5,
        attention_dropout=0.0,
        initializer_range=0.02,
        initializer_factor=1.0,
        **kwargs,
    ):
        # 调用父类的构造方法，初始化基类的属性
        super().__init__(**kwargs)

        # 初始化模型的各种参数
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.projection_dim = projection_dim
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_channels = num_channels
        self.patch_size = patch_size
        self.image_size = image_size
        self.initializer_range = initializer_range
        self.initializer_factor = initializer_factor
        self.attention_dropout = attention_dropout
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        cls._set_token_in_kwargs(kwargs)

        # 获取预训练模型的配置字典和额外的关键字参数
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果从 CLIPConfig 加载，获取视觉配置字典
        if config_dict.get("model_type") == "clip":
            config_dict = config_dict["vision_config"]

        # 如果配置字典中存在模型类型，且与当前类的模型类型不匹配，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 使用配置字典和额外参数创建类的实例
        return cls.from_dict(config_dict, **kwargs)
class CLIPConfig(PretrainedConfig):
    r"""
    [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
    a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
    a configuration with the defaults will yield a similar configuration to that of the CLIP
    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
        projection_dim (`int`, *optional*, defaults to 512):
            Dimentionality of text and vision projection layers.
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```
    >>> from transformers import CLIPConfig, CLIPModel

    >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
    >>> configuration = CLIPConfig()

    >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
    >>> model = CLIPModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
    >>> from transformers import CLIPTextConfig, CLIPVisionConfig

    >>> # Initializing a CLIPText and CLIPVision configuration
    >>> config_text = CLIPTextConfig()
    >>> config_vision = CLIPVisionConfig()

    >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
    ```"""

    model_type = "clip"

    def __init__(
        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
    ):
        # 调用父类的初始化方法，初始化基类的配置
        super().__init__(**kwargs)
        # 设定文本配置
        self.text_config = text_config
        # 设定视觉配置
        self.vision_config = vision_config
        # 设定投影维度
        self.projection_dim = projection_dim
        # 设定logit_scale参数的初始值
        self.logit_scale_init_value = logit_scale_init_value

    @classmethod
    def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
        r"""
        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
        configuration.

        Returns:
            [`CLIPConfig`]: An instance of a configuration object
        """
        # 从文本配置和视觉配置创建一个新的 `CLIPConfig` 实例
        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)


class CLIPOnnxConfig(OnnxConfig):
    @property
    # 定义一个方法 `inputs`，返回一个有序字典，描述了输入数据的结构
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 返回一个有序字典，包含三个键值对，每个键值对描述了不同输入的维度信息
        return OrderedDict(
            [
                ("input_ids", {0: "batch", 1: "sequence"}),  # 表示 input_ids 维度为 [batch, sequence]
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),  # pixel_values 维度为 [batch, num_channels, height, width]
                ("attention_mask", {0: "batch", 1: "sequence"}),  # attention_mask 维度为 [batch, sequence]
            ]
        )

    # 定义一个只读属性 `outputs`，返回一个有序字典，描述了输出数据的结构
    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        # 返回一个有序字典，包含四个键值对，每个键值对描述了不同输出的维度信息
        return OrderedDict(
            [
                ("logits_per_image", {0: "batch"}),  # logits_per_image 维度为 [batch]
                ("logits_per_text", {0: "batch"}),   # logits_per_text 维度为 [batch]
                ("text_embeds", {0: "batch"}),       # text_embeds 维度为 [batch]
                ("image_embeds", {0: "batch"}),      # image_embeds 维度为 [batch]
            ]
        )

    # 定义一个方法 `atol_for_validation`，返回浮点数值，表示验证中的绝对容差
    @property
    def atol_for_validation(self) -> float:
        return 1e-4

    # 定义一个方法 `generate_dummy_inputs`，生成虚拟输入数据的字典
    def generate_dummy_inputs(
        self,
        processor: "ProcessorMixin",
        batch_size: int = -1,
        seq_length: int = -1,
        framework: Optional["TensorType"] = None,
    ) -> Mapping[str, Any]:
        # 使用父类的方法生成文本输入的虚拟数据字典
        text_input_dict = super().generate_dummy_inputs(
            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
        )
        # 使用父类的方法生成图像输入的虚拟数据字典
        image_input_dict = super().generate_dummy_inputs(
            processor.image_processor, batch_size=batch_size, framework=framework
        )
        # 返回合并了文本和图像输入数据字典的结果
        return {**text_input_dict, **image_input_dict}

    # 定义一个只读属性 `default_onnx_opset`，返回整数值，表示默认的 ONNX 运算集版本
    @property
    def default_onnx_opset(self) -> int:
        return 14

`.\models\clip\convert_clip_original_pytorch_to_hf.py`

# 引入 argparse 模块，用于处理命令行参数
import argparse

# 引入 PyTorch 库
import torch
# 从 clip 模块中导入 load 函数
from clip import load
# 从 transformers 库中导入 CLIPConfig 和 CLIPModel 类
from transformers import CLIPConfig, CLIPModel


def copy_attn_layer(hf_attn_layer, pt_attn_layer):
    # 将 pt_attn_layer.in_proj_weight 按行分割成 q_proj, k_proj, v_proj 三部分
    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
    # 将 pt_attn_layer.in_proj_bias 按行分割成 q_proj_bias, k_proj_bias, v_proj_bias 三部分
    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)

    # 设置 hf_attn_layer 的权重和偏置
    hf_attn_layer.q_proj.weight.data = q_proj
    hf_attn_layer.q_proj.bias.data = q_proj_bias

    hf_attn_layer.k_proj.weight.data = k_proj
    hf_attn_layer.k_proj.bias.data = k_proj_bias

    hf_attn_layer.v_proj.weight.data = v_proj
    hf_attn_layer.v_proj.bias.data = v_proj_bias

    # 设置 hf_attn_layer 的输出投影权重和偏置
    hf_attn_layer.out_proj.weight = pt_attn_layer.out_proj.weight
    hf_attn_layer.out_proj.bias = pt_attn_layer.out_proj.bias


def copy_mlp(hf_mlp, pt_mlp):
    # 复制 pt_mlp 中的全连接层参数到 hf_mlp 中的 fc1 和 fc2 层
    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)


def copy_linear(hf_linear, pt_linear):
    # 复制权重和偏置
    hf_linear.weight = pt_linear.weight
    hf_linear.bias = pt_linear.bias


def copy_layer(hf_layer, pt_layer):
    # 复制层归一化
    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)

    # 复制 MLP
    copy_mlp(hf_layer.mlp, pt_layer.mlp)

    # 复制注意力层
    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)


def copy_layers(hf_layers, pt_layers):
    # 遍历并复制每个层
    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
        copy_layer(hf_layer, pt_layer)


def copy_encoder(hf_encoder, pt_model):
    # 复制嵌入层权重
    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding

    # 复制最终层归一化
    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)

    # 复制隐藏层
    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)


def copy_text_model_and_projection(hf_model, pt_model):
    # 复制文本投影层
    hf_model.text_projection.weight.data = pt_model.text_projection.data.T

    # 复制文本编码器
    copy_encoder(hf_model.text_model, pt_model)


def copy_vison_model_and_projection(hf_model, pt_model):
    # 复制视觉投影层
    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T

    # 复制层归一化
    copy_linear(hf_model.visual_model.layer_norm, pt_model.visual.ln)


# 以上是对给定代码的详细注释
    # 将 hf_model 的预层标准化层复制到 pt_model 的视觉模型的前标准化层
    copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)

    # 将 hf_model 的后层标准化层复制到 pt_model 的视觉模型的后标准化层
    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)

    # 复制嵌入层的权重
    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
    # 复制嵌入层的类别嵌入
    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
    # 复制嵌入层的位置编码权重
    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data

    # 复制编码器的层
    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
@torch.no_grad()
def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    # 如果提供了配置路径，则从预训练模型加载配置
    if config_path is not None:
        config = CLIPConfig.from_pretrained(config_path)
    else:
        # 否则使用默认配置创建 CLIPConfig 对象，设置投影维度为512，文本和视觉配置为空字典
        config = CLIPConfig(projection_dim=512, text_config={}, vision_config={})

    # 创建并设置为评估模式的 HF 模型对象
    hf_model = CLIPModel(config).eval()

    # 加载 PyTorch 模型，返回模型和其他元数据
    pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
    # 将 PyTorch 模型设置为评估模式
    pt_model = pt_model.eval()

    # 复制文本模型和投影
    copy_text_model_and_projection(hf_model, pt_model)
    # 复制视觉模型和投影
    copy_vison_model_and_projection(hf_model, pt_model)
    # 将 HF 模型的 logit_scale 属性设置为与 PT 模型相同的值
    hf_model.logit_scale = pt_model.logit_scale

    # 创建输入的示例数据
    input_ids = torch.arange(0, 77).unsqueeze(0)
    pixel_values = torch.randn(1, 3, 224, 224)

    # 使用 HF 模型进行推理，返回结果字典
    hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
    # 提取 HF 模型的图像 logit
    hf_logits_per_image = hf_outputs.logits_per_image
    # 提取 HF 模型的文本 logit
    hf_logits_per_text = hf_outputs.logits_per_text
    # 使用 PT 模型进行推理，返回图像和文本 logit
    pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)

    # 断言 HF 模型的图像 logit 与 PT 模型的图像 logit 接近（误差不超过1e-3）
    assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
    # 断言 HF 模型的文本 logit 与 PT 模型的文本 logit 接近（误差不超过1e-3）
    assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)

    # 将转换后的 HF 模型保存到指定路径
    hf_model.save_pretrained(pytorch_dump_folder_path)


if __name__ == "__main__":
    # 解析命令行参数
    parser = argparse.ArgumentParser()
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    args = parser.parse_args()

    # 调用转换函数，传入命令行参数
    convert_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)

`.\models\clip\feature_extraction_clip.py`

# coding=utf-8
# 指定文件编码为UTF-8

# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
# 版权声明，保留所有权利

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证 2.0 版本授权

# you may not use this file except in compliance with the License.
# 除非符合许可证规定，否则不得使用此文件。

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证副本：

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则依据“原样”分发软件，无论是明示的还是暗示的保证或条件。

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查阅许可证以了解特定的语言授权和限制。

"""Feature extractor class for CLIP."""
# 用于 CLIP 的特征提取器类。

import warnings
# 导入警告模块

from ...utils import logging
# 导入日志记录工具模块

from .image_processing_clip import CLIPImageProcessor
# 导入 CLIP 图像处理模块中的 CLIPImageProcessor 类

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

class CLIPFeatureExtractor(CLIPImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        warnings.warn(
            "The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
            " use CLIPImageProcessor instead.",
            FutureWarning,
        )
        # 发出警告，提醒 CLIPFeatureExtractor 类已废弃，将在 Transformers 版本 5 中移除，建议使用 CLIPImageProcessor 替代。

        super().__init__(*args, **kwargs)
        # 调用父类构造函数，初始化 CLIPImageProcessor 类

`.\models\clip\image_processing_clip.py`

# 导入必要的模块和类
from typing import Dict, List, Optional, Union  # 导入类型提示所需的模块

import numpy as np  # 导入 NumPy 库，用于处理数组和数值计算

# 导入图像处理相关的工具和函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
    convert_to_rgb,  # 导入将图像转换为 RGB 格式的函数
    get_resize_output_image_size,  # 导入获取调整大小后图像尺寸的函数
    resize,  # 导入调整图像大小的函数
    to_channel_dimension_format,  # 导入将图像转换为指定通道格式的函数
)
from ...image_utils import (
    OPENAI_CLIP_MEAN,  # 导入 CLIP 模型期望的图像均值
    OPENAI_CLIP_STD,  # 导入 CLIP 模型期望的图像标准差
    ChannelDimension,  # 导入通道维度类型
    ImageInput,  # 导入图像输入类型
    PILImageResampling,  # 导入 PIL 图像的重采样方式
    infer_channel_dimension_format,  # 推断图像通道维度格式的函数
    is_scaled_image,  # 判断图像是否已经缩放的函数
    make_list_of_images,  # 创建图像列表的函数
    to_numpy_array,  # 将图像转换为 NumPy 数组的函数
    valid_images,  # 验证图像有效性的函数
    validate_kwargs,  # 验证关键字参数的函数
    validate_preprocess_arguments,  # 验证预处理参数的函数
)
from ...utils import TensorType, is_vision_available, logging  # 导入相关工具和日志记录模块

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


# 检查视觉处理模块是否可用
if is_vision_available():
    import PIL  # 如果视觉处理可用，导入 PIL 库用于图像操作


class CLIPImageProcessor(BaseImageProcessor):
    r"""
    Constructs a CLIP image processor.
    
    """
    """
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
            `do_resize` in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
            method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
        do_center_crop (`bool`, *optional*, defaults to `True`):
            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
            `preprocess` method.
        crop_size (`Dict[str, int]` *optional*, defaults to 224):
            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
            method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
    """
    
    model_input_names = ["pixel_values"]
    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        do_center_crop: bool = True,
        crop_size: Dict[str, int] = None,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = True,
        **kwargs,
    ) -> None:
        # 调用父类初始化方法，传递额外参数
        super().__init__(**kwargs)
        
        # 如果未提供 size 参数，则默认设置为 {"shortest_edge": 224}
        size = size if size is not None else {"shortest_edge": 224}
        # 根据提供的 size 参数获取最终的尺寸字典，保证不默认为正方形
        size = get_size_dict(size, default_to_square=False)
        
        # 如果未提供 crop_size 参数，则默认设置为 {"height": 224, "width": 224}
        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
        # 根据提供的 crop_size 参数获取最终的裁剪尺寸字典，保证默认为正方形，命名参数为 "crop_size"
        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")

        # 将各个参数赋值给实例变量
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.do_center_crop = do_center_crop
        self.crop_size = crop_size
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
        self.do_convert_rgb = do_convert_rgb
        
        # 定义一个包含有效处理器键的列表，用于后向兼容 KOSMOS-2
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_center_crop",
            "crop_size",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "do_convert_rgb",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

        # 用于 KOSMOS-2 的向后兼容性，如果 kwargs 中包含 "use_square_size"，则将 size 调整为正方形
        if "use_square_size" in kwargs:
            self.size = {"height": size["shortest_edge"], "width": size["shortest_edge"]}

    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    def resize_image(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
        resized to keep the input aspect ratio.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Size of the output image. Can specify 'shortest_edge' or 'height' and 'width'.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # Determine if we default to resizing to a square image
        default_to_square = True
        if "shortest_edge" in size:
            # Resize based on the shortest edge specified in size dictionary
            size = size["shortest_edge"]
            default_to_square = False
        elif "height" in size and "width" in size:
            # Resize based on both height and width specified in size dictionary
            size = (size["height"], size["width"])
        else:
            # If neither 'shortest_edge' nor ('height' and 'width') is provided, raise an error
            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")

        # Calculate the output size for resizing while maintaining aspect ratio
        output_size = get_resize_output_image_size(
            image,
            size=size,
            default_to_square=default_to_square,
            input_data_format=input_data_format,
        )
        
        # Perform the resizing operation
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

`.\models\clip\modeling_clip.py`

# 设置文件编码为 UTF-8

# 版权声明，2021年由OpenAI团队和HuggingFace团队版权所有
#
# 根据Apache许可证2.0版（"许可证"）授权；
# 除非符合许可证要求，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"提供，不提供任何明示或暗示的担保或条件。
# 请查阅许可证了解具体法律权限和限制。
""" PyTorch CLIP模型。"""

# 导入必要的模块和库
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入内部模块和函数
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# 文档字符串中的常规说明
_CONFIG_FOR_DOC = "CLIPConfig"
_CHECKPOINT_FOR_DOC = "openai/clip-vit-base-patch32"

# 图像分类相关文档字符串
_IMAGE_CLASS_CHECKPOINT = "openai/clip-vit-base-patch32"
_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_0"

# 预训练的CLIP模型存档列表
CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "openai/clip-vit-base-patch32",
    # 更多CLIP模型详见 https://huggingface.co/models?filter=clip
]

# 对比损失函数，改编自
# https://sachinruk.github.io/blog/2021-03-07-clip.html
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    # 使用交叉熵损失计算对比损失
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))

def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
    # 计算对比损失的同时考虑文本和图像
    caption_loss = contrastive_loss(similarity)  # 计算文本损失
    image_loss = contrastive_loss(similarity.t())  # 计算图像损失
    return (caption_loss + image_loss) / 2.0

@dataclass
class CLIPVisionModelOutput(ModelOutput):
    """
    CLIP视觉模型输出的基类，同时包含最后隐藏状态的池化图像嵌入。
    """
    """
    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # 可选参数，表示模型初始化时如果使用了投影层，则返回此投影层应用于池化输出后得到的图像嵌入向量
    image_embeds: Optional[torch.FloatTensor] = None
    # 必需参数，表示模型最后一层的输出隐藏状态，形状为 `(batch_size, sequence_length, hidden_size)`
    last_hidden_state: torch.FloatTensor = None
    # 可选参数，当设置 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回，
    # 是一个元组，包含模型每一层的隐藏状态的输出，如果模型有嵌入层则还包括初始嵌入的输出
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 可选参数，当设置 `output_attentions=True` 或 `config.output_attentions=True` 时返回，
    # 是一个元组，包含每一层的注意力权重，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 使用 dataclass 装饰器声明一个数据类，表示 CLIP 模型的文本输出结果，继承自 ModelOutput。
@dataclass
class CLIPTextModelOutput(ModelOutput):
    """
    Base class for text model's outputs that also contains a pooling of the last hidden states.

    Args:
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The text embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # 可选字段：文本嵌入，类型为 torch.FloatTensor，形状为 (batch_size, output_dim)
    text_embeds: Optional[torch.FloatTensor] = None
    # 必需字段：最后一个隐藏层的隐藏状态，类型为 torch.FloatTensor，形状为 (batch_size, sequence_length, hidden_size)
    last_hidden_state: torch.FloatTensor = None
    # 可选字段：各层的隐藏状态元组，每个元素是 torch.FloatTensor，形状为 (batch_size, sequence_length, hidden_size)
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 可选字段：注意力权重元组，每个元素是 torch.FloatTensor，形状为 (batch_size, num_heads, sequence_length, sequence_length)
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


# 使用 dataclass 装饰器声明一个数据类，表示 CLIP 模型的输出结果，继承自 ModelOutput。
@dataclass
class CLIPOutput(ModelOutput):
    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
        text_model_output(`BaseModelOutputWithPooling`):
            The output of the [`CLIPTextModel`].
        vision_model_output(`BaseModelOutputWithPooling`):
            The output of the [`CLIPVisionModel`].
    """

    # Optional: Loss tensor representing contrastive loss for image-text similarity
    loss: Optional[torch.FloatTensor] = None
    # Optional: Scores indicating image-text similarity (image_batch_size x text_batch_size)
    logits_per_image: torch.FloatTensor = None
    # Optional: Scores indicating text-image similarity (text_batch_size x image_batch_size)
    logits_per_text: torch.FloatTensor = None
    # Optional: Text embeddings derived from CLIPTextModel's pooled output
    text_embeds: torch.FloatTensor = None
    # Optional: Image embeddings derived from CLIPVisionModel's pooled output
    image_embeds: torch.FloatTensor = None
    # Optional: Output object from CLIPTextModel with pooling
    text_model_output: BaseModelOutputWithPooling = None
    # Optional: Output object from CLIPVisionModel with pooling
    vision_model_output: BaseModelOutputWithPooling = None

    def to_tuple(self) -> Tuple[Any]:
        # Convert all attributes except 'text_model_output' and 'vision_model_output' to a tuple
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )
class CLIPVisionEmbeddings(nn.Module):
    # CLIP 视觉嵌入模块，继承自 nn.Module 类
    def __init__(self, config: CLIPVisionConfig):
        # 初始化函数，接受 CLIPVisionConfig 类型的配置参数
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        # 类别嵌入向量，作为可学习参数
        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))

        # 图像块嵌入层，使用 Conv2d 实现，将图像分割为块并转换为嵌入表示
        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            bias=False,
        )

        # 计算图像中的块数和位置嵌入维度
        self.num_patches = (self.image_size // self.patch_size) ** 2
        self.num_positions = self.num_patches + 1
        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)

        # 注册位置索引张量，用于嵌入位置编码
        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        # 前向传播函数，接收像素值张量并返回嵌入表示的张量
        batch_size = pixel_values.shape[0]
        target_dtype = self.patch_embedding.weight.dtype
        # 对输入像素值进行图像块嵌入
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

        # 类别嵌入张量扩展到每个样本
        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
        # 将类别嵌入和图像块嵌入连接成一个张量
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
        # 加上位置嵌入
        embeddings = embeddings + self.position_embedding(self.position_ids)
        return embeddings


class CLIPTextEmbeddings(nn.Module):
    # CLIP 文本嵌入模块，继承自 nn.Module 类
    def __init__(self, config: CLIPTextConfig):
        # 初始化函数，接受 CLIPTextConfig 类型的配置参数
        super().__init__()
        embed_dim = config.hidden_size

        # 词汇表嵌入层和位置嵌入层，使用 Embedding 实现
        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)

        # 注册位置索引张量，用于嵌入位置编码
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.Tensor:
        # 前向传播函数，接收输入的词汇 IDs 或嵌入表示，返回文本嵌入表示的张量
        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]

        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        if inputs_embeds is None:
            inputs_embeds = self.token_embedding(input_ids)

        # 获取位置嵌入
        position_embeddings = self.position_embedding(position_ids)
        # 计算最终的文本嵌入张量
        embeddings = inputs_embeds + position_embeddings

        return embeddings


class CLIPAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    # CLIP 注意力模块，继承自 nn.Module 类
    # 初始化函数，用于初始化一个注意力机制模型实例
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 将配置参数保存在实例中
        self.config = config
        # 从配置中获取隐藏层大小作为嵌入维度
        self.embed_dim = config.hidden_size
        # 从配置中获取注意力头的数量
        self.num_heads = config.num_attention_heads
        # 计算每个注意力头的维度
        self.head_dim = self.embed_dim // self.num_heads
        # 检查 embed_dim 是否能被 num_heads 整除，否则抛出异常
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        # 计算缩放因子，用于注意力分数的缩放
        self.scale = self.head_dim**-0.5
        # 从配置中获取注意力机制的 dropout 率
        self.dropout = config.attention_dropout

        # 初始化线性变换层，用于查询、键、值和输出的投影
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    # 辅助函数，用于调整张量形状以适应多头注意力的计算
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 将张量重新形状为 [bsz, seq_len, num_heads, head_dim]
        reshaped_tensor = tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
        # 交换维度，变成 [bsz, num_heads, seq_len, head_dim]
        transposed_tensor = reshaped_tensor.transpose(1, 2).contiguous()
        return transposed_tensor

    # 前向传播函数，实现注意力机制的计算过程
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
class CLIPMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config  # 保存配置信息到实例变量中
        self.activation_fn = ACT2FN[config.hidden_act]  # 根据配置选择激活函数
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)  # 创建线性层 fc1
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)  # 创建线性层 fc2

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)  # 输入通过线性层 fc1
        hidden_states = self.activation_fn(hidden_states)  # 应用激活函数
        hidden_states = self.fc2(hidden_states)  # 再次通过线性层 fc2
        return hidden_states  # 返回处理后的隐藏状态


class CLIPEncoderLayer(nn.Module):
    def __init__(self, config: CLIPConfig):
        super().__init__()
        self.embed_dim = config.hidden_size  # 保存隐藏尺寸到实例变量
        self.self_attn = CLIPAttention(config)  # 创建自注意力机制
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 创建层归一化层1
        self.mlp = CLIPMLP(config)  # 创建多层感知机 MLP
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 创建层归一化层2

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states  # 保存输入隐藏状态作为残差连接的起点

        hidden_states = self.layer_norm1(hidden_states)  # 应用层归一化层1
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
        )  # 使用自注意力机制处理隐藏状态

        hidden_states = residual + hidden_states  # 添加残差连接

        residual = hidden_states  # 更新残差连接起点为当前隐藏状态

        hidden_states = self.layer_norm2(hidden_states)  # 应用层归一化层2
        hidden_states = self.mlp(hidden_states)  # 输入通过多层感知机 MLP

        hidden_states = residual + hidden_states  # 添加残差连接

        outputs = (hidden_states,)  # 将输出打包为元组

        if output_attentions:
            outputs += (attn_weights,)  # 如果需要输出注意力权重，添加到输出元组中

        return outputs  # 返回输出元组
        

class CLIPPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = CLIPConfig  # 指定配置类
    base_model_prefix = "clip"  # 模型前缀
    supports_gradient_checkpointing = True  # 支持梯度检查点
    # 初始化模型权重的函数，根据不同的模块类型设置不同的初始化策略
    def _init_weights(self, module):
        """Initialize the weights"""
        # 获取初始化因子
        factor = self.config.initializer_factor
        
        # 如果模块是 CLIPTextEmbeddings 类型
        if isinstance(module, CLIPTextEmbeddings):
            # 初始化 token_embedding 和 position_embedding 的权重
            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
        
        # 如果模块是 CLIPVisionEmbeddings 类型
        elif isinstance(module, CLIPVisionEmbeddings):
            # 初始化 class_embedding, patch_embedding 和 position_embedding 的权重
            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
        
        # 如果模块是 CLIPAttention 类型
        elif isinstance(module, CLIPAttention):
            # 初始化注意力机制中的投影权重
            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            out_proj_std = (module.embed_dim**-0.5) * factor
            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
        
        # 如果模块是 CLIPMLP 类型
        elif isinstance(module, CLIPMLP):
            # 初始化多层感知机中的全连接层权重
            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            nn.init.normal_(module.fc1.weight, std=fc_std)
            nn.init.normal_(module.fc2.weight, std=in_proj_std)
        
        # 如果模块是 CLIPModel 类型
        elif isinstance(module, CLIPModel):
            # 初始化 CLIPModel 中的文本和视觉投影权重
            nn.init.normal_(
                module.text_projection.weight,
                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
            )
            nn.init.normal_(
                module.visual_projection.weight,
                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
            )
        
        # 如果模块是 CLIPVisionModelWithProjection 类型
        elif isinstance(module, CLIPVisionModelWithProjection):
            # 初始化视觉模型中的投影权重
            nn.init.normal_(
                module.visual_projection.weight,
                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
            )
        
        # 如果模块是 CLIPTextModelWithProjection 类型
        elif isinstance(module, CLIPTextModelWithProjection):
            # 初始化文本模型中的投影权重
            nn.init.normal_(
                module.text_projection.weight,
                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
            )
        
        # 如果模块是 nn.LayerNorm 类型
        if isinstance(module, nn.LayerNorm):
            # 初始化 LayerNorm 的偏置和权重
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
        # 如果模块是 nn.Linear 类型并且有偏置项
        if isinstance(module, nn.Linear) and module.bias is not None:
            # 将线性层的偏置项初始化为零
            module.bias.data.zero_()
# CLIP_START_DOCSTRING 是一个包含模型介绍和配置参数说明的原始字符串文档
CLIP_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# CLIP_TEXT_INPUTS_DOCSTRING 是一个包含关于文本输入参数的原始字符串文档
CLIP_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# CLIP_VISION_INPUTS_DOCSTRING 是一个空的字符串文档，用于表示视觉输入的参数说明
CLIP_VISION_INPUTS_DOCSTRING = r"""
    Args:
        # `pixel_values` 是一个 torch.FloatTensor，表示图像像素值，形状为 `(batch_size, num_channels, height, width)`
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.

        # `output_attentions` 是一个布尔值，可选参数，默认为 False
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.

        # `output_hidden_states` 是一个布尔值，可选参数，默认为 False
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        # `return_dict` 是一个布尔值，可选参数，默认为 False
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
    CLIP_INPUTS_DOCSTRING = r"""
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
                it.
    
                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.
    
                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
    
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
    
                [What are attention masks?](../glossary#attention-mask)
            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
                config.max_position_embeddings - 1]`.
    
                [What are position IDs?](../glossary#position-ids)
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
                [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
            return_loss (`bool`, *optional*):
                Whether or not to return the contrastive loss.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
                more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
    """
    
    
    class CLIPEncoder(nn.Module):
        """
        Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
        [`CLIPEncoderLayer`].
    
        Args:
            config: CLIPConfig
        """
    
        def __init__(self, config: CLIPConfig):
            super().__init__()
            self.config = config
            # Initialize `num_hidden_layers` instances of CLIPEncoderLayer
            self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
            self.gradient_checkpointing = False
    
        def forward(
            self,
            inputs_embeds,
            attention_mask: Optional[torch.Tensor] = None,
            causal_attention_mask: Optional[torch.Tensor] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
        ):
            # Forward pass through each layer of the encoder
            # `inputs_embeds` are the embedded input tokens
            # `attention_mask` masks padding tokens from attention calculation
            # `causal_attention_mask` masks future tokens for autoregressive tasks
            # `output_attentions` controls whether to output attentions tensors
            # `output_hidden_states` controls whether to output hidden states of layers
            # `return_dict` controls whether to return a ModelOutput or a tuple
            pass  # Placeholder for actual implementation
    
    class CLIPTextTransformer(nn.Module):
    # 初始化方法，接受一个配置对象 config: CLIPTextConfig
    def __init__(self, config: CLIPTextConfig):
        # 调用父类初始化方法
        super().__init__()
        # 将传入的配置对象保存到实例变量 self.config 中
        self.config = config
        # 从配置对象中获取隐藏层的维度作为嵌入的维度
        embed_dim = config.hidden_size
        # 创建 CLIPTextEmbeddings 对象并保存到实例变量 self.embeddings 中
        self.embeddings = CLIPTextEmbeddings(config)
        # 创建 CLIPEncoder 对象并保存到实例变量 self.encoder 中
        self.encoder = CLIPEncoder(config)
        # 创建一个 LayerNorm 层，并设定输入维度为 embed_dim，epsilon 值为 config.layer_norm_eps
        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

        # 为了计算 `pooled_output`，保存 EOS token 的 ID 到实例变量 self.eos_token_id 中
        self.eos_token_id = config.eos_token_id

    # 前向传播方法，使用装饰器将其文档字符串添加到模型的前向传播方法中
    # 使用 CLIP_TEXT_INPUTS_DOCSTRING 描述输入参数
    # 使用 replace_return_docstrings 装饰器，指定输出类型为 BaseModelOutputWithPooling，并使用 CLIPTextConfig 类描述配置
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 使用装饰器为类添加文档字符串，描述这是一个不带头或顶部投影的 CLIP 文本模型
@add_start_docstrings(
    """The text model from CLIP without any head or projection on top.""",
    CLIP_START_DOCSTRING,
)
class CLIPTextModel(CLIPPreTrainedModel):
    # 设置配置类为 CLIPTextConfig
    config_class = CLIPTextConfig

    # 定义不需要分割的模块列表
    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]

    def __init__(self, config: CLIPTextConfig):
        super().__init__(config)
        # 使用给定的配置初始化 CLIPTextTransformer 模型
        self.text_model = CLIPTextTransformer(config)
        # 调用初始化函数，初始化权重并进行最终处理
        self.post_init()

    # 获取输入嵌入的方法，返回文本模型中的 token 嵌入
    def get_input_embeddings(self) -> nn.Module:
        return self.text_model.embeddings.token_embedding

    # 设置输入嵌入的方法，设置文本模型中的 token 嵌入为给定的值
    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

    # 重写 forward 方法，使用装饰器为其添加文档字符串，描述输入参数和返回值的类型
    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        """
        Returns:

        Examples:

        ```
        >>> from transformers import AutoTokenizer, CLIPTextModel

        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```"""
        # 如果 return_dict 为 None，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 text_model 的 forward 方法，传递参数并返回结果
        return self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


class CLIPVisionTransformer(nn.Module):
    def __init__(self, config: CLIPVisionConfig):
        super().__init__()
        self.config = config
        embed_dim = config.hidden_size

        # 初始化视觉嵌入、前层归一化、编码器和后层归一化
        self.embeddings = CLIPVisionEmbeddings(config)
        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        self.encoder = CLIPEncoder(config)
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

    # 使用装饰器为 forward 方法添加文档字符串，描述输入参数和返回值的类型
    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
    # 定义一个方法 `forward`，用于执行模型的前向传播操作
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        执行前向传播操作，并返回模型输出的相关结果。

        Returns:
            根据 `return_dict` 参数的值返回不同的结果组合。
        """

        # 如果 `output_attentions` 参数为 None，则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果 `output_hidden_states` 参数为 None，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 `return_dict` 参数为 None，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果 `pixel_values` 为 None，则抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将像素值 `pixel_values` 输入到嵌入层 `embeddings` 中得到隐藏状态 `hidden_states`
        hidden_states = self.embeddings(pixel_values)
        # 在嵌入层输出的隐藏状态上应用预层归一化 `pre_layrnorm`
        hidden_states = self.pre_layrnorm(hidden_states)

        # 将处理后的隐藏状态 `hidden_states` 输入到编码器 `encoder` 中
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器输出的最后一层隐藏状态 `last_hidden_state`
        last_hidden_state = encoder_outputs[0]
        # 从最后隐藏状态中提取池化输出 `pooled_output`
        pooled_output = last_hidden_state[:, 0, :]
        # 在池化输出上应用后层归一化 `post_layernorm`
        pooled_output = self.post_layernorm(pooled_output)

        # 如果 `return_dict` 为 False，则返回包含多个元组的结果
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果 `return_dict` 为 True，则返回一个包含多个属性的 `BaseModelOutputWithPooling` 对象
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
@add_start_docstrings(
    """The vision model from CLIP without any head or projection on top.""",
    CLIP_START_DOCSTRING,
)
# 定义 CLIPVisionModel 类，继承自 CLIPPreTrainedModel
class CLIPVisionModel(CLIPPreTrainedModel):
    # 使用 CLIPVisionConfig 作为配置类
    config_class = CLIPVisionConfig
    # 主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 不需要拆分的模块列表
    _no_split_modules = ["CLIPEncoderLayer"]

    # 初始化函数，接受一个 CLIPVisionConfig 类型的参数 config
    def __init__(self, config: CLIPVisionConfig):
        # 调用父类的初始化函数
        super().__init__(config)
        # 创建 CLIPVisionTransformer 对象，并赋值给 self.vision_model
        self.vision_model = CLIPVisionTransformer(config)
        # 调用自定义的后初始化函数
        self.post_init()

    # 返回模型的输入嵌入层
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

    # 前向传播函数，接受多个可选参数并返回 Union[Tuple, BaseModelOutputWithPooling] 类型的值
    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        """
        Returns:
        
        Examples:
        
        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPVisionModel

        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```"""
        # 如果 return_dict 为 None，则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 self.vision_model 的前向传播函数，并返回结果
        return self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


# 定义 CLIPModel 类，继承自 CLIPPreTrainedModel，带有 CLIP_START_DOCSTRING 的说明文档
@add_start_docstrings(CLIP_START_DOCSTRING)
class CLIPModel(CLIPPreTrainedModel):
    # 使用 CLIPConfig 作为配置类
    config_class = CLIPConfig
    # 不需要拆分的模块列表
    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
    def __init__(self, config: CLIPConfig):
        super().__init__(config)

        # 检查配置是否符合预期类型，否则引发值错误异常
        if not isinstance(config.text_config, CLIPTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type CLIPTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查配置是否符合预期类型，否则引发值错误异常
        if not isinstance(config.vision_config, CLIPVisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 将文本和视觉配置提取到局部变量中
        text_config = config.text_config
        vision_config = config.vision_config

        # 设置投影维度和文本嵌入维度，从配置中提取
        self.projection_dim = config.projection_dim
        self.text_embed_dim = text_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

        # 初始化文本模型和视觉模型
        self.text_model = CLIPTextTransformer(text_config)
        self.vision_model = CLIPVisionTransformer(vision_config)

        # 创建用于视觉和文本投影的线性层，无偏置
        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)

        # 创建并初始化logit_scale作为模型参数
        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPTextModel`].

        Examples:

        ```
        >>> from transformers import AutoTokenizer, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
        # 检查是否提供了输出注意力信息，如果没有则使用模型的配置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 检查是否提供了输出隐藏状态信息，如果没有则使用模型的配置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 检查是否提供了返回字典的信息，如果没有则使用模型的配置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用文本模型的前向传播，获取文本输出
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从文本输出中获取池化后的输出（通常是第二个元素）
        pooled_output = text_outputs[1]
        # 将池化后的输出应用于文本投影层，得到文本特征
        text_features = self.text_projection(pooled_output)

        # 返回文本特征作为函数的输出
        return text_features

    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
    def get_image_features(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPVisionModel`].

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```"""
        # 设置返回类型为 torch.FloatTensor，代表图像特征向量的形状为 (batch_size, output_dim)
        # 这些特征向量是通过将池化输出应用到 CLIPVisionModel 的投影层上获得的
        # 返回图像特征向量
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定，则使用 CLIP 模型配置中的 output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定，则使用 CLIP 模型配置中的 output_hidden_states
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果未指定，则使用 CLIP 模型配置中的 use_return_dict

        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 使用 CLIP 模型的视觉部分进行处理，传入像素值、注意力输出、隐藏状态输出和返回字典选项

        pooled_output = vision_outputs[1]  # 从视觉输出中获取池化后的输出
        image_features = self.visual_projection(pooled_output)
        # 将池化输出应用于视觉投影层，生成图像特征向量

        return image_features
"""
CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
"""
@add_start_docstrings(
    """
    CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output).
    """,
    CLIP_START_DOCSTRING,
)
class CLIPTextModelWithProjection(CLIPPreTrainedModel):
    config_class = CLIPTextConfig

    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]

    def __init__(self, config: CLIPTextConfig):
        super().__init__(config)

        # Initialize the text model component using CLIPTextTransformer
        self.text_model = CLIPTextTransformer(config)

        # Linear projection layer to transform hidden_size to projection_dim
        self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        # Return the token embeddings from CLIPTextTransformer
        return self.text_model.embeddings.token_embedding

    def set_input_embeddings(self, value):
        # Set new token embeddings for CLIPTextTransformer
        self.text_model.embeddings.token_embedding = value

    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=CLIPTextModelOutput, config_class=CLIPTextConfig)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CLIPTextModelOutput]:
        r"""
        Returns:

        Examples:

        ```
        >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection

        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass input through the text model to get text_outputs
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract pooled_output from text_outputs
        pooled_output = text_outputs[1]

        # Project pooled_output using text_projection linear layer
        text_embeds = self.text_projection(pooled_output)

        if not return_dict:
            # If return_dict is False, return tuple of outputs
            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
            return tuple(output for output in outputs if output is not None)

        # If return_dict is True, return CLIPTextModelOutput with specified attributes
        return CLIPTextModelOutput(
            text_embeds=text_embeds,
            last_hidden_state=text_outputs.last_hidden_state,
            hidden_states=text_outputs.hidden_states,
            attentions=text_outputs.attentions,
        )
    """
    将字符串 CLIP_START_DOCSTRING 插入到三引号字符串中
    CLIP_START_DOCSTRING 通常是一个文档字符串的起始标记
    """
    CLIP_START_DOCSTRING,
# 定义一个继承自 CLIPPreTrainedModel 的类，用于视觉模型和投影
class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
    # 设置配置类为 CLIPVisionConfig
    config_class = CLIPVisionConfig
    # 主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"

    # 初始化方法，接受一个 CLIPVisionConfig 类型的配置对象
    def __init__(self, config: CLIPVisionConfig):
        # 调用父类的初始化方法
        super().__init__(config)

        # 创建 CLIPVisionTransformer 类的实例，作为视觉模型
        self.vision_model = CLIPVisionTransformer(config)

        # 创建一个线性层，用于视觉投影，输入维度为 config.hidden_size，输出维度为 config.projection_dim，无偏置
        self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)

        # 执行后续的初始化权重和处理步骤
        self.post_init()

    # 获取输入嵌入的方法，返回视觉模型中的 patch_embedding 模块
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

    # 前向传播方法，接受像素值 pixel_values 等多个可选参数，返回 Union[Tuple, CLIPVisionModelOutput] 类型
    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=CLIPVisionModelOutput, config_class=CLIPVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CLIPVisionModelOutput]:
        """
        Returns:

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection

        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> image_embeds = outputs.image_embeds
        ```
        """
        # 如果 return_dict 为 None，则使用配置中的 use_return_dict 参数
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用视觉模型的前向传播方法，获取视觉输出
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 提取池化后的输出，命名为 pooled_output
        pooled_output = vision_outputs[1]  # pooled_output

        # 对 pooled_output 进行视觉投影，得到图像嵌入 image_embeds
        image_embeds = self.visual_projection(pooled_output)

        # 如果 return_dict 为 False，则返回元组形式的输出
        if not return_dict:
            outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
            return tuple(output for output in outputs if output is not None)

        # 如果 return_dict 为 True，则返回 CLIPVisionModelOutput 类型的结构化输出
        return CLIPVisionModelOutput(
            image_embeds=image_embeds,
            last_hidden_state=vision_outputs.last_hidden_state,
            hidden_states=vision_outputs.hidden_states,
            attentions=vision_outputs.attentions,
        )


# 添加关于图像分类的描述性注释，继承自 CLIPPreTrainedModel 的类
@add_start_docstrings(
    """
    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    """,
    CLIP_START_DOCSTRING,
)
class CLIPForImageClassification(CLIPPreTrainedModel):
    # 主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 初始化方法，接受一个 CLIPConfig 类型的配置参数
    def __init__(self, config: CLIPConfig) -> None:
        # 调用父类的初始化方法
        super().__init__(config)

        # 设置实例变量 num_labels，用于指定分类任务的类别数
        self.num_labels = config.num_labels
        
        # 根据配置中的视觉模型配置信息创建视觉模型，使用 CLIPVisionTransformer 类
        self.vision_model = CLIPVisionTransformer(config.vision_config)

        # 分类器头部部分，根据 num_labels 的值决定使用全连接层还是恒等映射
        self.classifier = (
            nn.Linear(config.vision_config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # 执行后续的初始化步骤和最终处理
        self.post_init()

    # 前向传播方法，接受像素值、标签以及其他配置参数，返回模型输出结果
    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 输入参数描述：
        # pixel_values: 图像的像素值张量，可选
        # labels: 标签张量，可选
        # output_attentions: 是否输出注意力权重张量，可选
        # output_hidden_states: 是否输出隐藏状态张量，可选
        # return_dict: 是否返回字典类型的结果，可选
    ) -> Union[tuple, ImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 确定是否输出注意力权重，默认与模型配置一致
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 确定是否输出隐藏状态，默认与模型配置一致
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 确定是否使用返回字典，默认与模型配置一致
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入数据传递给视觉模型，获取输出
        outputs = self.vision_model(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取序列输出，通常是模型输出的第一个元素
        sequence_output = outputs[0]

        # 对补丁令牌进行平均池化
        sequence_output = torch.mean(sequence_output[:, 1:, :], dim=1)
        
        # 应用分类器，生成分类器的 logits
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        if labels is not None:
            # 将标签移动到正确的设备以启用模型并行处理
            labels = labels.to(logits.device)
            # 根据问题类型设置模型配置
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不要求返回字典形式的输出，则返回元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 ImageClassifierOutput 对象，包括损失、logits、隐藏状态和注意力权重
        return ImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\clip\modeling_flax_clip.py`

# 导入所需模块和类
from typing import Any, Optional, Tuple, Union

import flax
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax

# 导入模型输出相关的类
from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPooling
# 导入模型工具函数和类
from ...modeling_flax_utils import (
    ACT2FN,
    FlaxPreTrainedModel,
    append_replace_return_docstrings,
    overwrite_call_docstring,
)
# 导入通用工具函数和类
from ...utils import ModelOutput, add_start_docstrings, logging
# 导入相关配置类
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# CLIP模型的起始文档字符串，提供模型介绍和使用说明
CLIP_START_DOCSTRING = r"""

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
    # 参数说明部分：config 是一个 CLIPConfig 类型的对象，包含模型的所有参数。
    # 通过传入一个配置文件初始化，不会加载模型的权重，只加载配置信息。
    # 查看 `FlaxPreTrainedModel.from_pretrained` 方法可以加载模型权重。
    # dtype 是计算数据的数据类型，默认为 `jax.numpy.float32`。
    # 可以选择 `jax.numpy.float32`, `jax.numpy.float16`（在GPU上）和 `jax.numpy.bfloat16`（在TPU上）。
    # 这可以用于在GPU或TPU上启用混合精度训练或半精度推断。
    # 如果指定了dtype，则所有计算将使用给定的dtype执行。
    # 注意：这只指定计算的dtype，不影响模型参数的dtype。
    # 如果希望更改模型参数的dtype，请参阅 `FlaxPreTrainedModel.to_fp16` 和 `FlaxPreTrainedModel.to_bf16`。
"""
CLIP_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

CLIP_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

CLIP_INPUTS_DOCSTRING = r"""
    Placeholder for combining textual and visual inputs documentation for the CLIP model.
"""
    Args:
        input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@flax.struct.dataclass
class FlaxCLIPTextModelOutput(ModelOutput):
    """
    Base class for text model's outputs that also contains a pooling of the last hidden states.

    Args:
        text_embeds (`jnp.ndarray` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of
            [`FlaxCLIPTextModel`].
        last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    text_embeds: jnp.ndarray = None  # 文本嵌入，通过将投影层应用于[`FlaxCLIPTextModel`]的汇聚输出获得
    last_hidden_state: jnp.ndarray = None  # 模型最后一层的隐藏状态输出，形状为`(batch_size, sequence_length, hidden_size)`
    hidden_states: Optional[Tuple[jnp.ndarray, ...]] = None  # 可选，当传递`output_hidden_states=True`或`config.output_hidden_states=True`时返回，元组中包含每层输出的隐藏状态
    attentions: Optional[Tuple[jnp.ndarray, ...]] = None  # 可选，当传递`output_attentions=True`或`config.output_attentions=True`时返回，元组中包含每层的注意力权重
"""


@flax.struct.dataclass
class FlaxCLIPOutput(ModelOutput):
    """
    Args:
        logits_per_image: (`jnp.ndarray` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text: (`jnp.ndarray` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds: (`jnp.ndarray` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of
            [`FlaxCLIPTextModel`].
        image_embeds: (`jnp.ndarray` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`FlaxCLIPVisionModel`].
        text_model_output: (`FlaxBaseModelOutputWithPooling`):
            The output of the [`FlaxCLIPTextModel`].
        vision_model_output: (`FlaxBaseModelOutputWithPooling`):
            The output of the [`FlaxCLIPVisionModel`].
    """

    logits_per_image: jnp.ndarray = None  # 图像与文本嵌入之间的标量乘积得分，形状为`(image_batch_size, text_batch_size)`，表示图像与文本之间的相似度分数
    logits_per_text: jnp.ndarray = None  # 文本与图像嵌入之间的标量乘积得分，形状为`(text_batch_size, image_batch_size)`，表示文本与图像之间的相似度分数
    text_embeds: jnp.ndarray = None  # 通过将投影层应用于[`FlaxCLIPTextModel`]的汇聚输出获得的文本嵌入
    image_embeds: jnp.ndarray = None  # 通过将投影层应用于[`FlaxCLIPVisionModel`]的汇聚输出获得的图像嵌入
    # 定义两个属性，分别用于存储文本模型和视觉模型的输出，初始值为None
    text_model_output: FlaxBaseModelOutputWithPooling = None
    vision_model_output: FlaxBaseModelOutputWithPooling = None
    
    # 定义一个方法，将对象的属性转换为元组返回
    def to_tuple(self) -> Tuple[Any]:
        return tuple(
            # 对于对象的每个属性，如果属性不是"text_model_output"或"vision_model_output"，直接取其值
            # 如果属性是"text_model_output"或"vision_model_output"，调用其to_tuple()方法进行转换
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()  # 遍历对象的所有属性名
        )
# 定义 FlaxCLIPVisionEmbeddings 类，继承自 nn.Module，用于视觉嵌入处理
class FlaxCLIPVisionEmbeddings(nn.Module):
    # 类属性 config 表示 CLIPVisionConfig 的配置
    config: CLIPVisionConfig
    # 类属性 dtype 表示数据类型，默认为 jnp.float32

    # 初始化方法 setup，用于设置模型结构和参数
    def setup(self):
        # 从配置中获取隐藏层大小作为嵌入维度
        embed_dim = self.config.hidden_size
        # 从配置中获取图像大小和patch大小
        image_size = self.config.image_size
        patch_size = self.config.patch_size

        # 初始化类别嵌入向量，命名为 class_embedding，使用正态分布初始化
        self.class_embedding = self.param("class_embedding", jax.nn.initializers.normal(stddev=0.02), (embed_dim,))

        # 初始化 patch 嵌入层，使用卷积操作，无偏置，数据类型为 dtype
        self.patch_embedding = nn.Conv(
            embed_dim,
            kernel_size=(patch_size, patch_size),
            strides=(patch_size, patch_size),
            padding="VALID",
            use_bias=False,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(),
        )

        # 计算图像分割成 patch 后的总数
        self.num_patches = (image_size // patch_size) ** 2
        # 计算位置嵌入的总数，包括类别嵌入
        num_positions = self.num_patches + 1
        # 初始化位置嵌入层，使用正态分布初始化
        self.position_embedding = nn.Embed(num_positions, embed_dim, embedding_init=jax.nn.initializers.normal())
        # 初始化位置编号，用于确定每个位置的嵌入
        self.position_ids = jnp.expand_dims(jnp.arange(0, num_positions, dtype="i4"), axis=0)

    # 实现 __call__ 方法，用于执行模型的前向传播
    def __call__(self, pixel_values):
        # 对输入的像素值进行 patch 嵌入处理
        patch_embeds = self.patch_embedding(pixel_values)
        # 获取批量大小、高度、宽度和通道数
        batch_size, height, width, channels = patch_embeds.shape
        # 将 patch 嵌入重新形状为 (批量大小, 高度*宽度, 通道数)
        patch_embeds = jnp.reshape(patch_embeds, (batch_size, height * width, channels))

        # 扩展类别嵌入到每个图像片段，以便与 patch 嵌入连接
        class_embeds = jnp.expand_dims(self.class_embedding, axis=(0, 1))
        class_embeds = jnp.tile(class_embeds, (batch_size, 1, 1))
        # 将类别嵌入和 patch 嵌入连接起来形成最终的嵌入表示
        embeddings = jnp.concatenate([class_embeds, patch_embeds], axis=1)
        # 将位置嵌入加到最终嵌入表示中
        embeddings = embeddings + self.position_embedding(self.position_ids)
        return embeddings


# 定义 FlaxCLIPTextEmbeddings 类，继承自 nn.Module，用于文本嵌入处理
class FlaxCLIPTextEmbeddings(nn.Module):
    # 类属性 config 表示 CLIPTextConfig 的配置
    config: CLIPTextConfig
    # 类属性 dtype 表示数据类型，默认为 jnp.float32

    # 初始化方法 setup，用于设置模型结构和参数
    def setup(self):
        # 从配置中获取隐藏层大小作为嵌入维度
        embed_dim = self.config.hidden_size

        # 初始化 token 嵌入层，使用正态分布初始化
        self.token_embedding = nn.Embed(self.config.vocab_size, embed_dim, embedding_init=jax.nn.initializers.normal())
        # 初始化位置嵌入层，使用正态分布初始化
        self.position_embedding = nn.Embed(
            self.config.max_position_embeddings, embed_dim, embedding_init=jax.nn.initializers.normal()
        )
        # 初始化位置编号，用于确定每个位置的嵌入
        self.position_ids = jnp.expand_dims(
            jnp.arange(0, self.config.max_position_embeddings, dtype="i4"), axis=(0, 1)
        )

    # 实现 __call__ 方法，用于执行模型的前向传播
    def __call__(self, input_ids, position_ids):
        # 将输入的 token 编号转换为对应的 token 嵌入
        input_embeds = self.token_embedding(input_ids.astype("i4"))
        # 获取对应位置编号的位置嵌入
        position_embeds = self.position_embedding(position_ids.astype("i4"))

        # 将 token 嵌入和位置嵌入相加得到最终的嵌入表示
        embeddings = input_embeds + position_embeds
        return embeddings


# 定义 FlaxCLIPAttention 类，继承自 nn.Module，用于注意力机制处理
class FlaxCLIPAttention(nn.Module):
    # 类属性 config 表示 CLIPTextConfig 或 CLIPVisionConfig 的配置
    config: Union[CLIPTextConfig, CLIPVisionConfig]
    # 类属性 dtype 表示数据类型，默认为 jnp.float32
    # 设置函数，初始化模型的注意力相关参数
    def setup(self):
        # 设置嵌入维度为隐藏大小
        self.embed_dim = self.config.hidden_size
        # 设置注意力头的数量
        self.num_heads = self.config.num_attention_heads
        # 计算每个注意力头的维度
        self.head_dim = self.embed_dim // self.num_heads
        # 检查embed_dim是否能被num_heads整除
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        # 设置缩放因子
        self.scale = self.head_dim**-0.5
        # 设置注意力的dropout率
        self.dropout = self.config.attention_dropout

        # 初始化键、值、查询、输出的线性投影层
        self.k_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01))
        self.v_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01))
        self.q_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01))
        self.out_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01))

        # 根据配置确定是否是有因果关系的注意力
        self.causal = isinstance(self.config, CLIPTextConfig)
        # 如果是因果关系注意力，则创建因果关系的掩码
        if self.causal:
            self.causal_mask = make_causal_mask(jnp.ones((1, self.config.max_position_embeddings), dtype="i4"))

    # 将隐藏状态按照头的数量和头的维度进行分割
    def _split_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))

    # 将分割后的头重新合并成原始的维度
    def _merge_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))

    # 定义模型的调用方法，用于执行自注意力机制
    def __call__(
        self,
        hidden_states,
        attention_mask=None,
        deterministic: bool = True,
        output_attentions: bool = False,
        ):
            # 使用 self.q_proj 对隐藏状态进行查询投影
            query = self.q_proj(hidden_states)
            # 使用 self.k_proj 对隐藏状态进行键投影
            key = self.k_proj(hidden_states)
            # 使用 self.v_proj 对隐藏状态进行值投影
            value = self.v_proj(hidden_states)

            # 将查询结果按头数分割
            query = self._split_heads(query)
            # 将键结果按头数分割
            key = self._split_heads(key)
            # 将值结果按头数分割
            value = self._split_heads(value)

            # 初始化因果注意力掩码
            causal_attention_mask = None
            if self.causal:
                # 如果开启因果模式，则根据查询和键的长度创建因果注意力掩码
                query_length, key_length = query.shape[1], key.shape[1]
                causal_attention_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length]

            # 整合外部传入的注意力掩码和因果注意力掩码
            if attention_mask is not None and causal_attention_mask is not None:
                attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
                attention_mask = combine_masks(attention_mask, causal_attention_mask, dtype="i4")
            elif causal_attention_mask is not None:
                attention_mask = causal_attention_mask
            elif attention_mask is not None:
                attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))

            # 根据最终得到的注意力掩码生成注意力偏置
            if attention_mask is not None:
                attention_bias = lax.select(
                    attention_mask > 0,
                    jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
                    jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
                )
            else:
                attention_bias = None

            # 初始化 dropout 的随机数生成器
            dropout_rng = None
            if not deterministic and self.dropout > 0.0:
                dropout_rng = self.make_rng("dropout")

            # 计算注意力权重
            attn_weights = dot_product_attention_weights(
                query,
                key,
                bias=attention_bias,
                dropout_rng=dropout_rng,
                dropout_rate=self.dropout,
                deterministic=deterministic,
                dtype=self.dtype,
                precision=None,
            )

            # 根据注意力权重计算注意力输出
            attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
            # 合并多头注意力的输出
            attn_output = self._merge_heads(attn_output)
            # 对注意力输出进行最终的投影
            attn_output = self.out_proj(attn_output)

            # 根据需求决定返回的输出内容
            outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
            return outputs
# 定义一个使用 CLIPTextConfig 或 CLIPVisionConfig 类型配置的神经网络模块
class FlaxCLIPMLP(nn.Module):
    config: Union[CLIPTextConfig, CLIPVisionConfig]  # 模块的配置属性，可以是文本或视觉配置类型
    dtype: jnp.dtype = jnp.float32  # 默认数据类型为 jnp.float32

    # 模块初始化设置方法
    def setup(self):
        # 根据配置中指定的激活函数选择对应的激活函数
        self.activation_fn = ACT2FN[self.config.hidden_act]
        # 第一个全连接层，输入大小为配置中的 intermediate_size，使用正态分布初始化权重
        self.fc1 = nn.Dense(
            self.config.intermediate_size,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(0.01),
        )
        # 第二个全连接层，输入大小为配置中的 hidden_size，使用正态分布初始化权重
        self.fc2 = nn.Dense(self.config.hidden_size, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01))

    # 模块调用方法
    def __call__(self, hidden_states):
        # 使用第一个全连接层进行前向传播
        hidden_states = self.fc1(hidden_states)
        # 使用选择的激活函数进行激活
        hidden_states = self.activation_fn(hidden_states)
        # 使用第二个全连接层进行前向传播
        hidden_states = self.fc2(hidden_states)
        return hidden_states


# 定义一个使用 CLIPTextConfig 或 CLIPVisionConfig 类型配置的编码器层模块
class FlaxCLIPEncoderLayer(nn.Module):
    config: Union[CLIPTextConfig, CLIPVisionConfig]  # 模块的配置属性，可以是文本或视觉配置类型
    dtype: jnp.dtype = jnp.float32  # 默认数据类型为 jnp.float32

    # 模块初始化设置方法
    def setup(self):
        # 自注意力机制
        self.self_attn = FlaxCLIPAttention(self.config, dtype=self.dtype)
        # 第一层归一化层，使用配置中指定的 epsilon 进行归一化
        self.layer_norm1 = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        # 多层感知机（MLP）模块
        self.mlp = FlaxCLIPMLP(self.config, dtype=self.dtype)
        # 第二层归一化层，使用配置中指定的 epsilon 进行归一化
        self.layer_norm2 = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)

    # 模块调用方法
    def __call__(
        self,
        hidden_states,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
    ):
        residual = hidden_states  # 保存输入的残差连接

        # 对输入进行第一层归一化处理
        hidden_states = self.layer_norm1(hidden_states)
        # 使用自注意力机制进行注意力计算
        attn_outputs = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
        )
        hidden_states = attn_outputs[0]  # 更新隐藏状态为注意力输出的第一个元素
        hidden_states = residual + hidden_states  # 残差连接

        residual = hidden_states  # 更新残差连接

        # 对更新后的隐藏状态进行第二层归一化处理
        hidden_states = self.layer_norm2(hidden_states)
        # 使用多层感知机（MLP）模块进行前向传播
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states  # 残差连接

        outputs = (hidden_states,)  # 将输出封装成元组

        # 如果需要输出注意力信息，则添加到输出中
        if output_attentions:
            outputs += attn_outputs[1:]

        return outputs


# 定义一个使用 CLIPTextConfig 或 CLIPVisionConfig 类型配置的多层编码器层集合模块
class FlaxCLIPLayerCollection(nn.Module):
    config: Union[CLIPTextConfig, CLIPVisionConfig]  # 模块的配置属性，可以是文本或视觉配置类型
    dtype: jnp.dtype = jnp.float32  # 默认数据类型为 jnp.float32

    # 模块初始化设置方法
    def setup(self):
        # 创建多层编码器层集合，每层使用 FlaxCLIPEncoderLayer 模块
        self.layers = [
            FlaxCLIPEncoderLayer(self.config, name=str(i), dtype=self.dtype)
            for i in range(self.config.num_hidden_layers)
        ]

    # 模块调用方法
    def __call__(
        self,
        hidden_states,
        attention_mask=None,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,

        ):
            # 遍历每层编码器层进行处理
            for layer in self.layers:
                # 对隐藏状态进行编码器层处理
                hidden_states = layer(
                    hidden_states=hidden_states,
                    attention_mask=attention_mask,
                    deterministic=deterministic,
                    output_attentions=output_attentions,
                )

            # 返回处理后的结果
            return hidden_states
    ):
        # 如果不输出注意力权重，则初始化空元组
        all_attentions = () if output_attentions else None
        # 如果不输出隐藏状态，则初始化空元组
        all_hidden_states = () if output_hidden_states else None

        # 遍历模型的每一层
        for layer in self.layers:
            if output_hidden_states:
                # 如果需要输出隐藏状态，则将当前隐藏状态添加到all_hidden_states元组中
                all_hidden_states += (hidden_states,)

            # 调用当前层的前向传播方法
            layer_outputs = layer(
                hidden_states, attention_mask, deterministic=deterministic, output_attentions=output_attentions
            )
            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            if output_attentions:
                # 如果需要输出注意力权重，则将当前层的注意力权重添加到all_attentions元组中
                all_attentions += (layer_outputs[1],)

        if output_hidden_states:
            # 如果需要输出隐藏状态，则将最终的隐藏状态添加到all_hidden_states元组中
            all_hidden_states += (hidden_states,)

        # 将最终的隐藏状态作为模型的输出
        outputs = (hidden_states,)

        if not return_dict:
            # 如果不返回字典形式的输出，则返回outputs中不为None的元素组成的元组
            return tuple(v for v in outputs if v is not None)

        # 返回FlaxBaseModelOutput类的实例，其中包括最终的隐藏状态、所有隐藏状态和所有注意力权重
        return FlaxBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )
class FlaxCLIPEncoder(nn.Module):
    config: Union[CLIPTextConfig, CLIPVisionConfig]  # 定义config属性，可以是CLIPTextConfig或CLIPVisionConfig类型
    dtype: jnp.dtype = jnp.float32  # 定义dtype属性，默认为jnp.float32类型

    def setup(self):
        self.layers = FlaxCLIPLayerCollection(self.config, dtype=self.dtype)
        # 初始化layers属性为FlaxCLIPLayerCollection实例，使用给定的config和dtype参数

    def __call__(
        self,
        inputs_embeds,
        attention_mask=None,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        return self.layers(
            hidden_states=inputs_embeds,
            attention_mask=attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 调用self.layers对象，传递输入的嵌入向量inputs_embeds和其他可选参数，返回计算结果


class FlaxCLIPTextTransformer(nn.Module):
    config: CLIPTextConfig  # 定义config属性为CLIPTextConfig类型
    dtype: jnp.dtype = jnp.float32  # 定义dtype属性，默认为jnp.float32类型

    def setup(self):
        self.embeddings = FlaxCLIPTextEmbeddings(self.config, dtype=self.dtype)
        # 初始化embeddings属性为FlaxCLIPTextEmbeddings实例，使用给定的config和dtype参数
        self.encoder = FlaxCLIPEncoder(self.config, dtype=self.dtype)
        # 初始化encoder属性为FlaxCLIPEncoder实例，使用给定的config和dtype参数
        self.final_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        # 初始化final_layer_norm属性为nn.LayerNorm实例，使用给定的layer_norm_eps和dtype参数

        # For `pooled_output` computation
        self.eos_token_id = self.config.eos_token_id
        # 设置eos_token_id属性为config中的eos_token_id值

    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 定义对象调用方法，接收输入参数，包括input_ids、attention_mask等
    # 如果没有指定output_attentions，则使用self.config.output_attentions
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    # 如果没有指定output_hidden_states，则使用self.config.output_hidden_states
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    # 如果没有指定return_dict，则使用self.config.use_return_dict
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    
    # 使用input_ids和position_ids作为输入，生成hidden_states
    hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
    
    # 将hidden_states作为输入，并传入额外的参数，生成encoder_outputs
    encoder_outputs = self.encoder(
        inputs_embeds=hidden_states,
        attention_mask=attention_mask,
        deterministic=deterministic,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    
    # 选取encoder_outputs中的第一个元素作为last_hidden_state
    last_hidden_state = encoder_outputs[0]
    # 对last_hidden_state进行final_layer_norm处理
    last_hidden_state = self.final_layer_norm(last_hidden_state)
    
    # 如果eos_token_id等于2，则执行以下逻辑
    if self.eos_token_id == 2:
        # 从last_hidden_state中取出特定位置的特征，形成pooled_output
        pooled_output = last_hidden_state[jnp.arange(last_hidden_state.shape[0]), input_ids.argmax(axis=-1)]
    else:
        # 处理eos_token_id不等于2的情况
        pooled_output = last_hidden_state[
            jnp.arange(last_hidden_state.shape[0]), (input_ids == self.eos_token_id).argmax(axis=-1)
        ]
    
    # 如果return_dict为False，则返回last_hidden_state, pooled_output和encoder_outputs的其他部分
    if not return_dict:
        return (last_hidden_state, pooled_output) + encoder_outputs[1:]
    
    # 如果return_dict为True，则返回FlaxBaseModelOutputWithPooling对象
    return FlaxBaseModelOutputWithPooling(
        last_hidden_state=last_hidden_state,
        pooler_output=pooled_output,
        hidden_states=encoder_outputs.hidden_states,
        attentions=encoder_outputs.attentions,
    )
# 定义一个名为 FlaxCLIPVisionTransformer 的类，继承自 nn.Module
class FlaxCLIPVisionTransformer(nn.Module):
    # 类变量 config，指定为 CLIPVisionConfig 类型
    config: CLIPVisionConfig
    # 类变量 dtype，默认为 jnp.float32 类型
    dtype: jnp.dtype = jnp.float32

    # 初始化函数 setup，用于设置模型的组件
    def setup(self):
        # 创建 FlaxCLIPVisionEmbeddings 实例，并传入 config 和 dtype 参数
        self.embeddings = FlaxCLIPVisionEmbeddings(self.config, dtype=self.dtype)
        # 创建 nn.LayerNorm 实例，用于前层归一化，设定 epsilon 参数为 config 的 layer_norm_eps
        self.pre_layrnorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        # 创建 FlaxCLIPEncoder 实例，并传入 config 和 dtype 参数
        self.encoder = FlaxCLIPEncoder(self.config, dtype=self.dtype)
        # 创建 nn.LayerNorm 实例，用于后层归一化，设定 epsilon 参数为 config 的 layer_norm_eps
        self.post_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)

    # 定义调用函数，接受多个参数
    def __call__(
        self,
        pixel_values=None,
        deterministic: bool = True,
        output_attentions=None,
        output_hidden_states=None,
        return_dict: bool = True,
    ):
        # 根据参数设定 output_attentions，默认为 config 中的 output_attentions
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 根据参数设定 output_hidden_states，默认为 config 中的 output_hidden_states
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据参数设定 return_dict，默认为 config 中的 use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 embeddings 对象处理输入的像素值，得到隐藏状态
        hidden_states = self.embeddings(pixel_values)
        # 对隐藏状态进行前层归一化处理
        hidden_states = self.pre_layrnorm(hidden_states)

        # 使用 encoder 对象处理归一化后的隐藏状态
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从编码器输出中提取最后一个隐藏状态
        last_hidden_state = encoder_outputs[0]
        # 对最后一个隐藏状态进行池化操作，提取池化输出
        pooled_output = last_hidden_state[:, 0, :]
        # 对池化输出进行后层归一化处理
        pooled_output = self.post_layernorm(pooled_output)

        # 如果 return_dict 为 False，则返回元组形式的结果
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果 return_dict 为 True，则返回 FlaxBaseModelOutputWithPooling 的实例
        return FlaxBaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


# 定义一个名为 FlaxCLIPTextPreTrainedModel 的类，继承自 FlaxPreTrainedModel
class FlaxCLIPTextPreTrainedModel(FlaxPreTrainedModel):
    # 类变量 config_class，指定为 CLIPTextConfig 类型
    config_class = CLIPTextConfig
    # 类变量 module_class，默认为 None
    module_class: nn.Module = None

    # 初始化函数，接受多个参数，包括一个 config 对象
    def __init__(
        self,
        config: CLIPTextConfig,
        input_shape=(1, 1),
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        **kwargs,
    ):
        # 根据传入的 config 参数和其他参数创建 module 对象
        module = self.module_class(config=config, dtype=dtype, **kwargs)
        # 调用父类的初始化方法，初始化模型
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
    # 初始化模型权重的方法，使用给定的随机数生成器和输入形状，可选地使用现有的参数字典
    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        # 初始化输入张量
        input_ids = jnp.zeros(input_shape, dtype="i4")
        # 创建位置编码张量，广播到输入形状的维度
        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
        # 创建注意力掩码张量，形状与输入张量相同，并初始化为全1
        attention_mask = jnp.ones_like(input_ids)

        # 分离随机数生成器为参数初始化和dropout层
        params_rng, dropout_rng = jax.random.split(rng)
        rngs = {"params": params_rng, "dropout": dropout_rng}

        # 使用模型的初始化方法初始化随机的参数
        random_params = self.module.init(rngs, input_ids, attention_mask, position_ids)["params"]

        # 如果存在现有参数，则将随机生成的参数与现有参数进行合并
        if params is not None:
            random_params = flatten_dict(unfreeze(random_params))
            params = flatten_dict(unfreeze(params))
            # 将缺失的键从随机参数复制到现有参数
            for missing_key in self._missing_keys:
                params[missing_key] = random_params[missing_key]
            self._missing_keys = set()
            # 冻结并返回合并后的参数字典
            return freeze(unflatten_dict(params))
        else:
            # 直接返回随机生成的参数字典
            return random_params

    # 模型对象的调用方法，接受一系列输入参数并返回模型的输出
    def __call__(
        self,
        input_ids,
        attention_mask=None,
        position_ids=None,
        params: dict = None,
        dropout_rng: jax.random.PRNGKey = None,
        train: bool = False,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 如果未提供位置编码，则使用广播到输入张量形状的默认位置编码
        if position_ids is None:
            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)

        # 如果未提供注意力掩码，则创建一个与输入张量形状相同的全1掩码
        if attention_mask is None:
            attention_mask = jnp.ones_like(input_ids)

        # 处理可能需要的任何随机数生成器
        rngs = {}
        if dropout_rng is not None:
            rngs["dropout"] = dropout_rng

        # 应用模型的前向传播方法并返回结果
        return self.module.apply(
            {"params": params or self.params},  # 模型参数，可以是传入的参数或者模型自身的参数
            jnp.array(input_ids, dtype="i4"),   # 输入张量，转换为32位整数
            jnp.array(attention_mask, dtype="i4"),  # 注意力掩码张量，转换为32位整数
            jnp.array(position_ids, dtype="i4"),    # 位置编码张量，转换为32位整数
            not train,          # 是否处于推理模式（训练模式取反）
            output_attentions,  # 是否输出注意力权重
            output_hidden_states,  # 是否输出隐藏状态
            return_dict,       # 是否以字典形式返回结果
            rngs=rngs,         # 随机数生成器字典
        )
# 定义一个继承自FlaxPreTrainedModel的新模型类，用于视觉任务的预训练模型
class FlaxCLIPVisionPreTrainedModel(FlaxPreTrainedModel):
    # 指定配置类为CLIPVisionConfig
    config_class = CLIPVisionConfig
    # 主要输入的名称为"pixel_values"
    main_input_name = "pixel_values"
    # 模块类的类型暂未指定
    module_class: nn.Module = None

    # 初始化方法，接收多个参数包括config、input_shape等
    def __init__(
        self,
        config: CLIPVisionConfig,
        input_shape: Optional[Tuple] = None,
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        **kwargs,
    ):
        # 如果未指定input_shape，默认为(1, config.image_size, config.image_size, 3)
        if input_shape is None:
            input_shape = (1, config.image_size, config.image_size, 3)
        # 使用给定的config和dtype创建模块对象
        module = self.module_class(config=config, dtype=dtype, **kwargs)
        # 调用父类的初始化方法，传递config、module等参数
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

    # 初始化权重的方法，接收随机数种子rng、输入形状input_shape、参数params等
    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        # 使用正态分布生成输入张量pixel_values
        pixel_values = jax.random.normal(rng, input_shape)

        # 分割rng以获取参数rng和dropout_rng
        params_rng, dropout_rng = jax.random.split(rng)
        rngs = {"params": params_rng, "dropout": dropout_rng}

        # 初始化模块的参数，返回随机生成的参数random_params
        random_params = self.module.init(rngs, pixel_values)["params"]

        # 如果提供了params，则将缺失的键从random_params复制到params中，并返回新的params
        if params is not None:
            random_params = flatten_dict(unfreeze(random_params))
            params = flatten_dict(unfreeze(params))
            for missing_key in self._missing_keys:
                params[missing_key] = random_params[missing_key]
            self._missing_keys = set()
            return freeze(unflatten_dict(params))
        else:
            return random_params

    # 模型的调用方法，接收多个参数如pixel_values、params等
    def __call__(
        self,
        pixel_values,
        params: dict = None,
        dropout_rng: jax.random.PRNGKey = None,
        train: bool = False,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 如果output_attentions、output_hidden_states或return_dict未指定，则使用config中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 将输入张量pixel_values转置为适合模块处理的形状
        pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))

        # 如果存在dropout_rng，则将其添加到rngs字典中
        rngs = {}
        if dropout_rng is not None:
            rngs["dropout"] = dropout_rng

        # 调用模块的apply方法，传递params、pixel_values和其他参数，返回模型的输出
        return self.module.apply(
            {"params": params or self.params},
            jnp.array(pixel_values, dtype=jnp.float32),
            not train,
            output_attentions,
            output_hidden_states,
            return_dict,
            rngs=rngs,
        )


# 定义一个继承自FlaxPreTrainedModel的新模型类，用于通用的CLIP预训练模型
class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
    # 指定配置类为CLIPConfig
    config_class = CLIPConfig
    # 模块类的类型暂未指定
    module_class: nn.Module = None

    # 初始化方法，接收多个参数包括config、input_shape等
    def __init__(
        self,
        config: CLIPConfig,
        input_shape: Optional[Tuple] = None,
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        **kwargs,
    ):
    ):
        # 如果未提供输入形状，则使用默认形状：((1, 1), (1, vision_config.image_size, vision_config.image_size, 3))
        if input_shape is None:
            input_shape = ((1, 1), (1, config.vision_config.image_size, config.vision_config.image_size, 3))
        
        # 根据指定的配置和参数初始化模块对象
        module = self.module_class(config=config, dtype=dtype, **kwargs)
        
        # 调用父类的初始化方法，传入配置、模块对象、输入形状等参数进行初始化
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        # 初始化输入张量，全部置零
        input_ids = jnp.zeros(input_shape[0], dtype="i4")
        
        # 生成位置编码，广播到与输入张量相同的形状
        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape[0])
        
        # 创建注意力掩码，与输入张量形状相同，全部置一
        attention_mask = jnp.ones_like(input_ids)

        # 生成像素数值，服从正态分布
        pixel_values = jax.random.normal(rng, input_shape[1])

        # 划分随机数生成器为参数和丢弃的两部分
        params_rng, dropout_rng = jax.random.split(rng)
        rngs = {"params": params_rng, "dropout": dropout_rng}

        # 使用模块的初始化方法初始化随机参数
        random_params = self.module.init(rngs, input_ids, pixel_values, attention_mask, position_ids)["params"]

        if params is not None:
            # 如果提供了参数，则使用提供的参数，否则使用随机生成的参数
            random_params = flatten_dict(unfreeze(random_params))
            params = flatten_dict(unfreeze(params))
            for missing_key in self._missing_keys:
                params[missing_key] = random_params[missing_key]
            self._missing_keys = set()
            return freeze(unflatten_dict(params))
        else:
            return random_params

    def __call__(
        self,
        input_ids,
        pixel_values,
        attention_mask=None,
        position_ids=None,
        params: dict = None,
        dropout_rng: jax.random.PRNGKey = None,
        train: bool = False,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 如果未提供位置编码，则根据输入张量的形状生成位置编码
        if position_ids is None:
            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)

        # 如果未提供注意力掩码，则生成与输入张量相同形状的全一掩码
        if attention_mask is None:
            attention_mask = jnp.ones_like(input_ids)

        # 转置像素值，调整维度顺序
        pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))

        # 如果需要处理任何随机数生成器
        rngs = {}
        if dropout_rng is not None:
            rngs["dropout"] = dropout_rng

        # 调用模块的应用方法，传入参数和数据，返回模块处理的结果
        return self.module.apply(
            {"params": params or self.params},
            jnp.array(input_ids, dtype="i4"),
            jnp.array(pixel_values, dtype=jnp.float32),
            jnp.array(attention_mask, dtype="i4"),
            jnp.array(position_ids, dtype="i4"),
            not train,
            output_attentions,
            output_hidden_states,
            return_dict,
            rngs=rngs,
        )
    def get_text_features(
        self,
        input_ids,
        attention_mask=None,
        position_ids=None,
        params: dict = None,
        dropout_rng: jax.random.PRNGKey = None,
        train=False,
    ):
        r"""
        Args:
            input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)

        Returns:
            text_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
            the projection layer to the pooled output of [`FlaxCLIPTextModel`].

        Examples:

        ```
        >>> from transformers import AutoTokenizer, FlaxCLIPModel

        >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
        >>> text_features = model.get_text_features(**inputs)
        ```"""

        # 如果未提供位置 IDs，则创建一个广播以匹配输入 IDs 的长度
        if position_ids is None:
            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)

        # 如果未提供注意力遮罩，则创建一个全1数组以表示全部注意
        if attention_mask is None:
            attention_mask = jnp.ones_like(input_ids)

        # 如果需要处理任何随机数生成器（PRNG）
        rngs = {}
        if dropout_rng is not None:
            rngs["dropout"] = dropout_rng

        # 定义内部函数以获取文本特征
        def _get_features(module, input_ids, attention_mask, position_ids, deterministic):
            # 获取文本模型的输出
            text_outputs = module.text_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                position_ids=position_ids,
                deterministic=deterministic,
            )
            # 获取汇总后的输出
            pooled_output = text_outputs[1]
            # 应用文本投影层得到文本特征
            text_features = module.text_projection(pooled_output)
            return text_features

        # 应用模块的方法来获取文本特征
        return self.module.apply(
            {"params": params or self.params},
            jnp.array(input_ids, dtype="i4"),
            jnp.array(attention_mask, dtype="i4"),
            jnp.array(position_ids, dtype="i4"),
            not train,
            method=_get_features,
            rngs=rngs,
        )

    def get_image_features(
        self, pixel_values, params: dict = None, dropout_rng: jax.random.PRNGKey = None, train=False
    ):
    ):
        r"""
        Args:
            pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained
                using [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.

        Returns:
            image_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`FlaxCLIPVisionModel`]

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, FlaxCLIPModel

        >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="np")

        >>> image_features = model.get_image_features(**inputs)
        ```"""
        # 转置像素值数组，调整通道顺序为(batch_size, height, width, num_channels)
        pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))

        # 处理可能需要的随机数发生器
        rngs = {}
        if dropout_rng is not None:
            rngs["dropout"] = dropout_rng

        def _get_features(module, pixel_values, deterministic):
            # 使用视觉模型处理像素值数组，获取视觉输出
            vision_outputs = module.vision_model(pixel_values=pixel_values, deterministic=deterministic)
            # 提取池化后的输出
            pooled_output = vision_outputs[1]  # pooled_output
            # 将池化输出应用于视觉投影层，得到图像特征
            image_features = module.visual_projection(pooled_output)
            return image_features

        # 应用模块的特征提取方法，返回图像特征
        return self.module.apply(
            {"params": params or self.params},  # 模型参数
            jnp.array(pixel_values, dtype=jnp.float32),  # 转换后的像素值数组
            not train,  # 是否训练模式
            method=_get_features,  # 使用_get_features方法进行特征提取
            rngs=rngs,  # 随机数发生器字典
        )
class FlaxCLIPTextModule(nn.Module):
    config: CLIPTextConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        # 初始化文本模型为 FlaxCLIPTextTransformer，使用给定的配置和数据类型
        self.text_model = FlaxCLIPTextTransformer(self.config, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用文本模型的前向传播方法，传递给定的输入参数，并返回结果
        return self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


class FlaxCLIPTextModel(FlaxCLIPTextPreTrainedModel):
    # 模型类的模块类型设置为 FlaxCLIPTextModule
    module_class = FlaxCLIPTextModule


FLAX_CLIP_TEXT_MODEL_DOCSTRING = """
    Returns:

    Example:

    ```
    >>> from transformers import AutoTokenizer, FlaxCLIPTextModel

    >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
    >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

    >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")

    >>> outputs = model(**inputs)
    >>> last_hidden_state = outputs.last_hidden_state
    >>> pooler_output = outputs.pooler_output  # pooled (EOS token) states
    ```
"""

# 覆盖 FlaxCLIPTextModel 类的 __call__ 方法的文档字符串，包括输入文档和模型输出示例
overwrite_call_docstring(FlaxCLIPTextModel, CLIP_TEXT_INPUTS_DOCSTRING + FLAX_CLIP_TEXT_MODEL_DOCSTRING)

# 追加或替换 FlaxCLIPTextModel 类的返回文档字符串，指定输出类型和配置类
append_replace_return_docstrings(
    FlaxCLIPTextModel, output_type=FlaxBaseModelOutputWithPooling, config_class=CLIPTextConfig
)


class FlaxCLIPTextModelWithProjectionModule(nn.Module):
    config: CLIPTextConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        # 初始化文本模型为 FlaxCLIPTextTransformer，使用给定的配置和数据类型
        self.text_model = FlaxCLIPTextTransformer(self.config, dtype=self.dtype)
        # 添加文本投影层，使用给定的投影维度和数据类型
        self.text_projection = nn.Dense(self.config.projection_dim, use_bias=False, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        ):
            # 调用文本模型生成文本输出
            text_outputs = self.text_model(
                input_ids=input_ids,  # 输入的token IDs
                attention_mask=attention_mask,  # 注意力掩码
                position_ids=position_ids,  # 位置 IDs
                deterministic=deterministic,  # 是否确定性运行
                output_attentions=output_attentions,  # 是否输出注意力权重
                output_hidden_states=output_hidden_states,  # 是否输出隐藏状态
                return_dict=return_dict,  # 是否返回字典形式的输出
            )

            # 从文本输出中获取汇聚的输出（一般是平均池化或CLS token的表示）
            pooled_output = text_outputs[1]
            # 将汇聚的输出通过文本投影层进行转换
            text_embeds = self.text_projection(pooled_output)

            # 如果不返回字典形式的输出，则返回元组
            if not return_dict:
                return (text_embeds, text_outputs[0]) + text_outputs[2:]

            # 如果返回字典形式的输出，则创建特定的输出对象
            return FlaxCLIPTextModelOutput(
                text_embeds=text_embeds,
                last_hidden_state=text_outputs.last_hidden_state,
                hidden_states=text_outputs.hidden_states,
                attentions=text_outputs.attentions,
            )
class FlaxCLIPTextModelWithProjection(FlaxCLIPTextPreTrainedModel):
    module_class = FlaxCLIPTextModelWithProjectionModule



# 定义一个类，继承自FlaxCLIPTextPreTrainedModel，用于文本模型与投影
class FlaxCLIPTextModelWithProjection(FlaxCLIPTextPreTrainedModel):
    # 指定模块类为FlaxCLIPTextModelWithProjectionModule
    module_class = FlaxCLIPTextModelWithProjectionModule


FLAX_CLIP_TEXT_MODEL_WITH_PROJECTION_DOCSTRING = """
    Returns:

    Example:

    ```
    >>> from transformers import AutoTokenizer, FlaxCLIPTextModelWithProjection

    >>> model = FlaxCLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
    >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

    >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")

    >>> outputs = model(**inputs)
    >>> text_embeds = outputs.text_embeds
    ```
"""

# 覆盖函数调用时的文档字符串，结合CLIP_TEXT_INPUTS_DOCSTRING和FLAX_CLIP_TEXT_MODEL_WITH_PROJECTION_DOCSTRING
overwrite_call_docstring(
    FlaxCLIPTextModelWithProjection, CLIP_TEXT_INPUTS_DOCSTRING + FLAX_CLIP_TEXT_MODEL_WITH_PROJECTION_DOCSTRING
)

# 追加或替换函数返回的文档字符串，输出类型为FlaxCLIPTextModelOutput，配置类为CLIPTextConfig
append_replace_return_docstrings(
    FlaxCLIPTextModelWithProjection, output_type=FlaxCLIPTextModelOutput, config_class=CLIPTextConfig
)


class FlaxCLIPVisionModule(nn.Module):
    config: CLIPVisionConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        # 设置视觉模型为FlaxCLIPVisionTransformer，使用指定的配置和数据类型
        self.vision_model = FlaxCLIPVisionTransformer(self.config, dtype=self.dtype)

    def __call__(
        self,
        pixel_values,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用视觉模型进行前向传播
        return self.vision_model(
            pixel_values=pixel_values,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


class FlaxCLIPVisionModel(FlaxCLIPVisionPreTrainedModel):
    module_class = FlaxCLIPVisionModule




FLAX_CLIP_VISION_MODEL_DOCSTRING = """
    Returns:

    Example:

    ```
    >>> from PIL import Image
    >>> import requests
    >>> from transformers import AutoProcessor, FlaxCLIPVisionModel

    >>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
    >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)

    >>> inputs = processor(images=image, return_tensors="np")

    >>> outputs = model(**inputs)
    >>> last_hidden_state = outputs.last_hidden_state
    >>> pooler_output = outputs.pooler_output  # pooled CLS states
    ```
"""

# 覆盖函数调用时的文档字符串，结合CLIP_VISION_INPUTS_DOCSTRING和FLAX_CLIP_VISION_MODEL_DOCSTRING
overwrite_call_docstring(FlaxCLIPVisionModel, CLIP_VISION_INPUTS_DOCSTRING + FLAX_CLIP_VISION_MODEL_DOCSTRING)

# 追加或替换函数返回的文档字符串，输出类型为FlaxBaseModelOutputWithPooling，配置类为CLIPVisionConfig
append_replace_return_docstrings(
    FlaxCLIPVisionModel, output_type=FlaxBaseModelOutputWithPooling, config_class=CLIPVisionConfig
)


class FlaxCLIPModule(nn.Module):
    config: CLIPConfig
    dtype: jnp.dtype = jnp.float32
    # 设置模型的初始化过程
    text_config = self.config.text_config
    vision_config = self.config.vision_config

    # 设置投影维度和文本、视觉嵌入的维度
    self.projection_dim = self.config.projection_dim
    self.text_embed_dim = text_config.hidden_size
    self.vision_embed_dim = vision_config.hidden_size

    # 初始化文本模型和视觉模型，使用FlaxCLIPTextTransformer和FlaxCLIPVisionTransformer
    self.text_model = FlaxCLIPTextTransformer(text_config, dtype=self.dtype)
    self.vision_model = FlaxCLIPVisionTransformer(vision_config, dtype=self.dtype)

    # 初始化视觉投影层和文本投影层，设置投影维度和使用正态分布初始化权重，不使用偏置
    self.visual_projection = nn.Dense(
        self.projection_dim,
        dtype=self.dtype,
        kernel_init=jax.nn.initializers.normal(0.02),
        use_bias=False,
    )
    self.text_projection = nn.Dense(
        self.projection_dim,
        dtype=self.dtype,
        kernel_init=jax.nn.initializers.normal(0.02),
        use_bias=False,
    )

    # 初始化logit_scale参数，设置为初始值为config.logit_scale_init_value的常数值
    self.logit_scale = self.param(
        "logit_scale", lambda _, shape: jnp.ones(shape) * self.config.logit_scale_init_value, []
    )
        ):
        # 如果 return_dict 参数未指定，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 调用视觉模型，传入像素值和其他相关参数，获取视觉模型的输出
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 调用文本模型，传入输入的 token IDs、注意力掩码、位置 IDs 等参数，获取文本模型的输出
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从视觉模型的输出中获取图像嵌入
        image_embeds = vision_outputs[1]
        # 通过视觉投影层处理图像嵌入
        image_embeds = self.visual_projection(image_embeds)

        # 从文本模型的输出中获取文本嵌入
        text_embeds = text_outputs[1]
        # 通过文本投影层处理文本嵌入
        text_embeds = self.text_projection(text_embeds)

        # 对图像嵌入进行归一化处理
        image_embeds = image_embeds / jnp.linalg.norm(image_embeds, axis=-1, keepdims=True)
        # 对文本嵌入进行归一化处理
        text_embeds = text_embeds / jnp.linalg.norm(text_embeds, axis=-1, keepdims=True)

        # 计算余弦相似度作为 logits
        logit_scale = jnp.exp(self.logit_scale)
        logits_per_text = jnp.matmul(text_embeds, image_embeds.T) * logit_scale
        logits_per_image = logits_per_text.T

        # 如果不返回字典形式的结果，则按顺序返回元组
        if not return_dict:
            return (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)

        # 返回 FlaxCLIPOutput 对象，封装各类输出和模型状态
        return FlaxCLIPOutput(
            logits_per_image=logits_per_image,
            logits_per_text=logits_per_text,
            text_embeds=text_embeds,
            image_embeds=image_embeds,
            text_model_output=text_outputs,
            vision_model_output=vision_outputs,
        )
# 使用装饰器为 FlaxCLIPModel 类添加起始文档字符串
@add_start_docstrings(CLIP_START_DOCSTRING)
# 将 FlaxCLIPPreTrainedModel 的模块类指定为 FlaxCLIPModule
class FlaxCLIPModel(FlaxCLIPPreTrainedModel):
    module_class = FlaxCLIPModule

# 定义 FLAX_CLIP_MODEL_DOCSTRING 常量，该常量包含关于 FlaxCLIPModel 类的详细文档字符串
FLAX_CLIP_MODEL_DOCSTRING = """
    Returns:
        描述函数返回的内容。

    Example:
        给出一个使用示例，展示模型如何使用。

    ```
    >>> import jax
    >>> from PIL import Image
    >>> import requests
    >>> from transformers import AutoProcessor, FlaxCLIPModel

    >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)

    >>> inputs = processor(
    ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True
    ... )

    >>> outputs = model(**inputs)
    >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
    >>> probs = jax.nn.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
    ```
"""

# 调用 overwrite_call_docstring 函数，将 CLIP_INPUTS_DOCSTRING 和 FLAX_CLIP_MODEL_DOCSTRING 合并作为 FlaxCLIPModel 类的文档字符串
overwrite_call_docstring(FlaxCLIPModel, CLIP_INPUTS_DOCSTRING + FLAX_CLIP_MODEL_DOCSTRING)

# 调用 append_replace_return_docstrings 函数，指定输出类型为 FlaxCLIPOutput，配置类为 CLIPConfig，为 FlaxCLIPModel 类附加和替换返回文档字符串
append_replace_return_docstrings(FlaxCLIPModel, output_type=FlaxCLIPOutput, config_class=CLIPConfig)

`.\models\clip\modeling_tf_clip.py`

# 设置编码为 UTF-8
# 版权声明和许可证信息
# 该代码遵循 Apache License, Version 2.0
# 可以在符合许可证条件的情况下使用和分发
"""
TF 2.0 CLIP 模型。
"""

# 导入必要的模块和库
from __future__ import annotations  # 使得类型注解支持延迟引用

import math  # 导入数学模块
from dataclasses import dataclass  # 导入 dataclass 用于定义数据类
from typing import Any, Optional, Tuple, Union  # 导入类型提示相关的模块

import numpy as np  # 导入 NumPy 库
import tensorflow as tf  # 导入 TensorFlow 库

# 导入所需的自定义模块和函数
from ...activations_tf import get_tf_activation  # 导入获取 TensorFlow 激活函数的函数
from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling  # 导入 TF 模型输出相关类

# 公共 API
from ...modeling_tf_utils import (
    TFModelInputType,  # 导入 TF 模型输入类型
    TFPreTrainedModel,  # 导入 TF 预训练模型基类
    get_initializer,  # 导入获取初始化器函数
    keras,  # 导入 Keras 相关功能
    keras_serializable,  # 导入 Keras 序列化函数
    unpack_inputs,  # 导入解包输入函数
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax  # 导入 TensorFlow 工具函数
from ...utils import (
    ModelOutput,  # 导入模型输出类
    add_start_docstrings,  # 导入添加文档字符串函数
    add_start_docstrings_to_model_forward,  # 导入添加前向模型文档字符串函数
    logging,  # 导入日志记录函数
    replace_return_docstrings,  # 导入替换返回文档字符串函数
)
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig  # 导入 CLIP 模型配置类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

_CHECKPOINT_FOR_DOC = "openai/clip-vit-base-patch32"  # 预训练模型的检查点名称

TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "openai/clip-vit-base-patch32",  # 支持的预训练模型列表，包括一个 CLIP 模型
    # 可以在 https://huggingface.co/models?filter=clip 查看所有 CLIP 模型
]

LARGE_NEGATIVE = -1e8  # 定义一个大负数，用于在计算中作为标记

# 从 transformers.models.bart.modeling_tf_bart._expand_mask 复制而来的函数
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
    """
    将注意力掩码从 `[bsz, seq_len]` 扩展到 `[bsz, 1, tgt_seq_len, src_seq_len]`。
    """
    src_len = shape_list(mask)[1]  # 获取掩码的序列长度
    tgt_len = tgt_len if tgt_len is not None else src_len  # 如果未指定目标长度，则使用源长度
    one_cst = tf.constant(1.0)  # 创建一个 TensorFlow 常量张量，值为 1.0
    mask = tf.cast(mask, dtype=one_cst.dtype)  # 将掩码转换为指定的数据类型的张量
    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))  # 使用 tf.tile 扩展掩码张量

    return (one_cst - expanded_mask) * LARGE_NEGATIVE  # 返回扩展后的掩码张量，同时使用大负数进行标记


# 对比损失函数，改编自 https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
    """
    计算对比损失，使用稀疏分类交叉熵作为损失函数。
    """
    return tf.math.reduce_mean(
        keras.metrics.sparse_categorical_crossentropy(
            y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
        )
    )


def clip_loss(similarity: tf.Tensor) -> tf.Tensor:
    """
    计算 CLIP 损失，结合文本和图像的对比损失。
    """
    caption_loss = contrastive_loss(similarity)  # 计算文本对比损失
    image_loss = contrastive_loss(tf.transpose(similarity))  # 计算图像对比损失
    return (caption_loss + image_loss) / 2.0  # 返回文本和图像损失的平均值


@dataclass
class TFCLIPOutput(ModelOutput):
    """
    TFCLIP 模型的输出类，继承自 ModelOutput。
    """
    # 这里是数据类的定义，包含了 ModelOutput 类的所有属性和方法
    """
    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`TFCLIPTextModel`].
        image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`TFCLIPVisionModel`].
        text_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]):
            The output of the [`TFCLIPTextModel`].
        vision_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]):
            The output of the [`TFCLIPVisionModel`].
    """

    # 定义类属性，用于存储各种输入和输出的张量或模型输出
    loss: tf.Tensor | None = None
    logits_per_image: tf.Tensor = None
    logits_per_text: tf.Tensor = None
    text_embeds: tf.Tensor = None
    image_embeds: tf.Tensor = None
    text_model_output: TFBaseModelOutputWithPooling = None
    vision_model_output: TFBaseModelOutputWithPooling = None

    # 将对象转换为元组的方法，忽略文本和视觉模型输出，直接使用它们的 `to_tuple` 方法获取元组表示
    def to_tuple(self) -> Tuple[Any]:
        return tuple(
            # 如果键不是 "text_model_output" 或 "vision_model_output"，则直接取值
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )
class TFCLIPVisionEmbeddings(keras.layers.Layer):
    # TFCLIPVisionEmbeddings 类，用于实现 CLIP 视觉嵌入层

    def __init__(self, config: CLIPVisionConfig, **kwargs):
        # 初始化函数，接受 CLIPVisionConfig 类型的配置参数和其他关键字参数

        super().__init__(**kwargs)

        # 设置嵌入维度
        self.embed_dim = config.hidden_size
        # 图像尺寸
        self.image_size = config.image_size
        # 图像分块大小
        self.patch_size = config.patch_size

        # 计算图像中的分块数量
        self.num_patches = (self.image_size // self.patch_size) ** 2
        # 计算位置编码数量（分块数量 + [CLS] token）
        self.num_positions = self.num_patches + 1

        # 保存配置参数
        self.config = config

        # 定义图像分块嵌入层，使用 2D 卷积实现
        self.patch_embedding = keras.layers.Conv2D(
            filters=self.embed_dim,
            kernel_size=self.patch_size,
            strides=self.patch_size,
            padding="valid",
            data_format="channels_last",
            use_bias=False,
            kernel_initializer=get_initializer(self.config.initializer_range * self.config.initializer_factor),
            name="patch_embedding",
        )

    def build(self, input_shape: tf.TensorShape = None):
        # 构建函数，用于构建层的权重和其他状态

        factor = self.config.initializer_factor

        # 添加类别嵌入向量权重
        self.class_embedding = self.add_weight(
            shape=(self.embed_dim,),
            initializer=get_initializer(self.embed_dim**-0.5 * factor),
            trainable=True,
            name="class_embedding",
        )

        # 添加位置编码向量权重
        with tf.name_scope("position_embedding"):
            self.position_embedding = self.add_weight(
                shape=(self.num_positions, self.embed_dim),
                initializer=get_initializer(self.config.initializer_range * factor),
                trainable=True,
                name="embeddings",
            )

        if self.built:
            return
        self.built = True

        # 如果已构建，直接返回；否则构建图像分块嵌入层
        if getattr(self, "patch_embedding", None) is not None:
            with tf.name_scope(self.patch_embedding.name):
                self.patch_embedding.build([None, None, None, self.config.num_channels])

    def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
        """`pixel_values` is expected to be of NCHW format."""
        # 调用函数，对输入的像素值进行处理并返回嵌入向量

        batch_size, num_channels, height, width = shape_list(pixel_values)

        # 当在 CPU 上运行时，`tf.nn.conv2d` 不支持 `NCHW` 格式，
        # 因此将输入格式从 `NCHW` 转换为 `NHWC`
        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))

        # 使用图像分块嵌入层处理像素值
        patch_embeds = self.patch_embedding(pixel_values)

        # 将2D空间维度转换为单个时间维度
        patch_embeds = tf.reshape(tensor=patch_embeds, shape=(batch_size, self.num_patches, -1))

        # 添加 [CLS] token 到嵌入的分块 token 中
        class_embeds = tf.broadcast_to(self.class_embedding, shape=(batch_size, 1, self.embed_dim))
        embeddings = tf.concat((class_embeds, patch_embeds), axis=1)

        # 添加位置编码到嵌入向量中
        embeddings = embeddings + self.position_embedding

        return embeddings
    def __init__(self, config: CLIPTextConfig, **kwargs):
        super().__init__(**kwargs)

        self.embed_dim = config.hidden_size  # 从配置中获取嵌入维度大小

        self.config = config  # 将配置对象存储在实例中以供后续使用

    def build(self, input_shape: tf.TensorShape = None):
        with tf.name_scope("token_embedding"):
            # 创建并添加嵌入权重变量，形状为 (词汇大小, 嵌入维度)
            self.weight = self.add_weight(
                shape=(self.config.vocab_size, self.embed_dim),
                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
                trainable=True,
                name="weight",
            )

        with tf.name_scope("position_embedding"):
            # 创建并添加位置嵌入权重变量，形状为 (最大位置嵌入数, 嵌入维度)
            self.position_embedding = self.add_weight(
                shape=(self.config.max_position_embeddings, self.embed_dim),
                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
                trainable=True,
                name="embeddings",
            )

        super().build(input_shape)

    def call(
        self,
        input_ids: tf.Tensor = None,
        position_ids: tf.Tensor = None,
        inputs_embeds: tf.Tensor = None,
    ) -> tf.Tensor:
        """
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        """
        if input_ids is None and inputs_embeds is None:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if inputs_embeds is None:
            # 如果未提供预先计算好的嵌入向量，则根据输入的 ids 获取对应的嵌入向量
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        input_shape = shape_list(inputs_embeds)[:-1]

        if position_ids is None:
            # 如果未提供位置 ids，则生成默认位置 ids
            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        # 根据位置 ids 获取位置嵌入向量，并在批次维度上进行复制以匹配输入嵌入的形状
        position_embeds = tf.gather(params=self.position_embedding, indices=position_ids)
        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
        
        # 最终的嵌入向量是输入嵌入向量和位置嵌入向量的和
        final_embeddings = inputs_embeds + position_embeds

        return final_embeddings
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: CLIPConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化层参数
        self.embed_dim = config.hidden_size  # 获取隐藏层大小
        self.num_attention_heads = config.num_attention_heads  # 获取注意力头的数量
        self.attention_head_size = self.embed_dim // self.num_attention_heads  # 计算每个注意力头的大小
        if self.attention_head_size * self.num_attention_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_attention_heads})."
            )

        # 初始化投影矩阵的标准差
        factor = config.initializer_factor
        in_proj_std = (self.embed_dim**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
        out_proj_std = (self.embed_dim**-0.5) * factor

        # 设置平方根注意力头大小
        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)

        # 定义查询、键、值的投影层
        self.q_proj = keras.layers.Dense(
            units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj"
        )
        self.k_proj = keras.layers.Dense(
            units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj"
        )
        self.v_proj = keras.layers.Dense(
            units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj"
        )

        # 定义 dropout 层
        self.dropout = keras.layers.Dropout(rate=config.attention_dropout)

        # 定义输出投影层
        self.out_proj = keras.layers.Dense(
            units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj"
        )

    # 从 transformers.models.bert.modeling_tf_bert.TFBertSelfAttention.transpose_for_scores 复制而来
    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # 将张量从 [batch_size, seq_length, all_head_size] 重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size]
        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))

        # 将张量从 [batch_size, seq_length, num_attention_heads, attention_head_size] 转置为 [batch_size, num_attention_heads, seq_length, attention_head_size]
        return tf.transpose(tensor, perm=[0, 2, 1, 3])

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        causal_attention_mask: tf.Tensor,
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        """Input shape: Batch x Time x Channel"""
        
        # 获取隐藏状态张量的批量大小
        batch_size = shape_list(hidden_states)[0]
        
        # 通过线性投影计算混合的查询、键和值张量
        mixed_query_layer = self.q_proj(inputs=hidden_states)
        mixed_key_layer = self.k_proj(inputs=hidden_states)
        mixed_value_layer = self.v_proj(inputs=hidden_states)
        
        # 将混合的查询、键和值张量转置以便于计算注意力分数
        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)

        # 计算注意力分数，使用query和key的点积
        # 结果形状为(batch size, num_heads, seq_len_q, seq_len_k)
        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
        
        # 缩放注意力分数
        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
        attention_scores = tf.divide(attention_scores, dk)

        # 首先应用因果注意力掩码
        if causal_attention_mask is not None:
            # 使用预先计算的因果注意力掩码，添加到注意力分数中
            attention_scores = tf.add(attention_scores, causal_attention_mask)

        # 如果存在普通注意力掩码，则也应用它
        if attention_mask is not None:
            # 使用预先计算的注意力掩码，添加到注意力分数中
            attention_scores = tf.add(attention_scores, attention_mask)

        # 将注意力分数归一化为概率值
        _attention_probs = stable_softmax(logits=attention_scores, axis=-1)

        # 使用dropout对注意力概率进行处理
        attention_probs = self.dropout(inputs=_attention_probs, training=training)

        # 计算注意力输出值
        attention_output = tf.matmul(attention_probs, value_layer)
        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])

        # 重新整形注意力输出的张量形状
        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.embed_dim))

        # 将注意力输出通过输出投影层
        attention_output = self.out_proj(attention_output, training=training)
        
        # 在TFBert中，注意力权重在dropout之后返回
        # 但是在CLIP中，它们在dropout之前返回
        outputs = (attention_output, _attention_probs) if output_attentions else (attention_output,)

        return outputs
    # 构建方法用于初始化层，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 将标志位设置为已构建
        self.built = True
        
        # 如果存在查询投影层，则构建该层
        if getattr(self, "q_proj", None) is not None:
            with tf.name_scope(self.q_proj.name):
                # 构建查询投影层，期望输入形状为 [None, None, self.embed_dim]
                self.q_proj.build([None, None, self.embed_dim])
        
        # 如果存在键投影层，则构建该层
        if getattr(self, "k_proj", None) is not None:
            with tf.name_scope(self.k_proj.name):
                # 构建键投影层，期望输入形状为 [None, None, self.embed_dim]
                self.k_proj.build([None, None, self.embed_dim])
        
        # 如果存在值投影层，则构建该层
        if getattr(self, "v_proj", None) is not None:
            with tf.name_scope(self.v_proj.name):
                # 构建值投影层，期望输入形状为 [None, None, self.embed_dim]
                self.v_proj.build([None, None, self.embed_dim])
        
        # 如果存在输出投影层，则构建该层
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                # 构建输出投影层，期望输入形状为 [None, None, self.embed_dim]
                self.out_proj.build([None, None, self.embed_dim])
# 定义 TFCLIPMLP 类，继承自 keras.layers.Layer
class TFCLIPMLP(keras.layers.Layer):
    # 初始化方法，接受一个 CLIPConfig 类型的配置对象和其他关键字参数
    def __init__(self, config: CLIPConfig, **kwargs):
        super().__init__(**kwargs)

        # 获取激活函数，根据配置中的隐藏层激活函数类型
        self.activation_fn = get_tf_activation(config.hidden_act)

        # 计算初始化因子
        factor = config.initializer_factor
        # 计算输入投影的标准差
        in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
        # 计算全连接层的标准差
        fc_std = (2 * config.hidden_size) ** -0.5 * factor

        # 创建全连接层 fc1，units 表示输出单元数，kernel_initializer 设置初始化方式，名称为 "fc1"
        self.fc1 = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
        )
        # 创建全连接层 fc2，units 表示输出单元数，kernel_initializer 设置初始化方式，名称为 "fc2"
        self.fc2 = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
        )
        # 保存配置对象
        self.config = config

    # 调用方法，接受隐藏状态输入，返回全连接层的输出
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 使用全连接层 fc1 对隐藏状态进行投影
        hidden_states = self.fc1(inputs=hidden_states)
        # 使用激活函数对投影后的隐藏状态进行激活
        hidden_states = self.activation_fn(hidden_states)
        # 使用全连接层 fc2 对激活后的隐藏状态再次投影
        hidden_states = self.fc2(inputs=hidden_states)
        # 返回投影后的隐藏状态
        return hidden_states

    # 构建方法，用于构建层的结构
    def build(self, input_shape=None):
        # 如果层已经构建，则直接返回
        if self.built:
            return
        # 设置层已构建的标志为 True
        self.built = True
        # 如果存在 fc1 层，则构建 fc1 层，设置输入形状为 [None, None, self.config.hidden_size]
        if getattr(self, "fc1", None) is not None:
            with tf.name_scope(self.fc1.name):
                self.fc1.build([None, None, self.config.hidden_size])
        # 如果存在 fc2 层，则构建 fc2 层，设置输入形状为 [None, None, self.config.intermediate_size]
        if getattr(self, "fc2", None) is not None:
            with tf.name_scope(self.fc2.name):
                self.fc2.build([None, None, self.config.intermediate_size])


# 定义 TFCLIPEncoderLayer 类，继承自 keras.layers.Layer
class TFCLIPEncoderLayer(keras.layers.Layer):
    # 初始化方法，接受一个 CLIPConfig 类型的配置对象和其他关键字参数
    def __init__(self, config: CLIPConfig, **kwargs):
        super().__init__(**kwargs)

        # 设置嵌入维度为配置中的隐藏大小
        self.embed_dim = config.hidden_size
        # 创建自注意力层 self_attn
        self.self_attn = TFCLIPAttention(config, name="self_attn")
        # 创建第一个层规范化层 layer_norm1
        self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
        # 创建 MLP 层 mlp
        self.mlp = TFCLIPMLP(config, name="mlp")
        # 创建第二个层规范化层 layer_norm2
        self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")

    # 调用方法，接受隐藏状态、注意力掩码、因果注意力掩码、输出注意力的标志和训练的标志，返回处理后的隐藏状态
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        causal_attention_mask: tf.Tensor,
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        """
        Args:
            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`tf.Tensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            causal_attention_mask (`tf.Tensor`): causal attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`):
                Whether or not to return the attentions tensors of all attention layers. See `outputs` under returned
                tensors for more detail.
        """
        residual = hidden_states  # 存储输入的隐藏状态作为残差连接的起点

        hidden_states = self.layer_norm1(inputs=hidden_states)  # 对输入的隐藏状态进行 layer normalization
        attention_outputs = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
            training=training,
        )  # 使用自注意力机制处理隐藏状态，可能返回注意力输出
        hidden_states = attention_outputs[0]  # 更新隐藏状态为自注意力机制的输出
        hidden_states = residual + hidden_states  # 残差连接：加上原始输入的隐藏状态

        residual = hidden_states  # 存储当前状态作为残差连接的起点
        hidden_states = self.layer_norm2(inputs=hidden_states)  # 对当前状态进行第二次 layer normalization
        hidden_states = self.mlp(hidden_states=hidden_states)  # 使用多层感知机处理当前状态
        hidden_states = residual + hidden_states  # 残差连接：加上前一步的输出

        outputs = (hidden_states,) + attention_outputs[1:]  # 如果需要返回注意力张量，则添加到输出中

        return outputs  # 返回处理后的输出结果，可能包含注意力张量和隐藏状态

    def build(self, input_shape=None):
        if self.built:
            return  # 如果已经构建过，直接返回

        self.built = True  # 标记模型已经构建

        if getattr(self, "self_attn", None) is not None:
            with tf.name_scope(self.self_attn.name):
                self.self_attn.build(None)  # 构建自注意力层

        if getattr(self, "layer_norm1", None) is not None:
            with tf.name_scope(self.layer_norm1.name):
                self.layer_norm1.build([None, None, self.embed_dim])  # 构建第一个 layer normalization 层

        if getattr(self, "mlp", None) is not None:
            with tf.name_scope(self.mlp.name):
                self.mlp.build(None)  # 构建多层感知机

        if getattr(self, "layer_norm2", None) is not None:
            with tf.name_scope(self.layer_norm2.name):
                self.layer_norm2.build([None, None, self.embed_dim])  # 构建第二个 layer normalization 层
class TFCLIPEncoder(keras.layers.Layer):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`TFCLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    """

    def __init__(self, config: CLIPConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化 Transformer 编码器的多层自注意力层
        self.layers = [TFCLIPEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)]

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        causal_attention_mask: tf.Tensor,
        output_attentions: bool,
        output_hidden_states: bool,
        return_dict: bool,
        training: bool = False,
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 初始化用于保存所有隐藏状态的元组，如果需要输出隐藏状态的话
        all_hidden_states = () if output_hidden_states else None
        # 初始化用于保存所有注意力权重的元组，如果需要输出注意力权重的话
        all_attentions = () if output_attentions else None

        # 遍历所有编码器层
        for i, layer_module in enumerate(self.layers):
            if output_hidden_states:
                # 将当前隐藏状态添加到所有隐藏状态的元组中
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 调用当前编码器层进行前向传播
            layer_outputs = layer_module(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                causal_attention_mask=causal_attention_mask,
                output_attentions=output_attentions,
                training=training,
            )
            hidden_states = layer_outputs[0]  # 更新隐藏状态为当前层的输出

            if output_attentions:
                # 将当前层的注意力权重添加到所有注意力权重的元组中
                all_attentions = all_attentions + (layer_outputs[1],)

        # 添加最后一层的隐藏状态（如果需要输出隐藏状态）
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 根据 return_dict 标志返回不同的输出格式
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)

        # 返回 TFBaseModelOutput 对象，包含最终的隐藏状态、所有隐藏状态和所有注意力权重
        return TFBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )

    def build(self, input_shape=None):
        # 如果已经构建，则直接返回
        if self.built:
            return
        self.built = True
        # 对每一层进行构建
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    layer.build(None)


class TFCLIPTextTransformer(keras.layers.Layer):
    def __init__(self, config: CLIPTextConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化文本 Transformer 的嵌入层
        self.embeddings = TFCLIPTextEmbeddings(config, name="embeddings")
        # 初始化文本 Transformer 的编码器层
        self.encoder = TFCLIPEncoder(config, name="encoder")
        # 初始化最终的层归一化层
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")

        # 用于计算 `pooled_output`
        self.eos_token_id = config.eos_token_id  # EOS（End of Sentence）符号的标识符
        self.embed_dim = config.hidden_size  # 嵌入维度大小

    def call(
        self,
        input_ids: TFModelInputType,
        attention_mask: tf.Tensor,
        position_ids: tf.Tensor,
        output_attentions: bool,
        output_hidden_states: bool,
        return_dict: bool,
        training: bool = False,
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 这里略去对 TFCLIPTextTransformer 的 call 方法的注释，因为上面已经详细解释了 TFCLIPEncoder 的 call 方法。
        ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        # 获取输入 `input_ids` 的形状信息
        input_shape = shape_list(input_ids)

        # 使用 `self.embeddings` 对象处理输入 `input_ids` 和 `position_ids`，生成嵌入输出
        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids)

        # 从输入形状信息中获取批大小和序列长度
        batch_size, seq_length = input_shape
        # CLIP 的文本模型使用因果遮蔽，此处准备因果注意力遮罩
        # 参考链接：https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
        causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_length, dtype=embedding_output.dtype)

        # 检查注意力遮罩并扩展其维度
        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
        attention_mask = _expand_mask(attention_mask)

        # 调用编码器 `self.encoder` 进行编码器的前向传播
        encoder_outputs = self.encoder(
            hidden_states=embedding_output,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取编码器输出的序列输出
        sequence_output = encoder_outputs[0]
        # 应用最终的层归一化到序列输出
        sequence_output = self.final_layer_norm(inputs=sequence_output)

        # 如果 `self.eos_token_id` 为 2，则执行以下操作
        if self.eos_token_id == 2:
            # `eos_token_id` 在 PR #24773 之前是错误的：我们保持这里的操作。
            # 当配置中的 `eos_token_id` 无法正确处理额外的新标记时，具有此类 `eos_token_id` 的 CLIP 模型不能正常工作
            # ------------------------------------------------------------
            # text_embeds.shape = [batch_size, n_ctx, transformer.width]
            # 从 eot 嵌入中获取特征（eot_token 是每个序列中的最高编号）
            pooled_output = tf.gather_nd(
                params=sequence_output,
                indices=tf.stack(
                    values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
                ),
            )
        else:
            # 配置从 PR #24773 中更新了 `eos_token_id`（因此可以使用额外的新标记）
            pooled_output = tf.gather_nd(
                params=sequence_output,
                indices=tf.stack(
                    values=(
                        tf.range(input_shape[0], dtype=tf.int64),
                        tf.math.argmax(tf.cast(input_ids == self.eos_token_id, dtype=tf.int8), axis=-1),
                    ),
                    axis=1,
                ),
            )

        # 如果 `return_dict` 为 False，则返回非字典形式的输出
        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        # 返回 TFBaseModelOutputWithPooling 类型的字典形式输出
        return TFBaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
    # 构建因果注意力掩码
    def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32):
        # 如果 seq_length 是运行时值，则 tf.constant 不支持，根据 TensorFlow 文档，
        # tf.fill 可以处理运行时动态形状：https://www.tensorflow.org/api_docs/python/tf/fill
        # 创建一个长度为 seq_length 的零对角线张量，并转换为指定的数据类型
        diag = tf.cast(tf.fill((seq_length,), 0.0), dtype)

        # 创建一个形状为 (seq_length, seq_length) 的二维注意力掩码，所有位置初始化为 -10000.0
        to_mask = tf.cast(tf.fill((seq_length, seq_length), -10000.0), dtype)

        # 设置二维注意力掩码的对角线和下三角部分为 0（即不需要掩码的位置）
        to_mask = tf.linalg.band_part(to_mask, 0, -1)
        # to_mask = tf.linalg.band_part(to_mask, -1, 0)  # 如果需要上三角部分也不被掩码，可以取消注释此行
        to_mask = tf.linalg.set_diag(to_mask, diagonal=diag)

        # 将二维注意力掩码扩展成形状为 (batch_size, 1, seq_length, seq_length) 的四维张量并返回
        return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length))

    # 构建模型
    def build(self, input_shape=None):
        if self.built:
            return  # 如果模型已经构建，则直接返回

        self.built = True  # 标记模型已构建

        # 如果存在嵌入层，则构建嵌入层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)

        # 如果存在编码器，则构建编码器
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)

        # 如果存在最终层归一化，则构建最终层归一化
        if getattr(self, "final_layer_norm", None) is not None:
            with tf.name_scope(self.final_layer_norm.name):
                # 构建最终层归一化，输入形状为 [None, None, self.embed_dim]
                self.final_layer_norm.build([None, None, self.embed_dim])
@keras_serializable
class TFCLIPTextMainLayer(keras.layers.Layer):
    # 设置配置类为CLIPTextConfig
    config_class = CLIPTextConfig

    def __init__(self, config: CLIPTextConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        # 初始化文本模型为TFCLIPTextTransformer对象
        self.text_model = TFCLIPTextTransformer(config, name="text_model")

    def get_input_embeddings(self) -> keras.layers.Layer:
        # 返回文本模型的嵌入层
        return self.text_model.embeddings

    def set_input_embeddings(self, value: tf.Variable):
        # 设置文本模型的嵌入层权重和词汇表大小
        self.text_model.embeddings.weight = value
        self.text_model.embeddings.vocab_size = shape_list(value)[0]

    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        if input_ids is None:
            # 如果input_ids为None，则抛出数值错误异常
            raise ValueError("You have to specify input_ids")

        input_shape = shape_list(input_ids)

        if attention_mask is None:
            # 如果attention_mask为None，则创建一个形状与input_ids相同的全1张量
            attention_mask = tf.fill(dims=input_shape, value=1)

        # 调用文本模型进行前向传播
        text_model_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return text_model_outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "text_model", None) is not None:
            with tf.name_scope(self.text_model.name):
                # 构建文本模型
                self.text_model.build(None)


class TFCLIPVisionTransformer(keras.layers.Layer):
    def __init__(self, config: CLIPVisionConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化视觉嵌入层、预层归一化、编码器、后层归一化和嵌入维度
        self.embeddings = TFCLIPVisionEmbeddings(config, name="embeddings")
        self.pre_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm")
        self.encoder = TFCLIPEncoder(config, name="encoder")
        self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
        self.embed_dim = config.hidden_size

    def call(
        self,
        pixel_values: TFModelInputType,
        output_attentions: bool,
        output_hidden_states: bool,
        return_dict: bool,
        training: bool = False,
    ):
        # 在视觉Transformer中调用嵌入层、编码器和归一化层，进行前向传播
        pass  # 这里应该有更多的代码，但已经被省略
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        # 输入：像素值作为输入，通过嵌入层获取嵌入输出
        embedding_output = self.embeddings(pixel_values=pixel_values)
        # 对嵌入输出进行预层归一化处理
        embedding_output = self.pre_layernorm(inputs=embedding_output)

        # 使用编码器处理嵌入输出，生成编码器的输出
        encoder_outputs = self.encoder(
            hidden_states=embedding_output,
            attention_mask=None,
            causal_attention_mask=None,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从编码器输出中提取序列输出
        sequence_output = encoder_outputs[0]
        # 提取池化后的输出，仅保留每个序列的第一个向量
        pooled_output = sequence_output[:, 0, :]
        # 对池化输出进行后层归一化处理
        pooled_output = self.post_layernorm(inputs=pooled_output)

        # 如果不要求返回字典形式的结果，则返回序列输出和池化输出以及可能的其他编码器输出
        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        # 如果要求返回字典形式的结果，则构建 TFBaseModelOutputWithPooling 对象
        return TFBaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        self.built = True
        # 如果定义了嵌入层，则构建嵌入层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        # 如果定义了预层归一化，则构建预层归一化层
        if getattr(self, "pre_layernorm", None) is not None:
            with tf.name_scope(self.pre_layernorm.name):
                self.pre_layernorm.build([None, None, self.embed_dim])
        # 如果定义了编码器，则构建编码器
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        # 如果定义了后层归一化，则构建后层归一化层
        if getattr(self, "post_layernorm", None) is not None:
            with tf.name_scope(self.post_layernorm.name):
                self.post_layernorm.build([None, self.embed_dim])
# 使用 keras_serializable 装饰器声明 TFCLIPVisionMainLayer 类，使其可序列化
@keras_serializable
class TFCLIPVisionMainLayer(keras.layers.Layer):
    # 设定配置类为 CLIPVisionConfig
    config_class = CLIPVisionConfig

    # 初始化方法，接受配置对象 config 和其他关键字参数 kwargs
    def __init__(self, config: CLIPVisionConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 将传入的配置对象保存在 self.config 中
        self.config = config
        # 创建 TFCLIPVisionTransformer 对象作为视觉模型，命名为 "vision_model"
        self.vision_model = TFCLIPVisionTransformer(config, name="vision_model")

    # 返回视觉模型的嵌入层
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.vision_model.embeddings

    # 使用 unpack_inputs 装饰器定义 call 方法，处理输入参数，并返回视觉模型的输出
    @unpack_inputs
    def call(
        self,
        pixel_values: TFModelInputType | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        # 如果 pixel_values 为 None，抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 调用视觉模型，传入参数并获取其输出
        vision_model_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回视觉模型的输出
        return vision_model_outputs

    # 构建方法，用于构建层，确保视觉模型已经构建
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 设置标志位表明已构建
        self.built = True
        # 如果视觉模型存在，则在名称作用域内构建它
        if getattr(self, "vision_model", None) is not None:
            with tf.name_scope(self.vision_model.name):
                self.vision_model.build(None)


# 使用 keras_serializable 装饰器声明 TFCLIPMainLayer 类，使其可序列化
@keras_serializable
class TFCLIPMainLayer(keras.layers.Layer):
    # 设定配置类为 CLIPConfig
    config_class = CLIPConfig
    # 初始化方法，接受一个CLIPConfig对象和其他关键字参数
    def __init__(self, config: CLIPConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 检查config.text_config是否为CLIPTextConfig类型，如果不是则抛出数值错误异常
        if not isinstance(config.text_config, CLIPTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type CLIPTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查config.vision_config是否为CLIPVisionConfig类型，如果不是则抛出数值错误异常
        if not isinstance(config.vision_config, CLIPVisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 将传入的config对象保存到实例变量self.config中
        self.config = config

        # 分别获取text_config和vision_config对象，保存到局部变量中
        text_config = config.text_config
        vision_config = config.vision_config

        # 设置实例变量projection_dim为config.projection_dim
        self.projection_dim = config.projection_dim

        # 创建TFCLIPTextTransformer实例并保存到实例变量self.text_model中
        self.text_model = TFCLIPTextTransformer(text_config, name="text_model")
        
        # 创建TFCLIPVisionTransformer实例并保存到实例变量self.vision_model中
        self.vision_model = TFCLIPVisionTransformer(vision_config, name="vision_model")

        # 创建一个Dense层用于视觉特征的投影，设置单元数为projection_dim，
        # 使用指定的初始化器初始化权重，不使用偏置，命名为visual_projection
        self.visual_projection = keras.layers.Dense(
            units=self.projection_dim,
            kernel_initializer=get_initializer(vision_config.hidden_size**-0.5 * self.config.initializer_factor),
            use_bias=False,
            name="visual_projection",
        )

        # 创建一个Dense层用于文本特征的投影，设置单元数为projection_dim，
        # 使用指定的初始化器初始化权重，不使用偏置，命名为text_projection
        self.text_projection = keras.layers.Dense(
            units=self.projection_dim,
            kernel_initializer=get_initializer(text_config.hidden_size**-0.5 * self.config.initializer_factor),
            use_bias=False,
            name="text_projection",
        )

        # 设置实例变量text_embed_dim为text_config.hidden_size
        self.text_embed_dim = text_config.hidden_size
        
        # 设置实例变量vision_embed_dim为vision_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

    # 构建方法，用于构建模型的组件
    def build(self, input_shape: tf.TensorShape = None):
        # 添加一个名为logit_scale的可训练权重，形状为(1,)，初始值为config.logit_scale_init_value
        self.logit_scale = self.add_weight(
            shape=(1,),
            initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
            trainable=True,
            name="logit_scale",
        )

        # 如果已经构建过模型，则直接返回，避免重复构建
        if self.built:
            return
        self.built = True

        # 如果存在text_model实例，则在名为text_model的作用域内构建text_model
        if getattr(self, "text_model", None) is not None:
            with tf.name_scope(self.text_model.name):
                self.text_model.build(None)

        # 如果存在vision_model实例，则在名为vision_model的作用域内构建vision_model
        if getattr(self, "vision_model", None) is not None:
            with tf.name_scope(self.vision_model.name):
                self.vision_model.build(None)

        # 如果存在visual_projection实例，则在名为visual_projection的作用域内构建visual_projection
        if getattr(self, "visual_projection", None) is not None:
            with tf.name_scope(self.visual_projection.name):
                self.visual_projection.build([None, None, self.vision_embed_dim])

        # 如果存在text_projection实例，则在名为text_projection的作用域内构建text_projection
        if getattr(self, "text_projection", None) is not None:
            with tf.name_scope(self.text_projection.name):
                self.text_projection.build([None, None, self.text_embed_dim])

    # 使用装饰器unpack_inputs修饰的方法，用于解包输入数据
    @unpack_inputs
    @unpack_inputs
    # 使用装饰器 unpack_inputs 解包输入参数，使得方法可以接收多种输入形式
    def get_text_features(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> tf.Tensor:
        # 如果没有提供 input_ids，则抛出数值错误异常
        if input_ids is None:
            raise ValueError("You have to specify either input_ids")

        # 获取 input_ids 的形状信息
        input_shape = shape_list(input_ids)

        # 如果 attention_mask 未提供，则使用维度为 input_shape 的全 1 张量填充
        if attention_mask is None:
            attention_mask = tf.fill(dims=input_shape, value=1)

        # 使用 text_model 处理输入数据，获取文本模型的输出
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从 text_outputs 中获取池化输出（一般是第二个输出）
        pooled_output = text_outputs[1]
        # 使用 text_projection 对池化输出进行投影，得到文本特征表示
        text_features = self.text_projection(inputs=pooled_output)

        # 返回文本特征表示
        return text_features

    @unpack_inputs
    # 使用装饰器 unpack_inputs 解包输入参数，使得方法可以接收多种输入形式
    def get_image_features(
        self,
        pixel_values: TFModelInputType | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> tf.Tensor:
        # 如果没有提供 pixel_values，则抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 使用 vision_model 处理输入数据，获取视觉模型的输出
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从 vision_outputs 中获取池化输出（一般是第二个输出）
        pooled_output = vision_outputs[1]
        # 使用 visual_projection 对池化输出进行投影，得到图像特征表示
        image_features = self.visual_projection(inputs=pooled_output)

        # 返回图像特征表示
        return image_features

    @unpack_inputs
    # 使用装饰器 unpack_inputs 解包输入参数，使得方法可以接收多种输入形式
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        pixel_values: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        return_loss: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
        # 如果既没有提供 input_ids 也没有提供 pixel_values，则抛出数值错误异常
        if input_ids is None and pixel_values is None:
            raise ValueError("You have to specify either input_ids or pixel_values")

        # 如果提供了 input_ids，则处理文本特征
        if input_ids is not None:
            input_shape = shape_list(input_ids)
            if attention_mask is None:
                attention_mask = tf.fill(dims=input_shape, value=1)
            text_outputs = self.text_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                position_ids=position_ids,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                training=training,
            )
            pooled_output = text_outputs[1]
            text_features = self.text_projection(inputs=pooled_output)
            return text_features

        # 如果提供了 pixel_values，则处理图像特征
        if pixel_values is not None:
            vision_outputs = self.vision_model(
                pixel_values=pixel_values,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                training=training,
            )
            pooled_output = vision_outputs[1]
            image_features = self.visual_projection(inputs=pooled_output)
            return image_features

        # 如果设置了 return_loss 为 True，则返回损失值和相应特征
        if return_loss:
            return text_features, image_features

        # 否则根据 return_dict 的设置返回相应的特征表示
        if return_dict:
            return {'text_features': text_features, 'image_features': image_features}
        else:
            return text_features, image_features
    ) -> Union[TFCLIPOutput, Tuple[tf.Tensor]]:
        # 如果未提供 input_ids 参数，则抛出数值错误
        if input_ids is None:
            raise ValueError("You have to specify either input_ids")
        # 如果未提供 pixel_values 参数，则抛出数值错误
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 获取输入张量的形状信息
        input_shape = shape_list(input_ids)

        # 如果未提供 attention_mask 参数，则用值为 1 填充并赋给 attention_mask
        if attention_mask is None:
            attention_mask = tf.fill(dims=input_shape, value=1)

        # 调用 vision_model 处理图像输入，并返回相应的输出
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 调用 text_model 处理文本输入，并返回相应的输出
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从 vision_outputs 中提取图像嵌入向量
        image_embeds = vision_outputs[1]
        # 通过 visual_projection 对图像嵌入向量进行投影
        image_embeds = self.visual_projection(inputs=image_embeds)

        # 从 text_outputs 中提取文本嵌入向量
        text_embeds = text_outputs[1]
        # 通过 text_projection 对文本嵌入向量进行投影
        text_embeds = self.text_projection(inputs=text_embeds)

        # 对图像嵌入向量进行标准化处理
        image_embeds = image_embeds / tf.norm(tensor=image_embeds, ord="euclidean", axis=-1, keepdims=True)
        # 对文本嵌入向量进行标准化处理
        text_embeds = text_embeds / tf.norm(tensor=text_embeds, ord="euclidean", axis=-1, keepdims=True)

        # 计算余弦相似度作为 logits
        logit_scale = tf.math.exp(self.logit_scale)
        logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale
        logits_per_image = tf.transpose(logits_per_text)

        # 初始化 loss 为 None
        loss = None
        # 如果需要返回 loss，则计算 clip_loss 并将其重塑为形状为 (1,) 的张量
        if return_loss:
            loss = clip_loss(logits_per_text)
            loss = tf.reshape(loss, (1,))

        # 如果不返回字典格式的输出，以元组形式返回各种输出
        if not return_dict:
            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
            return (loss,) + output if loss is not None else output

        # 如果需要返回字典格式的输出，则构建 TFCLIPOutput 对象并返回
        return TFCLIPOutput(
            loss=loss,
            logits_per_image=logits_per_image,
            logits_per_text=logits_per_text,
            text_embeds=text_embeds,
            image_embeds=image_embeds,
            text_model_output=text_outputs,
            vision_model_output=vision_outputs,
        )
# CLIP 文本输入的文档字符串，用于说明如何传递文本输入给模型
CLIP_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).


注释：

# input_ids: 输入序列token在词汇表中的索引，可以是np.ndarray、tf.Tensor、List[tf.Tensor]、Dict[str, tf.Tensor]或Dict[str, np.ndarray]类型，每个示例必须具有形状为({0})。
# attention_mask: 可选参数，用于避免在填充token索引上执行注意力操作的掩码。掩码值在[0, 1]之间选择：
#   - 1表示不被掩盖的token，
#   - 0表示被掩盖的token。
# position_ids: 可选参数，输入序列中每个token在位置嵌入中的位置索引。选择范围为[0, config.max_position_embeddings - 1]。
# output_attentions: 可选参数，是否返回所有注意力层的注意力张量。详细信息请参见返回的张量中的`attentions`。此参数仅在动态图模式下有效，在静态图模式下将使用配置中的值。
# output_hidden_states: 可选参数，是否返回所有层的隐藏状态。详细信息请参见返回的张量中的`hidden_states`。此参数仅在动态图模式下有效，在静态图模式下将使用配置中的值。
# return_dict: 可选参数，是否返回[`~utils.ModelOutput`]而不是普通元组。此参数可以在动态图模式下使用，在静态图模式下将始终设置为True。
# training: 可选参数，默认为`False`，指示模型是否处于训练模式（例如，某些模块如dropout在训练和评估之间有不同的行为）。
# CLIP_VISION_INPUTS_DOCSTRING 是一个原始字符串（raw string），用于描述 CLIP 模型的输入参数。
CLIP_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`CLIPImageProcessor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to
            return the attentions tensors of all attention layers. See `attentions` under returned tensors for more
            detail. This argument can be used only in eager mode, in graph mode the value in the config will be used
            instead.
        output_hidden_states (`bool`, *optional`):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional`):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
"""

# CLIP_INPUTS_DOCSTRING 是一个原始字符串（raw string），用于描述 CLIP 模型的输入参数，不同于 CLIP_VISION_INPUTS_DOCSTRING。
CLIP_INPUTS_DOCSTRING = r"""
"""
    # 定义一个函数，接受多种类型的输入数据作为参数，这些数据用于描述输入序列的特征和掩码
    Args:
        # 输入序列的标记索引，可以是多种数据类型，如 np.ndarray, tf.Tensor, List[tf.Tensor], Dict[str, tf.Tensor] 或 Dict[str, np.ndarray]
        # 每个样本都必须具有形状为 ({0}) 的索引
        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
            
            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.
            
            [What are input IDs?](../glossary#input-ids)
        
        # 像素值，可以是多种数据类型，如 np.ndarray, tf.Tensor, List[tf.Tensor], Dict[str, tf.Tensor] 或 Dict[str, np.ndarray]
        # 每个样本必须具有形状为 (batch_size, num_channels, height, width) 的像素值
        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`CLIPImageProcessor.__call__`] for details.
        
        # 可选参数，用于避免在填充标记索引上执行注意力操作的掩码
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            
            [What are attention masks?](../glossary#attention-mask)
        
        # 可选参数，指定每个输入序列标记在位置嵌入中的位置索引
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
            
            [What are position IDs?](../glossary#position-ids)
        
        # 可选参数，指定是否返回对比损失
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        
        # 可选参数，在 eager 模式下是否返回所有注意力层的注意力张量
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        
        # 可选参数，在 eager 模式下是否返回所有层的隐藏状态
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        
        # 可选参数，指定是否返回一个 `~utils.ModelOutput` 而不是普通的元组。在 eager 模式下可以使用，图模式下始终为 True。
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        
        # 可选参数，指定是否以训练模式运行模型（某些模块如 dropout 在训练和评估中有不同的行为）
        training (`bool`, *optional*, defaults to `False`):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
"""
Define TFCLIPTextModel class inheriting from TFCLIPPreTrainedModel.
"""
class TFCLIPTextModel(TFCLIPPreTrainedModel):
    # Specify the configuration class for text CLIP
    config_class = CLIPTextConfig

    def __init__(self, config: CLIPTextConfig, *inputs, **kwargs):
        """
        Initialize TFCLIPTextModel.

        Args:
            config (CLIPTextConfig): Model configuration object.
            *inputs: Variable length input arguments.
            **kwargs: Keyword arguments for additional configuration.
        """
        # Call superclass initialization
        super().__init__(config, *inputs, **kwargs)

        # Initialize the main CLIP text layer
        self.clip = TFCLIPTextMainLayer(config, name="clip")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPTextConfig)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        """
        Perform the forward pass of the model.

        Args:
            input_ids (TFModelInputType, optional): Input tensor of token ids.
            attention_mask (np.ndarray or tf.Tensor, optional): Attention mask for masking padded tokens.
            position_ids (np.ndarray or tf.Tensor, optional): Position indices for the input tokens.
            output_attentions (bool, optional): Whether to output attentions.
            output_hidden_states (bool, optional): Whether to output hidden states.
            return_dict (bool, optional): Whether to return a dictionary.
            training (bool, optional): Whether the model is in training mode.

        Returns:
            TFBaseModelOutputWithPooling or Tuple[tf.Tensor]: Model outputs.

        Examples:
            Example usage of the model:
            ```
            >>> from transformers import AutoTokenizer, TFCLIPTextModel

            >>> model = TFCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
            >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

            >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")

            >>> outputs = model(**inputs)
            >>> last_hidden_state = outputs.last_hidden_state
            >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
            ```
        """
        # Forward pass through the CLIP model
        outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return outputs

    def build(self, input_shape=None):
        """
        Build method for constructing the model.

        Args:
            input_shape: Shape of the input tensor (not used here).
        """
        if self.built:
            return
        self.built = True
        # Build the main CLIP layer if defined
        if getattr(self, "clip", None) is not None:
            with tf.name_scope(self.clip.name):
                self.clip.build(None)


"""
Define TFCLIPVisionModel class inheriting from TFCLIPPreTrainedModel.
"""
class TFCLIPVisionModel(TFCLIPPreTrainedModel):
    # Specify the configuration class for vision CLIP
    config_class = CLIPVisionConfig
    # Define the main input name for vision model
    main_input_name = "pixel_values"

    def __init__(self, config: CLIPVisionConfig, *inputs, **kwargs):
        """
        Initialize TFCLIPVisionModel.

        Args:
            config (CLIPVisionConfig): Model configuration object.
            *inputs: Variable length input arguments.
            **kwargs: Keyword arguments for additional configuration.
        """
        # Call superclass initialization
        super().__init__(config, *inputs, **kwargs)

        # Initialize the main CLIP vision layer
        self.clip = TFCLIPVisionMainLayer(config, name="clip")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPVisionConfig)
    def call(
        self,
        pixel_values: TFModelInputType | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,

    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        """
        Perform the forward pass of the model.

        Args:
            pixel_values (TFModelInputType, optional): Input tensor of pixel values.
            output_attentions (bool, optional): Whether to output attentions.
            output_hidden_states (bool, optional): Whether to output hidden states.
            return_dict (bool, optional): Whether to return a dictionary.
            training (bool, optional): Whether the model is in training mode.

        Returns:
            TFBaseModelOutputWithPooling or Tuple[tf.Tensor]: Model outputs.
        """
        # Forward pass through the CLIP model
        outputs = self.clip(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return outputs
        ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        r"""
        Returns:

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFCLIPVisionModel

        >>> model = TFCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="tf")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```"""

        # 调用 self.clip 方法进行模型推断
        outputs = self.clip(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回推断的输出结果
        return outputs

    def build(self, input_shape=None):
        # 如果模型已经构建完成，直接返回
        if self.built:
            return
        # 设置模型已构建标志为 True
        self.built = True
        # 如果 self.clip 存在，则在命名空间下构建 clip 模型
        if getattr(self, "clip", None) is not None:
            with tf.name_scope(self.clip.name):
                self.clip.build(None)
# 使用装饰器为类添加文档字符串，使用CLIP_START_DOCSTRING作为模板
@add_start_docstrings(CLIP_START_DOCSTRING)
class TFCLIPModel(TFCLIPPreTrainedModel):
    # 设置配置类为CLIPConfig
    config_class = CLIPConfig

    # 初始化方法，接受配置对象config和任意额外输入
    def __init__(self, config: CLIPConfig, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)

        # 创建TFCLIPMainLayer实例并赋给self.clip
        self.clip = TFCLIPMainLayer(config, name="clip")

    # 使用装饰器解包输入并添加文档字符串，使用CLIP_TEXT_INPUTS_DOCSTRING作为模板
    @unpack_inputs
    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def get_text_features(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> tf.Tensor:
        r"""
        返回文本特征张量(`tf.Tensor`，形状为`(batch_size, output_dim)`):
        通过将投影层应用于[`TFCLIPTextModel`]的汇总输出获得的文本嵌入。

        Examples:
        
        ```
        >>> from transformers import AutoTokenizer, TFCLIPModel
        
        >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
        
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
        >>> text_features = model.get_text_features(**inputs)
        ```"""

        # 调用self.clip的get_text_features方法，传入各种输入参数
        text_features = self.clip.get_text_features(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 返回文本特征张量
        return text_features

    # 使用装饰器解包输入并添加文档字符串，使用CLIP_VISION_INPUTS_DOCSTRING作为模板
    @unpack_inputs
    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
    def get_image_features(
        self,
        pixel_values: TFModelInputType | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> tf.Tensor:
        r"""
        Returns:
            image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
            the projection layer to the pooled output of [`TFCLIPVisionModel`].

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFCLIPModel

        >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="tf")

        >>> image_features = model.get_image_features(**inputs)
        ```"""

        # 调用 CLIP 模型获取图像特征
        image_features = self.clip.get_image_features(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 返回获取的图像特征张量
        return image_features

    @unpack_inputs
    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TFCLIPOutput, config_class=CLIPConfig)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        pixel_values: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        return_loss: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    def serving_output(self, output: TFCLIPOutput) -> TFCLIPOutput:
        """
        返回经过服务输出处理后的 TFCLIPOutput 对象。

        Parameters:
        output (TFCLIPOutput): 待处理的 TFCLIPOutput 对象。

        Returns:
        TFCLIPOutput: 经过服务输出处理后的 TFCLIPOutput 对象。
        """
        # TODO: 目前在 saved_model=True 模式下存在问题，因为 TensorFlow 无法追踪嵌套的 dataclass 结构。
        # 参考链接: https://github.com/huggingface/transformers/pull/16886
        return output

    def build(self, input_shape=None):
        """
        构建模型的方法。如果已经构建过，则直接返回，否则进行构建。

        Parameters:
        input_shape: 输入张量的形状，默认为 None。
        """
        if self.built:
            return
        self.built = True
        # 如果模型已经包含了 CLIP 模型实例，则在命名空间下构建该模型。
        if getattr(self, "clip", None) is not None:
            with tf.name_scope(self.clip.name):
                self.clip.build(None)

Transformers-源码解析-二十五-

Transformers 源码解析（二十五）

.\models\clap\processing_clap.py

.\models\clap\__init__.py

.\models\clip\configuration_clip.py

.\models\clip\convert_clip_original_pytorch_to_hf.py

.\models\clip\feature_extraction_clip.py

.\models\clip\image_processing_clip.py

.\models\clip\modeling_clip.py

.\models\clip\modeling_flax_clip.py

.\models\clip\modeling_tf_clip.py

`.\models\clap\processing_clap.py`

`.\models\clap\init.py`

`.\models\clip\configuration_clip.py`

`.\models\clip\convert_clip_original_pytorch_to_hf.py`

`.\models\clip\feature_extraction_clip.py`

`.\models\clip\image_processing_clip.py`

`.\models\clip\modeling_clip.py`

`.\models\clip\modeling_flax_clip.py`

`.\models\clip\modeling_tf_clip.py`