Transformers 源码解析（五十九）

`.\models\idefics\perceiver.py`

    def __init__(
        self, config: IdeficsConfig, embed_dim: int, depth: int, n_heads: int, head_dim: int, n_latents: int
    ):
        """
        初始化函数，创建一个 IdeficsPerceiverResampler 对象。

        参数:
        - config: IdeficsConfig 对象，包含了模型的配置信息
        - embed_dim: 整数，嵌入维度，用于定义输入的特征维度
        - depth: 整数，表示模型的深度或层数
        - n_heads: 整数，注意力头的数量，用于多头注意力机制
        - head_dim: 整数，每个注意力头的维度
        - n_latents: 整数，指定要生成的潜变量的数量
        """
        # 调用父类 nn.Module 的初始化方法
        super().__init__()
        
        # 将传入的配置信息保存为类的属性
        self.config = config
        
        # 初始化一个线性层，用于将输入嵌入特征维度映射到 latent 潜变量的数量维度
        self.project_in = nn.Linear(embed_dim, n_latents * head_dim)
        
        # 保存配置中的参数作为类属性
        self.depth = depth
        self.n_heads = n_heads
        self.head_dim = head_dim
        self.n_latents = n_latents


这段代码定义了一个名为 `IdeficsPerceiverResampler` 的 PyTorch 模型类，用于实现 Perceiver Resampler 架构。
    ) -> None:
        """
        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

        Args:
            config (`IdeficsConfig`): config object
            embed_dim (`int`): The size of each embedding vector
            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
            n_latents (`int`):
                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

        """
        super().__init__()
        # 设置类的属性 embed_dim, n_heads, head_dim, n_latents
        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = embed_dim, n_heads, head_dim, n_latents
        # 获取配置文件中的 qk_layer_norms_perceiver 设置
        self.qk_layer_norms = config.perceiver_config.qk_layer_norms_perceiver

        # 创建 Perceiver 的潜变量（latent embeddings）
        self.latents = nn.Parameter(torch.randn(self.n_latents, self.embed_dim), requires_grad=True)

        # 确定中间层的维度，根据是否存在 vision_config.embed_dim 进行选择
        self.intermediate_dim = (
            self.embed_dim * 4
            if not hasattr(config.vision_config, "embed_dim")
            else config.vision_config.embed_dim * 4
        )
        # 创建包含 depth 个 Transformer 块的模块列表
        self.blocks = nn.ModuleList(
            [
                nn.ModuleList(
                    [
                        IdeficsPerceiverAttention(self.embed_dim, self.n_heads, self.head_dim, self.qk_layer_norms),
                        IdeficsMLP(self.intermediate_dim, config),
                    ]
                )
                for _ in range(depth)
            ]
        )
        # 创建用于归一化输出的 LayerNorm
        self.layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(self, context: torch.Tensor) -> torch.Tensor:
        """Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
        # 重复潜变量以匹配上下文张量的批次大小
        latents = self.latents.repeat(context.shape[0], 1, 1)

        # 通过每个 Transformer 块的注意力机制和 MLP 前馈网络
        for attn, ff in self.blocks:
            # 执行注意力机制
            latents = attn(context, latents) + latents
            # 执行前馈网络
            latents = ff(latents) + latents

        # 对最终的输出进行 LayerNorm 处理
        return self.layer_norm(latents)
class IdeficsPerceiverAttention(nn.Module):
    # 定义 Perceiver 注意力模块，用于处理跨注意力的计算
    def __init__(self, embed_dim: int, n_heads: int, head_dim: int, qk_layer_norms: bool) -> None:
        """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
        # 初始化函数，设置模块的参数和层
        super().__init__()
        self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
        self.qk_layer_norms = qk_layer_norms
        # 对上下文向量和潜在向量进行层标准化
        self.context_layer_norm = nn.LayerNorm(self.embed_dim)
        self.latents_layer_norm = nn.LayerNorm(self.embed_dim)
        if self.qk_layer_norms:
            # 如果需要，对查询和键进行单独的层标准化
            self.q_layer_norm = nn.LayerNorm(self.head_dim)
            self.k_layer_norm = nn.LayerNorm(self.head_dim)

        # 缩放因子，用于缩放 Q 和 K 的点积计算
        self.qk_scale = self.head_dim**-0.5

        # Q, K, V 投影层 (无偏置 -- 根据 Perceiver/Flamingo 论文中的详细说明)
        self.q_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
        self.k_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
        self.v_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)

        # 输出投影层，将多头注意力结果投影到最终输出维度
        self.output_proj = nn.Linear(self.n_heads * self.head_dim, embed_dim, bias=False)
    def forward(self, context: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
        """
        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

        Args:
            context (`torch.Tensor`):
                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
            latents (`torch.Tensor`):
                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

        Returns:
            `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
            from context.
        """
        # 对上下文进行 layer normalization
        context = self.context_layer_norm(context)
        # 对潜变量进行 layer normalization
        latents = self.latents_layer_norm(latents)
        # 获取 batch_size, seq_length, embed_dim 的值
        batch_size, seq_length, embed_dim = context.shape[:3]

        # 查询、键、值的投影 --> 注意，在 Flamingo 中，潜变量会 *连接* 到上下文之前进行注意力操作！
        # 注意：这导致查询具有 `seq = n_latents`，键和值具有 `seq = len(context) + n_latents`
        q = self.q_proj(latents)
        k = self.k_proj(torch.cat([context, latents], dim=-2))
        v = self.v_proj(torch.cat([context, latents], dim=-2))

        # 多头自注意力机制，使用稳定的 softmax（在调用 softmax 前减去每行的最大值）
        #   =>> `attn` 应该是形状为 [n_latents x (context + n_latents)] 的二维矩阵
        q, k, v = [x.reshape(batch_size, x.shape[1], self.n_heads, self.head_dim).transpose(1, 2) for x in (q, k, v)]

        # 如果启用了 qk_layer_norms，对查询和键进行 layer normalization
        if self.qk_layer_norms:
            q = self.q_layer_norm(q)
            k = self.k_layer_norm(k)

        # 计算注意力分数
        scores = torch.einsum("... i d, ... j d -> ... i j", q * self.qk_scale, k)
        # 对分数进行稳定化处理（减去每行的最大值）
        stabilized_scores = scores - (scores.amax(dim=-1, keepdim=True).detach())
        # 应用 softmax 获取注意力权重
        attn = stabilized_scores.softmax(dim=-1)

        # 注意力加权平均并投影回输出空间...
        resampled = torch.einsum("... i j, ... j d -> ... i d", attn, v)
        # 对 resampled 进行重新排列，以便恢复形状为 "bsz seq (heads embed)"
        return self.output_proj(resampled.transpose(1, 2).flatten(-2))
class IdeficsMLP(nn.Module):
    def __init__(self, intermediate_size, config: IdeficsConfig):
        """Simple MLP block with intermediate_size and embedding size"""
        # 初始化函数，定义一个简单的MLP模块，使用给定的intermediate_size和配置的embedding大小
        super().__init__()
        # 从配置中获取视觉配置的嵌入维度
        self.embed_dim = config.vision_config.embed_dim
        # LayerNorm层，对输入进行归一化处理
        self.ln = nn.LayerNorm(self.embed_dim)
        # 全连接层，将输入的嵌入维度映射到intermediate_size维度，无偏置项
        self.fc = nn.Linear(self.embed_dim, intermediate_size, bias=False)
        # 激活函数ReLU
        self.act = nn.ReLU()
        # 进一步映射intermediate_size维度到嵌入维度，无偏置项
        self.c_proj = nn.Linear(intermediate_size, self.embed_dim, bias=False)

    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
        # 前向传播函数，接收隐藏状态作为输入，返回张量类型的隐藏状态
        hidden_states = self.ln(hidden_states)  # LayerNorm处理隐藏状态
        hidden_states = self.fc(hidden_states)  # 全连接层处理隐藏状态
        hidden_states = self.act(hidden_states)  # ReLU激活函数处理隐藏状态
        hidden_states = self.c_proj(hidden_states)  # 进一步映射处理隐藏状态

        return hidden_states

`.\models\idefics\processing_idefics.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for IDEFICS.
"""

from typing import Callable, List, Optional, Union
from urllib.parse import urlparse

from ...feature_extraction_utils import BatchFeature
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
from ...utils import TensorType, is_torch_available


if is_torch_available():
    import torch


IMAGE_TOKEN = "<image>"


# copied from m4.training.packing
# 将增量注意力掩码转换为二进制注意力掩码
def incremental_to_binary_attention_mask(incremental_mask, num_classes=-1):
    # This function converts: [-1, 0, 1] => [[0, 0], [1, 0], [0, 1]]

    # 如果有任何图像索引超过 num_classes，将它们设置为 -1
    # 超过允许的最大图像数量的单词不参与任何注意力
    if num_classes != -1:
        incremental_mask[incremental_mask >= num_classes] = -1

    negatives = incremental_mask == -1
    incremental_mask[negatives] = 0
    attn_mask = torch.nn.functional.one_hot(incremental_mask, num_classes=num_classes)
    attn_mask[negatives, :] = 0
    return attn_mask


# copied from m4.training.packing
# 为打包的输入 ID 创建图像注意力掩码
def image_attention_mask_for_packed_input_ids(input_ids, tokenizer):
    image_attention_mask = torch.full_like(input_ids, fill_value=-1)
    next_image_attention_mask = torch.full_like(input_ids, fill_value=-1)
    image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
    eod_token_id = tokenizer.eos_token_id
    for batch_idx in range(input_ids.size(0)):
        count = -1
        seen_eod = False
        for idx, token_id in enumerate(input_ids[batch_idx]):
            if token_id == image_token_id:
                count += 1
                image_attention_mask[batch_idx][idx] = count
                seen_eod = False
            else:
                image_attention_mask[batch_idx][idx] = count

            if seen_eod:
                image_attention_mask[batch_idx][idx] = -1

            if token_id == eod_token_id:
                seen_eod = True
    # 遍历每个批次中的输入 ID
    for batch_idx in range(input_ids.size(0)):
        # 初始化计数器为-1，表示还未遇到图片标记
        count = -1
        # 标记是否已经遇到过结束符 (eod_token_id)
        seen_eod = False
        
        # 倒序遍历当前批次中的输入 ID
        for idx in range(input_ids[batch_idx].size(0) - 1, -1, -1):
            # 获取当前位置的 token ID
            token_id = input_ids[batch_idx][idx]
            
            # 如果当前 token 是图片标记 (image_token_id)
            if token_id == image_token_id:
                # 计数器加一，表示遇到了下一个图片标记
                count += 1
                # 在下一个图片标记的位置设置注意力掩码为当前计数值
                next_image_attention_mask[batch_idx][idx] = count
                # 重置结束符标记为未见过
                seen_eod = False
            else:
                # 在非图片标记位置设置注意力掩码为当前计数值
                next_image_attention_mask[batch_idx][idx] = count
            
            # 如果当前 token 是结束符 (eod_token_id)
            if token_id == eod_token_id:
                # 标记已经遇到过结束符
                seen_eod = True
            
            # 如果已经遇到过结束符
            if seen_eod:
                # 在结束符后的位置设置注意力掩码为-1
                next_image_attention_mask[batch_idx][idx] = -1
        
        # 找出非负索引位置
        non_negative_indices = next_image_attention_mask[batch_idx] != -1
        # 对非负索引位置的注意力掩码值进行调整
        next_image_attention_mask[batch_idx][non_negative_indices] -= count
        next_image_attention_mask[batch_idx][non_negative_indices] *= -1
    
    # 返回处理后的注意力掩码
    return image_attention_mask, next_image_attention_mask
def is_url(string):
    """Checks if the passed string contains a valid URL and nothing else. 
    If a space is included, the URL is immediately invalidated."""
    # 如果字符串中包含空格，则返回 False
    if " " in string:
        return False
    # 解析 URL，验证其结构是否符合 URL 标准
    result = urlparse(string)
    # 检查 URL 是否包含 scheme 和 netloc，若都包含则认为是有效的 URL
    return all([result.scheme, result.netloc])


class IdeficsProcessor(ProcessorMixin):
    r"""
    Constructs an IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.

    [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.

    Args:
        image_processor (`IdeficsImageProcessor`):
            An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
        tokenizer (`LlamaTokenizerFast`):
            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
        image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "IdeficsImageProcessor"
    tokenizer_class = "LlamaTokenizerFast"

    def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
        # 检查 image_processor 是否为空，若为空则抛出 ValueError
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        # 检查 tokenizer 是否为空，若为空则抛出 ValueError
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        # 调用父类的初始化方法，传入 image_processor 和 tokenizer
        super().__init__(image_processor, tokenizer)
        # 将当前的处理器设置为 image_processor
        self.current_processor = self.image_processor
        # 将图片 token 的 ID 设置为 tokenizer 中 IMAGE_TOKEN 对应的 ID
        self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)

        # 设置默认的图片维度，从 image_processor 中获取图像的通道数和尺寸
        self.default_image_dims = (
            self.image_processor.image_num_channels,
            self.image_processor.image_size,
            self.image_processor.image_size,
        )

        # 检查 tokenizer 是否训练过 "<end_of_utterance>" 这个特殊 token
        self.tokenizer_was_trained_with_end_of_utterance_token = (
            True
            if "<end_of_utterance>" in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
            else False
        )

    def __call__(
        self,
        prompts: Union[List[TextInput], List[List[TextInput]]],
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        transform: Callable = None,
        add_eos_token=False,
        add_end_of_utterance_token=None,
        debug=False,
        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
    ):
        """
        This method processes input prompts into tokenized and optionally transformed outputs.

        Args:
            prompts (Union[List[TextInput], List[List[TextInput]]]): Input prompts to process.
            padding (Union[bool, str, PaddingStrategy], optional): Padding strategy for tokenized outputs. Defaults to False.
            truncation (Union[bool, str, TruncationStrategy], optional): Truncation strategy for tokenized outputs. Defaults to None.
            max_length (Optional[int], optional): Maximum length of the tokenized outputs. Defaults to None.
            transform (Callable, optional): Transformation function applied after tokenization. Defaults to None.
            add_eos_token (bool, optional): Whether to add an end-of-sequence token. Defaults to False.
            add_end_of_utterance_token (None, optional): Placeholder for adding end-of-utterance token. Defaults to None.
            debug (bool, optional): Whether to enable debug mode. Defaults to False.
            return_tensors (Optional[Union[str, TensorType]], optional): Output tensor type. Defaults to TensorType.PYTORCH.

        Returns:
            Dict[str, Any]: Processed outputs based on input prompts.
        """
        # 在这里实现具体的处理逻辑，将输入 prompts 处理为 tokenized 和可能转换后的输出
        pass

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        # 调用 tokenizer 的 batch_decode 方法，将参数透传给它
        return self.tokenizer.batch_decode(*args, **kwargs)
    # 将所有参数转发到 LlamaTokenizerFast 的 `PreTrainedTokenizer.decode` 方法中，并返回结果
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    # 获取模型输入的名称列表，合并并去重来自于 tokenizer 和 image_processor 的输入名称
    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

`.\models\idefics\vision.py`

# 设置文件编码为 UTF-8

# 版权声明，声明此代码版权归 OpenAI 团队和 HuggingFace 团队所有，保留所有权利
#
# 根据 Apache 许可证 2.0 版本授权使用本文件
# 除非符合许可证规定，否则不得使用本文件
# 您可以从以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于“按原样”提供的，没有任何形式的明示或暗示担保或条件
# 有关详细信息，请参阅许可证

""" PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""

# 导入必要的库和模块
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

# 导入模型输出相关的类和函数
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...utils import ModelOutput, logging

# 导入 IdeficsVisionConfig 配置类
from .configuration_idefics import IdeficsVisionConfig

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 数据类，用于表示 IdeficsVision 模型的输出
@dataclass
class IdeficsVisionModelOutput(ModelOutput):
    """
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True` is set to `True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    image_embeds: Optional[torch.FloatTensor] = None  # 可选的图像嵌入，通过将投影层应用于 pooler_output 而获得
    last_hidden_state: torch.FloatTensor = None  # 最后一层模型输出的隐藏状态序列
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None  # 隐藏状态的元组，每层模型的输出和可选的初始嵌入输出
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None  # 注意力权重的元组，用于计算自注意头中的加权平均值
# Adapted from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings
# 从 transformers.models.clip.modeling_clip.CLIPVisionEmbeddings 改编而来

class IdeficsVisionEmbeddings(nn.Module):
    # 定义 IdeficsVisionEmbeddings 类，继承自 nn.Module
    def __init__(self, config: IdeficsVisionConfig):
        super().__init__()
        # 调用父类构造函数初始化模块
        self.config = config
        # 存储传入的配置对象
        self.embed_dim = config.hidden_size
        # 设置嵌入维度为配置中的隐藏大小
        self.image_size = config.image_size
        # 图像尺寸为配置中的图像大小
        self.patch_size = config.patch_size
        # 补丁大小为配置中的补丁大小

        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
        # 初始化类别嵌入为一个随机的可学习参数

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            bias=False,
        )
        # 创建卷积层用于将图像补丁映射到嵌入维度空间，无偏置项

        self.num_patches = (self.image_size // self.patch_size) ** 2
        # 计算图像中的补丁数量

        self.num_positions = self.num_patches + 1
        # 计算位置嵌入的数量，为补丁数量加一

        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
        # 创建位置嵌入层，将每个位置映射到嵌入维度空间

        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
        # 注册位置 ID 缓冲区，用于存储位置索引张量，非持久化

    # Heavily inspired from https://github.com/huggingface/transformers/blob/v4.33.0/src/transformers/models/vit/modeling_vit.py#L82
    # 在很大程度上受到 https://github.com/huggingface/transformers/blob/v4.33.0/src/transformers/models/vit/modeling_vit.py#L82 的启发
    # 对嵌入向量进行插值，以便在更高分辨率的图像上使用预训练的位置编码
    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
        """
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        """

        # 计算嵌入向量中的补丁数量
        num_patches = embeddings.shape[1] - 1
        # 获取位置编码
        pos_embed = self.position_embedding(self.position_ids)
        # 获取位置编码的数量
        num_positions = pos_embed.shape[1] - 1

        # 如果补丁数量与位置编码数量相等，并且图像高度与宽度相等，则直接返回位置编码
        if num_patches == num_positions and height == width:
            return pos_embed

        # 提取类别位置编码和补丁位置编码
        class_pos_embed = pos_embed[:, 0]
        patch_pos_embed = pos_embed[:, 1:]

        # 获取嵌入向量的维度
        embed_dim = embeddings.shape[-1]
        # 计算高度和宽度上的补丁数量
        num_h_patches = height // self.config.patch_size
        num_w_patches = width // self.config.patch_size

        # 添加一个小数以避免插值时的浮点错误
        num_h_patches, num_w_patches = num_h_patches + 0.1, num_w_patches + 0.1

        # 计算位置编码中位置的平方根
        sqrt_num_positions = math.sqrt(num_positions)

        # 重塑补丁位置编码的形状以便插值
        patch_pos_embed = patch_pos_embed.reshape(1, int(sqrt_num_positions), int(sqrt_num_positions), embed_dim)
        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)

        # 如果补丁位置编码的数据类型为 torch.bfloat16，则进行转换为 torch.float，因为 torch.bfloat16 不支持 bicubic 插值
        fp32_upcasting = patch_pos_embed.dtype == torch.bfloat16
        if fp32_upcasting:
            logger.warning_once(
                "Upcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate "
                "is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead."
            )
            patch_pos_embed = patch_pos_embed.to(torch.float)

        # 使用双三次插值对补丁位置编码进行插值
        patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed,
            scale_factor=(num_h_patches / sqrt_num_positions, num_w_patches / sqrt_num_positions),
            mode="bicubic",
            align_corners=False,
        )

        # 如果之前进行了类型转换，则将补丁位置编码还原为 torch.bfloat16 类型
        if fp32_upcasting:
            patch_pos_embed = patch_pos_embed.to(torch.bfloat16)

        # 检查插值后的补丁位置编码形状是否符合预期
        if int(num_h_patches) != patch_pos_embed.shape[-2] or int(num_w_patches) != patch_pos_embed.shape[-1]:
            raise ValueError(
                f"Number of patches for images ({int(num_h_patches), int(num_w_patches)}) don't match the "
                f"shape of position embedding ({patch_pos_embed.shape[-2], patch_pos_embed.shape[-1]})"
            )

        # 调整补丁位置编码的形状并返回
        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, embed_dim)
        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
    # 定义前向传播函数，输入是像素值张量 pixel_values 和是否插值位置编码的标志 interpolate_pos_encoding
    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
        # 获取输入张量的批量大小、通道数、高度和宽度
        batch_size, num_channels, height, width = pixel_values.shape
        
        # 如果不进行位置编码的插值
        if not interpolate_pos_encoding:
            # 检查输入图像的尺寸是否与模型要求的 self.image_size 一致，否则引发值错误
            if height != self.image_size or width != self.image_size:
                raise ValueError(
                    f"Input image size ({height}*{width}) doesn't match model"
                    f" ({self.image_size}*{self.image_size}). You should try to set `interpolate_pos_encoding=True`"
                )

        # 确定目标数据类型为 self.patch_embedding 的权重数据类型
        target_dtype = self.patch_embedding.weight.dtype
        
        # 使用 patch_embedding 将像素值张量转换为补丁嵌入向量，形状为 [*, width, grid, grid]
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))

        # 将 patch_embeds 沿着第三个维度展平，然后交换第一维和第二维
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

        # 创建类别嵌入向量，扩展到与 batch_size 相同的大小
        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
        
        # 将类别嵌入向量与补丁嵌入向量连接起来
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)

        # 如果插值位置编码为真，则对每个令牌添加位置编码
        if interpolate_pos_encoding:
            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
        else:
            # 否则，使用预定义的位置 id 来添加位置编码
            embeddings = embeddings + self.position_embedding(self.position_ids)

        # 返回最终的嵌入向量
        return embeddings
# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->IdeficsVision
class IdeficsVisionAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size  # 设置嵌入维度为隐藏大小
        self.num_heads = config.num_attention_heads  # 获取注意力头的数量
        self.head_dim = self.embed_dim // self.num_heads  # 计算每个注意力头的维度
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_dim**-0.5  # 缩放因子，根据头的维度计算
        self.dropout = config.attention_dropout  # 注意力部分的丢弃率

        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)  # 查询投影层
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)  # 值投影层
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)  # 键投影层
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)  # 输出投影层

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        """重新形状张量以适应多头注意力的结构"""
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,



# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->IdeficsVision
class IdeficsVisionMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]  # 激活函数从配置中获取
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)  # 第一个全连接层
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)  # 第二个全连接层

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)  # 第一个全连接层计算
        hidden_states = self.activation_fn(hidden_states)  # 应用激活函数
        hidden_states = self.fc2(hidden_states)  # 第二个全连接层计算
        return hidden_states



# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->IdeficsVision
class IdeficsVisionEncoderLayer(nn.Module):
    def __init__(self, config: IdeficsVisionConfig):
        super().__init__()
        self.embed_dim = config.hidden_size  # 嵌入维度等于隐藏大小
        self.self_attn = IdeficsVisionAttention(config)  # 自注意力机制
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 第一个层归一化
        self.mlp = IdeficsVisionMLP(config)  # 多层感知器网络
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 第二个层归一化

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states  # 保存输入的原始 hidden_states 作为残差连接的一部分

        hidden_states = self.layer_norm1(hidden_states)  # 对输入进行 Layer Normalization

        # 调用 self-attention 层计算新的 hidden_states 和注意力权重
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
        )

        hidden_states = residual + hidden_states  # 将残差与经过 self-attention 后的结果相加

        residual = hidden_states  # 更新残差连接的变量为当前的 hidden_states

        hidden_states = self.layer_norm2(hidden_states)  # 对新的 hidden_states 进行 Layer Normalization

        hidden_states = self.mlp(hidden_states)  # 通过 MLP 进行全连接网络的处理

        hidden_states = residual + hidden_states  # 将残差与经过 MLP 后的结果相加

        outputs = (hidden_states,)  # 将处理后的 hidden_states 包装成 tuple 输出

        if output_attentions:
            outputs += (attn_weights,)  # 如果需要输出注意力权重，将它们也加入输出的 tuple 中

        return outputs  # 返回输出的 tuple，其中包括处理后的 hidden_states 和可能的注意力权重
# 从transformers.models.clip.modeling_clip.CLIPEncoder复制，将CLIP更改为IdeficsVision
class IdeficsVisionEncoder(nn.Module):
    """
    IdeficsVision编码器，由`config.num_hidden_layers`个自注意力层组成。每一层都是一个[`IdeficsVisionEncoderLayer`]。

    Args:
        config: IdeficsVisionConfig
    """

    def __init__(self, config: IdeficsVisionConfig):
        super().__init__()
        self.config = config
        # 创建包含`config.num_hidden_layers`个IdeficsVisionEncoderLayer的模块列表
        self.layers = nn.ModuleList([IdeficsVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



# 从transformers.models.clip.modeling_clip.CLIPVisionTransformer调整而来
class IdeficsVisionTransformer(nn.Module):
    def __init__(self, config: IdeficsVisionConfig):
        super().__init__()
        self.config = config
        embed_dim = config.hidden_size

        # 初始化嵌入层和LayerNorm
        self.embeddings = IdeficsVisionEmbeddings(config)
        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        # 初始化IdeficsVision编码器
        self.encoder = IdeficsVisionEncoder(config)
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

    # 从transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward适应而来
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        interpolate_pos_encoding: Optional[bool] = False,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        返回模型的输出结果。

        """
        # 如果未指定output_attentions，则使用配置中的output_attentions参数
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定output_hidden_states，则使用配置中的output_hidden_states参数
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定return_dict，则使用配置中的use_return_dict参数
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果pixel_values为None，则抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将像素值嵌入到模型的嵌入层中，如果指定了interpolate_pos_encoding，则插值位置编码
        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
        # 在嵌入层之后应用预层归一化
        hidden_states = self.pre_layrnorm(hidden_states)

        # 使用编码器对嵌入的输入进行编码
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器输出的最后隐藏状态
        last_hidden_state = encoder_outputs[0]
        # 提取汇聚输出，即取每个样本的第一个位置的隐藏状态
        pooled_output = last_hidden_state[:, 0, :]
        # 在汇聚输出后应用后层归一化
        pooled_output = self.post_layernorm(pooled_output)

        # 如果return_dict为False，则返回元组形式的输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果return_dict为True，则返回BaseModelOutputWithPooling对象
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

`.\models\idefics\init.py`

# 版权声明及许可证信息，声明代码版权归 HuggingFace 团队所有，基于 Apache License, Version 2.0 发布
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入类型检查模块中的 TYPE_CHECKING
from typing import TYPE_CHECKING

# 从工具模块中导入必要的异常和工具函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块导入结构，包含 IDEFiCS 配置和模型相关的结构
_import_structure = {"configuration_idefics": ["IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP", "IdeficsConfig"]}

# 检查是否图像处理可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加图像处理相关的导入结构
    _import_structure["image_processing_idefics"] = ["IdeficsImageProcessor"]

# 检查是否 Torch 可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加模型和处理相关的导入结构
    _import_structure["modeling_idefics"] = [
        "IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST",
        "IdeficsForVisionText2Text",
        "IdeficsModel",
        "IdeficsPreTrainedModel",
    ]
    _import_structure["processing_idefics"] = ["IdeficsProcessor"]

# 如果是类型检查阶段
if TYPE_CHECKING:
    # 从 configuration_idefics 模块导入必要的类和对象
    from .configuration_idefics import IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP, IdeficsConfig

    # 检查是否图像处理可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，从 image_processing_idefics 模块导入必要的类和对象
        from .image_processing_idefics import IdeficsImageProcessor

    # 检查是否 Torch 可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，从 modeling_idefics 和 processing_idefics 模块导入必要的类和对象
        from .modeling_idefics import (
            IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST,
            IdeficsForVisionText2Text,
            IdeficsModel,
            IdeficsPreTrainedModel,
        )
        from .processing_idefics import IdeficsProcessor

# 如果不是类型检查阶段
else:
    import sys

    # 将当前模块设置为延迟加载模块，使用 _LazyModule 进行懒加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\imagegpt\configuration_imagegpt.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" OpenAI ImageGPT configuration"""

# 导入必要的模块和类
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional

# 导入配置工具和ONNX配置
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
# 导入日志工具
from ...utils import logging

# 如果是类型检查，导入特定模块
if TYPE_CHECKING:
    from ... import FeatureExtractionMixin, TensorType

# 获取logger对象
logger = logging.get_logger(__name__)

# 预训练模型配置映射字典
IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "openai/imagegpt-small": "",
    "openai/imagegpt-medium": "",
    "openai/imagegpt-large": "",
}


class ImageGPTConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`ImageGPTModel`] or a [`TFImageGPTModel`]. It is
    used to instantiate a GPT-2 model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the ImageGPT
    [openai/imagegpt-small](https://huggingface.co/openai/imagegpt-small) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义 ImageGPT 模型的配置类 ImageGPTConfig，用于设置模型参数
    Args:
        vocab_size (`int`, *optional*, defaults to 512):
            GPT-2 模型的词汇表大小，定义了可以由 `inputs_ids` 表示的不同标记数量。
        n_positions (`int`, *optional*, defaults to 32*32):
            模型可能使用的最大序列长度。通常设置为一个较大的值（例如 512、1024 或 2048）。
        n_embd (`int`, *optional*, defaults to 512):
            嵌入和隐藏状态的维度。
        n_layer (`int`, *optional*, defaults to 24):
            Transformer 编码器中的隐藏层数。
        n_head (`int`, *optional*, defaults to 8):
            Transformer 编码器中每个注意力层的注意头数。
        n_inner (`int`, *optional*, defaults to None):
            内部前馈层的维度。如果为 `None`，将设置为 4 倍的 n_embd。
        activation_function (`str`, *optional*, defaults to `"quick_gelu"`):
            激活函数（可以是 src/transformers/activations.py 中定义的激活函数之一）。默认为 "quick_gelu"。
        resid_pdrop (`float`, *optional*, defaults to 0.1):
            嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
        embd_pdrop (`int`, *optional*, defaults to 0.1):
            嵌入层的 dropout 比率。
        attn_pdrop (`float`, *optional*, defaults to 0.1):
            注意力机制的 dropout 比率。
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            层归一化层使用的 epsilon。
        initializer_range (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态初始化器的标准差。
        scale_attn_weights (`bool`, *optional*, defaults to `True`):
            是否通过除以 sqrt(hidden_size) 缩放注意力权重。
        use_cache (`bool`, *optional*, defaults to `True`):
            模型是否应返回最后一次的键/值注意力（不是所有模型都使用）。
        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
            是否额外按 `1 / layer_idx + 1` 缩放注意力权重。
        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
            是否在计算注意力（点积）之前缩放键（K）并在训练时将注意力点积/softmax 升级为 float()（用于混合精度）。

    Example:

    ```
    >>> from transformers import ImageGPTConfig, ImageGPTModel

    >>> # 初始化一个 ImageGPT 配置对象
    >>> configuration = ImageGPTConfig()

    >>> # 使用配置对象初始化一个模型（带有随机权重）
    >>> model = ImageGPTModel(configuration)
    ```
    # 定义模型类型为"imagegpt"
    model_type = "imagegpt"
    # 在推理阶段需要忽略的关键字列表
    keys_to_ignore_at_inference = ["past_key_values"]
    # 属性映射字典，将模型配置的名称映射到内部属性名称
    attribute_map = {
        "hidden_size": "n_embd",
        "max_position_embeddings": "n_positions",
        "num_attention_heads": "n_head",
        "num_hidden_layers": "n_layer",
    }

    # 初始化方法，设置模型各种配置参数
    def __init__(
        self,
        vocab_size=512 + 1,  # 词汇表大小，默认为512（加一是为了起始标记的token）
        n_positions=32 * 32,  # 最大位置编码，默认为32*32
        n_embd=512,  # 隐藏单元的维度，默认为512
        n_layer=24,  # 隐藏层的数量，默认为24
        n_head=8,  # 注意力头的数量，默认为8
        n_inner=None,  # 内部隐藏层的维度，默认为None
        activation_function="quick_gelu",  # 激活函数，默认为"quick_gelu"
        resid_pdrop=0.1,  # 残差连接中的dropout概率，默认为0.1
        embd_pdrop=0.1,  # 嵌入层的dropout概率，默认为0.1
        attn_pdrop=0.1,  # 注意力层的dropout概率，默认为0.1
        layer_norm_epsilon=1e-5,  # 层归一化中epsilon的值，默认为1e-5
        initializer_range=0.02,  # 初始化范围，默认为0.02
        scale_attn_weights=True,  # 是否缩放注意力权重，默认为True
        use_cache=True,  # 是否使用缓存，默认为True
        tie_word_embeddings=False,  # 是否绑定词嵌入，默认为False
        scale_attn_by_inverse_layer_idx=False,  # 是否通过逆层索引缩放注意力，默认为False
        reorder_and_upcast_attn=False,  # 是否重排序和上升注意力，默认为False
        **kwargs,  # 其他关键字参数
    ):
        # 初始化各个配置参数
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_inner = n_inner
        self.activation_function = activation_function
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.scale_attn_weights = scale_attn_weights
        self.use_cache = use_cache
        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
        self.reorder_and_upcast_attn = reorder_and_upcast_attn
        self.tie_word_embeddings = tie_word_embeddings

        # 调用父类初始化方法，传递绑定词嵌入的参数和其他关键字参数
        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
# 定义一个继承自OnnxConfig的ImageGPTOnnxConfig类，用于配置基于ONNX的图像生成模型
class ImageGPTOnnxConfig(OnnxConfig):

    # 定义一个属性方法inputs，返回一个有序字典，描述了模型的输入信息
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                ("input_ids", {0: "batch", 1: "sequence"}),
            ]
        )

    # 定义一个生成虚拟输入数据的方法generate_dummy_inputs
    def generate_dummy_inputs(
        self,
        preprocessor: "FeatureExtractionMixin",
        batch_size: int = 1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional["TensorType"] = None,
        num_channels: int = 3,
        image_width: int = 32,
        image_height: int = 32,
    ) -> Mapping[str, Any]:
        """
        Generate inputs to provide to the ONNX exporter for the specific framework

        Args:
            preprocessor ([`PreTrainedTokenizerBase`] or [`FeatureExtractionMixin`]):
                与此模型配置相关联的预处理器。
            batch_size (`int`, *optional*, defaults to -1):
                导出模型的批处理大小（-1表示动态轴）。
            num_choices (`int`, *optional*, defaults to -1):
                多选任务提供的候选答案数量（-1表示动态轴）。
            seq_length (`int`, *optional*, defaults to -1):
                导出模型的序列长度（-1表示动态轴）。
            is_pair (`bool`, *optional*, defaults to `False`):
                指示输入是否为一对（句子1，句子2）。
            framework (`TensorType`, *optional*, defaults to `None`):
                预处理器将为其生成张量的框架（PyTorch或TensorFlow）。
            num_channels (`int`, *optional*, defaults to 3):
                生成图像的通道数。
            image_width (`int`, *optional*, defaults to 40):
                生成图像的宽度。
            image_height (`int`, *optional*, defaults to 40):
                生成图像的高度。

        Returns:
            Mapping[str, Tensor]：包含提供给模型前向函数的关键字参数
        """

        # 使用内部方法_generate_dummy_images生成虚拟输入图像数据
        input_image = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)
        
        # 调用预处理器preprocessor，传递生成的图像数据input_image，并根据framework返回张量
        inputs = dict(preprocessor(images=input_image, return_tensors=framework))

        # 返回输入参数字典
        return inputs

`.\models\imagegpt\convert_imagegpt_original_tf2_to_pytorch.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert OpenAI Image GPT checkpoints."""


import argparse  # 导入 argparse 模块，用于解析命令行参数

import torch  # 导入 PyTorch 库

from transformers import ImageGPTConfig, ImageGPTForCausalLM, load_tf_weights_in_imagegpt
from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging  # 导入相关类和函数


logging.set_verbosity_info()  # 设置日志输出级别为信息


def convert_imagegpt_checkpoint_to_pytorch(imagegpt_checkpoint_path, model_size, pytorch_dump_folder_path):
    # Construct configuration depending on size
    MODELS = {"small": (512, 8, 24), "medium": (1024, 8, 36), "large": (1536, 16, 48)}
    n_embd, n_head, n_layer = MODELS[model_size]  # 根据给定的模型大小设置模型超参数
    config = ImageGPTConfig(n_embd=n_embd, n_layer=n_layer, n_head=n_head)  # 构建 ImageGPT 的配置对象
    model = ImageGPTForCausalLM(config)  # 根据配置创建 ImageGPT 模型对象

    # Load weights from numpy
    load_tf_weights_in_imagegpt(model, config, imagegpt_checkpoint_path)  # 加载 TensorFlow 的权重到 PyTorch 模型中

    # Save pytorch-model
    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME  # 设置 PyTorch 模型权重保存路径
    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME  # 设置 PyTorch 模型配置保存路径
    print(f"Save PyTorch model to {pytorch_weights_dump_path}")  # 输出保存 PyTorch 模型权重的信息
    torch.save(model.state_dict(), pytorch_weights_dump_path)  # 保存 PyTorch 模型的权重
    print(f"Save configuration file to {pytorch_config_dump_path}")  # 输出保存配置文件的信息
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:  # 打开配置文件路径，准备写入配置
        f.write(config.to_json_string())  # 将配置对象转换为 JSON 字符串并写入文件


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建命令行参数解析器
    # Required parameters
    parser.add_argument(
        "--imagegpt_checkpoint_path",
        default=None,
        type=str,
        required=True,
        help="Path to the TensorFlow checkpoint path.",  # TensorFlow 检查点路径，必填参数
    )
    parser.add_argument(
        "--model_size",
        default=None,
        type=str,
        required=True,
        help="Size of the model (can be either 'small', 'medium' or 'large').",  # 模型大小，必填参数
    )
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        required=True,
        help="Path to the output PyTorch model.",  # 输出 PyTorch 模型路径，必填参数
    )
    args = parser.parse_args()  # 解析命令行参数
    convert_imagegpt_checkpoint_to_pytorch(
        args.imagegpt_checkpoint_path, args.model_size, args.pytorch_dump_folder_path
    )  # 调用函数将 TensorFlow 检查点转换为 PyTorch 模型并保存

`.\models\imagegpt\feature_extraction_imagegpt.py`

# 指定编码格式为 UTF-8，确保文件中的所有字符能够正确地被解析和处理
# 版权声明，指出 HuggingFace Inc. 团队保留所有权利
#
# 根据 Apache 许可证 2.0 版本的规定使用此文件
# 您只能在遵守许可证的前提下使用本文件
# 您可以从以下链接获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于"原样"提供的，不提供任何形式的担保或条件
# 无论是明示的还是暗示的，包括但不限于对特定用途的适销性和适用性的暗示担保。
# 有关许可证的详细信息，请参阅许可证文档。
"""ImageGPT 的特征提取器类。"""

# 引入警告模块，用于发出关于类过时的警告
import warnings

# 从 utils 模块中引入日志记录功能
from ...utils import logging

# 从 image_processing_imagegpt 模块导入 ImageGPTImageProcessor 类
from .image_processing_imagegpt import ImageGPTImageProcessor

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


class ImageGPTFeatureExtractor(ImageGPTImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告：ImageGPTFeatureExtractor 类已弃用，将在 Transformers 版本 5 中移除，请使用 ImageGPTImageProcessor 替代。
        warnings.warn(
            "The class ImageGPTFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
            " Please use ImageGPTImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类 ImageGPTImageProcessor 的构造函数，传递所有位置参数和关键字参数
        super().__init__(*args, **kwargs)

`.\models\imagegpt\image_processing_imagegpt.py`

# 导入所需的模块和函数
from typing import Dict, List, Optional, Union  # 导入类型提示相关的模块和函数

import numpy as np  # 导入 NumPy 库，用于数值计算

# 导入所需的图像处理工具函数和类
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import rescale, resize, to_channel_dimension_format  # 导入图像变换相关函数
from ...image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)  # 导入图像处理和验证相关函数

from ...utils import TensorType, is_vision_available, logging  # 导入其他工具函数和变量

# 如果视觉相关功能可用，则导入 PIL 库
if is_vision_available():
    import PIL

# 获取 logger 实例，用于记录日志
logger = logging.get_logger(__name__)


def squared_euclidean_distance(a, b):
    """
    计算两组向量之间的平方欧几里得距离。

    参数：
    - a: 第一组向量，形状为 (m, d)
    - b: 第二组向量，形状为 (n, d)

    返回：
    - d: 形状为 (m, n) 的距离矩阵
    """
    b = b.T  # 将 b 转置为 (d, n)，以便计算点积
    a2 = np.sum(np.square(a), axis=1)  # 计算 a 中每个向量的平方和
    b2 = np.sum(np.square(b), axis=0)  # 计算 b 中每个向量的平方和
    ab = np.matmul(a, b)  # 计算 a 和 b 之间的点积
    d = a2[:, None] - 2 * ab + b2[None, :]  # 计算平方欧几里得距离矩阵
    return d


def color_quantize(x, clusters):
    """
    对输入的像素值进行颜色量化，将每个像素映射到最近的颜色簇。

    参数：
    - x: 输入的像素值数组，形状为 (h*w, 3)，其中 h 是高度，w 是宽度
    - clusters: 颜色簇的数组，形状为 (k, 3)，其中 k 是颜色簇的数量

    返回：
    - 颜色簇索引数组，形状为 (h*w,)，每个元素表示对应像素所属的颜色簇索引
    """
    x = x.reshape(-1, 3)  # 将输入的像素值重塑为 (h*w, 3) 的二维数组
    d = squared_euclidean_distance(x, clusters)  # 计算每个像素值与各个颜色簇的距离
    return np.argmin(d, axis=1)  # 返回每个像素值所属的最近颜色簇的索引


class ImageGPTImageProcessor(BaseImageProcessor):
    """
    ImageGPT 的图像处理器类，用于将图像调整大小到较小的分辨率（如 32x32 或 64x64），归一化并进行颜色量化，
    以获取像素值序列（颜色簇）。
    """
    def __init__(self):
        """
        初始化 ImageGPTImageProcessor 类。
        """
        super().__init__()  # 调用父类 BaseImageProcessor 的初始化方法

    def process(self, images: List[ImageInput], size: Optional[Union[int, Tuple[int, int]]] = None) -> List[BatchFeature]:
        """
        对输入的图像列表进行处理，包括调整大小、归一化和颜色量化。

        参数：
        - images: 输入的图像列表，每个元素是 ImageInput 类型的对象
        - size: 要调整的目标大小，可以是单个整数（将图像调整为正方形）或包含两个整数的元组（宽度，高度）

        返回：
        - 处理后的 BatchFeature 列表，每个 BatchFeature 包含处理后的特征和元数据
        """
        if size is not None:
            validate_preprocess_arguments(size)  # 验证预处理参数是否合法

        # 将图像转换为 NumPy 数组列表
        np_images = make_list_of_images(images)

        if size is not None:
            # 调整图像大小到指定尺寸
            np_images = resize(np_images, size, resampling=PILImageResampling.BICUBIC)

        # 将图像转换为适合通道维度格式
        np_images = to_channel_dimension_format(np_images, ChannelDimension.LAST)

        # 归一化图像像素值到 [0, 1] 范围内
        np_images = rescale(np_images)

        # 对归一化后的图像进行颜色量化，得到颜色簇序列
        clusters = get_size_dict(size)
        pixel_values = [color_quantize(im, clusters) for im in np_images]

        # 将处理后的特征和元数据封装为 BatchFeature 对象并返回
        batch_features = [
            BatchFeature(
                pixel_values=im_quantized.tolist(),  # 转换为列表形式的颜色簇序列
                metadata={"original_size": im.shape[:2]}  # 记录原始图像的尺寸
            )
            for im, im_quantized in zip(images, pixel_values)
        ]

        return batch_features  # 返回处理后的 BatchFeature 列表
    Args:
        clusters (`np.ndarray` or `List[List[int]]`, *optional*):
            The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overriden by `clusters`
            in `preprocess`.
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's dimensions to `(size["height"], size["width"])`. Can be overridden by
            `do_resize` in `preprocess`.
        size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image pixel value to between [-1, 1]. Can be overridden by `do_normalize` in
            `preprocess`.
        do_color_quantize (`bool`, *optional*, defaults to `True`):
            Whether to color quantize the image. Can be overridden by `do_color_quantize` in `preprocess`.
    """

    # List of input names expected by the model
    model_input_names = ["pixel_values"]

    def __init__(
        self,
        # clusters is a first argument to maintain backwards compatibility with the old ImageGPTImageProcessor
        clusters: Optional[Union[List[List[int]], np.ndarray]] = None,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_normalize: bool = True,
        do_color_quantize: bool = True,
        **kwargs,
    ) -> None:
        # Call the constructor of the superclass with additional keyword arguments
        super().__init__(**kwargs)
        # If size argument is None, set default size to {"height": 256, "width": 256}
        size = size if size is not None else {"height": 256, "width": 256}
        # Normalize size dictionary to ensure it contains both "height" and "width" keys
        size = get_size_dict(size)
        # Convert clusters to a numpy array if not None, else set it to None
        self.clusters = np.array(clusters) if clusters is not None else None
        # Initialize instance variables with provided or default values
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.do_normalize = do_normalize
        self.do_color_quantize = do_color_quantize
        # List of valid keys for the processor configuration
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_normalize",
            "do_color_quantize",
            "clusters",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        """
        # 获取规范化后的尺寸字典
        size = get_size_dict(size)
        # 检查尺寸字典中是否包含有效的"height"和"width"键
        if "height" not in size or "width" not in size:
            # 如果缺少任一键，抛出值错误异常
            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
        # 设置输出尺寸为(height, width)
        output_size = (size["height"], size["width"])
        # 调用resize函数进行图像调整大小操作，返回调整后的图像
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )
    def normalize_image(
        self,
        image: np.ndarray,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None
    ) -> np.ndarray:
        """
        Normalizes an image's pixel values to between [-1, 1].

        Args:
            image (`np.ndarray`):
                Image to normalize.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        
        Returns:
            np.ndarray:
                Normalized image with pixel values scaled to [-1, 1].
        """
        # Rescale the image pixel values to the range [-1, 1]
        image = rescale(image=image, scale=1 / 127.5, data_format=data_format, input_data_format=input_data_format)
        # Adjust the image values to fit the range [-1, 1] by subtracting 1
        image = image - 1
        # Return the normalized image
        return image

    def preprocess(
        self,
        images: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_normalize: bool = None,
        do_color_quantize: Optional[bool] = None,
        clusters: Optional[Union[List[List[int]], np.ndarray]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        """
        Preprocesses images based on specified operations and parameters.

        Args:
            images (`ImageInput`):
                Input images to be preprocessed.
            do_resize (`bool`, *optional*):
                Flag indicating whether to resize images.
            size (`Dict[str, int]`, *optional*):
                Dictionary specifying target sizes for resizing.
            resample (`PILImageResampling`, *optional*):
                Resampling method for resizing images.
            do_normalize (`bool`, *optional*):
                Flag indicating whether to normalize images.
            do_color_quantize (`Optional[bool]`, *optional*):
                Flag indicating whether to perform color quantization.
            clusters (`Optional[Union[List[List[int]], np.ndarray]]`, *optional*):
                Clusters for color quantization.
            return_tensors (`Optional[Union[str, TensorType]]`, *optional*):
                Desired format for output tensors.
            data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
                Format of the image data channels.
            input_data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
                Format of the input image data channels.

        Returns:
            Preprocessed images according to the specified operations and parameters.
        """

`.\models\imagegpt\modeling_imagegpt.py`

# coding=utf-8
# Copyright 2021 The OpenAI Team Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch OpenAI ImageGPT model."""

import math  # 导入数学函数库
import os  # 导入操作系统功能
import warnings  # 导入警告处理模块
from typing import Any, Optional, Tuple, Union  # 导入类型提示模块

import torch  # 导入PyTorch库
import torch.utils.checkpoint  # 导入PyTorch的checkpoint模块
from torch import nn  # 导入PyTorch的神经网络模块
from torch.cuda.amp import autocast  # 导入PyTorch的混合精度训练模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 导入PyTorch的损失函数模块

from ...activations import ACT2FN  # 导入激活函数映射
from ...modeling_outputs import (  # 导入模型输出相关模块
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    SequenceClassifierOutputWithPast,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer  # 导入PyTorch工具函数
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings  # 导入工具函数和日志模块
from .configuration_imagegpt import ImageGPTConfig  # 导入ImageGPT模型的配置类


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CHECKPOINT_FOR_DOC = "openai/imagegpt-small"  # ImageGPT模型的checkpoint位置，用于文档说明
_CONFIG_FOR_DOC = "ImageGPTConfig"  # ImageGPT模型的配置类，用于文档说明

IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST = [  # 预训练的ImageGPT模型列表
    "openai/imagegpt-small",
    "openai/imagegpt-medium",
    "openai/imagegpt-large",
    # See all Image GPT models at https://huggingface.co/models?filter=imagegpt
]


def load_tf_weights_in_imagegpt(model, config, imagegpt_checkpoint_path):
    """
    Load tf checkpoints in a pytorch model
    """
    try:
        import re  # 导入正则表达式模块
        import tensorflow as tf  # 导入TensorFlow库
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    tf_path = os.path.abspath(imagegpt_checkpoint_path)  # 获取TensorFlow模型的绝对路径
    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))  # 记录日志，指示正在转换TensorFlow的checkpoint
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)  # 获取TensorFlow模型中的变量列表
    names = []
    arrays = []

    for name, shape in init_vars:
        logger.info("Loading TF weight {} with shape {}".format(name, shape))  # 记录日志，指示正在加载TensorFlow模型的权重和形状
        array = tf.train.load_variable(tf_path, name)  # 加载TensorFlow模型中的变量
        names.append(name)
        arrays.append(array.squeeze())  # 将加载的变量压缩并添加到数组中

    return model  # 返回加载了TensorFlow权重的PyTorch模型


class ImageGPTLayerNorm(nn.Module):
    def __init__(self, hidden_size: Tuple[int], eps: float = 1e-5):
        super().__init__()
        self.eps = eps  # 设置层标准化的epsilon值
        self.weight = nn.Parameter(torch.Tensor(hidden_size))  # 初始化标准化的权重参数
    # 定义一个方法 `forward`，接受一个 torch.Tensor 类型的参数 `tensor`，返回一个元组
    def forward(self, tensor: torch.Tensor) -> tuple:
        # input is not mean centered
        # 返回值是输入张量 `tensor` 除以标准差，然后乘以权重数据 `self.weight.data[..., :]`，以实现标准化处理
        return (
            tensor
            / torch.sqrt(torch.mean(torch.square(tensor), axis=-1, keepdim=True) + self.eps)
            * self.weight.data[..., :]
        )
# 定义一个名为 ImageGPTAttention 的类，继承自 nn.Module
class ImageGPTAttention(nn.Module):
    # 初始化函数，接受配置参数 config 和两个可选参数 is_cross_attention 和 layer_idx
    def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx: Optional[int] = None):
        # 调用父类的初始化函数
        super().__init__()

        # 获取最大位置嵌入数
        max_positions = config.max_position_embeddings
        # 注册一个缓冲区 "bias"，包含一个下三角形状的张量，用于自注意力机制
        self.register_buffer(
            "bias",
            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                1, 1, max_positions, max_positions
            ),
            persistent=False,
        )
        # 注册一个缓冲区 "masked_bias"，包含一个很大的负数，用于掩码注意力
        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)

        # 获取嵌入维度
        self.embed_dim = config.hidden_size
        # 获取注意力头数和每个头的维度
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        self.split_size = self.embed_dim
        # 检查 embed_dim 必须能够被 num_heads 整除
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )

        # 是否对注意力权重进行缩放
        self.scale_attn_weights = config.scale_attn_weights
        # 是否是跨注意力机制
        self.is_cross_attention = is_cross_attention

        # 层级注意力缩放、重排序和上投
        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
        self.layer_idx = layer_idx
        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn

        # 如果是跨注意力机制，定义两个卷积层
        if self.is_cross_attention:
            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
        else:
            # 如果不是跨注意力机制，定义一个卷积层
            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
        # 定义一个卷积层 c_proj
        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)

        # 注意力和残差的 Dropout 层
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)

        # 初始化被修剪的注意力头集合
        self.pruned_heads = set()

    # 函数 prune_heads，用于修剪不需要的注意力头
    def prune_heads(self, heads):
        # 如果 heads 集合为空，则直接返回
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数找到可修剪的头部和索引
        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
        # 构造索引张量
        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])

        # 在卷积层上进行修剪
        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)

        # 更新超参数
        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
        self.num_heads = self.num_heads - len(heads)
        self.pruned_heads = self.pruned_heads.union(heads)
    # 定义注意力函数，接受查询(query)、键(key)、值(value)以及注意力掩码和头掩码作为输入
    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
        # 计算注意力权重，使用query与key的矩阵乘积
        attn_weights = torch.matmul(query, key.transpose(-1, -2))

        # 如果开启了注意力权重的缩放
        if self.scale_attn_weights:
            attn_weights = attn_weights / (float(value.size(-1)) ** 0.5)

        # 如果开启了逐层注意力权重的缩放
        if self.scale_attn_by_inverse_layer_idx:
            attn_weights = attn_weights / float(self.layer_idx + 1)

        # 如果不是交叉注意力模式
        if not self.is_cross_attention:
            # 如果只有“正常”注意力层实现了因果遮罩
            query_length, key_length = query.size(-2), key.size(-2)
            # 创建因果遮罩，遮罩值设置为极小值以确保被遮罩区域的权重为负无穷大
            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
            mask_value = torch.finfo(attn_weights.dtype).min
            # 将遮罩值转换为与attn_weights相同的数据类型，并放置在相同的设备上
            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
            # 使用遮罩更新注意力权重，被遮罩区域用mask_value填充
            attn_weights = torch.where(causal_mask, attn_weights, mask_value)

        # 如果有注意力掩码，则应用该掩码
        if attention_mask is not None:
            attn_weights = attn_weights + attention_mask

        # 对注意力权重进行softmax归一化
        attn_weights = nn.Softmax(dim=-1)(attn_weights)

        # 将注意力权重降回到值(value)的数据类型（如果在混合精度下使用），否则无操作
        attn_weights = attn_weights.type(value.dtype)

        # 对注意力权重应用dropout操作
        attn_weights = self.attn_dropout(attn_weights)

        # 如果指定了头掩码，则应用头掩码
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        # 计算注意力输出，使用softmax归一化后的注意力权重与值(value)的矩阵乘积
        attn_output = torch.matmul(attn_weights, value)

        # 返回注意力输出和注意力权重
        return attn_output, attn_weights
    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)

        # Extract batch size, number of heads, query sequence length, and dk (last dimension of query)
        bsz, num_heads, q_seq_len, dk = query.size()

        # Extract key sequence length
        _, _, k_seq_len, _ = key.size()

        # Preallocate attention weights tensor for `baddbmm`
        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)

        # Compute scale factor based on configuration settings
        scale_factor = 1.0
        if self.scale_attn_weights:
            scale_factor /= float(value.size(-1)) ** 0.5

        if self.scale_attn_by_inverse_layer_idx:
            scale_factor /= float(self.layer_idx + 1)

        # Upcast (turn off autocast) and reorder (scale K by 1 / root(dk))
        with autocast(enabled=False):
            # Reshape query and key tensors for matrix multiplication
            q = query.reshape(-1, q_seq_len, dk)
            k = key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
            
            # Perform batched matrix multiplication with `baddbmm`, scaling with alpha and adding to attn_weights
            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
            
            # Reshape attn_weights back to original shape (batch size, num heads, query seq length, key seq length)
            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)

        if not self.is_cross_attention:
            # Apply causal mask for "normal" attention layer
            query_length, key_length = query.size(-2), key.size(-2)
            
            # Generate causal mask tensor from self.bias for current query and key lengths
            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
            
            # Set mask value to minimum float value of attn_weights' dtype
            mask_value = torch.finfo(attn_weights.dtype).min
            
            # Create mask_value tensor of attn_weights' dtype and device
            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
            
            # Apply causal_mask to attn_weights, replacing values where causal_mask is False with mask_value
            attn_weights = torch.where(causal_mask, attn_weights, mask_value)

        if attention_mask is not None:
            # Apply attention_mask to attn_weights
            attn_weights = attn_weights + attention_mask

        # Apply softmax along the last dimension of attn_weights
        attn_weights = nn.Softmax(dim=-1)(attn_weights)

        # Ensure attn_weights is of type torch.float32
        if attn_weights.dtype != torch.float32:
            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
        
        # Convert attn_weights to type value.dtype
        attn_weights = attn_weights.type(value.dtype)
        
        # Apply attention dropout
        attn_weights = self.attn_dropout(attn_weights)

        # Apply head_mask if provided
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        # Compute attention output by matrix multiplication of attn_weights and value tensors
        attn_output = torch.matmul(attn_weights, value)

        return attn_output, attn_weights
    def _merge_heads(self, tensor, num_heads, attn_head_size):
        """
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        """
        # Permute dimensions to merge attn_head_size and num_heads dimensions
        tensor = tensor.permute(0, 2, 1, 3).contiguous()
        # Calculate new shape by merging num_heads and attn_head_size into the last dimension
        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
        return tensor.view(new_shape)

    def forward(
        self,
        hidden_states: torch.Tensor,
        layer_past: Optional[bool] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> tuple:
        if encoder_hidden_states is not None:
            if not hasattr(self, "q_attn"):
                # Raise an error if `q_attn` weights are not defined for cross-attention usage
                raise ValueError(
                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
                    "Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`."
                )

            # Compute query using self.q_attn module
            query = self.q_attn(hidden_states)
            # Compute key and value from encoder_hidden_states using self.c_attn module and split them
            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
            # Use encoder_attention_mask for attention masking
            attention_mask = encoder_attention_mask
        else:
            # Compute query, key, and value from hidden_states using self.c_attn module and split them
            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)

        # Split query, key, and value tensors into multiple heads
        query = self._split_heads(query, self.num_heads, self.head_dim)
        key = self._split_heads(key, self.num_heads, self.head_dim)
        value = self._split_heads(value, self.num_heads, self.head_dim)

        # Concatenate past_key with key and past_value with value if layer_past is not None
        if layer_past is not None:
            past_key, past_value = layer_past
            key = torch.cat((past_key, key), dim=-2)
            value = torch.cat((past_value, value), dim=-2)

        # Store key and value tensors in present if use_cache is True
        if use_cache is True:
            present = (key, value)
        else:
            present = None

        # Perform attention calculation based on self.reorder_and_upcast_attn flag
        if self.reorder_and_upcast_attn:
            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
        else:
            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)

        # Merge heads back into the hidden_size dimension
        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
        # Project merged heads using self.c_proj module
        attn_output = self.c_proj(attn_output)
        # Apply residual dropout
        attn_output = self.resid_dropout(attn_output)

        # Prepare outputs tuple including attn_output and present
        outputs = (attn_output, present)
        # Include attention weights in outputs if output_attentions is True
        if output_attentions:
            outputs += (attn_weights,)

        return outputs  # a, present, (attentions)
# 定义一个基于图像的 GPT 模型的单个 MLP 层
class ImageGPTMLP(nn.Module):
    def __init__(self, intermediate_size, config):
        super().__init__()
        embed_dim = config.hidden_size
        # 定义一个一维卷积层，输入维度是 hidden_size，输出维度是 intermediate_size
        self.c_fc = Conv1D(intermediate_size, embed_dim)
        # 定义一个一维卷积层，输入维度是 intermediate_size，输出维度是 hidden_size
        self.c_proj = Conv1D(embed_dim, intermediate_size)
        # 激活函数根据配置选择，将激活函数映射到 ACT2FN 中对应的函数
        self.act = ACT2FN[config.activation_function]
        # Dropout 层，以 config 中配置的概率进行随机丢弃
        self.dropout = nn.Dropout(config.resid_pdrop)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 先通过 c_fc 卷积层处理隐藏状态
        hidden_states = self.c_fc(hidden_states)
        # 应用激活函数
        hidden_states = self.act(hidden_states)
        # 再通过 c_proj 卷积层处理输出
        hidden_states = self.c_proj(hidden_states)
        # 应用 Dropout 进行随机丢弃
        hidden_states = self.dropout(hidden_states)
        # 返回处理后的隐藏状态
        return hidden_states


# 定义一个基于图像的 GPT 模型的单个 Block
class ImageGPTBlock(nn.Module):
    def __init__(self, config, layer_idx=None):
        super().__init__()
        hidden_size = config.hidden_size
        # 如果没有配置内部维度，则使用默认的 4 倍 hidden_size
        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size

        # 图像 GPT 特有的 LayerNorm 层，用于归一化隐藏状态
        self.ln_1 = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        # 图像 GPT 特有的 Attention 层
        self.attn = ImageGPTAttention(config, layer_idx=layer_idx)
        # 再次应用 LayerNorm 层
        self.ln_2 = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon)

        # 如果配置中包含交叉注意力，添加交叉注意力层和 LayerNorm 层
        if config.add_cross_attention:
            self.crossattention = ImageGPTAttention(config, is_cross_attention=True, layer_idx=layer_idx)
            self.ln_cross_attn = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon)

        # MLP 层，用于处理内部维度
        self.mlp = ImageGPTMLP(inner_dim, config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        layer_past: Optional[bool] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> tuple:
        residual = hidden_states  # 保存输入的隐藏状态作为残差连接的一部分
        hidden_states = self.ln_1(hidden_states)  # 应用 LayerNorm 到隐藏状态
        attn_outputs = self.attn(
            hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        attn_output = attn_outputs[0]  # 提取注意力机制的输出
        outputs = attn_outputs[1:]  # 提取其他输出，如 present, (attentions)
        # 残差连接
        hidden_states = attn_output + residual

        if encoder_hidden_states is not None:
            # 为交叉注意力添加一个自注意力块
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
                    "cross-attention layers by setting `config.add_cross_attention=True`"
                )
            residual = hidden_states  # 保存交叉注意力前的隐藏状态作为残差连接的一部分
            hidden_states = self.ln_cross_attn(hidden_states)  # 应用 LayerNorm 到隐藏状态
            cross_attn_outputs = self.crossattention(
                hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                output_attentions=output_attentions,
            )
            attn_output = cross_attn_outputs[0]  # 提取交叉注意力机制的输出
            # 残差连接
            hidden_states = residual + attn_output
            outputs = outputs + cross_attn_outputs[2:]  # 如果输出注意力权重，则添加交叉注意力权重

        residual = hidden_states  # 保存输入的隐藏状态作为残差连接的一部分
        hidden_states = self.ln_2(hidden_states)  # 应用 LayerNorm 到隐藏状态
        feed_forward_hidden_states = self.mlp(hidden_states)  # 应用 MLP 层到隐藏状态
        # 残差连接
        hidden_states = residual + feed_forward_hidden_states

        outputs = (hidden_states,) + (outputs if use_cache else outputs[1:])  # 构建输出元组，包括隐藏状态和其他输出

        return outputs  # 返回隐藏状态、present、(attentions, cross_attentions)
class ImageGPTPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = ImageGPTConfig  # 指定配置类为 ImageGPTConfig，用于模型配置参数
    load_tf_weights = load_tf_weights_in_imagegpt  # 指定加载 TensorFlow 权重的函数
    base_model_prefix = "transformer"  # 基础模型前缀，用于命名模型的主要部分
    main_input_name = "input_ids"  # 主要输入名称，通常用于输入模型的主要特征
    supports_gradient_checkpointing = True  # 支持梯度检查点，用于加速训练和减少内存消耗

    def __init__(self, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)  # 调用父类 PreTrainedModel 的初始化方法

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, (nn.Linear, Conv1D)):
            # 初始化线性层和一维卷积层的权重
            # 与 TensorFlow 版本稍有不同，后者使用截断正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 初始化嵌入层的权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, ImageGPTLayerNorm):
            # 初始化自定义的 ImageGPTLayerNorm 层的权重
            module.weight.data.fill_(1.0)

        # 重新初始化选定的权重，遵循 OpenAI GPT-2 论文的方案：
        #   > 通过修改的初始化方式，考虑到模型深度的残差路径累积。在初始化时通过因子 1/√N 缩放残差层的权重，其中 N 是残差层的数量。
        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
        #
        # 参考（Megatron-LM）：https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
        for name, p in module.named_parameters():
            if "c_proj" in name and "weight" in name:
                # 特殊的缩放初始化 --> 每个 Transformer 块有 2 个 Layer Norms
                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))


IMAGEGPT_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`ImageGPTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# 定义一个文档字符串常量，用于描述ImageGPT模型的输入格式和功能
IMAGEGPT_INPUTS_DOCSTRING = r"""
"""


# 使用装饰器为类添加文档字符串，在ImageGPT模型上方输出原始隐藏状态，没有特定的输出头
@add_start_docstrings(
    "The bare ImageGPT Model transformer outputting raw hidden-states without any specific head on top.",
    IMAGEGPT_START_DOCSTRING,
)
class ImageGPTModel(ImageGPTPreTrainedModel):
    def __init__(self, config: ImageGPTConfig):
        super().__init__(config)

        self.embed_dim = config.hidden_size

        # 输入词嵌入和位置嵌入
        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)

        self.drop = nn.Dropout(config.embd_pdrop)
        # 多层ImageGPT块的堆叠
        self.h = nn.ModuleList([ImageGPTBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
        # 最后一层的LayerNorm
        self.ln_f = ImageGPTLayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)

        # 模型并行设置
        self.model_parallel = False
        self.device_map = None
        self.gradient_checkpointing = False
        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.wte

    def set_input_embeddings(self, new_embeddings):
        self.wte = new_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)

    # 使用装饰器添加文档字符串，在模型前向传播时输出ImageGPT模型的输入格式和功能
    @add_start_docstrings_to_model_forward(IMAGEGPT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPastAndCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs: Any,
@add_start_docstrings(
    """
    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    IMAGEGPT_START_DOCSTRING,
)
class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]
    def __init__(self, config: ImageGPTConfig):
        super().__init__(config)
        # 使用给定的配置初始化父类（ImageGPTConfig），调用其构造函数
        self.transformer = ImageGPTModel(config)
        # 根据配置创建图像GPT模型的转换器部分
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size - 1, bias=False)
        # 创建一个线性层作为语言模型头部，设置输入和输出维度

        # Model parallel
        self.model_parallel = False
        # 设定模型并行计算为假，表示未启用模型的并行计算
        self.device_map = None
        # 设定设备映射为None，表示设备映射未定义

        # Initialize weights and apply final processing
        self.post_init()
        # 调用自定义方法post_init()，用于初始化权重并进行最终处理

    def get_output_embeddings(self):
        return self.lm_head
        # 返回语言模型头部（lm_head）作为输出的嵌入层

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings
        # 设置新的嵌入层作为语言模型头部（lm_head）

    def prepare_inputs_for_generation(self, input_ids: torch.Tensor, past_key_values: Optional[bool] = None, **kwargs):
        token_type_ids = kwargs.get("token_type_ids", None)
        # 从kwargs中获取token_type_ids参数，如果不存在则设为None

        # Omit tokens covered by past_key_values
        if past_key_values:
            past_length = past_key_values[0][0].shape[2]
            # 获取past_key_values的长度信息

            # Some generation methods already pass only the last input ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # Default to old behavior: keep only final ID
                remove_prefix_length = input_ids.shape[1] - 1
            # 根据past_key_values来决定保留的输入ID的长度

            input_ids = input_ids[:, remove_prefix_length:]
            # 根据remove_prefix_length截取输入ID

            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
            # 如果token_type_ids存在，则截取相应长度的token_type_ids

        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            # 根据注意力掩码在批处理生成时动态创建position_ids
            position_ids.masked_fill_(attention_mask == 0, 1)
            # 根据注意力掩码填充position_ids
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]
            # 如果存在past_key_values，则截取相应长度的position_ids
        else:
            position_ids = None
        # 否则设置position_ids为None

        return {
            "input_ids": input_ids,
            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache"),
            "position_ids": position_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
        }
        # 返回准备好的输入字典，包含input_ids、past_key_values、use_cache、position_ids、attention_mask和token_type_ids
    # 定义模型的前向传播方法，接受多个可选的输入参数和配置选项
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的词索引序列
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,  # 存储循环生成过程中的键值对
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，用于指定哪些位置需要进行注意力计算
        token_type_ids: Optional[torch.Tensor] = None,  # 区分不同句子或段落的类型信息
        position_ids: Optional[torch.Tensor] = None,  # 指定输入序列中每个词的位置信息
        head_mask: Optional[torch.Tensor] = None,  # 控制多头注意力中每个头的掩码
        inputs_embeds: Optional[torch.Tensor] = None,  # 直接提供的嵌入输入
        encoder_hidden_states: Optional[torch.Tensor] = None,  # 编码器的隐藏状态，用于注意力机制
        encoder_attention_mask: Optional[torch.Tensor] = None,  # 编码器的注意力掩码
        labels: Optional[torch.Tensor] = None,  # 用于监督学习的标签
        use_cache: Optional[bool] = None,  # 是否使用缓存机制
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否以字典形式返回结果
        **kwargs: Any,  # 其他未指定的关键字参数
    ):
        pass  # 此处为方法定义的占位符，实际实现未提供

    @staticmethod
    def _reorder_cache(
        past_key_values: Tuple[Tuple[torch.Tensor]],  # 存储着多层循环生成过程中的键值对
        beam_idx: torch.Tensor  # 当前束搜索的索引，用于重新排序缓存
    ) -> Tuple[Tuple[torch.Tensor]]:
        """
        用于在调用 [`~PreTrainedModel.beam_search`] 或 [`~PreTrainedModel.beam_sample`] 时重新排序
        `past_key_values` 缓存的函数。这是为了在每个生成步骤中将 `past_key_values` 与正确的 `beam_idx` 匹配。
        """
        return tuple(
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
            for layer_past in past_key_values
        )
# 给 ImageGPT 模型添加一个图片分类的头部，使用线性层进行分类
# 使用 average-pooling 对隐藏状态进行处理以进行分类
@add_start_docstrings(
    """
    The ImageGPT Model transformer with an image classification head on top (linear layer).
    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
    """,
    IMAGEGPT_START_DOCSTRING,
)
class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
    def __init__(self, config: ImageGPTConfig):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = ImageGPTModel(config)
        # 创建一个线性层，输入维度为 config.n_embd，输出维度为 num_labels，无偏置项
        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(IMAGEGPT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=SequenceClassifierOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs: Any,

`.\models\imagegpt\init.py`

# 版权声明和许可证信息，指明代码版权归HuggingFace团队所有，使用Apache License, Version 2.0许可证
#
# from typing import TYPE_CHECKING 导入类型检查相关模块

# 从 utils 模块中导入 OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available 函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块导入结构的字典 _import_structure
_import_structure = {
    "configuration_imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig", "ImageGPTOnnxConfig"]
}

# 尝试检测视觉处理是否可用，若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若视觉处理可用，则向 _import_structure 添加 feature_extraction_imagegpt 和 image_processing_imagegpt 模块及其对应的函数列表
    _import_structure["feature_extraction_imagegpt"] = ["ImageGPTFeatureExtractor"]
    _import_structure["image_processing_imagegpt"] = ["ImageGPTImageProcessor"]

# 尝试检测是否可用 Torch 库，若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若 Torch 可用，则向 _import_structure 添加 modeling_imagegpt 模块及其对应的函数列表
    _import_structure["modeling_imagegpt"] = [
        "IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ImageGPTForCausalImageModeling",
        "ImageGPTForImageClassification",
        "ImageGPTModel",
        "ImageGPTPreTrainedModel",
        "load_tf_weights_in_imagegpt",
    ]

# 如果当前是类型检查模式
if TYPE_CHECKING:
    # 从 configuration_imagegpt 模块导入 IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig, ImageGPTOnnxConfig 类和常量
    from .configuration_imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig, ImageGPTOnnxConfig

    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从 feature_extraction_imagegpt 模块导入 ImageGPTFeatureExtractor 类
        from .feature_extraction_imagegpt import ImageGPTFeatureExtractor
        # 从 image_processing_imagegpt 模块导入 ImageGPTImageProcessor 类

        from .image_processing_imagegpt import ImageGPTImageProcessor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从 modeling_imagegpt 模块导入相关类和函数
        from .modeling_imagegpt import (
            IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
            ImageGPTForCausalImageModeling,
            ImageGPTForImageClassification,
            ImageGPTModel,
            ImageGPTPreTrainedModel,
            load_tf_weights_in_imagegpt,
        )

# 如果不是类型检查模式，则动态设置当前模块为懒加载模块，使用 _LazyModule 包装，并指定模块导入结构 _import_structure
else:
    import sys
    # 将当前模块设置为 _LazyModule 类的实例，以支持延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\informer\configuration_informer.py`

# coding=utf-8
# 定义模型配置的文件，声明版权信息和许可证信息

"""Informer model configuration"""
# 引入必要的库和模块
from typing import List, Optional, Union

from ...configuration_utils import PretrainedConfig  # 导入预训练配置基类
from ...utils import logging  # 导入日志记录工具


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

# 预训练配置文件映射字典，包含预训练模型的名称和对应的配置文件URL
INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "huggingface/informer-tourism-monthly": (
        "https://huggingface.co/huggingface/informer-tourism-monthly/resolve/main/config.json"
    ),
    # 查看所有 Infromer 模型的信息，地址：https://huggingface.co/models?filter=informer
}


class InformerConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of an [`InformerModel`]. It is used to instantiate an
    Informer model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Informer
    [huggingface/informer-tourism-monthly](https://huggingface.co/huggingface/informer-tourism-monthly) architecture.

    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import InformerConfig, InformerModel

    >>> # Initializing an Informer configuration with 12 time steps for prediction
    >>> configuration = InformerConfig(prediction_length=12)

    >>> # Randomly initializing a model (with random weights) from the configuration
    >>> model = InformerModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """
    # 模型类型标识
    model_type = "informer"
    # 属性映射字典，将配置参数转换为模型需要的参数名
    attribute_map = {
        "hidden_size": "d_model",
        "num_attention_heads": "encoder_attention_heads",
        "num_hidden_layers": "encoder_layers",
    }
    # 定义一个初始化方法，用于初始化模型的各种参数和设置
    def __init__(
        self,
        # 预测长度，可选参数，用于指定模型的预测长度
        prediction_length: Optional[int] = None,
        # 上下文长度，可选参数，用于指定模型的上下文长度
        context_length: Optional[int] = None,
        # 分布输出类型，字符串参数，默认为学生 t 分布
        distribution_output: str = "student_t",
        # 损失函数类型，字符串参数，默认为负对数似然损失
        loss: str = "nll",
        # 输入尺寸，整数参数，默认为 1
        input_size: int = 1,
        # 滞后序列，整数列表参数，可选参数，用于指定滞后序列
        lags_sequence: List[int] = None,
        # 缩放方法，字符串或布尔类型参数，默认为均值
        scaling: Optional[Union[str, bool]] = "mean",
        # 动态实数特征数，整数参数，默认为 0
        num_dynamic_real_features: int = 0,
        # 静态实数特征数，整数参数，默认为 0
        num_static_real_features: int = 0,
        # 静态分类特征数，整数参数，默认为 0
        num_static_categorical_features: int = 0,
        # 时间特征数，整数参数，默认为 0
        num_time_features: int = 0,
        # 基数，整数列表参数，可选参数，用于指定分类特征的基数
        cardinality: Optional[List[int]] = None,
        # 嵌入维度，整数列表参数，可选参数，用于指定嵌入特征的维度
        embedding_dimension: Optional[List[int]] = None,
        # 编码器模型的维度，整数参数，默认为 64
        d_model: int = 64,
        # 编码器前馈神经网络的维度，整数参数，默认为 32
        encoder_ffn_dim: int = 32,
        # 解码器前馈神经网络的维度，整数参数，默认为 32
        decoder_ffn_dim: int = 32,
        # 编码器注意力头数，整数参数，默认为 2
        encoder_attention_heads: int = 2,
        # 解码器注意力头数，整数参数，默认为 2
        decoder_attention_heads: int = 2,
        # 编码器层数，整数参数，默认为 2
        encoder_layers: int = 2,
        # 解码器层数，整数参数，默认为 2
        decoder_layers: int = 2,
        # 是否是编码器-解码器结构，布尔类型参数，默认为 True
        is_encoder_decoder: bool = True,
        # 激活函数类型，字符串参数，默认为 GELU
        activation_function: str = "gelu",
        # 普通 dropout 概率，浮点数参数，默认为 0.05
        dropout: float = 0.05,
        # 编码器层 dropout 概率，浮点数参数，默认为 0.1
        encoder_layerdrop: float = 0.1,
        # 解码器层 dropout 概率，浮点数参数，默认为 0.1
        decoder_layerdrop: float = 0.1,
        # 注意力 dropout 概率，浮点数参数，默认为 0.1
        attention_dropout: float = 0.1,
        # 激活函数 dropout 概率，浮点数参数，默认为 0.1
        activation_dropout: float = 0.1,
        # 并行采样数，整数参数，默认为 100
        num_parallel_samples: int = 100,
        # 初始化标准差，浮点数参数，默认为 0.02
        init_std: float = 0.02,
        # 是否使用缓存，布尔类型参数，默认为 True
        use_cache=True,
        # Informer 模型特有参数
        # 注意类型，字符串参数，默认为 "prob"
        attention_type: str = "prob",
        # 采样因子，整数参数，默认为 5
        sampling_factor: int = 5,
        # 是否蒸馏，布尔类型参数，默认为 True
        distil: bool = True,
        # 其他参数，字典参数，用于接收额外的关键字参数
        **kwargs,
        # time series specific configuration
        self.prediction_length = prediction_length  # 设置预测长度
        self.context_length = context_length or prediction_length  # 设置上下文长度，如果未提供则默认与预测长度相同
        self.distribution_output = distribution_output  # 设置分布输出类型
        self.loss = loss  # 设置损失函数类型
        self.input_size = input_size  # 设置输入特征的大小
        self.num_time_features = num_time_features  # 设置时间特征的数量
        self.lags_sequence = lags_sequence if lags_sequence is not None else [1, 2, 3, 4, 5, 6, 7]  # 设置滞后序列，如果未提供则默认为指定的序列
        self.scaling = scaling  # 设置是否进行数据缩放
        self.num_dynamic_real_features = num_dynamic_real_features  # 设置动态实数特征的数量
        self.num_static_real_features = num_static_real_features  # 设置静态实数特征的数量
        self.num_static_categorical_features = num_static_categorical_features  # 设置静态分类特征的数量

        # set cardinality
        if cardinality and num_static_categorical_features > 0:
            if len(cardinality) != num_static_categorical_features:
                raise ValueError(
                    "The cardinality should be a list of the same length as `num_static_categorical_features`"
                )
            self.cardinality = cardinality  # 设置分类特征的基数（类别数量）
        else:
            self.cardinality = [0]  # 如果未提供分类特征或数量为零，则设置基数为零

        # set embedding_dimension
        if embedding_dimension and num_static_categorical_features > 0:
            if len(embedding_dimension) != num_static_categorical_features:
                raise ValueError(
                    "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
                )
            self.embedding_dimension = embedding_dimension  # 设置嵌入维度
        else:
            # 计算默认嵌入维度，每个分类特征的嵌入维度最大为50，最小为其基数加1再除以2
            self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]

        self.num_parallel_samples = num_parallel_samples  # 设置并行采样的数量

        # Transformer architecture configuration
        self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features  # 计算特征大小
        self.d_model = d_model  # 设置Transformer模型的维度大小
        self.encoder_attention_heads = encoder_attention_heads  # 设置编码器注意力头数
        self.decoder_attention_heads = decoder_attention_heads  # 设置解码器注意力头数
        self.encoder_ffn_dim = encoder_ffn_dim  # 设置编码器前馈神经网络的维度
        self.decoder_ffn_dim = decoder_ffn_dim  # 设置解码器前馈神经网络的维度
        self.encoder_layers = encoder_layers  # 设置编码器层数
        self.decoder_layers = decoder_layers  # 设置解码器层数

        self.dropout = dropout  # 设置全局的dropout比率
        self.attention_dropout = attention_dropout  # 设置注意力层的dropout比率
        self.activation_dropout = activation_dropout  # 设置激活函数的dropout比率
        self.encoder_layerdrop = encoder_layerdrop  # 设置编码器层的随机丢弃比率
        self.decoder_layerdrop = decoder_layerdrop  # 设置解码器层的随机丢弃比率

        self.activation_function = activation_function  # 设置激活函数类型
        self.init_std = init_std  # 设置初始化标准差

        self.use_cache = use_cache  # 设置是否使用缓存

        # Informer
        self.attention_type = attention_type  # 设置注意力机制的类型
        self.sampling_factor = sampling_factor  # 设置采样因子
        self.distil = distil  # 设置是否使用蒸馏技术

        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)  # 调用父类的初始化方法
    # 定义一个私有方法，用于计算特征的总数并返回整数类型的结果
    def _number_of_features(self) -> int:
        # 计算嵌入维度列表中所有元素的总和
        return (
            sum(self.embedding_dimension)  # 加上嵌入维度的总和
            + self.num_dynamic_real_features  # 加上动态实数特征的数量
            + self.num_time_features  # 加上时间特征的数量
            + self.num_static_real_features  # 加上静态实数特征的数量
            + self.input_size * 2  # 加上输入大小乘以2，代表 log1p(abs(loc)) 和 log(scale) 特征
        )

`.\models\informer\modeling_informer.py`

# coding=utf-8
# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Informer model."""

from typing import List, Optional, Tuple, Union

import numpy as np
import torch
from torch import nn

from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    SampleTSPredictionOutput,
    Seq2SeqTSModelOutput,
    Seq2SeqTSPredictionOutput,
)
from ...modeling_utils import PreTrainedModel
from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_informer import InformerConfig

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 文档中使用的配置名
_CONFIG_FOR_DOC = "InformerConfig"

# 预训练模型的存档列表
INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "huggingface/informer-tourism-monthly",
    # See all Informer models at https://huggingface.co/models?filter=informer
]


# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesFeatureEmbedder with TimeSeries->Informer
class InformerFeatureEmbedder(nn.Module):
    """
    Embed a sequence of categorical features.

    Args:
        cardinalities (`list[int]`):
            List of cardinalities of the categorical features.
        embedding_dims (`list[int]`):
            List of embedding dimensions of the categorical features.
    """

    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
        super().__init__()

        # 计算输入的分类特征数量
        self.num_features = len(cardinalities)
        # 使用 nn.ModuleList 创建嵌入层列表，每个分类特征对应一个 Embedding 层
        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        if self.num_features > 1:
            # 如果有多个特征，按最后一个维度切片特征张量
            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
        else:
            cat_feature_slices = [features]

        # 将每个分类特征片段经过对应的 Embedding 层嵌入，并在最后一个维度上连接它们
        return torch.cat(
            [
                embed(cat_feature_slice.squeeze(-1))
                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
            ],
            dim=-1,
        )
# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeriesTransformer->Informer,TimeSeries->Informer
class InformerStdScaler(nn.Module):
    """
    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
    subtracting from the mean and dividing by the standard deviation.
    """

    def __init__(self, config: InformerConfig):
        super().__init__()
        # 初始化标准化器对象，从配置中获取标准化维度，默认为第一维
        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
        # 确定是否保持维度大小不变，默认为True
        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
        # 最小缩放值，用于避免除以零或接近零的情况，默认为1e-5
        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-5

    def forward(
        self, data: torch.Tensor, observed_indicator: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        """
        # 计算分母，即观察指示器的和，用于标准差计算，保持维度不变
        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
        # 确保分母不小于1.0，避免除以零或接近零的情况
        denominator = denominator.clamp_min(1.0)
        # 计算数据的均值，仅在观察指示器为真时考虑数据贡献
        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator

        # 计算方差，仅在观察指示器为真时考虑数据贡献，用于计算标准差
        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
        # 计算标准差，并加上最小缩放值以确保数值稳定性
        scale = torch.sqrt(variance + self.minimum_scale)
        # 标准化数据并返回标准化后的数据、均值和标准差
        return (data - loc) / scale, loc, scale


# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeriesTransformer->Informer,TimeSeries->Informer
class InformerMeanScaler(nn.Module):
    """
    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
    accordingly.
    """

    def __init__(self, config: InformerConfig):
        super().__init__()
        # 初始化均值标准化器对象，从配置中获取标准化维度，默认为第一维
        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
        # 确定是否保持维度大小不变，默认为True
        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
        # 最小缩放值，用于避免除以零或接近零的情况，默认为1e-10
        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
        # 默认的缩放值，如果配置中未指定，默认为None
        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None

    def forward(
        self, data: torch.Tensor, observed_indicator: torch.Tensor
    ) -> torch.Tensor:
        """
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            `torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`

        """
        # 返回数据经过均值标准化处理后的结果
        return data / (data.abs().mean(self.dim, keepdim=self.keepdim) + self.minimum_scale)
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        """
        # Calculate the sum of absolute values of data multiplied by observed_indicator,
        # along the specified dimension `self.dim`, keeping the dimensionality.
        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)

        # Count the number of observed elements (True values) in observed_indicator
        # along dimension `self.dim`, keeping the dimensionality.
        num_observed = observed_indicator.sum(self.dim, keepdim=True)

        # Compute the scale factor by dividing ts_sum by num_observed, clamping
        # num_observed to a minimum value of 1 to avoid division by zero.
        scale = ts_sum / torch.clamp(num_observed, min=1)

        # If `default_scale` is not provided, calculate it using the batch sum of ts_sum
        # and batch_observations, otherwise use the provided `default_scale`.
        if self.default_scale is None:
            batch_sum = ts_sum.sum(dim=0)
            batch_observations = torch.clamp(num_observed.sum(0), min=1)
            default_scale = torch.squeeze(batch_sum / batch_observations)
        else:
            default_scale = self.default_scale * torch.ones_like(scale)

        # Apply default_scale where num_observed is greater than zero, otherwise use scale.
        scale = torch.where(num_observed > 0, scale, default_scale)

        # Ensure the scale values are at least `self.minimum_scale`.
        scale = torch.clamp(scale, min=self.minimum_scale)

        # Normalize data by dividing each element by its corresponding scale.
        scaled_data = data / scale

        # If keepdim is False, squeeze the scale tensor along dimension `self.dim`.
        if not self.keepdim:
            scale = scale.squeeze(dim=self.dim)

        # Return scaled_data, a tensor of zeros with the same shape as scale, and scale itself.
        return scaled_data, torch.zeros_like(scale), scale
# 从 transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler 复制而来，改名为 InformerNOPScaler，
# 仅对名字进行了变更，并未改动其功能或逻辑
class InformerNOPScaler(nn.Module):
    """
    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
    """

    def __init__(self, config: InformerConfig):
        super().__init__()
        # 根据配置文件，确定缩放的维度，默认为第一维
        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
        # 根据配置文件，确定是否保持维度，默认为 True
        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True

    def forward(
        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        """
        # 计算数据的缩放因子，沿着指定维度取均值，得到缩放因子
        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
        # 计算数据的均值，沿着指定维度取均值，得到均值
        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
        # 返回原始数据、均值和缩放因子
        return data, loc, scale


# 从 transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average 复制而来
def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
    """
    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

    Args:
        input_tensor (`torch.FloatTensor`):
            Input tensor, of which the average must be computed.
        weights (`torch.FloatTensor`, *optional*):
            Weights tensor, of the same shape as `input_tensor`.
        dim (`int`, *optional*):
            The dim along which to average `input_tensor`.

    Returns:
        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
    """
    if weights is not None:
        # 计算加权平均值，避免权重为零时的 NaN 问题，使用 torch.where 处理
        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
        # 计算非零权重的和，并进行最小值约束
        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
        # 返回加权平均值
        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
    else:
        # 若没有提供权重，则计算普通的平均值
        return input_tensor.mean(dim=dim)


# 从 transformers.models.time_series_transformer.modeling_time_series_transformer.nll 复制而来
def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
    """
    Computes the negative log likelihood loss from input distribution with respect to target.
    """
    # 计算输入分布相对于目标的负对数似然损失
    return -input.log_prob(target)
# 从 transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding 复制并修改为 InformerSinusoidalPositionalEmbedding
class InformerSinusoidalPositionalEmbedding(nn.Embedding):
    """This module produces sinusoidal positional embeddings of any length."""

    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
        super().__init__(num_positions, embedding_dim)
        # 初始化权重
        self.weight = self._init_weight(self.weight)

    @staticmethod
    def _init_weight(out: nn.Parameter) -> nn.Parameter:
        """
        与 XLM 中 create_sinusoidal_embeddings 函数相同，但特征未交错。cos 特征在向量的第二半部分 [dim // 2:]。
        """
        n_pos, dim = out.shape
        # 创建位置编码矩阵
        position_enc = np.array(
            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
        )
        # 在早期设置为不需要梯度，以避免在 pytorch-1.8+ 中出现错误
        out.requires_grad = False
        # 确定分界线位置
        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
        # 填充正弦和余弦值
        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
        out.detach_()
        return out

    @torch.no_grad()
    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
        """`input_ids_shape` 期望是 [bsz x seqlen]。"""
        bsz, seq_len = input_ids_shape[:2]
        # 生成位置索引
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
        )
        return super().forward(positions)


# 从 transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesValueEmbedding 复制并修改为 InformerValueEmbedding
class InformerValueEmbedding(nn.Module):
    def __init__(self, feature_size, d_model):
        super().__init__()
        # 值投影层，将输入特征映射到模型维度
        self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)

    def forward(self, x):
        return self.value_projection(x)


# 从 transformers.models.bart.modeling_bart.BartAttention 复制并修改为 InformerAttention
class InformerAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[InformerConfig] = None,
    # 继承父类的初始化方法，初始化注意力机制的参数
    ):
        # 调用父类的初始化方法
        super().__init__()
        # 设置嵌入维度
        self.embed_dim = embed_dim
        # 设置注意力头的数量
        self.num_heads = num_heads
        # 设置dropout概率
        self.dropout = dropout
        # 计算每个注意力头的维度
        self.head_dim = embed_dim // num_heads
        # 设置配置参数
        self.config = config

        # 检查嵌入维度是否能够被注意力头的数量整除
        if (self.head_dim * num_heads) != self.embed_dim:
            # 如果不能整除，抛出数值错误异常
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        # 设置缩放因子为头维度的倒数平方根
        self.scaling = self.head_dim**-0.5
        # 设置是否为解码器
        self.is_decoder = is_decoder
        # 设置是否为因果模型
        self.is_causal = is_causal

        # 初始化键、值、查询、输出的线性投影层
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    # 定义形状变换函数，用于将张量变形成适合注意力计算的形状
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    # 前向传播函数，实现注意力机制的计算过程
    def forward(
        self,
        hidden_states: torch.Tensor,  # 输入的隐藏状态张量
        key_value_states: Optional[torch.Tensor] = None,  # 键值状态张量（可选）
        past_key_value: Optional[Tuple[torch.Tensor]] = None,  # 过去的键值状态元组（可选）
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码张量（可选）
        layer_head_mask: Optional[torch.Tensor] = None,  # 层级头掩码张量（可选）
        output_attentions: bool = False,  # 是否输出注意力权重（默认为否）
# 定义了一个名为InformerProbSparseAttention的PyTorch模型类，实现了概率注意力机制来选择“活跃”查询，而不是“懒惰”查询，并提供了稀疏Transformer以减少传统注意力机制的二次计算和内存需求。

class InformerProbSparseAttention(nn.Module):
    """Probabilistic Attention mechanism to select the "active"
    queries rather than the "lazy" queries and provides a sparse Transformer thus mitigating the quadratic compute and
    memory requirements of vanilla attention"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        sampling_factor: int = 5,
        bias: bool = True,
    ):
        super().__init__()
        # 设置采样因子
        self.factor = sampling_factor
        # 嵌入维度
        self.embed_dim = embed_dim
        # 注意力头的数量
        self.num_heads = num_heads
        # dropout概率
        self.dropout = dropout
        # 每个头的维度
        self.head_dim = embed_dim // num_heads

        # 如果头维度乘以头的数量不等于嵌入维度，抛出数值错误
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        # 缩放因子
        self.scaling = self.head_dim**-0.5
        # 是否为解码器
        self.is_decoder = is_decoder

        # 初始化线性变换层
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    # 重新塑形张量的辅助函数，用于变换张量形状以适应注意力机制
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    # 前向传播函数，定义了模型的计算流程和数据处理
    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 此处实现了模型的具体计算流程，具体内容可以查看源代码链接
        pass

# InformerConvLayer类的定义，实现了一维卷积层、批归一化、ELU激活函数和最大池化操作，用于模型的卷积处理

class InformerConvLayer(nn.Module):
    def __init__(self, c_in):
        super().__init__()
        # 一维卷积层定义，包括输入通道数、输出通道数、卷积核大小和填充方式
        self.downConv = nn.Conv1d(
            in_channels=c_in,
            out_channels=c_in,
            kernel_size=3,
            padding=1,
            padding_mode="circular",
        )
        # 批归一化层
        self.norm = nn.BatchNorm1d(c_in)
        # ELU激活函数
        self.activation = nn.ELU()
        # 最大池化层
        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

    # 前向传播函数，定义了数据在卷积层中的处理流程
    def forward(self, x):
        # 数据的维度变换，将数据的通道维度调整到卷积层期望的格式
        x = self.downConv(x.permute(0, 2, 1))
        # 数据的批归一化处理
        x = self.norm(x)
        # 数据的ELU激活函数处理
        x = self.activation(x)
        # 数据的最大池化处理
        x = self.maxPool(x)
        # 数据维度的还原，恢复到原来的维度格式
        x = x.transpose(1, 2)
        return x

# InformerEncoderLayer类的定义，可以继续添加注释
    # 初始化函数，接受一个 InformedConfig 类型的配置对象作为参数
    def __init__(self, config: InformerConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 设置嵌入维度为配置对象中的 d_model 属性
        self.embed_dim = config.d_model
        # 根据配置对象中的 attention_type 属性选择不同类型的注意力机制
        if config.attention_type == "prob":
            # 如果 attention_type 是 "prob"，使用稀疏注意力机制
            self.self_attn = InformerProbSparseAttention(
                embed_dim=self.embed_dim,  # 设置嵌入维度
                num_heads=config.encoder_attention_heads,  # 设置注意力头数
                dropout=config.attention_dropout,  # 设置注意力机制的 dropout 率
                sampling_factor=config.sampling_factor,  # 设置采样因子
            )
        else:
            # 否则，使用标准的注意力机制
            self.self_attn = InformerAttention(
                embed_dim=self.embed_dim,  # 设置嵌入维度
                num_heads=config.encoder_attention_heads,  # 设置注意力头数
                dropout=config.attention_dropout,  # 设置注意力机制的 dropout 率
            )
        # 对自注意力机制的输出进行层归一化
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        # 设置 dropout 率
        self.dropout = config.dropout
        # 根据配置对象中的 activation_function 属性选择激活函数
        self.activation_fn = ACT2FN[config.activation_function]
        # 设置激活函数的 dropout 率
        self.activation_dropout = config.activation_dropout
        # 设置第一个全连接层，将嵌入维度映射到编码器前馈网络的维度
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        # 设置第二个全连接层，将编码器前馈网络的维度映射回嵌入维度
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        # 对最终输出进行层归一化
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states  # 保存输入 hidden_states 的原始值，用于残差连接
        hidden_states, attn_weights, _ = self.self_attn(  # 使用 self-attention 层处理输入
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 对 self-attention 的输出进行 dropout
        hidden_states = residual + hidden_states  # 残差连接，将 dropout 后的结果与原始输入相加
        hidden_states = self.self_attn_layer_norm(hidden_states)  # 使用 LayerNorm 进行归一化处理

        residual = hidden_states  # 保存处理后的 hidden_states，用于残差连接
        hidden_states = self.activation_fn(self.fc1(hidden_states))  # 通过线性层和激活函数 fc1 处理 hidden_states
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)  # 对 fc1 的输出进行 dropout
        hidden_states = self.fc2(hidden_states)  # 通过线性层 fc2 处理 hidden_states
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 对 fc2 的输出进行 dropout
        hidden_states = residual + hidden_states  # 残差连接，将 dropout 后的结果与之前保存的 hidden_states 相加
        hidden_states = self.final_layer_norm(hidden_states)  # 使用 LayerNorm 进行归一化处理

        if hidden_states.dtype == torch.float16 and (  # 如果 hidden_states 的数据类型为 float16，且包含无穷大或 NaN 值
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000  # 获取当前数据类型的最大值，用于 clamp 操作
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)  # 对 hidden_states 进行 clamp 操作

        outputs = (hidden_states,)  # 将处理后的 hidden_states 存入 outputs 中

        if output_attentions:  # 如果指定要返回 attention tensors
            outputs += (attn_weights,)  # 将 attention weights 也加入 outputs 中

        return outputs  # 返回最终的输出结果
class InformerDecoderLayer(nn.Module):
    # Informer 解码器层的定义，继承自 nn.Module
    def __init__(self, config: InformerConfig):
        super().__init__()
        self.embed_dim = config.d_model
        # 根据配置选择不同类型的注意力机制
        if config.attention_type == "prob":
            # 使用稀疏概率注意力机制
            self.self_attn = InformerProbSparseAttention(
                embed_dim=self.embed_dim,
                num_heads=config.decoder_attention_heads,
                dropout=config.attention_dropout,
                sampling_factor=config.sampling_factor,
                is_decoder=True,
            )
        else:
            # 使用普通的注意力机制
            self.self_attn = InformerAttention(
                embed_dim=self.embed_dim,
                num_heads=config.decoder_attention_heads,
                dropout=config.attention_dropout,
                is_decoder=True,
            )
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout

        # 解码器自注意力层归一化
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        # 解码器与编码器之间的注意力
        self.encoder_attn = InformerAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )
        # 编码器注意力层归一化
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        # 解码器的前馈神经网络的第一层
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        # 解码器的前馈神经网络的第二层
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
        # 最终层的归一化
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    # 前向传播函数，接受多个参数和可选的参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,



class InformerPreTrainedModel(PreTrainedModel):
    # Informer 预训练模型，继承自 PreTrainedModel 类
    config_class = InformerConfig
    base_model_prefix = "model"
    main_input_name = "past_values"
    supports_gradient_checkpointing = True

    # 初始化权重函数，根据模块类型不同进行不同的初始化
    def _init_weights(self, module):
        std = self.config.init_std
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


INFORMER_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)
    # 这个模型也是一个 PyTorch 的 torch.nn.Module 子类。
    # 可以像使用普通的 PyTorch 模块一样使用它，关于一般使用和行为的所有问题，请参考 PyTorch 的文档。

    Parameters:
        # config 参数接受一个 TimeSeriesTransformerConfig 类的实例。
        # 这个模型的配置类包含所有的模型参数。使用配置文件初始化模型时，并不会加载与模型相关的权重，仅加载配置。
        # 若要加载模型的权重，请参考 PreTrainedModel.from_pretrained 方法。
"""
INFORMER_INPUTS_DOCSTRING = r"""
"""


class InformerEncoder(InformerPreTrainedModel):
    """
    Informer encoder consisting of *config.encoder_layers* self attention layers with distillation layers. Each
    attention layer is an [`InformerEncoderLayer`].

    Args:
        config: InformerConfig
    """

    def __init__(self, config: InformerConfig):
        super().__init__(config)

        self.dropout = config.dropout  # 从配置中获取 dropout 率
        self.layerdrop = config.encoder_layerdrop  # 从配置中获取 encoder 层的 dropout 率
        self.gradient_checkpointing = False  # 初始化梯度检查点标志为 False
        if config.prediction_length is None:
            raise ValueError("The `prediction_length` config needs to be specified.")

        self.value_embedding = InformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
        self.embed_positions = InformerSinusoidalPositionalEmbedding(
            config.context_length + config.prediction_length, config.d_model
        )  # 创建位置嵌入对象，用于处理输入序列位置信息
        self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
        self.layernorm_embedding = nn.LayerNorm(config.d_model)  # 创建输入嵌入的 LayerNorm 层

        if config.distil:
            self.conv_layers = nn.ModuleList(
                [InformerConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)]
            )
            self.conv_layers.append(None)
        else:
            self.conv_layers = [None] * config.encoder_layers  # 根据 distil 配置初始化卷积层列表

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass for the InformerEncoder.

        Args:
            attention_mask: Optional mask for attention layers.
            head_mask: Optional mask for attention heads.
            inputs_embeds: Optional input embeddings.
            output_attentions: Optional flag to output attentions.
            output_hidden_states: Optional flag to output hidden states.
            return_dict: Optional flag to return a dictionary as output.

        Returns:
            Depending on `return_dict`, either a tuple or a dictionary with different outputs.
        """
        # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer,TimeSeriesTransformerConfig->InformerConfig,time-series-transformer->informer,Transformer->Informer,TimeSeries->Informer


class InformerDecoder(InformerPreTrainedModel):
    """
    Informer decoder consisting of *config.decoder_layers* layers. Each layer is a
    [`InformerDecoderLayer`]

    Args:
        config: InformerConfig
    """
    # 初始化函数，接受一个 InfromerConfig 类型的配置对象作为参数
    def __init__(self, config: InformerConfig):
        # 调用父类的初始化方法，传入配置对象
        super().__init__(config)
        # 设置类的dropout属性为配置对象中的dropout值
        self.dropout = config.dropout
        # 设置类的layerdrop属性为配置对象中的decoder_layerdrop值
        self.layerdrop = config.decoder_layerdrop
        # 如果配置对象中的prediction_length为None，则抛出值错误异常
        if config.prediction_length is None:
            raise ValueError("The `prediction_length` config needs to be specified.")

        # 创建一个 InfromerValueEmbedding 实例，传入配置对象中的feature_size和d_model作为参数
        self.value_embedding = InformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
        # 创建一个 InfromerSinusoidalPositionalEmbedding 实例，传入context_length + prediction_length和d_model作为参数
        self.embed_positions = InformerSinusoidalPositionalEmbedding(
            config.context_length + config.prediction_length, config.d_model
        )
        # 创建一个由多个 InfromerDecoderLayer 实例组成的 ModuleList，数量为配置对象中的decoder_layers
        self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
        # 创建一个 LayerNorm 层，归一化维度为配置对象中的d_model
        self.layernorm_embedding = nn.LayerNorm(config.d_model)

        # 初始化梯度检查点为False
        self.gradient_checkpointing = False
        # 执行额外的初始化操作和最终处理
        self.post_init()

    # 前向传播函数
    def forward(
        self,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    "The bare Informer Model outputting raw hidden-states without any specific head on top.",
    INFORMER_START_DOCSTRING,
)
# 从transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel复制而来，将TimeSeriesTransformer->Informer，TIME_SERIES_TRANSFORMER->INFORMER，time-series-transformer->informer，TimeSeries->Informer
class InformerModel(InformerPreTrainedModel):
    def __init__(self, config: InformerConfig):
        super().__init__(config)

        # 根据配置选择合适的缩放器
        if config.scaling == "mean" or config.scaling is True:
            self.scaler = InformerMeanScaler(config)
        elif config.scaling == "std":
            self.scaler = InformerStdScaler(config)
        else:
            self.scaler = InformerNOPScaler(config)

        # 如果存在静态分类特征，创建特征嵌入器
        if config.num_static_categorical_features > 0:
            self.embedder = InformerFeatureEmbedder(
                cardinalities=config.cardinality,
                embedding_dims=config.embedding_dimension,
            )

        # 初始化变压器编码器和解码器，以及掩码初始化器
        self.encoder = InformerEncoder(config)
        self.decoder = InformerDecoder(config)

        # 初始化权重并应用最终处理
        self.post_init()

    @property
    def _past_length(self) -> int:
        # 返回过去历史长度，包括最大的滞后长度
        return self.config.context_length + max(self.config.lags_sequence)

    def get_lagged_subsequences(
        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
    ) -> torch.Tensor:
        """
        返回给定序列的滞后子序列。返回的张量形状为(N, S, C, I)，其中S为子序列长度，I为indices的长度，包含滞后的子序列。
        具体而言，lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].

        Args:
            sequence: Tensor
                要提取滞后子序列的序列。形状为(N, T, C).
            subsequences_length : int
                要提取的子序列长度。
            shift: int
                向后移动滞后的量。
        """
        sequence_length = sequence.shape[1]
        indices = [lag - shift for lag in self.config.lags_sequence]

        if max(indices) + subsequences_length > sequence_length:
            raise ValueError(
                f"lags不能超过历史长度，发现lag {max(indices)}，而历史长度仅为{sequence_length}"
            )

        lagged_values = []
        for lag_index in indices:
            begin_index = -lag_index - subsequences_length
            end_index = -lag_index if lag_index > 0 else None
            lagged_values.append(sequence[:, begin_index:end_index, ...])
        return torch.stack(lagged_values, dim=-1)
    # 创建神经网络模型的输入数据
    def create_network_inputs(
        self,
        # 过去的值，作为模型输入的时间序列数据
        past_values: torch.Tensor,
        # 过去时间特征，如日期、时间等
        past_time_features: torch.Tensor,
        # 静态分类特征，如类别信息（可选）
        static_categorical_features: Optional[torch.Tensor] = None,
        # 静态实数特征，如价格、数量等（可选）
        static_real_features: Optional[torch.Tensor] = None,
        # 过去观测的掩码，标记观测是否存在（可选）
        past_observed_mask: Optional[torch.Tensor] = None,
        # 未来的目标值（可选）
        future_values: Optional[torch.Tensor] = None,
        # 未来时间特征（可选）
        future_time_features: Optional[torch.Tensor] = None,
        ):
            # time feature
            time_feat = (
                torch.cat(
                    (
                        past_time_features[:, self._past_length - self.config.context_length :, ...],
                        future_time_features,
                    ),
                    dim=1,
                )
                if future_values is not None
                else past_time_features[:, self._past_length - self.config.context_length :, ...]
            )

            # target
            if past_observed_mask is None:
                past_observed_mask = torch.ones_like(past_values)

            # Extract context from past values
            context = past_values[:, -self.config.context_length :]
            observed_context = past_observed_mask[:, -self.config.context_length :]

            # Normalize context using the scaler
            _, loc, scale = self.scaler(context, observed_context)
            
            # Prepare inputs for the model, applying normalization
            inputs = (
                (torch.cat((past_values, future_values), dim=1) - loc) / scale
                if future_values is not None
                else (past_values - loc) / scale
            )

            # Calculate static features
            log_abs_loc = loc.abs().log1p() if self.config.input_size == 1 else loc.squeeze(1).abs().log1p()
            log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
            static_feat = torch.cat((log_abs_loc, log_scale), dim=1)

            # Incorporate additional static real features if available
            if static_real_features is not None:
                static_feat = torch.cat((static_real_features, static_feat), dim=1)
            
            # Incorporate additional static categorical features if available
            if static_categorical_features is not None:
                embedded_cat = self.embedder(static_categorical_features)
                static_feat = torch.cat((embedded_cat, static_feat), dim=1)
            
            # Expand static features to align with time feature shape
            expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)

            # Combine static and time features
            features = torch.cat((expanded_static_feat, time_feat), dim=-1)

            # Generate lagged sequences of inputs
            subsequences_length = (
                self.config.context_length + self.config.prediction_length
                if future_values is not None
                else self.config.context_length
            )
            lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
            lags_shape = lagged_sequence.shape
            reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)

            # Check if the length of reshaped lagged sequence matches time feature length
            if reshaped_lagged_sequence.shape[1] != time_feat.shape[1]:
                raise ValueError(
                    f"input length {reshaped_lagged_sequence.shape[1]} and time feature lengths {time_feat.shape[1]} does not match"
                )

            # Prepare inputs for transformer, combining reshaped lagged sequence and features
            transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)

            return transformer_inputs, loc, scale, static_feat

        def get_encoder(self):
            return self.encoder

        def get_decoder(self):
            return self.decoder

        @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
        @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
    # 定义神经网络模型中的前向传播方法，处理输入和输出
    def forward(
        self,
        # 过去的数值状态，形状为 [batch_size, seq_length, num_features]
        past_values: torch.Tensor,
        # 过去的时间特征，形状为 [batch_size, seq_length, num_time_features]
        past_time_features: torch.Tensor,
        # 过去观察到的掩码，形状为 [batch_size, seq_length]
        past_observed_mask: torch.Tensor,
        # 静态分类特征，形状为 [batch_size, num_static_cat_features]
        static_categorical_features: Optional[torch.Tensor] = None,
        # 静态实数特征，形状为 [batch_size, num_static_real_features]
        static_real_features: Optional[torch.Tensor] = None,
        # 未来的数值状态，形状为 [batch_size, seq_length, num_future_features]
        future_values: Optional[torch.Tensor] = None,
        # 未来的时间特征，形状为 [batch_size, seq_length, num_time_features]
        future_time_features: Optional[torch.Tensor] = None,
        # 解码器的注意力掩码，形状为 [batch_size, seq_length]
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        # 多头注意力掩码，形状为 [num_heads, seq_length, seq_length]
        head_mask: Optional[torch.Tensor] = None,
        # 解码器头部的掩码，形状为 [num_layers, num_heads, seq_length, seq_length]
        decoder_head_mask: Optional[torch.Tensor] = None,
        # 交叉注意力头部的掩码，形状为 [num_layers, num_heads, seq_length, seq_length]
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        # 编码器输出列表，每个元素形状为 [batch_size, seq_length, hidden_size]
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        # 过去的键值对列表，每个元素形状为 [batch_size, seq_length, hidden_size]
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        # 输出隐藏状态的标志
        output_hidden_states: Optional[bool] = None,
        # 输出注意力权重的标志
        output_attentions: Optional[bool] = None,
        # 是否使用缓存的标志
        use_cache: Optional[bool] = None,
        # 是否返回字典格式的结果
        return_dict: Optional[bool] = None,
        ):
# 为 InfornerForPrediction 类添加文档字符串，描述其作为基于时间序列预测的模型的分布式头部
# 参考自 transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerForPrediction，将 TimeSeriesTransformer->Informer，TIME_SERIES_TRANSFORMER->INFORMER，time-series-transformer->informer 进行替换
@add_start_docstrings(
    "The Informer Model with a distribution head on top for time-series forecasting.",
    INFORMER_START_DOCSTRING,
)
class InformerForPrediction(InformerPreTrainedModel):
    def __init__(self, config: InformerConfig):
        super().__init__(config)
        # 使用给定的配置初始化 InfornerModel
        self.model = InformerModel(config)
        # 根据配置选择分布输出类型，如 student_t, normal, negative_binomial
        if config.distribution_output == "student_t":
            self.distribution_output = StudentTOutput(dim=config.input_size)
        elif config.distribution_output == "normal":
            self.distribution_output = NormalOutput(dim=config.input_size)
        elif config.distribution_output == "negative_binomial":
            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
        else:
            raise ValueError(f"Unknown distribution output {config.distribution_output}")

        # 使用 distribution_output 的参数投影初始化 parameter_projection
        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.d_model)
        # 设置 target_shape 为 distribution_output 的事件形状
        self.target_shape = self.distribution_output.event_shape

        # 根据配置选择损失函数，如 nll
        if config.loss == "nll":
            self.loss = nll
        else:
            raise ValueError(f"Unknown loss function {config.loss}")

        # 初始化 distribution_output 的权重并应用最终处理
        self.post_init()

    # 输出参数投影的方法，接受解码器输出 dec_output
    def output_params(self, dec_output):
        return self.parameter_projection(dec_output)

    # 获取编码器的方法
    def get_encoder(self):
        return self.model.get_encoder()

    # 获取解码器的方法
    def get_decoder(self):
        return self.model.get_decoder()

    # 忽略此方法的 Torch JIT 编译
    @torch.jit.ignore
    def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
        # 如果 trailing_n 不为 None，则对 params 进行切片操作
        sliced_params = params
        if trailing_n is not None:
            sliced_params = [p[:, -trailing_n:] for p in params]
        # 返回 distribution_output 的分布对象，传入切片后的参数、loc 和 scale
        return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)

    # 添加 INFORMER_INPUTS_DOCSTRING 到 model_forward 方法的文档字符串
    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
    # 替换输出类型文档字符串为 Seq2SeqTSModelOutput，使用 _CONFIG_FOR_DOC 作为配置类
    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
    # 定义一个方法用于向前传播，接收多个输入参数和可选的参数，返回模型预测结果
    def forward(
        self,
        past_values: torch.Tensor,  # 过去的数值数据，类型为 Torch 张量
        past_time_features: torch.Tensor,  # 过去的时间特征，类型为 Torch 张量
        past_observed_mask: torch.Tensor,  # 过去观察掩码，类型为 Torch 张量
        static_categorical_features: Optional[torch.Tensor] = None,  # 静态分类特征，可选的 Torch 张量
        static_real_features: Optional[torch.Tensor] = None,  # 静态实数特征，可选的 Torch 张量
        future_values: Optional[torch.Tensor] = None,  # 未来的数值数据，可选的 Torch 张量
        future_time_features: Optional[torch.Tensor] = None,  # 未来的时间特征，可选的 Torch 张量
        future_observed_mask: Optional[torch.Tensor] = None,  # 未来观察掩码，可选的 Torch 张量
        decoder_attention_mask: Optional[torch.LongTensor] = None,  # 解码器注意力掩码，可选的 LongTensor
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码，可选的 Torch 张量
        decoder_head_mask: Optional[torch.Tensor] = None,  # 解码器头部掩码，可选的 Torch 张量
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 交叉注意力头部掩码，可选的 Torch 张量
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,  # 编码器输出列表，每个元素为 Torch 浮点张量，可选的
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 过去键值列表，每个元素为 Torch 浮点张量，可选的
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔值
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选的布尔值
        use_cache: Optional[bool] = None,  # 是否使用缓存，可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典格式的结果，可选的布尔值
    ):
        # 在不计算梯度的上下文中执行方法体
        @torch.no_grad()
        def generate(
            self,
            past_values: torch.Tensor,  # 过去的数值数据，类型为 Torch 张量
            past_time_features: torch.Tensor,  # 过去的时间特征，类型为 Torch 张量
            future_time_features: torch.Tensor,  # 未来的时间特征，类型为 Torch 张量
            past_observed_mask: Optional[torch.Tensor] = None,  # 过去观察掩码，可选的 Torch 张量
            static_categorical_features: Optional[torch.Tensor] = None,  # 静态分类特征，可选的 Torch 张量
            static_real_features: Optional[torch.Tensor] = None,  # 静态实数特征，可选的 Torch 张量
            output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选的布尔值
            output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔值
        ):

`.\models\informer\init.py`

# 导入需要的模块和函数
from typing import TYPE_CHECKING
# 从当前包中导入自定义的异常和延迟加载模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构，包括配置和模型信息
_import_structure = {
    "configuration_informer": [
        "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "InformerConfig",
    ],
}

# 检查是否导入了 torch 库，如果没有则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果导入了 torch，则添加额外的模型信息到导入结构中
    _import_structure["modeling_informer"] = [
        "INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "InformerForPrediction",
        "InformerModel",
        "InformerPreTrainedModel",
    ]

# 如果是类型检查阶段，则导入配置和模型相关的类型信息
if TYPE_CHECKING:
    from .configuration_informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_informer import (
            INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            InformerForPrediction,
            InformerModel,
            InformerPreTrainedModel,
        )

# 如果不是类型检查阶段，则创建 LazyModule 对象，并将其作为当前模块的属性
else:
    import sys

    # 创建 LazyModule 对象，并设置当前模块的属性为该对象，实现延迟加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\instructblip\configuration_instructblip.py`

# coding=utf-8
# 定义脚本编码格式为 UTF-8

# 版权声明，此代码版权归 HuggingFace Inc. 团队所有，保留所有权利
# 根据 Apache License, Version 2.0 许可证使用此文件，除非符合许可证的规定，否则不得使用此文件
# 可以在以下网址获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 除非法律有明确规定或书面同意，否则依据此许可证分发的软件是基于“原样”提供的，无任何明示或暗示的担保或条件
# 请查看许可证以获取特定语言的详细信息

""" InstructBLIP model configuration"""

# 导入操作系统模块
import os
# 导入 Union 类型提示
from typing import Union

# 导入配置工具函数
from ...configuration_utils import PretrainedConfig
# 导入模型映射名称
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
# 导入日志工具
from ...utils import logging
# 导入自动配置映射
from ..auto import CONFIG_MAPPING

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 定义预训练配置存档映射字典
INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "Salesforce/instruct-blip-flan-t5": "https://huggingface.co/Salesforce/instruct-blip-flan-t5/resolve/main/config.json",
}

# 定义 InstructBlipVisionConfig 类，继承自 PretrainedConfig 类
class InstructBlipVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`InstructBlipVisionModel`]. It is used to
    instantiate a InstructBLIP vision encoder according to the specified arguments, defining the model architecture.
    Instantiating a configuration defaults will yield a similar configuration to that of the InstructBLIP
    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # 模型类型字符串，用于指示模型类型
    model_type = "instructblip_vision_model"
    
    # 初始化方法，用于创建一个新的InstructBlipVisionConfig对象
    def __init__(
        self,
        # 编码器层和池化层的维度大小，默认为1408
        hidden_size=1408,
        # Transformer编码器中“中间”（即前馈）层的维度大小，默认为6144
        intermediate_size=6144,
        # Transformer编码器中隐藏层的数量，默认为39
        num_hidden_layers=39,
        # Transformer编码器中每个注意力层的注意力头数量，默认为16
        num_attention_heads=16,
        # 每个图像的分辨率大小，默认为224
        image_size=224,
        # 每个图像分块的分辨率大小，默认为14
        patch_size=14,
        # 编码器和池化器中的非线性激活函数（函数或字符串），支持"gelu"、"relu"、"selu"和"gelu_new"，默认为"gelu"
        hidden_act="gelu",
        # 层归一化层使用的epsilon值，默认为1e-6
        layer_norm_eps=1e-6,
        # 注意力概率的dropout比率，默认为0.0
        attention_dropout=0.0,
        # 用于初始化所有权重矩阵的截断正态分布的标准差，默认为1e-10
        initializer_range=1e-10,
        # 是否在自注意力层中添加查询和值的偏置，默认为True
        qkv_bias=True,
        **kwargs,
    ):
        # 调用父类的初始化方法，传递所有的关键字参数
        super().__init__(**kwargs)

        # 设置隐藏层大小
        self.hidden_size = hidden_size
        # 设置中间层大小
        self.intermediate_size = intermediate_size
        # 设置隐藏层的数量
        self.num_hidden_layers = num_hidden_layers
        # 设置注意力头的数量
        self.num_attention_heads = num_attention_heads
        # 设置图像块的大小
        self.patch_size = patch_size
        # 设置图像的总体大小
        self.image_size = image_size
        # 设置初始化范围
        self.initializer_range = initializer_range
        # 设置注意力机制的dropout率
        self.attention_dropout = attention_dropout
        # 设置层归一化的epsilon值
        self.layer_norm_eps = layer_norm_eps
        # 设置隐藏层激活函数
        self.hidden_act = hidden_act
        # 设置查询/键/值矩阵的偏置
        self.qkv_bias = qkv_bias

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 调用内部方法，将token信息设置到关键字参数中
        cls._set_token_in_kwargs(kwargs)

        # 从预训练模型名称或路径获取配置字典和剩余的关键字参数
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中的模型类型是"instructblip"，则使用其视觉配置字典
        if config_dict.get("model_type") == "instructblip":
            config_dict = config_dict["vision_config"]

        # 如果配置字典中指定了模型类型，并且该类有模型类型属性，并且两者不匹配，则发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 使用配置字典和剩余的关键字参数构建一个新的实例
        return cls.from_dict(config_dict, **kwargs)
# 定义配置类 `InstructBlipQFormerConfig`，继承自 `PretrainedConfig`，用于存储 `InstructBlipQFormerModel` 的配置信息。
class InstructBlipQFormerConfig(PretrainedConfig):
    r"""
    这是一个配置类，用于存储 [`InstructBlipQFormerModel`] 的配置信息。它用于实例化一个 InstructBLIP Querying Transformer (Q-Former) 模型，
    根据指定的参数定义模型架构。使用默认参数实例化配置将产生与 InstructBLIP [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5)
    架构类似的配置。配置对象继承自 [`PretrainedConfig`]，可以用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。

    注意，[`InstructBlipQFormerModel`] 与 [`BertLMHeadModel`] 非常相似，具有交织的交叉注意力。
    """
    # 定义 Q-Former 模型配置的类，包含了模型的各种参数设置
    class InstructBlipQFormerConfig:
        # 词汇表大小，默认为 30522，定义了输入 `inputs_ids` 中可以表示的不同令牌数量
        def __init__(self,
                     vocab_size=30522,
                     # 编码器层和汇聚层的维度
                     hidden_size=768,
                     # Transformer 编码器中的隐藏层数量
                     num_hidden_layers=12,
                     # 每个注意力层中的注意力头数
                     num_attention_heads=12,
                     # Transformer 编码器中 "中间"（通常称为前馈）层的维度
                     intermediate_size=3072,
                     # 编码器和汇聚层中的非线性激活函数
                     hidden_act="gelu",
                     # 嵌入层、编码器和汇聚层中所有全连接层的 dropout 概率
                     hidden_dropout_prob=0.1,
                     # 注意力概率的 dropout 比例
                     attention_probs_dropout_prob=0.1,
                     # 可能用于模型的最大序列长度
                     max_position_embeddings=512,
                     # 初始化所有权重矩阵的截断正态初始化器的标准差
                     initializer_range=0.02,
                     # 层归一化层使用的 epsilon
                     layer_norm_eps=1e-12,
                     # 位置嵌入类型，可以选择 "absolute"、"relative_key" 或 "relative_key_query"
                     position_embedding_type="absolute",
                     # 在 Transformer 层中添加交叉注意力的频率
                     cross_attention_frequency=2,
                     # 交叉注意力中隐藏状态的隐藏大小
                     encoder_hidden_size=1408):
            # 将所有参数赋值给对应的实例变量
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
            self.num_attention_heads = num_attention_heads
            self.intermediate_size = intermediate_size
            self.hidden_act = hidden_act
            self.hidden_dropout_prob = hidden_dropout_prob
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            self.max_position_embeddings = max_position_embeddings
            self.initializer_range = initializer_range
            self.layer_norm_eps = layer_norm_eps
            self.position_embedding_type = position_embedding_type
            self.cross_attention_frequency = cross_attention_frequency
            self.encoder_hidden_size = encoder_hidden_size
    # 设定模型类型为 "instructblip_qformer"
    model_type = "instructblip_qformer"

    # 初始化方法，定义模型配置的各个参数
    def __init__(
        self,
        vocab_size=30522,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        position_embedding_type="absolute",
        cross_attention_frequency=2,
        encoder_hidden_size=1408,
        **kwargs,
    ):
        # 调用父类的初始化方法，设置 pad_token_id 参数
        super().__init__(pad_token_id=pad_token_id, **kwargs)

        # 初始化对象的各个属性，以提供模型配置的详细信息
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.cross_attention_frequency = cross_attention_frequency
        self.encoder_hidden_size = encoder_hidden_size

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 设置关键字参数中的 token
        cls._set_token_in_kwargs(kwargs)

        # 获取预训练模型的配置字典和可能的额外参数
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中的模型类型为 "instructblip"，则从中提取 qformer_config
        if config_dict.get("model_type") == "instructblip":
            config_dict = config_dict["qformer_config"]

        # 如果配置字典中的模型类型与当前类的模型类型不匹配，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 使用配置字典和关键字参数创建类的实例
        return cls.from_dict(config_dict, **kwargs)
class InstructBlipConfig(PretrainedConfig):
    r"""
    [`InstructBlipConfig`] is the configuration class to store the configuration of a
    [`InstructBlipForConditionalGeneration`]. It is used to instantiate a InstructBLIP model according to the specified
    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
    the defaults will yield a similar configuration to that of the InstructBLIP
    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVisionConfig`].
        qformer_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipQFormerConfig`].
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```
    >>> from transformers import (
    ...     InstructBlipVisionConfig,
    ...     InstructBlipQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipConfig,
    ...     InstructBlipForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipConfig with Salesforce/instruct-blip-flan-t5 style configuration
    >>> configuration = InstructBlipConfig()

    >>> # Initializing a InstructBlipForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
    >>> model = InstructBlipForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipConfig from a InstructBlipVisionConfig, InstructBlipQFormerConfig and any PretrainedConfig

    >>> # Initializing InstructBLIP vision, InstructBLIP Q-Former and language model configurations
    >>> vision_config = InstructBlipVisionConfig()
    >>> qformer_config = InstructBlipQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
    ```"""

    # 设置模型类型为 "instructblip"
    model_type = "instructblip"
    # 定义类的初始化方法，接受多个配置参数和其他关键字参数
    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 如果视觉配置为空，则使用空字典，并记录日志信息
        if vision_config is None:
            vision_config = {}
            logger.info("vision_config is None. initializing the InstructBlipVisionConfig with default values.")

        # 如果Q-Former配置为空，则使用空字典，并记录日志信息
        if qformer_config is None:
            qformer_config = {}
            logger.info("qformer_config is None. Initializing the InstructBlipQFormerConfig with default values.")

        # 如果文本配置为空，则使用空字典，并记录日志信息
        if text_config is None:
            text_config = {}
            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")

        # 根据传入的配置参数实例化视觉配置对象
        self.vision_config = InstructBlipVisionConfig(**vision_config)
        # 根据传入的配置参数实例化Q-Former配置对象
        self.qformer_config = InstructBlipQFormerConfig(**qformer_config)
        
        # 获取文本模型的类型，若未指定则默认为"opt"
        text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
        # 根据文本模型类型选择对应的配置类实例化文本配置对象
        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)

        # 设置词嵌入是否共享的标志
        self.tie_word_embeddings = self.text_config.tie_word_embeddings
        # 设置是否为编码器-解码器模型的标志
        self.is_encoder_decoder = self.text_config.is_encoder_decoder

        # 设置查询令牌的数量
        self.num_query_tokens = num_query_tokens
        # 将Q-Former的编码器隐藏层大小设置为视觉配置的隐藏层大小
        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
        # 根据文本模型类型判断是否只使用解码器作为语言模型
        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
        # 初始化因子
        self.initializer_factor = 1.0
        # 初始化范围
        self.initializer_range = 0.02

    @classmethod
    # 类方法：从给定的视觉配置、Q-Former配置和文本配置参数实例化一个InstructBlipConfig（或其派生类）对象
    def from_vision_qformer_text_configs(
        cls,
        vision_config: InstructBlipVisionConfig,
        qformer_config: InstructBlipQFormerConfig,
        text_config: PretrainedConfig,
        **kwargs,
    ):
        r"""
        Instantiate a [`InstructBlipConfig`] (or a derived class) from a InstructBLIP vision model, Q-Former and
        language model configurations.

        Returns:
            [`InstructBlipConfig`]: An instance of a configuration object
        """

        # 调用类的构造函数，传入各配置对象的字典形式和其他关键字参数
        return cls(
            vision_config=vision_config.to_dict(),
            qformer_config=qformer_config.to_dict(),
            text_config=text_config.to_dict(),
            **kwargs,
        )

`.\models\instructblip\convert_instructblip_original_to_pytorch.py`

# coding=utf-8
# 设置文件编码为UTF-8，确保支持中文和其他非ASCII字符

# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
# 版权声明，保留所有权利

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据Apache许可证2.0版授权，详细内容可访问指定URL查看

# you may not use this file except in compliance with the License.
# 您除非遵守许可证，否则不能使用本文件

# You may obtain a copy of the License at
# 您可以在上述URL获取许可证的副本

# http://www.apache.org/licenses/LICENSE-2.0
# 许可证URL地址

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则根据许可证分发的软件是基于"AS IS"的基础分发，没有任何明示或暗示的担保或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查阅许可证，了解权限和限制

"""
Convert InstructBLIP checkpoints from the original repository.

URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblip
"""
# 脚本说明，用于从原始存储库转换InstructBLIP检查点，附带原始存储库URL链接

import argparse
# 导入命令行参数解析模块

import requests
# 导入处理HTTP请求的模块

import torch
# 导入PyTorch深度学习库

# pip3 install salesforce-lavis
# 安装salesforce-lavis库的说明注释

# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch)
# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml
# same for Vicuna-13b
from lavis.models import load_model_and_preprocess
# 从lavis.models模块导入load_model_and_preprocess函数

from PIL import Image
# 导入Python Imaging Library模块中的Image类

from transformers import (
    AutoTokenizer,
    BlipImageProcessor,
    InstructBlipConfig,
    InstructBlipForConditionalGeneration,
    InstructBlipProcessor,
    InstructBlipQFormerConfig,
    InstructBlipVisionConfig,
    LlamaConfig,
    LlamaTokenizerFast,
    T5Config,
    T5TokenizerFast,
)
# 从transformers库导入多个类和配置，用于自然语言处理和模型处理

from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
# 导入常量定义，用于处理OpenAI的相关内容

def load_demo_image():
    url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
    # 指定图片的URL地址

    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
    # 通过HTTP请求获取图片流，并转换为RGB格式的图像对象

    return image
    # 返回加载的图像对象

# here we list all keys to be renamed (original name on the left, our name on the right)
# 此处列出需要重命名的所有键（原始名称在左侧，我们的名称在右侧）

def create_rename_keys(config):
    rename_keys = []
    # 创建一个空列表，用于存储重命名的键值对

    # fmt: off
    # 关闭代码格式化功能，用于防止对后续部分的自动格式化

    # vision encoder
    rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
    # 将键 "visual_encoder.cls_token" 重命名为 "vision_model.embeddings.class_embedding"

    rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
    # 将键 "visual_encoder.pos_embed" 重命名为 "vision_model.embeddings.position_embedding"

    rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
    # 将键 "visual_encoder.patch_embed.proj.weight" 重命名为 "vision_model.embeddings.patch_embedding.weight"

    rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
    # 将键 "visual_encoder.patch_embed.proj.bias" 重命名为 "vision_model.embeddings.patch_embedding.bias"

    rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
    # 将键 "ln_vision.weight" 重命名为 "vision_model.post_layernorm.weight"

    rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
    # 将键 "ln_vision.bias" 重命名为 "vision_model.post_layernorm.bias"
    # 遍历 vision_config 中的隐藏层数量次数，生成需要重命名的键值对列表
    for i in range(config.vision_config.num_hidden_layers):
        # 重命名 visual_encoder.blocks 中第 i 层的 norm1.weight 到 vision_model.encoder.layers 中对应层的 layer_norm1.weight
        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
        # 重命名 visual_encoder.blocks 中第 i 层的 norm1.bias 到 vision_model.encoder.layers 中对应层的 layer_norm1.bias
        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
        # 重命名 visual_encoder.blocks 中第 i 层的 norm2.weight 到 vision_model.encoder.layers 中对应层的 layer_norm2.weight
        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
        # 重命名 visual_encoder.blocks 中第 i 层的 norm2.bias 到 vision_model.encoder.layers 中对应层的 layer_norm2.bias
        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
        # 重命名 visual_encoder.blocks 中第 i 层的 attn.qkv.weight 到 vision_model.encoder.layers 中对应层的 self_attn.qkv.weight
        rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
        # 重命名 visual_encoder.blocks 中第 i 层的 attn.proj.weight 到 vision_model.encoder.layers 中对应层的 self_attn.projection.weight
        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
        # 重命名 visual_encoder.blocks 中第 i 层的 attn.proj.bias 到 vision_model.encoder.layers 中对应层的 self_attn.projection.bias
        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
        # 重命名 visual_encoder.blocks 中第 i 层的 mlp.fc1.weight 到 vision_model.encoder.layers 中对应层的 mlp.fc1.weight
        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
        # 重命名 visual_encoder.blocks 中第 i 层的 mlp.fc1.bias 到 vision_model.encoder.layers 中对应层的 mlp.fc1.bias
        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
        # 重命名 visual_encoder.blocks 中第 i 层的 mlp.fc2.weight 到 vision_model.encoder.layers 中对应层的 mlp.fc2.weight
        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
        # 重命名 visual_encoder.blocks 中第 i 层的 mlp.fc2.bias 到 vision_model.encoder.layers 中对应层的 mlp.fc2.bias
    
    # QFormer 的特定重命名
    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight"))
    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias"))
    
    # 返回所有生成的重命名键值对列表
    return rename_keys
# 从字典中删除旧键，并将其对应的值赋给变量 val
def rename_key(dct, old, new):
    val = dct.pop(old)
    # 将旧键的值添加到字典中作为新键的值
    dct[new] = val


# 从状态字典中读取 Q、V 偏置，并设置到新的位置
def read_in_q_v_bias(state_dict, config):
    # 遍历视觉模型的隐藏层次数
    for i in range(config.vision_config.num_hidden_layers):
        # 从状态字典中移除原始 Q 偏置和 V 偏置
        q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
        v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")

        # 创建新的 QKV 偏置，按照特定顺序连接 Q 偏置、零张量、V 偏置
        qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
        # 将新的 QKV 偏置设置到状态字典中的相应位置
        state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias


# 根据模型名称获取相应的配置信息和图像大小
def get_blip2_config(model_name):
    # 根据模型名称确定图像大小
    image_size = 364 if "coco" in model_name else 224
    # 获取视觉配置信息并转换为字典格式
    vision_config = InstructBlipVisionConfig(image_size=image_size).to_dict()

    # 根据模型名称选择文本配置信息
    if "t5-xl" in model_name:
        text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
    elif "t5-xxl" in model_name:
        text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()
    elif "vicuna-7b" in model_name:
        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict()
    elif "vicuna-13b" in model_name:
        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict()
    else:
        # 若模型名称不受支持，则引发值错误
        raise ValueError("Model name not supported")

    # Q-Former 模型配置信息，包含特殊的 "[DEC]" 标记，词汇大小为 30522 + 1
    qformer_config = InstructBlipQFormerConfig(vocab_size=30523).to_dict()
    # 构建并返回包含所有配置信息的 InstructBlipConfig 对象
    config = InstructBlipConfig(vision_config=vision_config, text_config=text_config, qformer_config=qformer_config)

    return config, image_size


# 使用 Torch 不计梯度上下文管理器，将 BLIP2 模型权重转换为 Transformers 设计
@torch.no_grad()
def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
    """
    Copy/paste/tweak model's weights to Transformers design.
    """
    # 使用 AutoTokenizer 从预训练模型中加载 Q-Former 令牌器，并添加特殊标记 "[DEC]"
    qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left")
    qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})

    # 根据模型名称选择相应的分词器
    if "t5" in model_name:
        tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left")
    elif "vicuna" in model_name:
        # 如果模型名称包含"vicuna"
        
        # 使用快速加载LLAMA tokenizer，设定左截断、BOS和UNK特殊token
        tokenizer = LlamaTokenizerFast.from_pretrained(
            "huggyllama/llama-7b", truncation_side="left", bos_token="</s>", unk_token="</s>"
        )
        
        # 添加[PAD]特殊token到tokenizer
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})

    # 获取BLIP2模型配置和图像尺寸
    config, image_size = get_blip2_config(model_name)
    
    # 加载BLIP2模型进行推理
    hf_model = InstructBlipForConditionalGeneration(config).eval()

    # 将模型名称映射到原始模型名称和类型
    model_name_to_original = {
        "instructblip-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"),
        "instructblip-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"),
        "instructblip-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"),
        "instructblip-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"),
    }

    # 根据模型名称获取原始模型名称和类型
    name, type = model_name_to_original[model_name]

    # 加载原始模型
    print("Loading original model...")
    # 检查GPU是否可用，选择合适的设备
    hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu"
    lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu"
    # 加载模型及预处理器
    original_model, vis_processors, _ = load_model_and_preprocess(
        name=name, model_type=type, is_eval=True, device=lavis_device
    )
    original_model.eval()
    print("Done!")

    # 更新state dict中的键名
    state_dict = original_model.state_dict()
    rename_keys = create_rename_keys(config)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)

    # 一些键名可以进行有效的重命名
    for key, val in state_dict.copy().items():
        val = state_dict.pop(key)
        if key.startswith("Qformer.bert"):
            key = key.replace("Qformer.bert", "qformer")
        if "attention.self" in key:
            key = key.replace("self", "attention")
        if "llm_proj" in key:
            key = key.replace("llm_proj", "language_projection")
        if "t5_proj" in key:
            key = key.replace("t5_proj", "language_projection")
        if key.startswith("llm_model"):
            key = key.replace("llm_model", "language_model")
        if key.startswith("t5"):
            key = key.replace("t5", "language")
        state_dict[key] = val

    # 读取qv biases
    read_in_q_v_bias(state_dict, config)

    # 注意: 默认情况下，权重以torch.float32加载
    # 使用state_dict加载模型权重
    hf_model.load_state_dict(state_dict, strict=True)

    # 加载演示图像
    image = load_demo_image()
    # 设定提示语句
    prompt = "What is unusual about this image?"

    # 创建图像处理器
    image_processor = BlipImageProcessor(
        size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
    )
    # 创建一个 InstructBlipProcessor 实例，用于处理图像和文本数据
    processor = InstructBlipProcessor(
        image_processor=image_processor,  # 图像处理器对象
        tokenizer=tokenizer,  # 文本 tokenizer 对象
        qformer_tokenizer=qformer_tokenizer,  # qformer tokenizer 对象
    )
    
    # 使用 processor 处理图像和文本数据，将结果转移到指定的 hf_model_device 上
    inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device)

    # 确保 processor 创建的像素值与原始像素值完全相同
    original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
    pixel_values = inputs.pixel_values
    assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values)

    # 将 original_model 和 hf_model 移动到指定的设备上
    original_model.to(lavis_device)
    hf_model.to(hf_model_device)
    
    # 使用 torch.no_grad() 上下文管理器，避免计算梯度
    with torch.no_grad():
        # 根据 model_name 的不同选择不同的计算方式
        if "vicuna" in model_name:
            # 使用 original_model 和 hf_model 分别计算 logits
            original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits
            logits = hf_model(**inputs).logits
        else:
            # 使用 original_model 和 hf_model 分别计算 logits，并为 HF 模型提供额外的文本输出信息
            original_logits = original_model(
                {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]}
            ).logits
            # 生成用于 HF 模型的标签输入 ids
            label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device)
            # 将标签中的 pad_token_id 替换为 -100
            labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100)
            logits = hf_model(**inputs, labels=labels).logits

    # 打印 original_logits 和 logits 的前几个值
    print("First values of original logits:", original_logits[0, :3, :3])
    print("First values of HF logits:", logits[0, :3, :3])

    # 断言 original_logits 和 logits 的形状相同
    assert original_logits.shape == logits.shape
    # 根据 model_name 的不同设置允许的误差范围
    atol = 1e-4 if "vicuna" in model_name else 1e-5
    # 断言 original_logits 和 logits 的数值在指定的误差范围内相等
    assert torch.allclose(original_logits.to(logits.device), logits, atol=atol)
    print("Looks ok!")

    # 使用 original_model 生成文本输出
    print("Generating with original model...")
    original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5)

    # 将 HF 模型生成的输出文本
    print("Generating with HF model...")
    outputs = hf_model.generate(
        **inputs,
        do_sample=False,
        num_beams=5,
        max_length=256,
        min_length=1,
        top_p=0.9,
        repetition_penalty=1.5,
        length_penalty=1.0,
        temperature=1,
    )
    # 如果 model_name 包含 "vicuna"，将输出 id 为 0 的位置替换为 2（eos_token_id）
    if "vicuna" in model_name:
        outputs[outputs == 0] = 2
    print("Original generation:", original_outputs)
    
    # 使用 processor 批量解码 HF 模型的输出，跳过特殊 token
    output_text = processor.batch_decode(outputs, skip_special_tokens=True)
    # 去除输出文本中的首尾空格
    output_text = [text.strip() for text in output_text]
    print("HF generation:", output_text)

    # 如果指定了 pytorch_dump_folder_path，保存 processor 和 hf_model 的预训练参数
    if pytorch_dump_folder_path is not None:
        processor.save_pretrained(pytorch_dump_folder_path)
        hf_model.save_pretrained(pytorch_dump_folder_path)

    # 如果指定了 push_to_hub，将 processor 和 hf_model 推送到指定的 Hub 路径
    if push_to_hub:
        processor.push_to_hub(f"Salesforce/{model_name}")
        hf_model.push_to_hub(f"Salesforce/{model_name}")
if __name__ == "__main__":
    # 如果脚本作为主程序运行，则执行以下代码块

    # 创建参数解析器对象
    parser = argparse.ArgumentParser()

    # 定义模型名称的选项列表
    choices = [
        "instructblip-vicuna-7b",
        "instructblip-vicuna-13b",
        "instructblip-flan-t5-xl",
        "instructblip-flan-t5-xxl",
    ]

    # 添加模型名称参数
    parser.add_argument(
        "--model_name",
        default="instructblip-flan-t5-xl",  # 默认模型名称为 instructblip-flan-t5-xl
        choices=choices,  # 可选的模型名称列表
        type=str,
        help="Path to hf config.json of model to convert",  # 参数帮助信息
    )

    # 添加 PyTorch 模型输出文件夹路径参数
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,  # 默认值为 None
        type=str,
        help="Path to the output PyTorch model.",  # 参数帮助信息
    )

    # 添加是否推送到 Hub 的标志参数
    parser.add_argument(
        "--push_to_hub",
        action="store_true",  # 如果设置，则为 True，否则为 False
        help="Whether to push the model and processor to the hub after converting",  # 参数帮助信息
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数 convert_blip2_checkpoint，传递解析后的参数
    convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

`.\models\instructblip\modeling_instructblip.py`

# 设置文件编码为 UTF-8
# 版权声明，标明版权归 Salesforce 作者和 HuggingFace 团队所有，保留所有权利
#
# 根据 Apache 许可证 2.0 版本授权使用本文件；除非遵守许可证的要求，否则不得使用本文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按“原样”提供，无任何明示或暗示的担保或条件
# 请参阅许可证以获取具体语言规定的权限以及限制
""" PyTorch InstructBLIP model."""

import math  # 导入数学库，用于数学运算
from dataclasses import dataclass  # 导入 dataclass 用于创建数据类
from typing import Any, Optional, Tuple, Union  # 导入类型提示

import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 的 checkpoint 模块
from torch import nn  # 从 PyTorch 导入神经网络模块
from torch.nn import CrossEntropyLoss  # 导入交叉熵损失函数

from ...activations import ACT2FN  # 导入激活函数
from ...modeling_outputs import (  # 导入模型输出相关类
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPooling,
    BaseModelOutputWithPoolingAndCrossAttentions,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型工具类
from ...pytorch_utils import (  # 导入 PyTorch 工具类
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from ...utils import (  # 导入工具函数
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ..auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM  # 导入自动模型类
from .configuration_instructblip import InstructBlipConfig, InstructBlipQFormerConfig, InstructBlipVisionConfig  # 导入配置类


logger = logging.get_logger(__name__)  # 获取 logger 对象用于日志记录

_CHECKPOINT_FOR_DOC = "Salesforce/instructblip-flan-t5-xl"  # 指定用于文档的检查点名称

INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [  # 预训练模型存档列表
    "Salesforce/instructblip-flan-t5-xl",
    # 查看所有 InstructBLIP 模型：https://huggingface.co/models?filter=instructblip
]


@dataclass  # 使用 dataclass 装饰器创建数据类
# 从 transformers.models.blip_2.modeling_blip_2.Blip2ForConditionalGenerationModelOutput 复制并将 Blip2 改为 InstructBlip
class InstructBlipForConditionalGenerationModelOutput(ModelOutput):
    """
    Class defining the outputs of [`InstructBlipForConditionalGeneration`].

    Args:
        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Language modeling loss from the language model.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head of the language model.
        vision_outputs (`BaseModelOutputWithPooling`):
            Outputs of the vision encoder.
        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
            Outputs of the Q-Former (Querying Transformer).
        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
            Outputs of the language model.
    """

    loss: Optional[Tuple[torch.FloatTensor]] = None  # 语言模型损失，当提供标签时返回
    logits: Optional[Tuple[torch.FloatTensor]] = None  # 语言模型头部的预测分数
    vision_outputs: Optional[BaseModelOutputWithPooling] = None  # 视觉编码器的输出
    qformer_outputs: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None  # Q-Former 的输出（包含交叉注意力）
    language_model_outputs: Optional[Union[CausalLMOutputWithPast, Seq2SeqLMOutput]] = None  # 语言模型的输出
    # 声明并初始化可选的 torch.FloatTensor 类型的变量，分别用于视觉、问题转换器和语言模型的输出
    vision_outputs: Optional[torch.FloatTensor] = None
    qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None
    language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None

    # 定义一个方法，将对象转换为元组
    def to_tuple(self) -> Tuple[Any]:
        # 返回对象的元组形式，其中包括所有属性的值，但排除了特定的输出属性
        return tuple(
            self[k]  # 如果属性 k 不是特定的输出属性，则直接取其值
            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
            else getattr(self, k).to_tuple()  # 如果属性 k 是特定的输出属性，则调用其 to_tuple() 方法
            for k in self.keys()  # 对于对象的所有属性进行处理
        )
# Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->InstructBlip
class InstructBlipVisionEmbeddings(nn.Module):
    def __init__(self, config: InstructBlipVisionConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size  # 设置嵌入维度为隐藏大小
        self.image_size = config.image_size  # 图像大小配置
        self.patch_size = config.patch_size  # 图像块大小配置

        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))  # 类别嵌入参数，形状为[1, 1, embed_dim]

        # 图像块嵌入，使用2D卷积进行处理
        self.patch_embedding = nn.Conv2d(
            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
        )

        self.num_patches = (self.image_size // self.patch_size) ** 2  # 计算图像中的块数
        self.num_positions = self.num_patches + 1  # 加上一个位置用于类别嵌入

        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))  # 位置嵌入参数

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        batch_size = pixel_values.shape[0]  # 获取批量大小
        target_dtype = self.patch_embedding.weight.dtype  # 获取目标数据类型
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # 对输入像素值进行块嵌入处理
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)  # 展平并转置以适应后续的拼接操作

        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)  # 扩展类别嵌入以适应批量处理
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)  # 拼接类别嵌入和块嵌入
        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)  # 添加位置嵌入
        return embeddings


# Copied from transformers.models.blip_2.modeling_blip_2.Blip2Attention with Blip2->InstructBlip
class InstructBlipAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size  # 设置嵌入维度为隐藏大小
        self.num_heads = config.num_attention_heads  # 注意力头的数量
        self.head_dim = self.embed_dim // self.num_heads  # 每个注意力头的维度
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: "
                f"{self.num_heads})."
            )
        self.scale = self.head_dim**-0.5  # 缩放因子
        self.dropout = nn.Dropout(config.attention_dropout)  # 注意力机制的dropout率设置

        # 无偏置的线性层用于计算Q, K, V
        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)

        if config.qkv_bias:
            q_bias = nn.Parameter(torch.zeros(self.embed_dim))  # Q偏置参数
            v_bias = nn.Parameter(torch.zeros(self.embed_dim))  # V偏置参数
        else:
            q_bias = None
            v_bias = None

        if q_bias is not None:
            qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
            self.qkv.bias = nn.Parameter(qkv_bias)  # 设置QKV线性层的偏置参数

        self.projection = nn.Linear(self.embed_dim, self.embed_dim)  # 线性投影层用于最终输出
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 将输入张量重新形状为 (bsz, seq_len, num_heads, head_dim)，并交换维度顺序
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        # 获取隐藏状态张量的维度信息
        bsz, tgt_len, embed_dim = hidden_states.size()

        # 使用 self.qkv 对隐藏状态张量进行线性变换，产生混合的查询、键、值张量
        mixed_qkv = self.qkv(hidden_states)

        # 将混合的查询、键、值张量重新形状为 (bsz, tgt_len, 3, num_heads, embed_dim // num_heads) 并置换维度
        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
            2, 0, 3, 1, 4
        )
        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]

        # 计算注意力分数，采用查询张量和键张量的点积
        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))

        # 缩放注意力分数
        attention_scores = attention_scores * self.scale

        # 对注意力分数进行 softmax 归一化，得到注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 对注意力概率进行 dropout 处理
        attention_probs = self.dropout(attention_probs)

        # 如果给定了 head_mask，则将注意力概率与 head_mask 相乘
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算上下文层，采用注意力概率与值张量的乘积，然后置换维度
        context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)

        # 将上下文层重新形状为与原始嵌入维度相匹配的形状
        new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
        context_layer = context_layer.reshape(new_context_layer_shape)

        # 将上下文层传递给投影层进行线性变换，得到最终输出
        output = self.projection(context_layer)

        # 如果需要输出注意力分数，则将其包含在输出元组中
        outputs = (output, attention_probs) if output_attentions else (output, None)

        return outputs
# Copied from transformers.models.blip.modeling_blip.BlipMLP
class InstructBlipMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 选择激活函数，从配置中获取并赋值给对象
        self.activation_fn = ACT2FN[config.hidden_act]
        # 创建第一个全连接层，输入维度为隐藏大小，输出维度为中间大小
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
        # 创建第二个全连接层，输入维度为中间大小，输出维度为隐藏大小
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入的隐藏状态传递给第一个全连接层
        hidden_states = self.fc1(hidden_states)
        # 应用预先选择的激活函数到第一个全连接层的输出
        hidden_states = self.activation_fn(hidden_states)
        # 将激活后的输出传递给第二个全连接层
        hidden_states = self.fc2(hidden_states)
        return hidden_states


# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->InstructBlip
class InstructBlipEncoderLayer(nn.Module):
    def __init__(self, config: InstructBlipConfig):
        super().__init__()
        self.embed_dim = config.hidden_size
        # 初始化自注意力层，使用给定的配置对象
        self.self_attn = InstructBlipAttention(config)
        # 初始化第一个层归一化层，输入维度为嵌入维度，使用给定的层归一化参数
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
        # 初始化MLP模块，使用给定的配置对象
        self.mlp = InstructBlipMLP(config)
        # 初始化第二个层归一化层，输入维度为嵌入维度，使用给定的层归一化参数
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
        """
        residual = hidden_states

        # 对输入进行第一个层归一化操作
        hidden_states = self.layer_norm1(hidden_states)
        # 将归一化后的输入传递给自注意力层，并返回注意力权重
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            head_mask=attention_mask,
            output_attentions=output_attentions,
        )
        # 将自注意力层的输出与残差连接
        hidden_states = hidden_states + residual
        residual = hidden_states
        # 对连接后的输出进行第二个层归一化操作
        hidden_states = self.layer_norm2(hidden_states)
        # 将第二层归一化后的输出传递给MLP模块
        hidden_states = self.mlp(hidden_states)

        # 将MLP的输出与残差连接
        hidden_states = hidden_states + residual

        outputs = (hidden_states,)

        # 如果需要输出注意力权重，将其添加到输出中
        if output_attentions:
            outputs += (attn_weights,)

        return outputs


class InstructBlipPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定用于配置的类
    config_class = InstructBlipConfig
    # 指定基础模型前缀名称
    base_model_prefix = "blip"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    _no_split_modules = [
        "InstructBlipQFormerEmbeddings",  # 列出不需要拆分的模块名称
        "InstructBlipAttention",          # 列出不需要拆分的模块名称
        "InstructBlipQFormerMultiHeadAttention",  # 列出不需要拆分的模块名称
        "InstructBlipQFormerSelfOutput",  # 列出不需要拆分的模块名称
    ]
    _keep_in_fp32_modules = []  # 留空列表，用于保留需要在FP32精度下操作的模块
    
    # 从 transformers.models.blip_2.modeling_blip_2.Blip2PreTrainedModel._init_weights 复制而来，将Blip2替换为InstructBlip
    def _init_weights(self, module):
        """初始化权重"""
        factor = self.config.initializer_range
        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
            # 如果是卷积层、嵌入层或线性层，使用正态分布初始化权重，偏置置零
            module.weight.data.normal_(mean=0.0, std=factor)
            if hasattr(module, "bias") and module.bias is not None:
                module.bias.data.zero_()
    
        if isinstance(module, InstructBlipVisionEmbeddings):
            if hasattr(self.config, "vision_config"):
                factor = self.config.vision_config.initializer_range
            # 如果是InstructBlipVisionEmbeddings类型，使用截断正态分布初始化位置嵌入和类别嵌入
            nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
            nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
    
        elif isinstance(module, nn.LayerNorm):
            # 如果是LayerNorm层，偏置置零，权重置为1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, nn.Linear) and module.bias is not None:
            # 如果是线性层且有偏置，偏置置零
            module.bias.data.zero_()
# 定义文档字符串，描述从 PreTrainedModel 继承的通用方法和本模型作为 PyTorch Module 的使用说明
INSTRUCTBLIP_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`InstructBlipConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义输入说明文档字符串，描述 InstructBlipEncoder 类中 forward 方法的参数及其用途
INSTRUCTBLIP_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`InstructBlipProcessor`]. See
            [`InstructBlipProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 空的输入文档字符串，暂无特定内容描述，可能是为了后续扩展或未完全实现的功能
INSTRUCTBLIP_INPUTS_DOCSTRING = r"""
"""


# 从 transformers.models.blip.modeling_blip.BlipEncoder 复制并重命名为 InstructBlipEncoder
class InstructBlipEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`InstructBlipEncoderLayer`].

    Args:
        config (`InstructBlipConfig`):
            The corresponding vision configuration for the `InstructBlipEncoder`.
    """

    def __init__(self, config: InstructBlipConfig):
        super().__init__()
        # 设置当前实例的配置参数
        self.config = config
        # 创建包含 config.num_hidden_layers 个 InstructBlipEncoderLayer 实例的模块列表
        self.layers = nn.ModuleList([InstructBlipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点标志，默认为 False
        self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # forward 方法的具体参数说明详见 INSTRUCTBLIP_VISION_INPUTS_DOCSTRING
        r"""
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        # 根据传入的参数或者模型配置，确定是否输出注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 根据传入的参数或者模型配置，确定是否输出各层隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据传入的参数或者模型配置，确定是否返回一个字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果不需要输出隐藏状态，则设置为空元组
        encoder_states = () if output_hidden_states else None
        # 如果不需要输出注意力权重，则设置为空元组
        all_attentions = () if output_attentions else None

        # 将输入的嵌入向量作为初始隐藏状态
        hidden_states = inputs_embeds
        # 遍历所有编码器层
        for idx, encoder_layer in enumerate(self.layers):
            # 如果需要输出隐藏状态，则将当前隐藏状态加入到encoder_states中
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            # 如果启用渐变检查点且处于训练状态，则使用渐变检查点函数计算编码器层的输出
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    encoder_layer.__call__,
                    hidden_states,
                    attention_mask,
                    output_attentions,
                )
            else:
                # 否则直接调用编码器层计算输出
                layer_outputs = encoder_layer(
                    hidden_states,
                    attention_mask,
                    output_attentions=output_attentions,
                )

            # 更新隐藏状态为编码器层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力权重，则将当前层的注意力权重加入到all_attentions中
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # 如果需要输出隐藏状态，则将最终的隐藏状态加入到encoder_states中
        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        # 如果不返回字典格式的输出，则返回一个包含非空值的元组
        if not return_dict:
            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        # 否则返回一个BaseModelOutput对象，包含最终的隐藏状态、所有隐藏状态和注意力权重
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )
# 从transformers.models.blip.modeling_blip.BlipVisionModel复制而来，将Blip->InstructBlip，BLIP->INSTRUCTBLIP
class InstructBlipVisionModel(InstructBlipPreTrainedModel):
    # 主要输入名称为"pixel_values"
    main_input_name = "pixel_values"
    # 使用的配置类为InstructBlipVisionConfig
    config_class = InstructBlipVisionConfig

    def __init__(self, config: InstructBlipVisionConfig):
        super().__init__(config)
        self.config = config
        embed_dim = config.hidden_size

        # 初始化嵌入层
        self.embeddings = InstructBlipVisionEmbeddings(config)
        # 初始化编码器
        self.encoder = InstructBlipEncoder(config)
        # 初始化后层归一化层
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

        # 执行初始化后操作
        self.post_init()

    @add_start_docstrings_to_model_forward(INSTRUCTBLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=InstructBlipVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        前向传播函数:
        
        Args:
            pixel_values (Optional[torch.FloatTensor], optional): 像素值. Defaults to None.
            output_attentions (Optional[bool], optional): 是否输出注意力权重. Defaults to None.
            output_hidden_states (Optional[bool], optional): 是否输出隐藏状态. Defaults to None.
            return_dict (Optional[bool], optional): 是否返回字典格式结果. Defaults to None.

        Returns:
            Union[Tuple, BaseModelOutputWithPooling]: 返回值包含最后隐藏状态、池化输出以及可能的其他结果.
        """
        # 如果未提供像素值，则抛出数值错误
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将像素值转换为嵌入向量
        hidden_states = self.embeddings(pixel_values)

        # 使用编码器处理嵌入向量
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器输出的最后隐藏状态，并进行后层归一化
        last_hidden_state = encoder_outputs[0]
        last_hidden_state = self.post_layernorm(last_hidden_state)

        # 对最后隐藏状态进行池化，获取池化输出，并再次进行后层归一化
        pooled_output = last_hidden_state[:, 0, :]
        pooled_output = self.post_layernorm(pooled_output)

        # 如果不返回字典格式的结果，则返回元组格式的结果
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 返回带有池化输出和其他信息的 BaseModelOutputWithPooling 对象
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

    def get_input_embeddings(self):
        return self.embeddings


class InstructBlipQFormerMultiHeadAttention(nn.Module):
    # 初始化函数，用于创建一个新的注意力模型层
    def __init__(self, config, is_cross_attention=False):
        super().__init__()
        # 将配置信息保存到实例中
        self.config = config
        # 检查隐藏层大小是否能够被注意力头的数量整除，或者是否有嵌入大小属性
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            # 如果不能整除且没有嵌入大小属性，则抛出异常
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
                % (config.hidden_size, config.num_attention_heads)
            )

        # 设置注意力头的数量
        self.num_attention_heads = config.num_attention_heads
        # 计算每个注意力头的大小
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        # 计算所有头的总大小
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建查询线性层，用于生成查询矩阵
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        
        # 根据是否是交叉注意力模型，选择创建键和值的线性层
        if is_cross_attention:
            # 如果是交叉注意力，使用编码器隐藏层大小创建键和值的线性层
            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
        else:
            # 如果不是交叉注意力，使用隐藏层大小创建键和值的线性层
            self.key = nn.Linear(config.hidden_size, self.all_head_size)
            self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 创建一个dropout层，用于注意力概率的dropout
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # 设置位置嵌入类型，默认为绝对位置嵌入
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        
        # 如果位置嵌入类型为相对键或者相对键查询，则创建距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            # 设置最大位置嵌入数
            self.max_position_embeddings = config.max_position_embeddings
            # 创建距离嵌入的Embedding层
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
        
        # 初始化保存注意力权重标志为False
        self.save_attention = False
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->InstructBlipQFormer
class InstructBlipQFormerSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化线性层，用于变换隐藏状态的维度
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化层归一化，对输出进行标准化
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 dropout，以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 对隐藏状态应用线性层变换
        hidden_states = self.dense(hidden_states)
        # 应用 dropout
        hidden_states = self.dropout(hidden_states)
        # 将层归一化应用到变换后的隐藏状态和输入张量之和上
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerAttention with Blip2->InstructBlip
class InstructBlipQFormerAttention(nn.Module):
    def __init__(self, config, is_cross_attention=False):
        super().__init__()
        # 初始化自注意力机制和输出层
        self.attention = InstructBlipQFormerMultiHeadAttention(config, is_cross_attention)
        self.output = InstructBlipQFormerSelfOutput(config)
        # 初始化一个空集合，用于存储需要剪枝的注意力头
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 找到需要剪枝的注意力头并返回索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.attention.query = prune_linear_layer(self.attention.query, index)
        self.attention.key = prune_linear_layer(self.attention.key, index)
        self.attention.value = prune_linear_layer(self.attention.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储已剪枝的注意力头
        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 执行自注意力机制的前向传播
        self_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 将自注意力输出应用到自输出层，并与原始隐藏状态相加
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力权重，则将它们添加到输出中
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs
# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制并修改为 InstructBlipQFormerIntermediate 类
class InstructBlipQFormerIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层，输入大小为 config.hidden_size，输出大小为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择激活函数，将字符串形式的激活函数映射到对应的函数，存储在 self.intermediate_act_fn 中
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用全连接层进行前向传播，输入为 hidden_states，输出为经过全连接层处理后的 hidden_states
        hidden_states = self.dense(hidden_states)
        # 对经过全连接层处理后的 hidden_states 应用预先选择的激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertOutput 复制并修改为 InstructBlipQFormerOutput 类
class InstructBlipQFormerOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层，输入大小为 config.intermediate_size，输出大小为 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建 LayerNorm 层，对输入大小为 config.hidden_size 的数据进行归一化
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建 Dropout 层，以指定的概率 config.hidden_dropout_prob 随机丢弃输入
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用全连接层处理 hidden_states，将其转换为大小为 config.hidden_size 的张量
        hidden_states = self.dense(hidden_states)
        # 对全连接层输出应用 Dropout 操作，以减少过拟合风险
        hidden_states = self.dropout(hidden_states)
        # 将经过 Dropout 处理后的 hidden_states 与输入张量 input_tensor 相加，并对结果进行 LayerNorm 处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# InstructBlipQFormerLayer 类定义
class InstructBlipQFormerLayer(nn.Module):
    def __init__(self, config, layer_idx):
        super().__init__()
        # 设置前向传播中的分块大小和序列长度维度
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1
        # 创建注意力层对象 InstructBlipQFormerAttention
        self.attention = InstructBlipQFormerAttention(config)
        
        # 记录当前层的索引
        self.layer_idx = layer_idx

        # 如果当前层索引能够整除 config.cross_attention_frequency，说明需要进行跨层注意力操作
        if layer_idx % config.cross_attention_frequency == 0:
            # 创建跨层注意力对象 InstructBlipQFormerAttention
            self.crossattention = InstructBlipQFormerAttention(config, is_cross_attention=True)
            self.has_cross_attention = True  # 标记该层有跨层注意力
        else:
            self.has_cross_attention = False  # 标记该层没有跨层注意力

        # 创建中间层对象 InstructBlipQFormerIntermediate 和输出层对象 InstructBlipQFormerOutput
        self.intermediate = InstructBlipQFormerIntermediate(config)
        self.output = InstructBlipQFormerOutput(config)

        # 创建用于查询的中间层对象和输出层对象
        self.intermediate_query = InstructBlipQFormerIntermediate(config)
        self.output_query = InstructBlipQFormerOutput(config)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        query_length=0,
        ):
        # 省略了 forward 方法的其余部分，需要根据具体情况补充完整
        pass  # 这里只是为了注释需要包含完整的代码块
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        # 解码器单向自注意力的缓存键/值元组位于位置 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 使用保存的过去键/值元组（如果存在）进行自注意力计算
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # 获取自注意力计算的输出
        attention_output = self_attention_outputs[0]
        # 获取除了第一个和最后一个元素之外的所有输出（主要用于输出注意力权重）
        outputs = self_attention_outputs[1:-1]

        # 获取当前的键/值元组，以备将来的计算使用
        present_key_value = self_attention_outputs[-1]

        # 如果查询长度大于 0，则截取注意力输出的一部分
        if query_length > 0:
            query_attention_output = attention_output[:, :query_length, :]

            # 如果模型包含交叉注意力，执行交叉注意力的计算
            if self.has_cross_attention:
                # 如果缺少编码器的隐藏状态，则引发错误
                if encoder_hidden_states is None:
                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
                # 执行交叉注意力计算
                cross_attention_outputs = self.crossattention(
                    query_attention_output,
                    attention_mask,
                    head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    output_attentions=output_attentions,
                )
                # 获取交叉注意力计算的输出
                query_attention_output = cross_attention_outputs[0]
                # 如果需要输出注意力权重，则将其添加到已有的输出中
                outputs = outputs + cross_attention_outputs[1:-1]

            # 将查询注意力输出传递给前馈网络的函数，可能会对其进行分块处理
            layer_output = apply_chunking_to_forward(
                self.feed_forward_chunk_query,
                self.chunk_size_feed_forward,
                self.seq_len_dim,
                query_attention_output,
            )

            # 如果注意力输出的形状大于查询长度，则对其余部分进行前馈网络计算
            if attention_output.shape[1] > query_length:
                layer_output_text = apply_chunking_to_forward(
                    self.feed_forward_chunk,
                    self.chunk_size_feed_forward,
                    self.seq_len_dim,
                    attention_output[:, query_length:, :],
                )
                # 将计算得到的结果拼接到之前的输出结果中
                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
        else:
            # 对注意力输出进行前馈网络计算，可能会对其进行分块处理
            layer_output = apply_chunking_to_forward(
                self.feed_forward_chunk,
                self.chunk_size_feed_forward,
                self.seq_len_dim,
                attention_output,
            )
        # 将最终的层输出添加到总体的输出中
        outputs = (layer_output,) + outputs

        # 将当前的键/值元组添加到总体的输出中
        outputs = outputs + (present_key_value,)

        # 返回最终的输出结果
        return outputs
# 从 transformers.models.blip_2.modeling_blip_2.Blip2QFormerEncoder 复制并修改为 InstructBlipQFormerEncoder
class InstructBlipQFormerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 创建一个 nn.ModuleList，包含 config.num_hidden_layers 个 InstructBlipQFormerLayer 对象
        self.layer = nn.ModuleList(
            [InstructBlipQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        query_length=0,
        ):
            # 如果输出隐藏状态为真，则初始化一个空元组
            all_hidden_states = () if output_hidden_states else None
            # 如果输出注意力权重为真，则初始化一个空元组
            all_self_attentions = () if output_attentions else None
            # 如果输出交叉注意力权重为真，则初始化一个空元组
            all_cross_attentions = () if output_attentions else None

            # 如果使用缓存，则初始化一个空元组
            next_decoder_cache = () if use_cache else None

            # 遍历所有 Transformer 层
            for i in range(self.config.num_hidden_layers):
                # 获取当前层的模块
                layer_module = self.layer[i]
                # 如果输出隐藏状态为真，则将当前隐藏状态添加到所有隐藏状态元组中
                if output_hidden_states:
                    all_hidden_states = all_hidden_states + (hidden_states,)

                # 获取当前层的注意力头掩码
                layer_head_mask = head_mask[i] if head_mask is not None else None
                # 获取当前层的过去键值对，用于跨层信息传递
                past_key_value = past_key_values[i] if past_key_values is not None else None

                # 如果配置启用梯度检查点且当前处于训练状态
                if getattr(self.config, "gradient_checkpointing", False) and self.training:
                    # 如果使用缓存为真，警告并设置为假，因为与梯度检查点不兼容
                    if use_cache:
                        logger.warning(
                            "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                        )
                        use_cache = False
                    # 使用梯度检查点函数计算当前层的输出
                    layer_outputs = self._gradient_checkpointing_func(
                        layer_module.__call__,
                        hidden_states,
                        attention_mask,
                        layer_head_mask,
                        encoder_hidden_states,
                        encoder_attention_mask,
                    )
                else:
                    # 否则直接调用当前层模块计算当前层的输出
                    layer_outputs = layer_module(
                        hidden_states,
                        attention_mask,
                        layer_head_mask,
                        encoder_hidden_states,
                        encoder_attention_mask,
                        past_key_value,
                        output_attentions,
                        query_length,
                    )

                # 更新当前隐藏状态为当前层的输出的第一个元素
                hidden_states = layer_outputs[0]
                # 如果使用缓存为真，则将当前层的输出的最后一个元素添加到下一个解码器缓存中
                if use_cache:
                    next_decoder_cache += (layer_outputs[-1],)
                # 如果输出注意力权重为真，则将当前层的注意力权重添加到所有自注意力权重元组中
                if output_attentions:
                    all_self_attentions = all_self_attentions + (layer_outputs[1],)
                    # 如果当前层有交叉注意力，将当前层的交叉注意力权重添加到所有交叉注意力权重元组中
                    if layer_module.has_cross_attention:
                        all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

            # 如果输出隐藏状态为真，则将最终隐藏状态添加到所有隐藏状态元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果不返回字典，则返回包含非空元素的元组
            if not return_dict:
                return tuple(
                    v
                    for v in [
                        hidden_states,
                        next_decoder_cache,
                        all_hidden_states,
                        all_self_attentions,
                        all_cross_attentions,
                    ]
                    if v is not None
                )
            # 否则返回带有过去和交叉注意力权重的基础模型输出对象
            return BaseModelOutputWithPastAndCrossAttentions(
                last_hidden_state=hidden_states,
                past_key_values=next_decoder_cache,
                hidden_states=all_hidden_states,
                attentions=all_self_attentions,
                cross_attentions=all_cross_attentions,
            )
# 定义一个模块类 InstructBlipQFormerEmbeddings，用于构建来自单词和位置嵌入的嵌入向量。
class InstructBlipQFormerEmbeddings(nn.Module):
    """Construct the embeddings from word and position embeddings."""

    # 初始化函数，接受一个配置对象 config 作为参数
    def __init__(self, config):
        super().__init__()
        # 创建一个单词嵌入层，vocab_size 是词汇表大小，hidden_size 是隐藏层大小，padding_idx 是填充标记的索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 创建一个位置嵌入层，max_position_embeddings 是最大位置嵌入数，hidden_size 是隐藏层大小
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)

        # LayerNorm 层，用于归一化隐藏层的输出
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层，用于在训练过程中随机失活隐藏层的输出，以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 注册一个缓冲区 position_ids，这个是位置标识符，是一个长度为 max_position_embeddings 的张量
        # 在序列化时它是内存连续的，并且当持久性设置为 False 时不会被导出
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 设置位置嵌入的类型，默认是绝对位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")

        self.config = config

    # 前向传播函数，接收输入参数并计算输出
    def forward(
        self,
        input_ids=None,
        position_ids=None,
        query_embeds=None,
        past_key_values_length=0,
    ):
        # 如果输入 input_ids 不为空，则获取序列长度 seq_length
        if input_ids is not None:
            seq_length = input_ids.size()[1]
        else:
            seq_length = 0

        # 如果位置标识符 position_ids 为空，则从预定义的 position_ids 中复制相应位置的标识符
        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()

        # 如果输入 input_ids 不为空，则计算单词嵌入
        if input_ids is not None:
            embeddings = self.word_embeddings(input_ids)
            # 如果位置嵌入类型为绝对位置编码，则计算位置嵌入并将其加到单词嵌入中
            if self.position_embedding_type == "absolute":
                position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
                embeddings = embeddings + position_embeddings

            # 如果存在查询嵌入 query_embeds，则将其与计算得到的嵌入拼接起来
            if query_embeds is not None:
                embeddings = torch.cat((query_embeds, embeddings), dim=1)
        else:
            # 如果没有输入 input_ids，则使用查询嵌入作为嵌入向量
            embeddings = query_embeds

        # 将嵌入向量转换为与 layernorm 权重相同的数据类型
        embeddings = embeddings.to(self.layernorm.weight.dtype)
        # 应用 layernorm 对嵌入向量进行归一化
        embeddings = self.layernorm(embeddings)
        # 对归一化后的向量应用 dropout 操作
        embeddings = self.dropout(embeddings)
        # 返回最终的嵌入向量作为输出
        return embeddings


# 定义一个模型类 InstructBlipQFormerModel，继承自 InstructBlipPreTrainedModel 类
class InstructBlipQFormerModel(InstructBlipPreTrainedModel):
    """
    Querying Transformer (Q-Former), used in InstructBLIP. Slightly modified from BLIP-2 as it also takes the
    instruction as input.
    """

    # 初始化函数，接受一个配置对象 config 作为参数
    def __init__(self, config: InstructBlipQFormerConfig):
        super().__init__(config)
        self.config = config

        # 创建嵌入层对象
        self.embeddings = InstructBlipQFormerEmbeddings(config)

        # 创建编码器对象
        self.encoder = InstructBlipQFormerEncoder(config)

        # 调用初始化后的处理方法
        self.post_init()

    # 获取输入嵌入层的函数，返回嵌入层的单词嵌入部分
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    # 设置输入嵌入层的函数，设置嵌入层的单词嵌入部分
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    # 剪枝模型头部的方法，接受一个 heads_to_prune 字典作为参数，用于指定要剪枝的头部
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历 heads_to_prune 字典的每一项，获取层号和要剪枝的头部列表
        for layer, heads in heads_to_prune.items():
            # 对指定层的注意力头部进行剪枝操作
            self.encoder.layer[layer].attention.prune_heads(heads)
    def get_extended_attention_mask(
        self,
        attention_mask: torch.Tensor,
        input_shape: Tuple[int],
        device: torch.device,
        has_query: bool = False,
    ) -> torch.Tensor:
        """
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.
            device: (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with the same dtype as `attention_mask.dtype`.
        """
        # 如果 attention_mask 的维度是 [batch_size, from_seq_length, to_seq_length]
        # 则将其扩展为 [batch_size, 1, from_seq_length, to_seq_length]，以便广播到所有的注意力头上
        if attention_mask.dim() == 3:
            extended_attention_mask = attention_mask[:, None, :, :]
        elif attention_mask.dim() == 2:
            # 如果提供的是维度为 [batch_size, seq_length] 的填充遮罩
            # 模型是编码器，因此将遮罩扩展为 [batch_size, 1, 1, seq_length]，使其可广播到 [batch_size, num_heads, seq_length, seq_length]
            extended_attention_mask = attention_mask[:, None, None, :]
        else:
            raise ValueError(
                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})",
            )

        # 由于 attention_mask 中 1.0 表示我们要关注的位置，0.0 表示被屏蔽的位置，
        # 这个操作将创建一个张量，对于我们要关注的位置是 0.0，对于被屏蔽的位置是 -10000.0
        # 在 softmax 之前将其添加到原始分数中，实际上等同于完全删除这些位置的影响
        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # 用于 fp16 兼容性
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        return extended_attention_mask
"""
为生成文本的 InstructBLIP 模型提供说明，该模型基于图像和可选文本提示生成文本。模型包括视觉编码器、查询变压器（Q-Former）和语言模型。

可以选择向模型传递 `input_ids`，作为文本提示，以便让语言模型继续提示。否则，语言模型将从 [BOS]（序列开始）标记开始生成文本。
"""
@add_start_docstrings(
    """
    InstructBLIP Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    """,
    INSTRUCTBLIP_START_DOCSTRING,
)
class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel):
    config_class = InstructBlipConfig
    main_input_name = "pixel_values"

    def __init__(self, config: InstructBlipConfig):
        super().__init__(config)

        # 初始化视觉模型
        self.vision_model = InstructBlipVisionModel(config.vision_config)

        # 初始化查询令牌
        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
        # 初始化查询变压器模型
        self.qformer = InstructBlipQFormerModel(config.qformer_config)

        # 配置语言投影层，将查询变压器的隐藏状态映射到文本配置的隐藏大小
        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)

        # 根据配置选择语言模型：CausalLM 或 Seq2SeqLM
        if config.use_decoder_only_language_model:
            language_model = AutoModelForCausalLM.from_config(config.text_config)
        else:
            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)

        # 如果语言模型有不需拆分的模块，则扩展这些模块
        if language_model._no_split_modules is not None:
            self._no_split_modules.extend(language_model._no_split_modules)

        # 如果语言模型有需保持在 FP32 的模块，则扩展这些模块
        if language_model._keep_in_fp32_modules is not None:
            self._keep_in_fp32_modules.extend(language_model._keep_in_fp32_modules)

        # 设置实例的语言模型
        self.language_model = language_model

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.language_model.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.language_model.set_input_embeddings(value)

    def set_output_embeddings(self, new_embeddings):
        self.language_model.set_output_embeddings(new_embeddings)

    def get_output_embeddings(self) -> nn.Module:
        return self.language_model.get_output_embeddings()

    def get_encoder(self):
        return self.language_model.get_encoder()

    def get_decoder(self):
        return self.language_model.get_decoder()

    def _tie_weights(self):
        # 如果不是仅使用解码器语言模型，则绑定权重
        if not self.config.use_decoder_only_language_model:
            self.language_model.encoder.embed_tokens = self.language_model.shared
            self.language_model.decoder.embed_tokens = self.language_model.shared
    def _preprocess_accelerate(self):
        r"""
        Some pre-processing hacks to make the model `accelerate` compatible. Check
        https://github.com/huggingface/transformers/pull/21707 for more details.
        """
        # 将 self.hf_device_map 赋值给 hf_device_map 变量
        hf_device_map = self.hf_device_map

        # 如果 hf_device_map 的长度大于 1，且不包含 "language_model" 键，并且 CUDA 设备数量大于 1
        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
            # 发出警告，提示用户在使用多 GPU + InstructBLIP + `accelerate` 时可能会遇到意外行为
            logger.warning(
                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
                " Please pass a `device_map` that contains `language_model` to remove this warning."
                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
                " more details on creating a `device_map` for large models.",
            )

        # 如果 self.language_model 具有属性 "_hf_hook"
        if hasattr(self.language_model, "_hf_hook"):
            # 设置 self.language_model._hf_hook.io_same_device 为 True，以便与 `generate` 兼容
            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility

    @add_start_docstrings_to_model_forward(INSTRUCTBLIP_INPUTS_DOCSTRING)
    @replace_return_docstrings(
        output_type=InstructBlipForConditionalGenerationModelOutput, config_class=InstructBlipVisionConfig
    )
    # 定义 forward 方法，接受多个参数并返回指定类型的输出
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        qformer_input_ids: torch.FloatTensor,
        qformer_attention_mask: Optional[torch.LongTensor] = None,
        input_ids: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        labels: Optional[torch.LongTensor] = None,
        return_dict: Optional[bool] = None,
    ):
        # 方法级装饰器，标记 forward 方法为添加了特定文档字符串的模型前向方法

    @torch.no_grad()
    # 方法级装饰器，标记 generate 方法为不需要梯度的方法
    def generate(
        self,
        pixel_values: torch.FloatTensor,
        qformer_input_ids: Optional[torch.LongTensor] = None,
        qformer_attention_mask: Optional[torch.LongTensor] = None,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        **generate_kwargs,
    ):
        # 定义 generate 方法，接受多个参数，用于生成模型的输出

Transformers-源码解析-五十九-

Transformers 源码解析（五十九）

.\models\idefics\perceiver.py

.\models\idefics\processing_idefics.py

.\models\idefics\vision.py

.\models\idefics\__init__.py

.\models\imagegpt\configuration_imagegpt.py

.\models\imagegpt\convert_imagegpt_original_tf2_to_pytorch.py

.\models\imagegpt\feature_extraction_imagegpt.py

.\models\imagegpt\image_processing_imagegpt.py

.\models\imagegpt\modeling_imagegpt.py

.\models\imagegpt\__init__.py

.\models\informer\configuration_informer.py

.\models\informer\modeling_informer.py

.\models\informer\__init__.py

.\models\instructblip\configuration_instructblip.py

.\models\instructblip\convert_instructblip_original_to_pytorch.py

.\models\instructblip\modeling_instructblip.py

`.\models\idefics\perceiver.py`

`.\models\idefics\processing_idefics.py`

`.\models\idefics\vision.py`

`.\models\idefics\init.py`

`.\models\imagegpt\configuration_imagegpt.py`

`.\models\imagegpt\convert_imagegpt_original_tf2_to_pytorch.py`

`.\models\imagegpt\feature_extraction_imagegpt.py`

`.\models\imagegpt\image_processing_imagegpt.py`

`.\models\imagegpt\modeling_imagegpt.py`

`.\models\imagegpt\init.py`

`.\models\informer\configuration_informer.py`

`.\models\informer\modeling_informer.py`

`.\models\informer\init.py`

`.\models\instructblip\configuration_instructblip.py`

`.\models\instructblip\convert_instructblip_original_to_pytorch.py`

`.\models\instructblip\modeling_instructblip.py`