Transformers 源码解析（二十一）

`.\models\blip\modeling_tf_blip_text.py`

# 导入所需的库和模块
from __future__ import annotations

import math  # 导入数学库，用于数学运算
from typing import Optional, Tuple  # 导入类型提示相关模块

import tensorflow as tf  # 导入 TensorFlow 库

# 导入模型输出相关的类和函数
from ...modeling_tf_outputs import (
    TFBaseModelOutputWithPastAndCrossAttentions,
    TFBaseModelOutputWithPoolingAndCrossAttentions,
    TFCausalLMOutputWithCrossAttentions,
)
# 导入 TensorFlow 下的实用工具函数和类
from ...modeling_tf_utils import (
    TFModelInputType,
    TFPreTrainedModel,
    get_initializer,
    get_tf_activation,
    keras,
    keras_serializable,
    shape_list,
    unpack_inputs,
)
# 导入一些 TensorFlow 下的实用函数，用于注意力机制和序列处理
from ...tf_utils import check_embeddings_within_bounds, invert_attention_mask, stable_softmax
# 导入通用工具函数和 logging 工具
from ...utils import add_start_docstrings_to_model_forward, logging
# 导入模型配置文件相关的类
from .configuration_blip import BlipTextConfig

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 用于文档字符串的 BLIP 文本输入说明
BLIP_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            # 输入序列的标记索引在词汇表中的位置。默认情况下，将忽略填充部分。

            # 可以使用 [`AutoProcessor`] 获得这些索引。有关详情，请参见 [`BlipProcessor.__call__`]。

            # [什么是输入 ID？](../glossary#input-ids)
        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 遮罩，用于在填充的标记索引上避免执行注意力操作。遮罩的取值范围为 `[0, 1]`：

            # - 1 表示**未被遮罩**的标记，
            # - 0 表示**被遮罩**的标记。

            # [什么是注意力遮罩？](../glossary#attention-mask)
        position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 每个输入序列标记在位置嵌入中的位置索引。取值范围为 `[0, config.max_position_embeddings - 1]`。

            # [什么是位置 ID？](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。有关更多细节，请参见返回的张量中的 `attentions`。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。有关更多细节，请参见返回的张量中的 `hidden_states`。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
# 从 https://github.com/salesforce/BLIP/blob/main/models/med.py#L52 适配而来的代码，该类定义了一个 TFBlipTextEmbeddings 类
class TFBlipTextEmbeddings(keras.layers.Layer):
    """Construct the embeddings from word and position embeddings."""

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 初始化词嵌入层，根据配置文件指定的词汇大小和隐藏大小
        self.word_embeddings = keras.layers.Embedding(
            config.vocab_size,
            config.hidden_size,
            embeddings_initializer=get_initializer(config.initializer_range),
            name="word_embeddings",
        )
        # 初始化位置嵌入层，根据配置文件指定的最大位置嵌入和隐藏大小
        self.position_embeddings = keras.layers.Embedding(
            config.max_position_embeddings,
            config.hidden_size,
            embeddings_initializer=get_initializer(config.initializer_range),
            name="position_embeddings",
        )

        # 使用 PyTorch 模型的变量命名风格，因此未使用蛇形命名法来定义 LayerNormalization 层，
        # 以便能够加载任何 TensorFlow 检查点文件
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 定义丢弃层，根据配置文件指定的隐藏层丢弃概率
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")

        # 创建位置ID张量，用于表示绝对位置嵌入的位置
        self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0)
        # 获取配置文件中的位置嵌入类型，默认为"absolute"
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")

        # 保存配置对象
        self.config = config

    def call(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0, training=None):
        # 如果传入了 input_ids，则获取其形状
        if input_ids is not None:
            input_shape = tf.shape(input_ids)
        else:
            # 否则获取 inputs_embeds 的形状，但不包括最后一个维度
            input_shape = tf.shape(inputs_embeds)[:-1]

        # 获取序列长度
        seq_length = input_shape[1]

        # 如果未提供 position_ids，则从预测键值长度到序列长度获取位置ID
        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        # 如果未提供 inputs_embeds，则根据 input_ids 检查嵌入是否在有效范围内，并获取词嵌入
        if inputs_embeds is None:
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            inputs_embeds = self.word_embeddings(input_ids)

        # 获取嵌入
        embeddings = inputs_embeds

        # 如果位置嵌入类型为"absolute"，则获取位置嵌入并加到嵌入中
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # 对嵌入进行 LayerNorm 处理
        embeddings = self.LayerNorm(embeddings)
        # 使用丢弃层进行丢弃处理，根据训练状态决定是否启用丢弃
        embeddings = self.dropout(embeddings, training=training)
        
        # 返回最终的嵌入表示
        return embeddings
    # 如果模型已经构建完成，则直接返回，不进行重复构建
    if self.built:
        return
    # 设置标志位，表示模型已经构建完成
    self.built = True
    
    # 如果存在词嵌入层对象，构建该层
    if getattr(self, "word_embeddings", None) is not None:
        # 在 TensorFlow 中使用命名空间为词嵌入层命名，并构建该层
        with tf.name_scope(self.word_embeddings.name):
            self.word_embeddings.build(None)
    
    # 如果存在位置嵌入层对象，构建该层
    if getattr(self, "position_embeddings", None) is not None:
        # 在 TensorFlow 中使用命名空间为位置嵌入层命名，并构建该层
        with tf.name_scope(self.position_embeddings.name):
            self.position_embeddings.build(None)
    
    # 如果存在 LayerNorm 层对象，构建该层
    if getattr(self, "LayerNorm", None) is not None:
        # 在 TensorFlow 中使用命名空间为 LayerNorm 层命名，并构建该层
        with tf.name_scope(self.LayerNorm.name):
            # 构建 LayerNorm 层，传入输入形状 [None, None, self.config.hidden_size]
            self.LayerNorm.build([None, None, self.config.hidden_size])
    
    # 如果存在 dropout 层对象，构建该层
    if getattr(self, "dropout", None) is not None:
        # 在 TensorFlow 中使用命名空间为 dropout 层命名，并构建该层
        with tf.name_scope(self.dropout.name):
            self.dropout.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97
class TFBlipTextSelfAttention(keras.layers.Layer):
    def __init__(self, config, is_cross_attention, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        # 检查隐藏层大小是否能被注意力头数整除，如果不能且没有嵌入大小的属性，则引发错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
                % (config.hidden_size, config.num_attention_heads)
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建用于查询、键、值的全连接层，初始化方法为指定范围的初始值
        self.query = keras.layers.Dense(
            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
        )
        self.key = keras.layers.Dense(
            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
        )
        self.value = keras.layers.Dense(
            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )

        # 添加 dropout 层，使用配置中的注意力概率
        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
        
        # 根据配置选择位置嵌入类型，默认为绝对位置
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 如果位置嵌入类型为相对键或相对键查询，则创建距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = keras.layers.Embedding(
                2 * config.max_position_embeddings - 1, self.attention_head_size
            )
        self.is_cross_attention = is_cross_attention

    # 调整张量形状以便计算注意力分数
    def transpose_for_scores(self, x):
        new_x_shape = tf.concat(
            [tf.shape(x)[:-1], tf.constant([self.num_attention_heads, self.attention_head_size], dtype=tf.int32)],
            axis=0,
        )
        x = tf.reshape(x, new_x_shape)
        return tf.transpose(x, perm=(0, 2, 1, 3))

    # 定义层的调用方法，接收多个参数用于注意力计算和输出
    def call(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        training=None,
    # 如果已经构建过网络结构，则直接返回，不再重复构建
    def build(self, input_shape=None):
        if self.built:
            return
        
        # 标记网络已经构建
        self.built = True
        
        # 如果存在查询（query）模块，则构建其网络结构
        if getattr(self, "query", None) is not None:
            with tf.name_scope(self.query.name):
                self.query.build([None, None, self.config.hidden_size])
        
        # 如果是交叉注意力机制，构建键（key）和值（value）的网络结构
        if self.is_cross_attention:
            # 如果存在键（key）模块，则构建其网络结构
            if getattr(self, "key", None) is not None:
                with tf.name_scope(self.key.name):
                    self.key.build([None, None, self.config.encoder_hidden_size])
            # 如果存在值（value）模块，则构建其网络结构
            if getattr(self, "value", None) is not None:
                with tf.name_scope(self.value.name):
                    self.value.build([None, None, self.config.encoder_hidden_size])
        else:
            # 如果存在键（key）模块，则构建其网络结构
            if getattr(self, "key", None) is not None:
                with tf.name_scope(self.key.name):
                    self.key.build([None, None, self.config.hidden_size])
            # 如果存在值（value）模块，则构建其网络结构
            if getattr(self, "value", None) is not None:
                with tf.name_scope(self.value.name):
                    self.value.build([None, None, self.config.hidden_size])
# TFBlipTextSelfOutput 类定义，继承自 keras.layers.Layer
class TFBlipTextSelfOutput(keras.layers.Layer):
    # 初始化方法，接收 BlipTextConfig 类型的 config 对象和其他关键字参数
    def __init__(self, config: BlipTextConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个 Dense 层，用于线性变换，units 参数为 config.hidden_size，使用指定的初始化器初始化权重
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 创建 LayerNormalization 层，epsilon 参数为 config.layer_norm_eps，用于归一化
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")

        # 创建 Dropout 层，丢弃率为 config.hidden_dropout_prob，用于随机丢弃部分神经元的输出
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)

        # 将 config 对象保存为实例变量，用于后续调用
        self.config = config

    # call 方法重写，定义了层的正向传播逻辑
    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
        # 将 hidden_states 输入到 Dense 层进行线性变换
        hidden_states = self.dense(inputs=hidden_states)
        
        # 在训练时，对 hidden_states 使用 Dropout 进行随机丢弃部分神经元的输出
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        
        # 将经过 Dropout 处理后的 hidden_states 与 input_tensor 相加，然后输入到 LayerNorm 层进行归一化处理
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        # 返回处理后的 hidden_states
        return hidden_states

    # build 方法用于构建层，在第一次调用时构建层的权重
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        
        # 将标志 built 设置为 True，表示已构建
        self.built = True
        
        # 如果实例中存在 dense 层，使用 tf.name_scope 创建命名空间，并构建 dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        
        # 如果实例中存在 LayerNorm 层，使用 tf.name_scope 创建命名空间，并构建 LayerNorm 层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# 从 https://github.com/salesforce/BLIP/blob/main/models/med.py#242 适配而来
# TFBlipTextAttention 类定义，继承自 keras.layers.Layer
class TFBlipTextAttention(keras.layers.Layer):
    # 初始化方法，接收 config 和 is_cross_attention 参数以及其他关键字参数
    def __init__(self, config, is_cross_attention=False, **kwargs):
        super().__init__(**kwargs)
        
        # 创建 TFBlipTextSelfAttention 类实例 self.self，用于自注意力计算，is_cross_attention 表示是否是跨注意力
        self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self")
        
        # 创建 TFBlipTextSelfOutput 类实例 self.self_output，用于自注意力层的输出处理
        self.self_output = TFBlipTextSelfOutput(config, name="output")

    # call 方法重写，定义了层的正向传播逻辑
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        encoder_hidden_states: tf.Tensor | None = None,
        encoder_attention_mask: tf.Tensor | None = None,
        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
        output_attentions: Optional[bool] = False,
        training: Optional[bool] = None,
    ):
        # 调用 self.self 的 call 方法进行自注意力计算，得到 self_outputs
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
            training=training,
        )
        
        # 将 self_outputs[0] 作为输入，hidden_states 作为 input_tensor，传入 self.self_output 进行处理
        attention_output = self.self_output(self_outputs[0], hidden_states, training=training)
        
        # 构建输出元组，如果需要输出 attentions，将 attentions 添加到 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # 如果输出 attentions，则添加到 outputs 中
        
        # 返回 outputs
        return outputs
    # 如果模型已经构建完成，直接返回，不做任何操作
    if self.built:
        return
    # 标记模型已经构建
    self.built = True
    # 检查是否存在self属性，并且不为None
    if getattr(self, "self", None) is not None:
        # 使用self的名称创建一个命名空间，并在其中构建self对象
        with tf.name_scope(self.self.name):
            self.self.build(None)
    # 检查是否存在self_output属性，并且不为None
    if getattr(self, "self_output", None) is not None:
        # 使用self_output的名称创建一个命名空间，并在其中构建self_output对象
        with tf.name_scope(self.self_output.name):
            self.self_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText
class TFBlipTextIntermediate(keras.layers.Layer):
    def __init__(self, config: BlipTextConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义一个全连接层，用于转换输入的隐藏状态到中间层大小
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 根据配置选择激活函数，如果是字符串则转换为对应的 TensorFlow 激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 应用全连接层到输入的隐藏状态
        hidden_states = self.dense(inputs=hidden_states)
        # 应用中间层激活函数到全连接层的输出
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果层已经构建且存在密集层，则构建密集层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


class TFBlipTextOutput(keras.layers.Layer):
    def __init__(self, config: BlipTextConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义一个全连接层，用于转换输入的隐藏状态到输出大小
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 应用层归一化，用于调整输出层的数据分布
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 应用丢弃层，用于随机丢弃部分神经元以防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 应用全连接层到输入的隐藏状态
        hidden_states = self.dense(inputs=hidden_states)
        # 如果训练模式开启，则应用丢弃层，否则跳过
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 将输入张量和归一化后的隐藏状态相加，并应用层归一化
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果层已经构建且存在密集层，则构建密集层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])
        # 如果层已经构建且存在层归一化层，则构建层归一化层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


class TFBlipTextLayer(keras.layers.Layer):
    # 初始化函数，用于创建一个新的TFBlipTextLayer对象
    def __init__(self, config, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 将传入的config参数保存到对象的config属性中
        self.config = config
        # 创建一个TFBlipTextAttention对象，并命名为"attention"
        self.attention = TFBlipTextAttention(config, name="attention")
        # 如果config中指定为decoder模式，则创建一个用于跨attention的TFBlipTextAttention对象，并命名为"crossattention"
        if self.config.is_decoder:
            self.crossattention = TFBlipTextAttention(
                config, is_cross_attention=self.config.is_decoder, name="crossattention"
            )
        # 创建一个TFBlipTextIntermediate对象，并命名为"intermediate"
        self.intermediate = TFBlipTextIntermediate(config, name="intermediate")
        # 创建一个TFBlipTextOutput对象，并命名为"output"
        self.self_output = TFBlipTextOutput(config, name="output")

    # call方法，用于执行前向传播操作
    def call(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        training=None,
    ):
        # 如果存在过去的key/value，则从中获取decoder单向self-attention的缓存
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 执行self attention操作，并获取输出结果
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
            training=training,
        )
        # 从self attention输出中提取注意力输出
        attention_output = self_attention_outputs[0]

        # 提取除了注意力输出以外的所有输出
        outputs = self_attention_outputs[1:-1]
        # 获取当前的key/value
        present_key_value = self_attention_outputs[-1]

        # 如果存在encoder的隐藏状态，则执行cross attention操作
        if encoder_hidden_states is not None:
            # 执行cross attention操作，并获取输出结果
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                output_attentions=output_attentions,
                training=training,
            )
            # 从cross attention输出中提取注意力输出
            attention_output = cross_attention_outputs[0]
            # 如果输出注意力权重，则将cross attention的输出添加到已有的outputs中
            outputs = outputs + cross_attention_outputs[1:-1]

        # 执行intermediate层的操作
        intermediate_output = self.intermediate(attention_output)
        # 执行self output层的操作，并获取最终的层输出
        layer_output = self.self_output(intermediate_output, attention_output, training=training)
        # 将最终的层输出和之前的outputs一起返回
        outputs = (layer_output,) + outputs

        # 将当前的key/value添加到outputs中
        outputs = outputs + (present_key_value,)

        # 返回所有的outputs
        return outputs

    # build方法，用于构建层，并确保每个组件被正确构建
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 将构建状态标记为已构建
        self.built = True
        # 如果存在attention对象，则构建attention对象
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        # 如果存在intermediate对象，则构建intermediate对象
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        # 如果存在self_output对象，则构建self_output对象
        if getattr(self, "self_output", None) is not None:
            with tf.name_scope(self.self_output.name):
                self.self_output.build(None)
        # 如果存在crossattention对象，则构建crossattention对象
        if getattr(self, "crossattention", None) is not None:
            with tf.name_scope(self.crossattention.name):
                self.crossattention.build(None)
# 基于 keras_serializable 装饰器将该类声明为可序列化的 Keras 层
@keras_serializable
class TFBlipTextEncoder(keras.layers.Layer):
    # 指定配置类为 BlipTextConfig
    config_class = BlipTextConfig

    # 初始化方法，接受配置对象 config 和可选的名称参数
    def __init__(self, config, name=None, **kwargs):
        super().__init__(name=name, **kwargs)
        # 将传入的配置对象保存为实例变量
        self.config = config
        # 创建一个由 TFBlipTextLayer 实例组成的列表，命名为 layer，用于表示隐藏层
        self.layer = [TFBlipTextLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]

    # 使用 unpack_inputs 装饰器定义 call 方法，接收多个输入参数并进行处理
    @unpack_inputs
    def call(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        training=None,
    ):
        # 如果设置了 output_hidden_states 标志，则初始化 all_hidden_states 为一个空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果设置了 output_attentions 标志，则初始化 all_self_attentions 为一个空元组
        all_self_attentions = () if output_attentions else None
        # 如果是解码器且设置了 output_attentions 标志，则初始化 all_cross_attentions 为一个空元组
        all_cross_attentions = () if output_attentions and self.config.is_decoder else None

        # 如果设置了 use_cache 标志，则初始化 next_decoder_cache 为一个空元组
        next_decoder_cache = () if use_cache else None

        # 循环遍历每个隐藏层
        for i in range(self.config.num_hidden_layers):
            # 获取当前层的模块对象
            layer_module = self.layer[i]
            # 如果设置了 output_hidden_states 标志，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取当前层的过去键值
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 调用当前层的模块对象进行前向传播
            layer_outputs = layer_module(
                hidden_states,
                attention_mask,
                layer_head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                past_key_value,
                output_attentions,
                training=training,
            )

            # 更新隐藏状态为当前层输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果设置了 use_cache 标志，则将当前层输出的最后一个元素添加到 next_decoder_cache 中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果设置了 output_attentions 标志，则将当前层输出的注意力值添加到对应的元组中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果设置了 output_hidden_states 标志，则将最终隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果 return_dict 为 False，则返回一个元组，其中包含非 None 的值
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        
        # 如果 return_dict 为 True，则返回 TFBaseModelOutputWithPastAndCrossAttentions 对象
        return TFBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
    # 定义 build 方法，用于构建模型的层
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，不再重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 检查是否存在 layer 属性，并对其进行迭代
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                # 使用 tf.name_scope 为当前层设置命名空间，确保命名唯一性
                with tf.name_scope(layer.name):
                    # 调用每一层的 build 方法，传入 input_shape 为 None
                    layer.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText
class TFBlipTextPooler(keras.layers.Layer):
    def __init__(self, config: BlipTextConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义一个全连接层，用于池化操作，输出维度为配置文件中的隐藏大小
        self.dense = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 从输入的隐藏状态中获取第一个 token 的隐藏状态
        first_token_tensor = hidden_states[:, 0]
        # 将第一个 token 的隐藏状态输入到全连接层中进行池化操作
        pooled_output = self.dense(inputs=first_token_tensor)

        return pooled_output

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        self.built = True
        # 如果定义了 dense 层，则在 tf 的命名空间下构建该层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText
class TFBlipTextPredictionHeadTransform(keras.layers.Layer):
    def __init__(self, config: BlipTextConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义一个全连接层，用于预测头部转换，输出维度为配置文件中的隐藏大小
        self.dense = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="dense",
        )

        # 根据配置文件中的激活函数类型，选择相应的激活函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.transform_act_fn = config.hidden_act

        # 定义 LayerNormalization 层，用于规范化隐藏状态
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 将输入的隐藏状态输入到全连接层中
        hidden_states = self.dense(inputs=hidden_states)
        # 应用预定义的激活函数
        hidden_states = self.transform_act_fn(hidden_states)
        # 将处理后的隐藏状态输入到 LayerNormalization 层中进行规范化
        hidden_states = self.LayerNorm(inputs=hidden_states)

        return hidden_states

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        self.built = True
        # 如果定义了 dense 层，则在 tf 的命名空间下构建该层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果定义了 LayerNorm 层，则在 tf 的命名空间下构建该层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


class TFBlipTextLMPredictionHead(keras.layers.Layer):
    # 这里是 TFBlipTextLMPredictionHead 类的定义，需要进一步补充注释
    pass
    # 初始化函数，接收配置参数和可选的关键字参数
    def __init__(self, config, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 使用配置初始化一个文本预测头部的转换器对象
        self.transform = TFBlipTextPredictionHeadTransform(config, name="transform")

        # 输出权重与输入嵌入相同，但每个标记有一个仅输出的偏置项
        # 创建一个全连接层，输出大小为词汇表大小，使用指定的初始化器初始化权重
        self.decoder = keras.layers.Dense(
            config.vocab_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="decoder",
            use_bias=False,
        )
        # 存储配置对象
        self.config = config

    # 构建模型
    def build(self, input_shape=None):
        # 添加一个名为bias的可训练权重，形状为词汇表大小，初始化为全零
        self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True)

        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True

        # 如果存在transform属性，则构建transform层
        if getattr(self, "transform", None) is not None:
            with tf.name_scope(self.transform.name):
                self.transform.build(None)

        # 如果存在decoder属性，则构建decoder层
        if getattr(self, "decoder", None) is not None:
            with tf.name_scope(self.decoder.name):
                # 构建时指定decoder层的输入形状为[None, None, 隐藏大小]
                self.decoder.build([None, None, self.config.hidden_size])

    # 调用模型，传入隐藏状态并返回预测结果
    def call(self, hidden_states):
        # 使用transform层处理隐藏状态
        hidden_states = self.transform(hidden_states)
        # 使用decoder层处理transform后的隐藏状态，并加上偏置
        hidden_states = self.decoder(hidden_states) + self.bias
        # 返回处理后的隐藏状态，即预测结果
        return hidden_states
class TFBlipTextOnlyMLMHead(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 使用给定的配置创建 BLIP 文本预测头部对象
        self.predictions = TFBlipTextLMPredictionHead(config, name="predictions")

    def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
        # 基于序列输出进行预测分数计算
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 predictions 属性存在，则构建预测头部对象
        if getattr(self, "predictions", None) is not None:
            with tf.name_scope(self.predictions.name):
                self.predictions.build(None)


# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548
class TFBlipTextPreTrainedModel(TFPreTrainedModel):
    """
    处理权重初始化和预训练模型下载加载的抽象类，提供简单的接口。
    """

    config_class = BlipTextConfig
    base_model_prefix = "bert"
    _keys_to_ignore_on_load_missing = [r"position_ids"]


# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571
class TFBlipTextModel(TFBlipTextPreTrainedModel):
    """
    该模型可以作为编码器（仅具有自注意力）或解码器行事，在后一种情况下，将在自注意力层之间添加交叉注意力层，遵循
    [Attention is all you need](https://arxiv.org/abs/1706.03762) 的架构描述。
    """

    def __init__(self, config, add_pooling_layer=True, name=None, **kwargs):
        super().__init__(config, name=name, **kwargs)
        self.config = config

        # 创建 BLIP 文本嵌入层、编码层和池化层（如果需要）
        self.embeddings = TFBlipTextEmbeddings(config, name="embeddings")
        self.encoder = TFBlipTextEncoder(config, name="encoder")
        self.pooler = TFBlipTextPooler(config, name="pooler") if add_pooling_layer else None

    def get_input_embeddings(self):
        # 返回输入嵌入层的权重
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        # 设置输入嵌入层的权重值
        self.embeddings.word_embeddings = value

    @tf.function
    def get_extended_attention_mask(
        self, attention_mask: tf.Tensor, input_shape: Tuple[int], is_decoder: bool
    ):
        # 返回扩展的注意力遮罩张量
        pass

    @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
    @unpack_inputs
    # 添加开始的文档字符串到模型前向传递
    # 定义一个方法 `call`，用于执行模型的前向传播。
    # 参数说明：
    # input_ids: 输入的 token IDs
    # attention_mask: 注意力遮罩张量
    # position_ids: 位置 ID 张量
    # head_mask: 头部遮罩张量
    # inputs_embeds: 输入的嵌入张量
    # encoder_embeds: 编码器的嵌入张量
    # encoder_hidden_states: 编码器的隐藏状态张量
    # encoder_attention_mask: 编码器的注意力遮罩张量
    # past_key_values: 缓存的键值对元组
    # use_cache: 是否使用缓存
    # output_attentions: 是否输出注意力权重
    # output_hidden_states: 是否输出隐藏状态
    # return_dict: 是否返回字典形式的结果
    # is_decoder: 是否作为解码器运行
    # training: 是否在训练模式下运行

    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果 `embeddings` 属性存在，则构建 `embeddings` 层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        # 如果 `encoder` 属性存在，则构建 `encoder` 层
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        # 如果 `pooler` 属性存在，则构建 `pooler` 层
        if getattr(self, "pooler", None) is not None:
            with tf.name_scope(self.pooler.name):
                self.pooler.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
    # 在加载模型时忽略的不期望键名列表
    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    # 在加载模型时忽略的丢失键名列表
    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]

    def __init__(self, config, **kwargs):
        super().__init__(config, **kwargs)

        # 初始化BERT模型，不包含池化层
        self.bert = TFBlipTextModel(config, add_pooling_layer=False, name="bert")
        # 初始化仅包含MLM头部的模型
        self.cls = TFBlipTextOnlyMLMHead(config, name="cls")
        # 获取标签平滑参数
        self.label_smoothing = config.label_smoothing

    # 获取输出的嵌入层
    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    # 设置输出的嵌入层
    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    # 调用函数，按照模型前向传播的文档字符串注释，解包输入参数
    @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
    @unpack_inputs
    def call(
        self,
        input_ids=None,
        attention_mask=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        labels=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        return_logits=False,
        is_decoder=True,
        training=None,
    ):
        input_shape = input_ids.shape
        # 如果没有提供注意力掩码，则创建一个全为1的掩码
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)

        # 如果使用了过去的键值对，则截断输入的input_ids
        if past_key_values is not None:
            input_ids = input_ids[:, -1:]

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
            "is_decoder": True,
        }

    # 重新排序缓存，以便于beam search
    def _reorder_cache(self, past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
        return reordered_past

    # 构建模型
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在BERT模型，则构建BERT
        if getattr(self, "bert", None) is not None:
            with tf.name_scope(self.bert.name):
                self.bert.build(None)
        # 如果存在CLS模型，则构建CLS
        if getattr(self, "cls", None) is not None:
            with tf.name_scope(self.cls.name):
                self.cls.build(None)

`.\models\blip\processing_blip.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for Blip.
"""

from typing import List, Optional, Union

from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType


class BlipProcessor(ProcessorMixin):
    r"""
    Constructs a BLIP processor which wraps a BERT tokenizer and BLIP image processor into a single processor.

    [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`BertTokenizerFast`]. See the
    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.

    Args:
        image_processor (`BlipImageProcessor`):
            An instance of [`BlipImageProcessor`]. The image processor is a required input.
        tokenizer (`BertTokenizerFast`):
            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "BlipImageProcessor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")

    def __init__(self, image_processor, tokenizer):
        tokenizer.return_token_type_ids = False  # 禁用 tokenizer 的返回 token 类型 ID 功能
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor  # 设置当前处理器为图像处理器

    def __call__(
        self,
        images: ImageInput = None,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_token_type_ids: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ):
        """
        Process images and text inputs using the BLIP processor.

        Args:
            images (ImageInput, optional): Input images to be processed.
            text (Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], optional):
                Input text or pre-tokenized text inputs to be processed.
            add_special_tokens (bool, optional): Whether to add special tokens to the inputs.
            padding (Union[bool, str, PaddingStrategy], optional): Padding strategy for inputs.
            truncation (Union[bool, str, TruncationStrategy], optional): Truncation strategy for inputs.
            max_length (int, optional): Maximum length of the processed inputs.
            stride (int, optional): Stride used for overflowing tokens.
            pad_to_multiple_of (int, optional): Pad inputs to a multiple of this number.
            return_attention_mask (bool, optional): Whether to return attention masks.
            return_overflowing_tokens (bool, optional): Whether to return overflowing tokens.
            return_special_tokens_mask (bool, optional): Whether to return special tokens mask.
            return_offsets_mapping (bool, optional): Whether to return offsets mapping.
            return_token_type_ids (bool, optional): Whether to return token type IDs.
            return_length (bool, optional): Whether to return the length of the processed inputs.
            verbose (bool, optional): Whether to print verbose information.
            return_tensors (Union[str, TensorType], optional): Type of tensor to return.

        Returns:
            BatchEncoding: Processed inputs in batch encoding format.
        """
        pass
    ) -> BatchEncoding:
        """
        使用 [`BlipImageProcessor.__call__`] 方法准备图像数据以供模型使用，
        并使用 [`BertTokenizerFast.__call__`] 方法准备文本数据以供模型使用。

        更多信息请参考上述两个方法的文档字符串。
        """
        # 如果未提供图像和文本，则抛出数值错误
        if images is None and text is None:
            raise ValueError("You have to specify either images or text.")

        # 仅处理文本情况
        if images is None:
            # 设置当前处理器为分词器
            self.current_processor = self.tokenizer
            # 使用分词器处理文本数据，返回编码结果
            text_encoding = self.tokenizer(
                text=text,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_offsets_mapping=return_offsets_mapping,
                return_token_type_ids=return_token_type_ids,
                return_length=return_length,
                verbose=verbose,
                return_tensors=return_tensors,
                **kwargs,
            )
            return text_encoding

        # 处理包含图像的情况
        # 使用图像处理器处理图像数据，返回编码结果
        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)

        if text is not None:
            # 如果同时提供了文本，使用分词器处理文本数据，返回编码结果
            text_encoding = self.tokenizer(
                text=text,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_offsets_mapping=return_offsets_mapping,
                return_token_type_ids=return_token_type_ids,
                return_length=return_length,
                verbose=verbose,
                return_tensors=return_tensors,
                **kwargs,
            )
        else:
            # 如果未提供文本，将文本编码设置为 None
            text_encoding = None

        if text_encoding is not None:
            # 如果存在文本编码结果，则更新图像处理器的编码结果
            encoding_image_processor.update(text_encoding)

        return encoding_image_processor

    def batch_decode(self, *args, **kwargs):
        """
        将所有参数转发给 BertTokenizerFast 的 [`~PreTrainedTokenizer.batch_decode`] 方法。
        请参考该方法的文档字符串获取更多信息。
        """
        # 调用分词器的批量解码方法，并返回结果
        return self.tokenizer.batch_decode(*args, **kwargs)
    # 将所有参数转发给 BertTokenizerFast 的 `~PreTrainedTokenizer.decode` 方法，并返回结果
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    # 返回模型输入的名称列表，这里包括了 tokenizer 和 image_processor 的输入名称，并去除重复项
    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

`.\models\blip\init.py`

# 导入类型检查标记
from typing import TYPE_CHECKING

# 导入必要的模块和异常类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_torch_available,
    is_vision_available,
)

# 定义模块的导入结构字典
_import_structure = {
    "configuration_blip": [
        "BLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "BlipConfig",
        "BlipTextConfig",
        "BlipVisionConfig",
    ],
    "processing_blip": ["BlipProcessor"],
}

# 检查视觉模块是否可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加视觉处理模块到_import_structure字典中
    _import_structure["image_processing_blip"] = ["BlipImageProcessor"]

# 检查PyTorch模块是否可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加PyTorch模型相关模块到_import_structure字典中
    _import_structure["modeling_blip"] = [
        "BLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "BlipModel",
        "BlipPreTrainedModel",
        "BlipForConditionalGeneration",
        "BlipForQuestionAnswering",
        "BlipVisionModel",
        "BlipTextModel",
        "BlipForImageTextRetrieval",
    ]

# 检查TensorFlow模块是否可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加TensorFlow模型相关模块到_import_structure字典中
    _import_structure["modeling_tf_blip"] = [
        "TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFBlipModel",
        "TFBlipPreTrainedModel",
        "TFBlipForConditionalGeneration",
        "TFBlipForQuestionAnswering",
        "TFBlipVisionModel",
        "TFBlipTextModel",
        "TFBlipForImageTextRetrieval",
    ]

# 如果在类型检查模式下，导入类型相关的类和常量
if TYPE_CHECKING:
    from .configuration_blip import BLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, BlipConfig, BlipTextConfig, BlipVisionConfig
    from .processing_blip import BlipProcessor

    # 检查视觉模块是否可用，若可用则导入视觉处理模块
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_blip import BlipImageProcessor

    # 检查PyTorch模块是否可用，若不可用则忽略导入相关模块
    # 如果前面的条件不满足，则执行以下代码块
    else:
        # 从当前包的模块中导入以下内容
        from .modeling_blip import (
            BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
            BlipForConditionalGeneration,
            BlipForImageTextRetrieval,
            BlipForQuestionAnswering,
            BlipModel,
            BlipPreTrainedModel,
            BlipTextModel,
            BlipVisionModel,
        )

    # 尝试检查是否 TensorFlow 可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常
    except OptionalDependencyNotAvailable:
        # 如果异常捕获到，则什么也不做，继续执行
        pass
    # 如果没有异常发生，则执行以下代码块
    else:
        # 从当前包的 TensorFlow 版本模块中导入以下内容
        from .modeling_tf_blip import (
            TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFBlipForConditionalGeneration,
            TFBlipForImageTextRetrieval,
            TFBlipForQuestionAnswering,
            TFBlipModel,
            TFBlipPreTrainedModel,
            TFBlipTextModel,
            TFBlipVisionModel,
        )
else:
    # 导入 sys 模块，用于操作 Python 解释器的运行时环境
    import sys

    # 使用当前模块的名称作为键，将 _LazyModule 对象赋值给 sys.modules 中的对应条目
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\blip_2\configuration_blip_2.py`

# coding=utf-8
# 定义文件编码格式为 UTF-8

# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
# 版权声明，版权归 HuggingFace 公司所有

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证 2.0 版本进行许可

# you may not use this file except in compliance with the License.
# 除非遵守许可证，否则不得使用此文件

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证副本

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则依据“现状”分发软件，无论是明示的还是暗示的保证或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查阅许可证了解特定语言的权限和限制

""" BLIP-2 model configuration"""
# BLIP-2 模型配置信息

import os
# 导入操作系统相关模块

from typing import Union
# 导入 Union 类型提示

from ...configuration_utils import PretrainedConfig
# 从配置工具中导入 PretrainedConfig 类

from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
# 从自动化模型中导入模型映射名称

from ...utils import logging
# 从工具包中导入日志记录模块

from ..auto import CONFIG_MAPPING
# 从自动化模块中导入配置映射

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "salesforce/blip2-opt-2.7b": "https://huggingface.co/salesforce/blip2-opt-2.7b/resolve/main/config.json",
}
# BLIP-2 预训练配置文件映射，指定了模型的配置文件路径和名称
# 键是模型的名称，值是配置文件的 URL 地址

class Blip2VisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Blip2VisionModel`]. It is used to instantiate a
    BLIP-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration defaults will yield a similar configuration to that of the BLIP-2
    [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # Blip2VisionConfig 类，用于存储 Blip2VisionModel 的配置信息
    # 此类通过指定的参数实例化 BLIP-2 视觉编码器，定义模型架构
    # 实例化配置的默认值将产生与 BLIP-2 [Salesforce/blip2-opt-2.7b] 架构相似的配置
    # 配置对象继承自 PretrainedConfig，并可用于控制模型输出

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        # 调用父类构造函数初始化配置对象
    # 定义模`
    # 参数说明
    Args:
        hidden_size (`int`, *optional*, defaults to 1408):
            # 编码器层和池化层的维度。
        intermediate_size (`int`, *optional*, defaults to 6144):
            # Transformer 编码器中“中间”层（即前馈层）的维度。
        num_hidden_layers (`int`, *optional*, defaults to 39):
            # Transformer 编码器中的隐藏层数量。
        num_attention_heads (`int`, *optional*, defaults to 16):
            # 每个注意力层中的注意力头数量。
        image_size (`int`, *optional*, defaults to 224):
            # 每个图像的大小（分辨率）。
        patch_size (`int`, *optional*, defaults to 14):
            # 每个 patch 的大小（分辨率）。
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            # 编码器和池化器中的非线性激活函数（函数或字符串）。如果是字符串，支持 `"gelu"`、`"relu"`、`"selu"` 和 `"gelu_new"`。
            # layer_norm_eps (`float`, *optional*, defaults to 1e-5): 层归一化层使用的 epsilon 值。
        attention_dropout (`float`, *optional*, defaults to 0.0):
            # 注意力概率的丢弃比率。
        initializer_range (`float`, *optional*, defaults to 0.02):
            # 用于初始化所有权重矩阵的 truncated_normal_initializer 的标准差。
        qkv_bias (`bool`, *optional*, defaults to `True`):
            # 是否在自注意力层中的查询和值添加偏置。

    Example:
    ```
    >>> from transformers import Blip2VisionConfig, Blip2VisionModel

    >>> # 初始化 Blip2VisionConfig，使用 Salesforce/blip2-opt-2.7b 样式配置
    >>> configuration = Blip2VisionConfig()

    >>> # 从 Salesforce/blip2-opt-2.7b 样式配置初始化 Blip2VisionModel（使用随机权重）
    >>> model = Blip2VisionModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```"""
    model_type = "blip_2_vision_model"

    def __init__(
        self,
        hidden_size=1408,  # 编码器层和池化层的维度，默认为 1408
        intermediate_size=6144,  # Transformer 编码器中前馈层的维度，默认为 6144
        num_hidden_layers=39,  # Transformer 编码器的隐藏层数量，默认为 39
        num_attention_heads=16,  # 每个注意力层的注意力头数量，默认为 16
        image_size=224,  # 每个图像的大小（分辨率），默认为 224
        patch_size=14,  # 每个 patch 的大小（分辨率），默认为 14
        hidden_act="gelu",  # 编码器和池化器的非线性激活函数，默认为 "gelu"
        layer_norm_eps=1e-6,  # 层归一化层使用的 epsilon 值，默认为 1e-6
        attention_dropout=0.0,  # 注意力概率的丢弃比率，默认为 0.0
        initializer_range=1e-10,  # 初始化所有权重矩阵的 truncated_normal_initializer 的标准差，默认为 1e-10
        qkv_bias=True,  # 是否在自注意力层的查询和值添加偏置，默认为 True
        **kwargs,  # 接受其他关键字参数
    ):
        # 调用父类的构造方法，传递所有关键字参数
        super().__init__(**kwargs)

        # 设置隐藏层大小
        self.hidden_size = hidden_size
        # 设置中间层大小
        self.intermediate_size = intermediate_size
        # 设置隐藏层数量
        self.num_hidden_layers = num_hidden_layers
        # 设置注意力头的数量
        self.num_attention_heads = num_attention_heads
        # 设置补丁大小
        self.patch_size = patch_size
        # 设置图像大小
        self.image_size = image_size
        # 设置初始化范围
        self.initializer_range = initializer_range
        # 设置注意力机制的dropout率
        self.attention_dropout = attention_dropout
        # 设置层归一化的epsilon值
        self.layer_norm_eps = layer_norm_eps
        # 设置隐藏层激活函数
        self.hidden_act = hidden_act
        # 设置注意力机制中的QKV偏置
        self.qkv_bias = qkv_bias

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 在关键字参数中设置token
        cls._set_token_in_kwargs(kwargs)

        # 获取预训练模型的配置字典和更新后的关键字参数
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典指定的模型类型是"blip-2"，则使用视觉配置字典
        if config_dict.get("model_type") == "blip-2":
            config_dict = config_dict["vision_config"]

        # 如果配置字典中存在"model_type"且与当前类的模型类型不同，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典和关键字参数中创建类的实例
        return cls.from_dict(config_dict, **kwargs)
# 定义 Blip2QFormerConfig 类，继承自 PretrainedConfig 类
class Blip2QFormerConfig(PretrainedConfig):
    r"""
    这是配置类，用于存储 [`Blip2QFormerModel`] 的配置信息。它被用来根据指定的参数实例化一个 BLIP-2 Querying Transformer (Q-Former) 模型，
    定义模型的架构。使用默认参数实例化一个配置对象会产生与 BLIP-2 [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) 架构
    类似的配置。配置对象继承自 [`PretrainedConfig`]，可以用来控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。

    注意，[`Blip2QFormerModel`] 与 [`BertLMHeadModel`] 非常相似，具有交错的跨注意力机制。
    ```
    # 定义 Q-Former 模型的配置类，包括模型的各种参数设置
    Args:
        vocab_size (`int`, *optional*, defaults to 30522):
            Q-Former 模型的词汇表大小，定义了在调用模型时 `inputs_ids` 可以表示的不同标记数量。
        hidden_size (`int`, *optional*, defaults to 768):
            编码器层和池化层的维度大小。
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Transformer 编码器中隐藏层的数量。
        num_attention_heads (`int`, *optional*, defaults to 12):
            Transformer 编码器中每个注意力层的注意力头数量。
        intermediate_size (`int`, *optional*, defaults to 3072):
            Transformer 编码器中“中间”（常称为前馈）层的维度。
        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            编码器和池化器中的非线性激活函数（函数或字符串）。支持的字符串有："gelu"、"relu"、"silu" 和 "gelu_new"。
        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            注意力概率的 dropout 比率。
        max_position_embeddings (`int`, *optional*, defaults to 512):
            模型可能使用的最大序列长度。通常设置为一个较大的值（例如 512、1024 或 2048）。
        initializer_range (`float`, *optional*, defaults to 0.02):
            初始化所有权重矩阵的截断正态分布的标准差。
        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            层归一化层使用的 epsilon 值。
        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
            位置嵌入的类型。选择 `"absolute"`、`"relative_key"` 或 `"relative_key_query"` 之一。关于 `"relative_key"` 的更多信息，请参考
            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155)。
            关于 `"relative_key_query"` 的更多信息，请参考
            [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658) 中的 *Method 4*。
        cross_attention_frequency (`int`, *optional*, defaults to 2):
            在 Transformer 层中添加跨注意力的频率。
        encoder_hidden_size (`int`, *optional*, defaults to 1408):
            用于跨注意力的隐藏状态的隐藏大小。

    Examples:

    ```
    >>> from transformers import Blip2QFormerConfig, Blip2QFormerModel
    >>> # 设置模型类型为 "blip_2_qformer"
    model_type = "blip_2_qformer"

    def __init__(
        self,
        vocab_size=30522,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        position_embedding_type="absolute",
        cross_attention_frequency=2,
        encoder_hidden_size=1408,
        **kwargs,
    ):
        # 调用父类的构造函数，初始化模型配置
        super().__init__(pad_token_id=pad_token_id, **kwargs)

        # 初始化模型配置的各个参数
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.cross_attention_frequency = cross_attention_frequency
        self.encoder_hidden_size = encoder_hidden_size

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 设置关键字参数中的 token_id
        cls._set_token_in_kwargs(kwargs)

        # 获取预训练模型的配置字典及其他参数
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中指定了模型类型为 "blip-2"，则使用其 qformer_config 配置
        if config_dict.get("model_type") == "blip-2":
            config_dict = config_dict["qformer_config"]

        # 检查模型类型是否匹配，如果不匹配则发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典创建一个实例
        return cls.from_dict(config_dict, **kwargs)
# 定义 Blip2Config 类，用于存储 Blip2ForConditionalGeneration 模型的配置信息
class Blip2Config(PretrainedConfig):
    # Blip2Config 是用于存储 Blip2ForConditionalGeneration 模型配置的类。它用于根据指定参数实例化 BLIP-2 模型，
    # 定义了视觉模型、Q-Former 模型和语言模型的配置。使用默认配置实例化将得到与 BLIP-2 架构
    # Salesforce/blip2-opt-2.7b 相似的配置。

    # 配置对象继承自 PretrainedConfig，可用于控制模型输出。有关更多信息，请阅读 PretrainedConfig 的文档。

    # Args:
    #     vision_config (`dict`, *optional*):
    #         用于初始化 Blip2VisionConfig 的配置选项字典。
    #     qformer_config (`dict`, *optional*):
    #         用于初始化 Blip2QFormerConfig 的配置选项字典。
    #     text_config (`dict`, *optional*):
    #         用于初始化任何 PretrainedConfig 的配置选项字典。
    #     num_query_tokens (`int`, *optional*, 默认为 32):
    #         通过 Transformer 传递的查询令牌数量。
    # 
    #     kwargs (*optional*):
    #         关键字参数字典。

    # 示例:
    # 
    # ```
    # >>> from transformers import (
    # ...     Blip2VisionConfig,
    # ...     Blip2QFormerConfig,
    # ...     OPTConfig,
    # ...     Blip2Config,
    # ...     Blip2ForConditionalGeneration,
    # ... )
    # 
    # >>> # 使用 Salesforce/blip2-opt-2.7b 风格配置初始化 Blip2Config
    # >>> configuration = Blip2Config()
    # 
    # >>> # 使用 Salesforce/blip2-opt-2.7b 风格配置初始化 Blip2ForConditionalGeneration（随机权重）
    # >>> model = Blip2ForConditionalGeneration(configuration)
    # 
    # >>> # 访问模型配置
    # >>> configuration = model.config
    # 
    # >>> # 也可以从 Blip2VisionConfig、Blip2QFormerConfig 和任何 PretrainedConfig 初始化 Blip2Config
    # 
    # >>> # 初始化 BLIP-2 视觉、BLIP-2 Q-Former 和语言模型配置
    # >>> vision_config = Blip2VisionConfig()
    # >>> qformer_config = Blip2QFormerConfig()
    # >>> text_config = OPTConfig()
    # 
    # >>> config = Blip2Config.from_text_vision_configs(vision_config, qformer_config, text_config)
    # ```
    model_type = "blip-2"
    # 定义模型类型为 "blip-2"
    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
        super().__init__(**kwargs)  # 调用父类的构造函数，传递任意额外的关键字参数

        if vision_config is None:
            vision_config = {}
            logger.info("vision_config is None. initializing the Blip2VisionConfig with default values.")
        
        if qformer_config is None:
            qformer_config = {}
            logger.info("qformer_config is None. Initializing the Blip2QFormerConfig with default values.")

        if text_config is None:
            text_config = {}
            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")

        self.vision_config = Blip2VisionConfig(**vision_config)  # 根据提供的视觉配置初始化 Blip2VisionConfig 对象
        self.qformer_config = Blip2QFormerConfig(**qformer_config)  # 根据提供的 Q-Former 配置初始化 Blip2QFormerConfig 对象
        text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)  # 根据文本配置的模型类型选择相应的配置对象进行初始化

        self.tie_word_embeddings = self.text_config.tie_word_embeddings  # 设置是否共享词嵌入参数
        self.is_encoder_decoder = self.text_config.is_encoder_decoder  # 设置是否为编码器-解码器模型

        self.num_query_tokens = num_query_tokens  # 设置查询令牌的数量
        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size  # 将视觉配置的隐藏大小设置为 Q-Former 配置的编码器隐藏大小
        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES  # 设置是否仅使用解码器语言模型
        self.initializer_factor = 1.0  # 初始化因子设为1.0
        self.initializer_range = 0.02  # 初始化范围设为0.02

    @classmethod
    def from_vision_qformer_text_configs(
        cls,
        vision_config: Blip2VisionConfig,
        qformer_config: Blip2QFormerConfig,
        text_config: PretrainedConfig,
        **kwargs,
    ):
        r"""
        Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model
        configurations.

        Returns:
            [`Blip2Config`]: An instance of a configuration object
        """

        return cls(
            vision_config=vision_config.to_dict(),  # 将视觉配置对象转换为字典形式传递给构造函数
            qformer_config=qformer_config.to_dict(),  # 将 Q-Former 配置对象转换为字典形式传递给构造函数
            text_config=text_config.to_dict(),  # 将语言模型配置对象转换为字典形式传递给构造函数
            **kwargs,  # 传递额外的关键字参数
        )

`.\models\blip_2\convert_blip_2_original_to_pytorch.py`

# 设置脚本的编码格式为 UTF-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Convert BLIP-2 checkpoints from the original repository.

URL: https://github.com/salesforce/LAVIS/tree/main/projects/blip2
"""

# 导入必要的库
import argparse  # 用于解析命令行参数

import requests  # 用于发送 HTTP 请求
import torch  # PyTorch 深度学习框架

# 安装了一个修改过的版本：pip3 install -U git+https://github.com/nielsrogge/LAVIS.git@blip2
# 以确保可以比较原版和 HF 实现的 float32 实现
from lavis.models import load_model_and_preprocess  # 导入 LAVIS 模型相关函数
from PIL import Image  # Python Imaging Library，用于图像处理

# 导入 Transformers 库中的相关模块和类
from transformers import (
    AutoTokenizer,  # 自动加载 Tokenizer
    Blip2Config,  # BLIP-2 模型的配置类
    Blip2ForConditionalGeneration,  # 用于条件生成的 BLIP-2 模型
    Blip2Processor,  # BLIP-2 处理器
    Blip2VisionConfig,  # BLIP-2 视觉配置类
    BlipImageProcessor,  # BLIP-2 图像处理器
    OPTConfig,  # OpenAI 的配置类
    T5Config,  # T5 模型的配置类
    set_seed,  # 设置随机种子的函数
)
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD  # OpenAI-CLIP 的均值和标准差常数

# 加载演示图像的函数
def load_demo_image():
    url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png"
    # 从指定 URL 下载图像并转换为 RGB 模式的 PIL 图像对象
    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

    return image


# 这里列出所有需要重命名的键值对（原始名称在左边，我们的名称在右边）
def create_rename_keys(config):
    rename_keys = []
    # fmt: off

    # 视觉编码器
    rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
    rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
    rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
    rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
    rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
    rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
    # 遍历从配置中获取的视觉模型的隐藏层数量
    for i in range(config.vision_config.num_hidden_layers):
        # 将视觉编码器中第 i 层的权重重命名为视觉模型编码器中对应层的权重
        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
        rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))

    # 将 QFormer 模型中的 LayerNorm 权重重命名为 QFormer 模型的 layernorm 权重
    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.layernorm.weight"))
    # 将 QFormer 模型中的 LayerNorm 偏置重命名为 QFormer 模型的 layernorm 偏置
    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.layernorm.bias"))

    # 返回所有重命名键的列表
    return rename_keys
# 重命名字典中的键，将旧键的值弹出并存储在变量 val 中，然后将该值与新键 new 关联起来
def rename_key(dct, old, new):
    val = dct.pop(old)
    dct[new] = val


# 从状态字典中读取视觉编码器每个隐藏层的 q 和 v 偏置
def read_in_q_v_bias(state_dict, config):
    for i in range(config.vision_config.num_hidden_layers):
        # 读取原始的 q 和 v 偏置
        q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
        v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")

        # 将 q 偏置和 v 偏置连接起来，并在 v 偏置后添加与其形状相同的零张量，构成 qkv 偏置
        qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
        # 设置新的偏置值到状态字典中
        state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias


# 根据模型名称和 EOS 令牌 ID 获取 Blip2 模型的配置信息
def get_blip2_config(model_name, eos_token_id):
    # 根据模型名称确定图像尺寸
    image_size = 364 if "coco" in model_name else 224
    # 从 Blip2VisionConfig 对象中获取图像尺寸配置并转换为字典形式
    vision_config = Blip2VisionConfig(image_size=image_size).to_dict()

    # 确保模型具有正确的 bos_token_id 和 eos_token_id 设置（生成时很重要）
    if "opt-2.7b" in model_name:
        # 从预训练模型 facebook/opt-2.7b 加载配置信息并转换为字典形式
        text_config = OPTConfig.from_pretrained("facebook/opt-2.7b", eos_token_id=eos_token_id).to_dict()
    elif "opt-6.7b" in model_name:
        # 从预训练模型 facebook/opt-6.7b 加载配置信息并转换为字典形式
        text_config = OPTConfig.from_pretrained("facebook/opt-6.7b", eos_token_id=eos_token_id).to_dict()
    elif "t5-xl" in model_name:
        # 从预训练模型 google/flan-t5-xl 加载配置信息并转换为字典形式，设置 dense_act_fn="gelu" 和 bos_token_id=1
        text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
    elif "t5-xxl" in model_name:
        # 从预训练模型 google/flan-t5-xxl 加载配置信息并转换为字典形式，设置 dense_act_fn="gelu" 和 bos_token_id=1
        text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()

    # 构建 Blip2Config 对象，将视觉配置和文本配置整合到一起
    config = Blip2Config(vision_config=vision_config, text_config=text_config)

    return config, image_size


# 使用 torch.no_grad 装饰器定义的函数，将 Blip2 模型的权重转换到 Transformers 设计
@torch.no_grad()
def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
    """
    复制/粘贴/调整模型权重到 Transformers 设计。
    """
    # 根据模型名称选择合适的分词器
    tokenizer = (
        AutoTokenizer.from_pretrained("facebook/opt-2.7b")
        if "opt" in model_name
        else AutoTokenizer.from_pretrained("google/flan-t5-xl")
    )
    # 获取 EOS 令牌的 ID
    eos_token_id = tokenizer("\n", add_special_tokens=False).input_ids[0]
    # 获取 Blip2 模型的配置信息和图像尺寸
    config, image_size = get_blip2_config(model_name, eos_token_id=eos_token_id)

    # 创建并评估 Blip2ForConditionalGeneration 模型
    hf_model = Blip2ForConditionalGeneration(config).eval()

    # 根据模型名称映射到原始模型名和类型
    model_name_to_original = {
        "blip2-opt-2.7b": ("blip2_opt", "pretrain_opt2.7b"),
        "blip2-opt-6.7b": ("blip2_opt", "pretrain_opt6.7b"),
        "blip2-opt-2.7b-coco": ("blip2_opt", "caption_coco_opt2.7b"),
        "blip2-opt-6.7b-coco": ("blip2_opt", "caption_coco_opt6.7b"),
        "blip2-flan-t5-xl": ("blip2_t5", "pretrain_flant5xl"),
        "blip2-flan-t5-xl-coco": ("blip2_t5", "caption_coco_flant5xl"),
        "blip2-flan-t5-xxl": ("blip2_t5", "pretrain_flant5xxl"),
    }

    # 注意: 此脚本在两个 GPU 上测试过，因为模型在 float32 下比较，需要相当多的内存。
    # 因此在单独的设备上加载两者以便比较是最简单的方式。
    # 如果CUDA可用，指定hf_model_device为cuda:0，否则为cpu
    hf_model_device = "cuda:0" if torch.cuda.is_available() else "cpu"
    # 如果CUDA可用，指定lavis_device为cuda:1，否则为cpu
    lavis_device = "cuda:1" if torch.cuda.is_available() else "cpu"

    # 加载原始模型
    print("Loading original model...")
    # 调用函数加载模型及预处理器，设置为评估模式，使用lavis_device指定的设备
    original_model, vis_processors, _ = load_model_and_preprocess(
        name=name, model_type=type, is_eval=True, device=lavis_device
    )
    # 将模型设置为评估模式
    original_model.eval()
    print("Done!")

    # 更新模型状态字典中的键名
    state_dict = original_model.state_dict()
    # 创建重命名映射表
    rename_keys = create_rename_keys(config)
    # 对状态字典中的每个键应用重命名操作
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)

    # 一些键可以高效地重命名
    for key, val in state_dict.copy().items():
        val = state_dict.pop(key)
        # 根据特定规则重命名键
        if key.startswith("Qformer.bert"):
            key = key.replace("Qformer.bert", "qformer")
        if "attention.self" in key:
            key = key.replace("self", "attention")
        if "opt_proj" in key:
            key = key.replace("opt_proj", "language_projection")
        if "t5_proj" in key:
            key = key.replace("t5_proj", "language_projection")
        if key.startswith("opt"):
            key = key.replace("opt", "language")
        if key.startswith("t5"):
            key = key.replace("t5", "language")
        # 将修改后的键值对重新添加到状态字典中
        state_dict[key] = val

    # 读取q_v偏置项
    read_in_q_v_bias(state_dict, config)

    # 加载状态字典到hf_model中，允许部分匹配
    missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False)
    # 断言确保没有缺失的键
    assert len(missing_keys) == 0
    # 断言确保出现的键是 ["qformer.embeddings.position_ids"]
    assert unexpected_keys == ["qformer.embeddings.position_ids"]

    # 加载演示图片
    image = load_demo_image()
    # 对演示图片进行处理得到原始像素值，并将其扩展为一维张量，放置在lavis_device上
    original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
    # 使用tokenizer处理文本，返回输入的ids张量，并将其放置在hf_model_device上
    input_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to(hf_model_device)

    # 创建图像处理器，指定图像尺寸、均值和标准差
    image_processor = BlipImageProcessor(
        size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
    )
    # 创建Blip2Processor，结合图像处理器和tokenizer
    processor = Blip2Processor(image_processor=image_processor, tokenizer=tokenizer)
    # 使用processor处理图片，返回像素值，并将其放置在hf_model_device上
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(hf_model_device)

    # 确保processor创建的像素值与原始像素值完全相同
    assert torch.allclose(pixel_values, original_pixel_values.to(pixel_values.device))

    # 将原始模型和hf_model移动到指定设备上
    original_model.to(lavis_device)
    hf_model.to(hf_model_device)
    # 使用torch.no_grad()上下文管理器，确保在推理阶段不计算梯度
    with torch.no_grad():
        if "opt" in model_name:
            # 如果模型名中包含'opt'，使用原始模型生成logits
            original_logits = original_model({"image": original_pixel_values, "text_input": [""]}).logits
            # 使用hf_model生成logits
            logits = hf_model(pixel_values, input_ids).logits
        else:
            # 否则，使用原始模型生成logits，并使用输入ids进行标签掩码
            original_logits = original_model(
                {"image": original_pixel_values, "text_input": ["\n"], "text_output": ["\n"]}
            ).logits
            labels = input_ids.masked_fill(input_ids == tokenizer.pad_token_id, -100)
            logits = hf_model(pixel_values, input_ids, labels=labels).logits

    # 断言确保原始logits和生成logits的形状相同
    assert original_logits.shape == logits.shape
    
if __name__ == "__main__":
    # 如果脚本被直接执行而非被导入，则执行以下代码块

    # 创建一个参数解析器对象
    parser = argparse.ArgumentParser()

    # 定义可选的模型名称列表
    choices = [
        "blip2-opt-2.7b",
        "blip2-opt-6.7b",
        "blip2-opt-2.7b-coco",
        "blip2-opt-6.7b-coco",
        "blip2-flan-t5-xl",
        "blip2-flan-t5-xl-coco",
        "blip2-flan-t5-xxl",
    ]

    # 向参数解析器添加命令行参数：模型名称
    parser.add_argument(
        "--model_name",
        default="blip2-opt-2.7b",
        choices=choices,
        type=str,
        help="Path to hf config.json of model to convert",
    )

    # 向参数解析器添加命令行参数：PyTorch 模型输出文件夹路径
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")

    # 向参数解析器添加命令行参数：是否推送模型到 Hub
    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether to push the model and processor to the hub after converting",
    )

    # 解析命令行参数，并将其存储在 args 对象中
    args = parser.parse_args()

    # 调用函数 convert_blip2_checkpoint，传递解析后的参数
    convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

`.\models\blip_2\modeling_blip_2.py`

# 设置文件编码为 UTF-8
# 版权声明，指出版权归 Salesforce 作者和 HuggingFace 团队所有
#
# 根据 Apache 许可证版本 2.0 进行许可
# 除非符合许可证规定，否则不得使用本文件
# 可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 如果适用法律要求或书面同意，本软件是基于"按原样提供"的基础分发的
# 没有任何明示或暗示的担保或条件
# 请参阅许可证以了解详细的法律条款
""" PyTorch BLIP-2 model."""

import math  # 导入数学函数库
from dataclasses import dataclass  # 导入用于数据类的 dataclass 装饰器
from typing import Any, Optional, Tuple, Union  # 导入类型提示相关的模块

import torch  # 导入 PyTorch 深度学习框架
import torch.utils.checkpoint  # 导入用于检查点的实用工具
from torch import nn  # 从 PyTorch 导入神经网络模块
from torch.nn import CrossEntropyLoss  # 导入交叉熵损失函数

# 从相关模块导入各种类和函数
from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPooling,
    BaseModelOutputWithPoolingAndCrossAttentions,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型的实用函数
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer  # 导入 PyTorch 实用工具函数
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ..auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM  # 从自动化模块导入自动模型类
from .configuration_blip_2 import Blip2Config, Blip2QFormerConfig, Blip2VisionConfig  # 导入 BLIP-2 配置类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CHECKPOINT_FOR_DOC = "Salesforce/blip2-opt-2.7b"  # 预训练模型的检查点名称用于文档

# BLIP-2 模型的预训练模型存档列表
BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "Salesforce/blip2-opt-2.7b",
    # 可以在以下网址查看所有 BLIP-2 模型：https://huggingface.co/models?filter=blip
]


@dataclass
class Blip2ForConditionalGenerationModelOutput(ModelOutput):
    """
    Class defining the outputs of [`Blip2ForConditionalGeneration`].

    Args:
        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Language modeling loss from the language model.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head of the language model.
        vision_outputs (`BaseModelOutputWithPooling`):
            Outputs of the vision encoder.
        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
            Outputs of the Q-Former (Querying Transformer).
        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
            Outputs of the language model.
    """

    loss: Optional[Tuple[torch.FloatTensor]] = None  # 语言模型的损失，当提供了标签时返回，为一个张量元组
    logits: Optional[Tuple[torch.FloatTensor]] = None  # 语言模型头部的预测分数，形状为(batch_size, sequence_length, config.vocab_size)的张量
    vision_outputs: Optional[torch.FloatTensor] = None  # 视觉编码器的输出
    qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None  # Q-Former (查询变压器)的输出，包含池化和交叉注意力
    language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None  # 语言模型的输出，包含过去的 CausalLMOutputWithPast 或者 Seq2SeqLMOutput
    # 将对象转换为元组的方法，返回对象的各个属性组成的元组
    def to_tuple(self) -> Tuple[Any]:
        # 使用生成器表达式生成元组
        return tuple(
            # 如果键不是特定的输出属性，则直接取对象的属性值
            self[k]
            # 否则，调用对象的相应属性的to_tuple方法来获取其元组表示
            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
            else getattr(self, k).to_tuple()
            # 对于对象的所有键进行遍历
            for k in self.keys()
        )
# 从transformers.models.blip.modeling_blip.BlipVisionEmbeddings复制并修改为Blip2VisionEmbeddings类
class Blip2VisionEmbeddings(nn.Module):
    def __init__(self, config: Blip2VisionConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size  # 设置嵌入维度为配置中的隐藏大小
        self.image_size = config.image_size  # 图像尺寸来自配置
        self.patch_size = config.patch_size  # 补丁尺寸来自配置

        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))  # 类别嵌入参数，形状为(1, 1, embed_dim)

        self.patch_embedding = nn.Conv2d(
            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
        )  # 使用卷积层作为补丁嵌入器，输入通道为3，输出通道为embed_dim，卷积核大小和步长为patch_size

        self.num_patches = (self.image_size // self.patch_size) ** 2  # 计算图像中的补丁数量
        self.num_positions = self.num_patches + 1  # 位置嵌入数量为补丁数量加1

        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
        # 位置嵌入参数，形状为(1, num_positions, embed_dim)

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        batch_size = pixel_values.shape[0]  # 获取批次大小
        target_dtype = self.patch_embedding.weight.dtype  # 目标数据类型与补丁嵌入的权重数据类型一致
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # 得到补丁嵌入，形状为[*, embed_dim, grid, grid]
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)  # 展平并转置补丁嵌入的维度

        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)  # 扩展类别嵌入以适应批次大小
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)  # 拼接类别嵌入和补丁嵌入
        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
        # 加上位置嵌入（仅限于嵌入的数量），使用与目标数据类型一致的参数
        return embeddings


class Blip2Attention(nn.Module):
    """来自'Attention Is All You Need'论文的多头注意力模块"""

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size  # 设置嵌入维度为配置中的隐藏大小
        self.num_heads = config.num_attention_heads  # 设置注意力头的数量
        self.head_dim = self.embed_dim // self.num_heads  # 每个注意力头的维度
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim必须能够被num_heads整除（得到`embed_dim`:{self.embed_dim}和`num_heads`:{self.num_heads}）."
            )
        self.scale = self.head_dim**-0.5  # 缩放因子，根据头维度设置
        self.dropout = nn.Dropout(config.attention_dropout)  # 注意力模型的dropout率

        # 对比于CLIP，这里做了一个小调整，不使用偏置
        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)
        # QKV线性变换层，输出维度为3 * embed_dim，没有偏置

        if config.qkv_bias:
            q_bias = nn.Parameter(torch.zeros(self.embed_dim))
            v_bias = nn.Parameter(torch.zeros(self.embed_dim))
        else:
            q_bias = None
            v_bias = None

        if q_bias is not None:
            qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
            self.qkv.bias = nn.Parameter(qkv_bias)  # 设置QKV的偏置参数

        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
        # 线性投影层，用于最终的嵌入维度映射
    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        # 获取输入张量的维度信息
        bsz, tgt_len, embed_dim = hidden_states.size()

        # 使用 self.qkv 对象对隐藏状态进行线性变换，生成混合的查询、键、值张量
        mixed_qkv = self.qkv(hidden_states)

        # 将混合的查询、键、值张量重塑为合适的形状，并重新排列维度顺序
        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
            2, 0, 3, 1, 4
        )
        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]

        # 计算注意力分数，即查询和键的点积
        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))

        # 缩放注意力分数
        attention_scores = attention_scores * self.scale

        # 对注意力分数进行归一化，转换为注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 使用 dropout 对注意力概率进行随机置零，防止过拟合
        attention_probs = self.dropout(attention_probs)

        # 如果提供了 head_mask，则对注意力概率进行掩码操作
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算加权后的值张量，再重新排列维度顺序
        context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)

        # 重新调整上下文层的形状，以匹配预期的输出维度
        new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
        context_layer = context_layer.reshape(new_context_layer_shape)

        # 对上下文层进行投影，生成最终的输出
        output = self.projection(context_layer)

        # 根据是否需要输出注意力概率，选择性返回结果
        outputs = (output, attention_probs) if output_attentions else (output, None)

        return outputs
# Copied from transformers.models.blip.modeling_blip.BlipMLP
# 定义一个名为 Blip2MLP 的类，继承自 nn.Module
class Blip2MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化函数，保存配置信息
        self.config = config
        # 选择激活函数，根据配置中的隐藏层激活函数选择对应的函数
        self.activation_fn = ACT2FN[config.hidden_act]
        # 第一个全连接层，输入为隐藏大小，输出为中间大小
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
        # 第二个全连接层，输入为中间大小，输出为隐藏大小
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)

    # 前向传播函数，接受隐藏状态张量并返回处理后的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 第一层全连接操作
        hidden_states = self.fc1(hidden_states)
        # 应用激活函数
        hidden_states = self.activation_fn(hidden_states)
        # 第二层全连接操作
        hidden_states = self.fc2(hidden_states)
        # 返回处理后的隐藏状态张量
        return hidden_states


# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->Blip2
# 定义一个名为 Blip2EncoderLayer 的类，继承自 nn.Module
class Blip2EncoderLayer(nn.Module):
    def __init__(self, config: Blip2Config):
        super().__init__()
        # 设置嵌入维度为隐藏大小
        self.embed_dim = config.hidden_size
        # 定义自注意力层，使用 Blip2Attention 类
        self.self_attn = Blip2Attention(config)
        # 第一个层归一化层，输入维度为嵌入维度
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
        # 定义 MLP 层，使用 Blip2MLP 类
        self.mlp = Blip2MLP(config)
        # 第二个层归一化层，输入维度为嵌入维度
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

    # 前向传播函数，接受隐藏状态、注意力掩码及是否输出注意力权重，并返回元组
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
        """
        # 保存残差连接
        residual = hidden_states

        # 应用第一个层归一化层
        hidden_states = self.layer_norm1(hidden_states)
        # 调用自注意力层的前向传播
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            head_mask=attention_mask,
            output_attentions=output_attentions,
        )
        # 添加残差连接
        hidden_states = hidden_states + residual
        # 保存更新后的残差连接
        residual = hidden_states
        # 应用第二个层归一化层
        hidden_states = self.layer_norm2(hidden_states)
        # 调用 MLP 层的前向传播
        hidden_states = self.mlp(hidden_states)
        # 添加残差连接
        hidden_states = hidden_states + residual

        # 准备输出，仅包含隐藏状态
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则添加到输出中
        if output_attentions:
            outputs += (attn_weights,)

        # 返回输出元组
        return outputs


# 定义一个名为 Blip2PreTrainedModel 的抽象类，继承自 PreTrainedModel
class Blip2PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 Blip2Config
    config_class = Blip2Config
    # 指定基础模型前缀为 "blip"
    base_model_prefix = "blip"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不需要拆分的模块列表
    _no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"]
    # 跳过键的设备放置
    _skip_keys_device_placement = "past_key_values"
    # 初始化要保持为单精度浮点数的模块列表，包括名为"wo"的模块
    _keep_in_fp32_modules = ["wo"]
    
    # 初始化模块的权重
    def _init_weights(self, module):
        """Initialize the weights"""
        # 获取初始化因子
        factor = self.config.initializer_range
        
        # 如果模块是卷积层、嵌入层或线性层，对其权重进行正态分布初始化，偏置初始化为零
        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=factor)
            if hasattr(module, "bias") and module.bias is not None:
                module.bias.data.zero_()
        
        # 如果模块是特定类型的自定义嵌入层Blip2VisionEmbeddings
        elif isinstance(module, Blip2VisionEmbeddings):
            # 如果配置中有视觉配置，则使用视觉配置中的初始化因子
            if hasattr(self.config, "vision_config"):
                factor = self.config.vision_config.initializer_range
            # 对位置嵌入和类别嵌入进行截断正态分布初始化
            nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
            nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
        
        # 如果模块是归一化层，初始化偏置为零，权重为1.0
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
        # 如果模块是线性层且具有偏置项，初始化偏置为零
        elif isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()
# BLIP_2_START_DOCSTRING 字符串变量，包含关于模型继承和用法的文档信息
BLIP_2_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`Blip2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# BLIP_2_VISION_INPUTS_DOCSTRING 字符串变量，包含关于视觉输入参数的文档信息
BLIP_2_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for
            details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# BLIP_2_TEXT_INPUTS_DOCSTRING 字符串变量，尚未提供具体的文档信息，待补充
BLIP_2_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            # 输入序列 token 在词汇表中的索引。默认情况下会忽略填充部分。可以使用 `AutoTokenizer` 获取索引。详见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。
            [什么是输入 ID？](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 避免在填充 token 索引上执行注意力操作的掩码。掩码取值为 `[0, 1]`：
            - 1 表示**未被掩蔽**的 token，
            - 0 表示**被掩蔽**的 token。
            [什么是注意力掩码？](../glossary#attention-mask)
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            # 解码器输入序列 token 在词汇表中的索引。
            # 可以使用 `AutoTokenizer` 获取索引。详见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。
            [什么是解码器输入 ID？](../glossary#decoder-input-ids)

            T5 使用 `pad_token_id` 作为 `decoder_input_ids` 生成的起始 token。如果使用了 `past_key_values`，可以选择仅输入最后的 `decoder_input_ids`（参见 `past_key_values`）。

            欲了解更多有关预训练的 `decoder_input_ids` 准备工作，请查看 [T5 Training](./t5#training)。
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            # 默认行为：生成一个忽略 `decoder_input_ids` 中填充 token 的张量。默认情况下也会使用因果掩码。
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详见返回张量中的 `attentions` 获取更多细节。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详见返回张量中的 `hidden_states` 获取更多细节。
        return_dict (`bool`, *optional*):
            # 是否返回一个 `~utils.ModelOutput` 而非简单的元组。
"""

BLIP_2_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for
            details.

        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
            provided to serve as text prompt, which the language model can continue.

            Indices can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an
            encoder-decoder language model (like T5) is used.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids)

        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            Only relevant in case an encoder-decoder language model (like T5) is used.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->Blip2
class Blip2Encoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Blip2EncoderLayer`].

    Args:
        config (`Blip2Config`):
            The corresponding vision configuration for the `Blip2Encoder`.
    """
    # 初始化函数，用于创建一个新的 Blip2Encoder 对象
    def __init__(self, config: Blip2Config):
        # 调用父类的初始化方法
        super().__init__()
        # 将传入的配置对象保存到实例变量中
        self.config = config
        # 创建一个包含多个 Blip2EncoderLayer 对象的列表，列表长度为 config.num_hidden_layers
        self.layers = nn.ModuleList([Blip2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 设置梯度检查点标记为 False
        self.gradient_checkpointing = False

    # 前向传播函数，用于定义模型的前向计算过程
    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, BaseModelOutput]:
        r"""
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        # Determine if output_attentions should be overridden by input or default configuration
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # Determine if output_hidden_states should be overridden by input or default configuration
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # Determine if return_dict should be overridden by input or default configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Initialize empty tuple for encoder_states if output_hidden_states is False
        encoder_states = () if output_hidden_states else None
        # Initialize empty tuple for all_attentions if output_attentions is False
        all_attentions = () if output_attentions else None

        # Set initial hidden_states to inputs_embeds
        hidden_states = inputs_embeds
        # Iterate through each encoder layer
        for idx, encoder_layer in enumerate(self.layers):
            # Append current hidden_states to encoder_states if output_hidden_states is True
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            # Perform gradient checkpointing if enabled during training
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    encoder_layer.__call__,
                    hidden_states,
                    attention_mask,
                    output_attentions,
                )
            else:
                # Execute encoder_layer with current hidden_states and attention_mask
                layer_outputs = encoder_layer(
                    hidden_states,
                    attention_mask,
                    output_attentions=output_attentions,
                )

            # Update hidden_states to the first output of encoder_layer
            hidden_states = layer_outputs[0]

            # Append attentions of current layer to all_attentions if output_attentions is True
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # Append final hidden_states to encoder_states if output_hidden_states is True
        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        # Return either a tuple or a BaseModelOutput based on return_dict flag
        if not return_dict:
            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )
# 从 transformers.models.blip.modeling_blip.BlipVisionModel 复制而来，将 Blip -> Blip2, BLIP -> BLIP_2
class Blip2VisionModel(Blip2PreTrainedModel):
    # 主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 使用 Blip2VisionConfig 作为配置类
    config_class = Blip2VisionConfig

    def __init__(self, config: Blip2VisionConfig):
        super().__init__(config)
        self.config = config
        embed_dim = config.hidden_size

        # 初始化嵌入层
        self.embeddings = Blip2VisionEmbeddings(config)
        # 初始化编码器
        self.encoder = Blip2Encoder(config)
        # 初始化 LayerNorm 层
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

        # 执行额外的初始化操作
        self.post_init()

    @add_start_docstrings_to_model_forward(BLIP_2_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        前向传播函数

        Returns:
            根据 return_dict 的值返回不同的输出格式
        """
        # 如果未指定 pixel_values，抛出 ValueError
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将像素值传入嵌入层
        hidden_states = self.embeddings(pixel_values)

        # 使用编码器进行编码
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取最后一个隐藏状态并进行 LayerNorm 处理
        last_hidden_state = encoder_outputs[0]
        last_hidden_state = self.post_layernorm(last_hidden_state)

        # 从最后隐藏状态中提取池化输出，并再次进行 LayerNorm 处理
        pooled_output = last_hidden_state[:, 0, :]
        pooled_output = self.post_layernorm(pooled_output)

        # 如果 return_dict 为 False，则返回元组形式的输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果 return_dict 为 True，则返回 BaseModelOutputWithPooling 类型的输出
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

    def get_input_embeddings(self):
        # 返回嵌入层对象
        return self.embeddings


class Blip2QFormerMultiHeadAttention(nn.Module):
    # 初始化函数，接受配置对象和是否跨注意力的标志
    def __init__(self, config, is_cross_attention=False):
        # 调用父类初始化函数
        super().__init__()
        # 将配置对象保存到实例变量中
        self.config = config
        # 检查隐藏大小是否能被注意力头数整除，如果不能且没有嵌入大小属性，则抛出错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
                % (config.hidden_size, config.num_attention_heads)
            )

        # 保存注意力头的数量和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建查询、键和值的线性变换层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        if is_cross_attention:
            # 如果是跨注意力，则使用编码器隐藏大小进行线性变换
            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
        else:
            # 否则使用隐藏大小进行线性变换
            self.key = nn.Linear(config.hidden_size, self.all_head_size)
            self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 创建一个丢弃层，用于注意力概率的丢弃
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # 保存位置嵌入类型，如果是相对键或相对键查询，则创建距离嵌入
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
        
        # 默认不保存注意力
        self.save_attention = False

    # 保存注意力梯度
    def save_attn_gradients(self, attn_gradients):
        self.attn_gradients = attn_gradients

    # 获取保存的注意力梯度
    def get_attn_gradients(self):
        return self.attn_gradients

    # 保存注意力映射
    def save_attention_map(self, attention_map):
        self.attention_map = attention_map

    # 获取保存的注意力映射
    def get_attention_map(self):
        return self.attention_map

    # 对输入张量进行维度转换以匹配多头注意力的形状
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数，接收隐藏状态和各种掩码、键值对等参数
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
# 从transformers.models.bert.modeling_bert.BertSelfOutput复制并修改为Blip2QFormerSelfOutput类
class Blip2QFormerSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)  # 创建线性层，用于变换隐藏状态的维度
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  # 创建层归一化层，用于归一化隐藏状态
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # 创建dropout层，用于随机丢弃部分神经元输出

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)  # 线性变换
        hidden_states = self.dropout(hidden_states)  # dropout处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)  # 添加残差连接并归一化
        return hidden_states


# 从transformers.models.bert.modeling_bert.BertSelfOutput复制并修改为Blip2QFormerAttention类
class Blip2QFormerAttention(nn.Module):
    def __init__(self, config, is_cross_attention=False):
        super().__init__()
        self.attention = Blip2QFormerMultiHeadAttention(config, is_cross_attention)  # 初始化多头注意力机制
        self.output = Blip2QFormerSelfOutput(config)  # 初始化自我输出层
        self.pruned_heads = set()  # 初始化被修剪的注意力头集合

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(
            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
        )

        # 修剪线性层
        self.attention.query = prune_linear_layer(self.attention.query, index)
        self.attention.key = prune_linear_layer(self.attention.key, index)
        self.attention.value = prune_linear_layer(self.attention.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪的注意力头
        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        self_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        attention_output = self.output(self_outputs[0], hidden_states)  # 调用自我输出层处理注意力输出
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出注意力权重，则添加到输出中
        return outputs


# 从transformers.models.bert.modeling_bert.BertIntermediate复制并修改为Blip2QFormerIntermediate类
class Blip2QFormerIntermediate(nn.Module):
    # 初始化方法，用于初始化对象
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个全连接层，输入大小为 config.hidden_size，输出大小为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 判断 config.hidden_act 是否为字符串类型
        if isinstance(config.hidden_act, str):
            # 如果是字符串，则从预定义的映射 ACT2FN 中获取对应的激活函数
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            # 否则直接使用 config.hidden_act 作为激活函数
            self.intermediate_act_fn = config.hidden_act

    # 前向传播方法，定义了数据流向
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 使用激活函数对线性变换的结果进行非线性变换
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回变换后的结果
        return hidden_states
# 从 transformers.models.bert.modeling_bert.BertOutput 复制并修改为 Blip2QFormerOutput 类
class Blip2QFormerOutput(nn.Module):
    # 初始化方法，接受一个配置对象并设置模型的基本结构
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层，将 intermediate_size 转换为 hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建 LayerNorm 层，对 hidden_size 进行归一化，eps 是归一化层的小数值
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建 Dropout 层，以 hidden_dropout_prob 的概率丢弃神经元
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法，接受隐藏状态和输入张量，返回处理后的张量
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 全连接层处理隐藏状态
        hidden_states = self.dense(hidden_states)
        # 使用 Dropout 层丢弃部分神经元
        hidden_states = self.dropout(hidden_states)
        # 将归一化层应用于加上输入张量的结果
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的隐藏状态张量
        return hidden_states


# Blip2QFormerLayer 类，继承自 nn.Module
class Blip2QFormerLayer(nn.Module):
    # 初始化方法，接受配置对象和层索引
    def __init__(self, config, layer_idx):
        super().__init__()
        # 设置前馈块的分块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 设置序列长度的维度
        self.seq_len_dim = 1
        # 创建 Blip2QFormerAttention 对象
        self.attention = Blip2QFormerAttention(config)
        
        # 记录当前层的索引
        self.layer_idx = layer_idx

        # 根据 config 中的跨注意力频率设置是否包含跨注意力机制
        if layer_idx % config.cross_attention_frequency == 0:
            # 如果层索引可以被跨注意力频率整除，创建跨注意力对象
            self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True)
            self.has_cross_attention = True
        else:
            # 否则不包含跨注意力机制
            self.has_cross_attention = False

        # 创建中间查询层
        self.intermediate_query = Blip2QFormerIntermediate(config)
        # 创建输出查询层
        self.output_query = Blip2QFormerOutput(config)

    # 前向传播方法，接受多个输入参数并返回处理后的隐藏状态张量
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        query_length=0,
        ):
        ):
        # 如果过去的键/值元组不为空，则将自注意力的过去键/值截取到位置1和2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 对自注意力层进行前向传播
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # 获取自注意力的输出
        attention_output = self_attention_outputs[0]
        # 输出不包括注意力
        outputs = self_attention_outputs[1:-1]

        # 获取当前注意力的键/值
        present_key_value = self_attention_outputs[-1]

        if query_length > 0:
            # 截取注意力输出的查询长度部分
            query_attention_output = attention_output[:, :query_length, :]

            if self.has_cross_attention:
                if encoder_hidden_states is None:
                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
                # 进行交叉注意力的前向传播
                cross_attention_outputs = self.crossattention(
                    query_attention_output,
                    attention_mask,
                    head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    output_attentions=output_attentions,
                )
                # 获取交叉注意力的输出
				query_attention_output = cross_attention_outputs[0]
                # 如果输出注意力权重，则添加交叉注意力
                outputs = outputs + cross_attention_outputs[1:-1]

            # 对前向传播应用分块处理
            layer_output = apply_chunking_to_forward(
                self.feed_forward_chunk_query,
                self.chunk_size_feed_forward,
                self.seq_len_dim,
                query_attention_output,
            )

            if attention_output.shape[1] > query_length:
                # 对注意力输出的查询长度之后的部分进行分块处理
                layer_output_text = apply_chunking_to_forward(
                    self.feed_forward_chunk,
                    self.chunk_size_feed_forward,
                    self.seq_len_dim,
                    attention_output[:, query_length:, :],
                )
                # 将处理后的两部分拼接起来
                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
        else:
            # 对注意力输出进行分块处理
            layer_output = apply_chunking_to_forward(
                self.feed_forward_chunk,
                self.chunk_size_feed_forward,
                self.seq_len_dim,
                attention_output,
            )
        # 更新输出
        outputs = (layer_output,) + outputs

        outputs = outputs + (present_key_value,)

        return outputs

    # 定义前馈层的分块处理函数
    def feed_forward_chunk(self, attention_output):
        # 对注意力输出进行前馈处理
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output

    # 定义交叉注意力的前馈层的分块处理函数
    def feed_forward_chunk_query(self, attention_output):
        # 对注意力输出进行前馈处理
        intermediate_output = self.intermediate_query(attention_output)
        layer_output = self.output_query(intermediate_output, attention_output)
        return layer_output
# 定义一个名为 Blip2QFormerEncoder 的类，继承自 nn.Module
class Blip2QFormerEncoder(nn.Module):
    # 初始化方法，接受一个参数 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 将传入的 config 参数保存到实例变量 self.config 中
        self.config = config
        # 创建一个 nn.ModuleList，其中包含多个 Blip2QFormerLayer 对象
        # 每个 Blip2QFormerLayer 对象通过 config 和 layer_idx 创建，layer_idx 在 0 到 config.num_hidden_layers 之间循环
        self.layer = nn.ModuleList(
            [Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        # 设置梯度检查点为 False
        self.gradient_checkpointing = False

    # 定义前向传播方法
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        query_length=0,
        ):
        # 如果输出隐藏状态为真，则初始化一个空元组用于存储所有隐藏状态
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重为真，则初始化一个空元组用于存储所有自注意力权重
        all_self_attentions = () if output_attentions else None
        # 如果输出交叉注意力权重为真，则初始化一个空元组用于存储所有交叉注意力权重
        all_cross_attentions = () if output_attentions else None

        # 如果使用缓存，则初始化一个空元组用于存储下一个解码器缓存
        next_decoder_cache = () if use_cache else None

        # 循环遍历所有的层
        for i in range(self.config.num_hidden_layers):
            # 获取当前层的模块
            layer_module = self.layer[i]
            # 如果输出隐藏状态为真，则将当前隐藏状态加入到所有隐藏状态中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码，如果没有则为 None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取过去的键值对，如果没有则为 None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果启用了梯度检查点且处于训练模式
            if getattr(self.config, "gradient_checkpointing", False) and self.training:
                # 如果使用了缓存，则发出警告并设置 use_cache=False
                if use_cache:
                    logger.warning(
                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                    )
                    use_cache = False
                # 调用梯度检查点函数，用于在计算中断时进行检查点
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                )
            else:
                # 否则，正常调用当前层的模块进行前向传播
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                    query_length,
                )

            # 更新当前隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果使用缓存，则将当前层的输出的最后一个元素加入到下一个解码器缓存中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果输出注意力权重为真，则将当前层的输出的第二个元素加入到所有自注意力权重中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果当前层有交叉注意力，将其输出的第三个元素加入到所有交叉注意力权重中
                if layer_module.has_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果输出隐藏状态为真，则将最终的隐藏状态加入到所有隐藏状态中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的结果，则以元组形式返回多个非空值
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则，以特定格式返回包含各种输出的对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
    """
    Querying Transformer (Q-Former), used in BLIP-2.
    """

    def __init__(self, config: Blip2QFormerConfig):
        # 调用父类的初始化方法，传入配置对象
        super().__init__(config)
        # 将配置对象保存在实例中
        self.config = config

        # 初始化 LayerNorm 层，用于规范化隐藏状态向量
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 Dropout 层，用于在训练时随机失活一部分神经元
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 初始化编码器，使用 BLIP-2 Q-Former 的编码器
        self.encoder = Blip2QFormerEncoder(config)

        # 执行额外的初始化步骤
        self.post_init()

    def get_input_embeddings(self):
        # 返回输入嵌入层的单词嵌入
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        # 设置输入嵌入层的单词嵌入为给定的值
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历需要修剪的层和头部
        for layer, heads in heads_to_prune.items():
            # 调用编码器中的注意力机制的修剪方法，修剪指定层中的指定注意力头
            self.encoder.layer[layer].attention.prune_heads(heads)

    def get_extended_attention_mask(
        self,
        attention_mask: torch.Tensor,
        input_shape: Tuple[int],
        device: torch.device,
        has_query: bool = False,
    ) -> torch.Tensor:
        """
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.
            device (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with the same dtype as `attention_mask.dtype`.
        """
        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        if attention_mask.dim() == 3:
            # Extend the attention mask to have an additional dimension for heads
            extended_attention_mask = attention_mask[:, None, :, :]
        elif attention_mask.dim() == 2:
            # Extend the attention mask for an encoder model to include heads and both sequence lengths
            extended_attention_mask = attention_mask[:, None, None, :]
        else:
            # Raise an error if the shape of attention_mask doesn't match expected shapes
            raise ValueError(
                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
                    input_shape, attention_mask.shape
                )
            )

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        
        # Convert extended_attention_mask to the same dtype as self.dtype (for FP16 compatibility)
        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)
        # Apply the masking: set positions to -10000.0 where extended_attention_mask is 0.0
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        return extended_attention_mask
# 使用装饰器添加文档字符串到类 Blip2Model，描述了该模型的组成部分和功能
@add_start_docstrings(
    """
    BLIP-2 Model for generating text and image features. The model consists of a vision encoder, Querying Transformer
    (Q-Former) and a language model.
    """,
    BLIP_2_START_DOCSTRING,
)
# 定义 Blip2Model 类，继承自 Blip2PreTrainedModel 类
class Blip2Model(Blip2PreTrainedModel):
    
    # 引用 Blip2Config 作为配置类
    config_class = Blip2Config
    # 主输入名称为 "pixel_values"
    main_input_name = "pixel_values"

    # 初始化方法，接受一个 Blip2Config 类型的参数 config
    def __init__(self, config: Blip2Config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 使用 Blip2VisionModel 类根据 vision_config 初始化 vision_model
        self.vision_model = Blip2VisionModel(config.vision_config)

        # 创建一个可训练参数，形状为 (1, num_query_tokens, hidden_size)，用于查询 Transformer 模型
        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
        # 使用 Blip2QFormerModel 类根据 qformer_config 初始化 qformer
        self.qformer = Blip2QFormerModel(config.qformer_config)

        # 使用 nn.Linear 创建一个线性层，将 qformer 的隐藏大小映射到 text_config 的隐藏大小
        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
        
        # 根据配置选择使用语言模型，可能是 AutoModelForCausalLM 或 AutoModelForSeq2SeqLM
        if config.use_decoder_only_language_model:
            language_model = AutoModelForCausalLM.from_config(config.text_config)
        else:
            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)

        # 如果语言模型有 tied weights keys，则更新 _tied_weights_keys 属性
        if language_model._tied_weights_keys is not None:
            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]

        # 将初始化后的语言模型赋值给 self.language_model
        self.language_model = language_model

        # 初始化权重并进行最终处理
        self.post_init()

    # 返回语言模型的输入嵌入层
    def get_input_embeddings(self):
        return self.language_model.get_input_embeddings()

    # 设置语言模型的输入嵌入层
    def set_input_embeddings(self, value):
        self.language_model.set_input_embeddings(value)

    # 设置语言模型的输出嵌入层
    def set_output_embeddings(self, new_embeddings):
        self.language_model.set_output_embeddings(new_embeddings)

    # 获取语言模型的输出嵌入层
    def get_output_embeddings(self) -> nn.Module:
        return self.language_model.get_output_embeddings()

    # 获取语言模型的编码器
    def get_encoder(self):
        return self.language_model.get_encoder()

    # 获取语言模型的解码器
    def get_decoder(self):
        return self.language_model.get_decoder()

    # 如果未使用仅解码器语言模型，则将语言模型的编码器和解码器的 embed_tokens 与 shared 属性绑定
    def _tie_weights(self):
        if not self.config.use_decoder_only_language_model:
            self.language_model.encoder.embed_tokens = self.language_model.shared
            self.language_model.decoder.embed_tokens = self.language_model.shared

    # 使用装饰器添加文档字符串到 get_text_features 方法，描述其输入参数和功能
    @add_start_docstrings_to_model_forward(BLIP_2_TEXT_INPUTS_DOCSTRING)
    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.Tensor] = None,
        decoder_attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    @add_start_docstrings_to_model_forward(BLIP_2_VISION_INPUTS_DOCSTRING)
    # 将 BLIP_2_VISION_INPUTS_DOCSTRING 添加到模型前向方法的文档字符串中
    def get_image_features(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        r"""
        Returns:
            text_outputs (`CausalLMOutputWithPast`, or `tuple(torch.FloatTensor)` if `return_dict=False`):
                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
                contains the language model logits, the past key values and the hidden states if
                `output_hidden_states=True`.
        Examples:
        ```
        >>> import torch
        >>> from transformers import AutoTokenizer, Blip2Model

        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip2-opt-2.7b")
        >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        # 如果未指定 output_attentions，则使用 self.config.output_attentions
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定 output_hidden_states，则使用 self.config.output_hidden_states
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定 return_dict，则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果配置为仅使用解码器语言模型
        if self.config.use_decoder_only_language_model:
            # 调用语言模型的前向方法，传入参数
            text_outputs = self.language_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
        else:
            # 获取输入嵌入
            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)

            # 调用语言模型的前向方法，传入参数
            text_outputs = self.language_model(
                inputs_embeds=inputs_embeds,
                attention_mask=attention_mask,
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                labels=labels,
            )

        # 返回语言模型的输出
        return text_outputs
    ):
        r"""
        Returns:
            vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`):
                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
                contains the image features, the pooled image features and the hidden states if
                `output_hidden_states=True`.
        Examples:
        ```
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Blip2Model

        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")

        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")
        >>> image_outputs = model.get_image_features(**inputs)
        ```"""
        # 如果没有明确指定，则从配置中获取是否返回注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果没有明确指定，则从配置中获取是否返回隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果没有明确指定，则从配置中获取是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 vision_model 方法，传入像素值、注意力权重、隐藏状态和返回字典参数
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 返回视觉模型的输出
        return vision_outputs

    @add_start_docstrings_to_model_forward(BLIP_2_INPUTS_DOCSTRING)
    def get_qformer_features(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        r"""
        Returns:
            vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`):
                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
                contains the image features, the pooled image features and the hidden states if
                `output_hidden_states=True`.
        Examples:
        ```
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import Blip2Processor, Blip2Model

        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")
        >>> qformer_outputs = model.get_qformer_features(**inputs)
        ```"""

        # Initialize optional variables or use values from model configuration if not provided
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Forward pass through the vision model to obtain image embeddings
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract image embeddings from the vision model outputs
        image_embeds = vision_outputs[0]

        # Step 2: Forward the query tokens through the QFormer with cross-attention to image embeddings

        # Create attention mask for the image embeddings
        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)

        # Expand the query tokens to match the batch size of image embeddings
        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)

        # Forward pass through the QFormer model
        query_outputs = self.qformer(
            query_embeds=query_tokens,
            encoder_hidden_states=image_embeds,
            encoder_attention_mask=image_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Return the outputs from the QFormer model
        return query_outputs
    # 定义模型的前向传播方法，用于推断或训练过程中的数据传递
    def forward(
        self,
        # 输入的像素值张量，作为模型的输入之一
        pixel_values: torch.FloatTensor,
        # 编码器的输入标识符张量，描述输入序列
        input_ids: torch.FloatTensor,
        # 可选的注意力掩码张量，用于指示哪些元素需要被注意到
        attention_mask: Optional[torch.LongTensor] = None,
        # 可选的解码器输入标识符张量，用于解码器输入
        decoder_input_ids: Optional[torch.LongTensor] = None,
        # 可选的解码器注意力掩码张量，用于解码器的注意力掩码
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        # 是否输出注意力权重的标志，可选
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态的标志，可选
        output_hidden_states: Optional[bool] = None,
        # 标签张量，用于训练时的目标标签
        labels: Optional[torch.LongTensor] = None,
        # 是否返回字典形式的输出，可选
        return_dict: Optional[bool] = None,
"""
BLIP-2 Model for generating text given an image and an optional text prompt. The model consists of a vision
encoder, Querying Transformer (Q-Former) and a language model.

One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.

<Tip>

Note that Flan-T5 checkpoints cannot be cast to float16. They are pre-trained using bfloat16.

</Tip>
"""
@add_start_docstrings(
    """
    BLIP-2 Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.

    <Tip>

    Note that Flan-T5 checkpoints cannot be cast to float16. They are pre-trained using bfloat16.

    </Tip>
    """,
    BLIP_2_START_DOCSTRING,
)
class Blip2ForConditionalGeneration(Blip2PreTrainedModel):
    config_class = Blip2Config
    main_input_name = "pixel_values"

    def __init__(self, config: Blip2Config):
        super().__init__(config)

        # Initialize vision model using Blip2VisionModel with given configuration
        self.vision_model = Blip2VisionModel(config.vision_config)

        # Initialize query tokens as a learnable parameter tensor for Q-Former
        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
        
        # Initialize Q-Former model using Blip2QFormerModel with given configuration
        self.qformer = Blip2QFormerModel(config.qformer_config)

        # Linear projection layer to map Q-Former's output to language model's input size
        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)

        # Conditionally select between AutoModelForCausalLM and AutoModelForSeq2SeqLM based on config
        if config.use_decoder_only_language_model:
            language_model = AutoModelForCausalLM.from_config(config.text_config)
        else:
            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)

        # Update _tied_weights_keys using the base model used
        if language_model._tied_weights_keys is not None:
            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]

        # Set the language model based on the condition above
        self.language_model = language_model

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.language_model.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.language_model.set_input_embeddings(value)

    def set_output_embeddings(self, new_embeddings):
        self.language_model.set_output_embeddings(new_embeddings)

    def get_output_embeddings(self) -> nn.Module:
        return self.language_model.get_output_embeddings()

    def get_encoder(self):
        return self.language_model.get_encoder()

    def get_decoder(self):
        return self.language_model.get_decoder()

    def _tie_weights(self):
        # Tie weights between encoder and decoder if not using decoder-only language model
        if not self.config.use_decoder_only_language_model:
            self.language_model.encoder.embed_tokens = self.language_model.shared
            self.language_model.decoder.embed_tokens = self.language_model.shared
    def _preprocess_accelerate(self):
        r"""
        Some pre-processing hacks to make the model `accelerate` compatible. Check
        https://github.com/huggingface/transformers/pull/21707 for more details.
        """
        # 将当前对象的 hf_device_map 属性赋值给局部变量 hf_device_map
        hf_device_map = self.hf_device_map

        # 如果 hf_device_map 中有多个设备，并且不包含 "language_model"，并且当前有多个 CUDA 设备可用
        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
            # 发出警告，提示用户在使用多 GPU + BLIP-2 + `accelerate` 时可能会出现意外行为
            logger.warning(
                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
                " Please pass a `device_map` that contains `language_model` to remove this warning."
                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
                " more details on creating a `device_map` for large models.",
            )

        # 如果当前对象的 language_model 属性具有 _hf_hook 方法
        if hasattr(self.language_model, "_hf_hook"):
            # 设置 language_model._hf_hook.io_same_device 为 True，以保证 `generate` 方法的兼容性
            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility

    @add_start_docstrings_to_model_forward(BLIP_2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Blip2ForConditionalGenerationModelOutput, config_class=Blip2VisionConfig)
    # 定义 forward 方法，接收多个输入参数并返回特定类型的输出
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        input_ids: torch.FloatTensor,
        attention_mask: Optional[torch.LongTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        labels: Optional[torch.LongTensor] = None,
        return_dict: Optional[bool] = None,
    ):
        # forward 方法的具体实现在这里被定义

    @torch.no_grad()
    # 定义 generate 方法，用于生成模型输出，不进行梯度计算
    def generate(
        self,
        pixel_values: torch.FloatTensor,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        **generate_kwargs,
    ):
        # generate 方法的具体实现在这里被定义
    ) -> torch.LongTensor:
        """
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
                Input images to be processed.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        """
        if hasattr(self, "hf_device_map"):
            # 如果模型有 `hf_device_map` 属性，进行 `accelerate` 预处理
            self._preprocess_accelerate()

        # 获取输入的批次大小
        batch_size = pixel_values.shape[0]
        # 提取图像特征向量，使用 `vision_model` 模型，并返回最后一个隐藏状态
        image_embeds = self.vision_model(pixel_values, return_dict=True).last_hidden_state
        # 创建图像的注意力掩码，全为 1，形状与 `image_embeds` 的前几维相同
        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)

        # 扩展查询 tokens 到与 `image_embeds` 相同的批次大小
        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
        # 使用 `qformer` 模型处理查询 tokens 和图像特征向量，返回一个字典
        query_outputs = self.qformer(
            query_embeds=query_tokens,
            encoder_hidden_states=image_embeds,
            encoder_attention_mask=image_attention_mask,
            return_dict=True,
        )
        # 获取查询输出的最后一个隐藏状态
        query_output = query_outputs.last_hidden_state

        # 对查询输出应用语言模型的投影层
        language_model_inputs = self.language_projection(query_output)
        # 创建语言模型的注意力掩码，全为 1，形状与 `language_model_inputs` 的前几维相同
        language_attention_mask = torch.ones(
            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
        )

        # 如果 `input_ids` 为空，则使用配置文件中的 `bos_token_id` 重复生成一个张量，并移到与 `image_embeds` 相同的设备上
        if input_ids is None:
            input_ids = (
                torch.LongTensor([[self.config.text_config.bos_token_id]])
                .repeat(batch_size, 1)
                .to(image_embeds.device)
            )
        # 如果 `attention_mask` 为空，则创建一个与 `input_ids` 相同形状的全 1 张量作为注意力掩码
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        # 拼接语言模型的注意力掩码和输入的注意力掩码，以便考虑到 padding 的处理
        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)

        # 将输入的 tokens embeddings 与语言模型的输入 embeddings 进行拼接
        inputs_embeds = self.get_input_embeddings()(input_ids)
        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)

        # 如果语言模型不是编码器-解码器结构，则增加 `max_length` 和 `min_length` 的值，确保最终计数仅在 token embeddings 上
        if not self.language_model.config.is_encoder_decoder:
            generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1]
            generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]

        # 使用语言模型生成文本输出
        outputs = self.language_model.generate(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            **generate_kwargs,
        )

        # 返回生成的文本输出
        return outputs

`.\models\blip_2\processing_blip_2.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for BLIP-2.
"""

from typing import List, Optional, Union

from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType


class Blip2Processor(ProcessorMixin):
    r"""
    Constructs a BLIP-2 processor which wraps a BLIP image processor and an OPT/T5 tokenizer into a single processor.

    [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the docstring
    of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.

    Args:
        image_processor (`BlipImageProcessor`):
            An instance of [`BlipImageProcessor`]. The image processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "BlipImageProcessor"
    tokenizer_class = "AutoTokenizer"

    # Copied from transformers.models.blip.processing_blip.BlipProcessor.__init__
    def __init__(self, image_processor, tokenizer):
        # 禁用 tokenizer 的 token_type_ids 返回功能
        tokenizer.return_token_type_ids = False
        # 调用父类 ProcessorMixin 的构造函数，传入 image_processor 和 tokenizer
        super().__init__(image_processor, tokenizer)
        # 将当前处理器设置为图像处理器 image_processor
        self.current_processor = self.image_processor

    # Copied from transformers.models.blip.processing_blip.BlipProcessor.__call__
    def __call__(
        self,
        images: ImageInput = None,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_token_type_ids: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ):
        # 根据参数设置调用处理器的各种功能
        pass
    ) -> BatchEncoding:
        """
        使用 BlipImageProcessor.__call__ 方法准备模型的图像输入，
        使用 BertTokenizerFast.__call__ 方法准备模型的文本输入。

        详细信息请参考上述两个方法的文档字符串。
        """
        if images is None and text is None:
            raise ValueError("You have to specify either images or text.")

        # 只处理文本情况
        if images is None:
            self.current_processor = self.tokenizer
            # 使用 tokenizer 处理文本编码
            text_encoding = self.tokenizer(
                text=text,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_offsets_mapping=return_offsets_mapping,
                return_token_type_ids=return_token_type_ids,
                return_length=return_length,
                verbose=verbose,
                return_tensors=return_tensors,
                **kwargs,
            )
            return text_encoding

        # 添加像素值处理
        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)

        if text is not None:
            # 使用 tokenizer 处理文本编码
            text_encoding = self.tokenizer(
                text=text,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_offsets_mapping=return_offsets_mapping,
                return_token_type_ids=return_token_type_ids,
                return_length=return_length,
                verbose=verbose,
                return_tensors=return_tensors,
                **kwargs,
            )
        else:
            text_encoding = None

        if text_encoding is not None:
            # 更新图像处理器的文本编码结果
            encoding_image_processor.update(text_encoding)

        return encoding_image_processor

    # 从 transformers.models.blip.processing_blip.BlipProcessor.batch_decode 复制，并替换为 BertTokenizerFast->PreTrainedTokenizer
    def batch_decode(self, *args, **kwargs):
        """
        此方法将所有参数转发给 PreTrainedTokenizer 的 batch_decode 方法。请参考该方法的文档字符串获取详细信息。
        """
        return self.tokenizer.batch_decode(*args, **kwargs)
    # 从 `transformers.models.blip.processing_blip.BlipProcessor.decode` 复制的方法。
    # 将所有参数和关键字参数转发给 `PreTrainedTokenizer` 的 `decode` 方法。
    # 请参阅 `PreTrainedTokenizer.decode` 方法的文档字符串以获取更多信息。
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    @property
    # 从 `transformers.models.blip.processing_blip.BlipProcessor.model_input_names` 复制的属性。
    # 获取 `tokenizer` 和 `image_processor` 的模型输入名称，并返回去重后的列表。
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

`.\models\blip_2\init.py`

# 引入必要的模块和类型检查
from typing import TYPE_CHECKING
# 引入自定义的异常，用于处理可选依赖不可用情况下的异常
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构，包括配置和处理模块的名称列表
_import_structure = {
    "configuration_blip_2": [
        "BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "Blip2Config",
        "Blip2QFormerConfig",
        "Blip2VisionConfig",
    ],
    "processing_blip_2": ["Blip2Processor"],
}

# 尝试检查是否存在 Torch 库，如果不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则添加建模模块的名称列表到导入结构中
    _import_structure["modeling_blip_2"] = [
        "BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Blip2Model",
        "Blip2QFormerModel",
        "Blip2PreTrainedModel",
        "Blip2ForConditionalGeneration",
        "Blip2VisionModel",
    ]

# 如果类型检查开启，则从配置和处理模块中导入相应的类
if TYPE_CHECKING:
    from .configuration_blip_2 import (
        BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
        Blip2Config,
        Blip2QFormerConfig,
        Blip2VisionConfig,
    )
    from .processing_blip_2 import Blip2Processor

    # 尝试检查 Torch 是否可用，不可用则跳过导入建模模块的步骤
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_blip_2 import (
            BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST,
            Blip2ForConditionalGeneration,
            Blip2Model,
            Blip2PreTrainedModel,
            Blip2QFormerModel,
            Blip2VisionModel,
        )

else:
    # 如果没有类型检查，则将当前模块注册为 LazyModule，将导入结构传递给 LazyModule
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\bloom\configuration_bloom.py`

# coding=utf-8
# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" Bloom configuration"""

from collections import OrderedDict  # 导入有序字典类
from typing import TYPE_CHECKING, Any, List, Mapping, Optional  # 导入类型提示

from packaging import version  # 导入版本号处理模块

if TYPE_CHECKING:
    from ... import PreTrainedTokenizer, TensorType  # 条件导入

from ...configuration_utils import PretrainedConfig  # 导入预训练配置基类
from ...onnx import OnnxConfigWithPast, PatchingSpec  # 导入ONNX相关配置
from ...utils import is_torch_available, logging  # 导入Torch是否可用判断和日志工具

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "bigscience/bloom": "https://huggingface.co/bigscience/bloom/resolve/main/config.json",
    "bigscience/bloom-560m": "https://huggingface.co/bigscience/bloom-560m/blob/main/config.json",
    "bigscience/bloom-1b1": "https://huggingface.co/bigscience/bloom-1b1/blob/main/config.json",
    "bigscience/bloom-1b7": "https://huggingface.co/bigscience/bloom-1b7/blob/main/config.json",
    "bigscience/bloom-3b": "https://huggingface.co/bigscience/bloom-3b/blob/main/config.json",
    "bigscience/bloom-7b1": "https://huggingface.co/bigscience/bloom-7b1/blob/main/config.json",
}

class BloomConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`BloomModel`]. It is used to instantiate a Bloom
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to the Bloom architecture
    [bigscience/bloom](https://huggingface.co/bigscience/bloom).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    pass  # 空的配置类，用于存储Bloom模型的配置信息
    # 定义 Bloom 模型的参数
    Args:
        vocab_size (`int`, *optional*, defaults to 250880):
            Bloom 模型的词汇表大小，定义了在调用 `BloomModel` 时 `inputs_ids` 可以表示的最大不同 token 数量。
            参考 [此讨论](https://huggingface.co/bigscience/bloom/discussions/120#633d28389addb8530b406c2a) 以了解 `vocab_size` 的定义。
        hidden_size (`int`, *optional*, defaults to 64):
            嵌入和隐藏状态的维度。
        n_layer (`int`, *optional*, defaults to 2):
            Transformer 编码器中的隐藏层数量。
        n_head (`int`, *optional*, defaults to 8):
            Transformer 编码器中每个注意力层的注意力头数。
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            层归一化层中使用的 epsilon 值。
        initializer_range (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态分布的标准差。
        apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`):
            如果启用，则在 transformer 块中使用隐藏状态的层归一化作为残差。
        hidden_dropout (`float`, *optional*, defaults to 0.1):
            应用于偏置丢弃的 dropout 率。
        attention_dropout (`float`, *optional*, defaults to 0.1):
            应用于注意力概率的 dropout 率。
        use_cache (`bool`, *optional*, defaults to `True`):
            模型是否应返回最后的 key/values 注意力（不是所有模型都使用）。
        pretraining_tp (`int`, *optional*, defaults to `1`):
            实验性功能。Megatron 预训练期间使用的张量并行性等级。请参考 [此文档](https://huggingface.co/docs/transformers/parallelism) 了解更多信息。
            此值对确保预训练结果的精确再现性至关重要。请参考 [此问题](https://github.com/pytorch/pytorch/issues/76232)。
            注意，仅在 `slow_but_exact=True` 时启用。
        slow_but_exact (`bool`, *optional*, defaults to `False`):
            实验性功能。是否使用注意力机制的缓慢但精确实现。在合并 TP 等级张量时，由于切片操作，Megatron 训练模型和我们模型之间的结果可能会略有不同。
            请参考 [此问题](https://github.com/pytorch/pytorch/issues/76232)。启用此功能可获得更准确的结果，但会增加推断的计算时间。
            一旦主模型通过 TP_rank=1 进行了精细调整，这个问题可能会在未来得到解决。
    # Importing necessary components from the transformers library
    from transformers import BloomConfig, BloomModel
    
    # Initializing a Bloom configuration object
    configuration = BloomConfig()
    
    # Initializing a Bloom model with random weights based on the configuration
    model = BloomModel(configuration)
    
    # Accessing the configuration attributes of the initialized model
    configuration = model.config
    
    model_type = "bloom"  # Setting the model type to "bloom"
    keys_to_ignore_at_inference = ["past_key_values"]  # Defining keys to ignore during inference
    
    # Mapping attributes for backward compatibility and clarity
    attribute_map = {
        "num_hidden_layers": "n_layer",  # Mapping number of hidden layers to 'n_layer'
        "num_attention_heads": "n_head",  # Mapping number of attention heads to 'n_head'
    }
    
    def __init__(
        self,
        vocab_size=250880,
        hidden_size=64,
        n_layer=2,
        n_head=8,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        use_cache=True,
        bos_token_id=1,
        eos_token_id=2,
        apply_residual_connection_post_layernorm=False,
        hidden_dropout=0.0,
        attention_dropout=0.0,
        pretraining_tp=1,  # TP rank used when training with megatron
        slow_but_exact=False,
        **kwargs,
    ):
        # Initializing the model attributes with default values or provided kwargs
        self.vocab_size = vocab_size
        n_embed = kwargs.pop("n_embed", None)  # Handling backward compatibility with 'n_embed' kwarg
        self.hidden_size = hidden_size if n_embed is None else n_embed
        self.n_layer = n_layer
        self.n_head = n_head
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.pretraining_tp = pretraining_tp
        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
    
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.slow_but_exact = slow_but_exact
    
        # Calling the superclass initializer with specific parameters
        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
# 定义 BloomOnnxConfig 类，继承自 OnnxConfigWithPast 类
class BloomOnnxConfig(OnnxConfigWithPast):
    # 设定 torch_onnx_minimum_version 属性为最低支持版本 1.12
    torch_onnx_minimum_version = version.parse("1.12")

    # 初始化方法，接收预训练配置 config，任务 task，默认补丁规格 patching_specs 和是否使用过去状态 use_past
    def __init__(
        self,
        config: PretrainedConfig,
        task: str = "default",
        patching_specs: List[PatchingSpec] = None,
        use_past: bool = False,
    ):
        # 调用父类的初始化方法，传递 config、task、patching_specs 和 use_past
        super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
        
        # 如果 self._config 没有定义 pad_token_id 属性，则设为默认值 0
        if not getattr(self._config, "pad_token_id", None):
            # TODO: how to do that better?
            self._config.pad_token_id = 0

    # inputs 属性，返回输入的映射关系，格式为 OrderedDict，键为字符串，值为映射关系的字典
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 创建通用的输入映射 common_inputs，包含 "input_ids" 键
        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
        
        # 如果使用过去状态 self.use_past
        if self.use_past:
            # 使用 fill_with_past_key_values_ 方法填充 common_inputs，方向为 "inputs"，并反转值的形状
            self.fill_with_past_key_values_(common_inputs, direction="inputs", inverted_values_shape=True)
            # 添加 "attention_mask" 键，指定映射关系为 {0: "batch", 1: "past_sequence + sequence"}
            common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
        else:
            # 否则，只添加 "attention_mask" 键，映射关系为 {0: "batch", 1: "sequence"}
            common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}

        # 返回最终的通用输入映射 common_inputs
        return common_inputs

    # num_layers 属性，返回配置中的层数 self._config.n_layer
    @property
    def num_layers(self) -> int:
        return self._config.n_layer

    # num_attention_heads 属性，返回配置中的注意力头数 self._config.n_head
    @property
    def num_attention_heads(self) -> int:
        return self._config.n_head

    # atol_for_validation 属性，返回用于验证的绝对容差值 1e-3
    @property
    def atol_for_validation(self) -> float:
        return 1e-3

    # generate_dummy_inputs 方法，生成虚拟输入数据
    def generate_dummy_inputs(
        self,
        tokenizer: "PreTrainedTokenizer",
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional["TensorType"] = None,
        ) -> Mapping[str, Any]:
        # 调用父类方法生成通用输入
        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
        )

        # 按照 forward() 方法中的顺序排序输入
        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})

        # 如果需要添加过去的键（past_keys）
        if self.use_past:
            # 检查是否安装了 PyTorch，否则抛出错误
            if not is_torch_available():
                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
            else:
                import torch

                # 获取输入的批次和序列长度
                batch, seqlen = common_inputs["input_ids"].shape
                # 为 past_key_values 指定不同的长度
                past_key_values_length = seqlen + 2
                # 计算头部维度
                head_dim = self._config.hidden_size // self.num_attention_heads
                # 定义过去键和值的形状
                past_key_shape = (
                    batch * self.num_attention_heads,
                    head_dim,
                    past_key_values_length,
                )
                past_value_shape = (
                    batch * self.num_attention_heads,
                    past_key_values_length,
                    head_dim,
                )
                # 为每个层次创建零张量的 past_key_values
                ordered_inputs["past_key_values"] = [
                    (torch.zeros(past_key_shape), torch.zeros(past_value_shape)) for _ in range(self.num_layers)
                ]

        # 添加 attention_mask 到有序输入中
        ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
        
        # 如果使用了 past_keys，则调整 attention_mask 的长度
        if self.use_past:
            mask_dtype = ordered_inputs["attention_mask"].dtype
            ordered_inputs["attention_mask"] = torch.cat(
                [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
            )

        # 返回排序后的输入
        return ordered_inputs

    @property
    def default_onnx_opset(self) -> int:
        # 返回默认的 ONNX 操作集版本号
        return 13

`.\models\bloom\convert_bloom_original_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert BigScience BLOOM checkpoint."""

import argparse   # 导入处理命令行参数的模块
import json   # 导入处理 JSON 格式的模块
import os   # 提供与操作系统相关的功能
import re   # 导入正则表达式模块

import torch   # 导入 PyTorch 库

from transformers import BloomConfig, BloomModel   # 导入 BLOOM 模型相关类
from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME   # 导入文件操作相关函数
from transformers.utils import logging   # 导入日志记录工具

logging.set_verbosity_info()   # 设置日志记录级别为 INFO

WEIGHTS_TO_AVERAGE_ENDSWITH = [   # 指定需要平均的权重名称列表
    "word_embeddings_layernorm.weight",
    "word_embeddings_layernorm.bias",
    "input_layernorm.weight",
    "input_layernorm.bias",
    "post_attention_layernorm.weight",
    "post_attention_layernorm.bias",
    "self_attention.dense.bias",
    "mlp.dense_4h_to_h.bias",
    "ln_f.weight",
    "ln_f.bias",
]

WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [   # 指定包含行并行性的权重名称列表
    "mlp.dense_4h_to_h.weight",
    "self_attention.dense.weight",
]


def layer_name_mapping(key, file):
    """Convert Megatron-DeepSpeed TP/PP weights mapping in transformers PP only"""
    # 处理第一个和最后一个层的名称映射
    layer_rename_map = {
        "word_embeddings.weight": "word_embeddings.weight",
        "word_embeddings.norm.weight": "word_embeddings_layernorm.weight",
        "word_embeddings.norm.bias": "word_embeddings_layernorm.bias",
        "weight": "ln_f.weight",
        "bias": "ln_f.bias",
    }

    if key in layer_rename_map:
        return layer_rename_map[key]

    # 处理 Transformer 块的名称映射
    layer_number = int(re.match(r".*layer_(\d*).*", file)[1])
    layer_number -= 3
    return f"h.{layer_number}." + key


def get_dtype_size(dtype):
    """获取数据类型的字节大小"""
    if dtype == torch.bool:
        return 1 / 8
    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
    if bit_search is None:
        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
    bit_size = int(bit_search.groups()[0])
    return bit_size // 8


def convert_bloom_checkpoint_to_pytorch(
    bloom_checkpoint_path, bloom_config_file, pytorch_dump_folder_path, shard_model, pretraining_tp
):
    """将 BLOOM 模型的检查点文件转换为 PyTorch 模型"""
    # 构建模型配置
    if bloom_config_file == "":
        config = BloomConfig()   # 如果未提供配置文件，则使用默认配置
    else:
        config = BloomConfig.from_json_file(bloom_config_file)   # 使用提供的 JSON 配置文件

if __name__ == "__main__":
    parser = argparse.ArgumentParser()   # 创建参数解析器对象
    # Required parameters
    parser.add_argument(
        "--bloom_checkpoint_path",
        default=None,
        type=str,
        required=True,
        help="Path to the Megatron-LM checkpoint path.",
    )
    # 添加命令行参数 --pytorch_dump_folder_path，指定输出的PyTorch模型路径，参数为必填项
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 添加命令行参数 --bloom_config_file，指定预训练模型对应的配置JSON文件路径，可选项
    parser.add_argument(
        "--bloom_config_file",
        default="",
        type=str,
        help=(
            "An optional config json file corresponding to the pre-trained model. \n"
            "This specifies the model architecture."
        ),
    )
    # 添加命令行参数 --shard_model，设置是否对输出的模型进行分片处理，可选项
    parser.add_argument(
        "--shard_model",
        action="store_true",
        help="An optional setting to shard the output model \nThis enables sharding the converted checkpoint",
    )
    # 添加命令行参数 --pretraining_tp，指定在Megatron-LM中训练模型时使用的预训练TP等级，默认为4，可选项
    parser.add_argument(
        "--pretraining_tp",
        default=4,
        type=int,
        help="Pretraining TP rank that has been used when training the model in Megatron-LM \n",
    )
    # 解析命令行参数，将结果存储在args对象中
    args = parser.parse_args()
    # 调用函数convert_bloom_checkpoint_to_pytorch，将参数传递给函数进行模型转换操作
    convert_bloom_checkpoint_to_pytorch(
        args.bloom_checkpoint_path,
        args.bloom_config_file,
        args.pytorch_dump_folder_path,
        args.shard_model,
        args.pretraining_tp,
    )

`.\models\bloom\modeling_bloom.py`

# 设置文件编码格式为 UTF-8

# 导入必要的库和模块
import math  # 导入数学库，用于数学运算
import warnings  # 导入警告模块，用于处理警告信息
from typing import Optional, Tuple, Union  # 导入类型提示模块，用于定义函数参数和返回类型

import torch  # 导入 PyTorch 深度学习库
import torch.utils.checkpoint  # 导入 PyTorch 中用于支持 checkpoint 的工具函数
from torch import nn  # 导入 PyTorch 的神经网络模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss  # 导入 PyTorch 中的损失函数
from torch.nn import functional as F  # 导入 PyTorch 中的函数模块，使用别名 F

# 导入 Hugging Face 提供的一些工具和模块
from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import logging
from .configuration_bloom import BloomConfig  # 导入 BLOOM 模型的配置文件

logger = logging.get_logger(__name__)  # 获取日志记录器对象

_CHECKPOINT_FOR_DOC = "bigscience/bloom-560m"  # 模型检查点的示例名称
_CONFIG_FOR_DOC = "BloomConfig"  # Bloom 模型的配置文件示例名称

BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "bigscience/bigscience-small-testing",
    "bigscience/bloom-560m",
    "bigscience/bloom-1b1",
    "bigscience/bloom-1b7",
    "bigscience/bloom-3b",
    "bigscience/bloom-7b1",
    "bigscience/bloom",
]  # 预训练模型的列表

def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
    """
    构建 Alibi 张量，参考文献：https://arxiv.org/abs/2108.12409。Alibi 张量不是因果性的，
    原始论文提到它依赖于 softmax 的平移不变性以进行快速实现：对于一个张量 l 和一个固定值 `softmax(l+a) = softmax(l)`。
    基于 https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
    TODO @thomasw21 这并不完全适用于掩码策略，因此掩码稍有不同。

    Args:
        attention_mask (`torch.Tensor`):
            令牌级别的注意力掩码，应为形状 (batch_size, max_seq_len)。
        num_heads (`int`, *required*):
            多头注意力的数量。
        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
            输出张量的数据类型。

    Returns:
        torch.Tensor:
            形状为 (batch_size * num_heads, 1, max_seq_len) 的张量。
    """
    batch_size, seq_length = attention_mask.shape  # 获取注意力掩码的批量大小和序列长度
    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))  # 计算最接近 num_heads 的 2 的幂次方
    # 使用最接近的 2 的幂次方的对数，计算指数，并将其转换为二进制后取倒数，再次取倒数，得到基数
    base = torch.tensor(
        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
    )

    # 创建一个张量，其中包含从 1 到最接近的 2 的幂次方的整数序列，设备和数据类型与 attention_mask 一致
    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)

    # 根据 base 和 powers 计算斜率 slopes
    slopes = torch.pow(base, powers)

    # 如果最接近的 2 的幂次方不等于 num_heads，则需要添加额外的斜率
    if closest_power_of_2 != num_heads:
        # 计算额外基数，使用最接近的 2 倍的最接近的 2 的幂次方的对数
        extra_base = torch.tensor(
            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
        )
        # 计算剩余的头数
        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
        # 创建一个张量，包含从 1 开始，以步长 2，到 2 倍剩余头数的整数序列
        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
        # 将额外的斜率拼接到现有的 slopes 张量中
        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)

    # 创建一个张量 arange_tensor，用于计算 alibi，形状为 (batch_size, num_heads, seq_length)
    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]

    # 计算 alibi，将 slopes 乘以 arange_tensor，并添加一个维度以保持与返回的 alibi 形状一致
    alibi = slopes[..., None] * arange_tensor

    # 将 alibi 重塑为所需的形状，其中 batch_size 乘以 num_heads 作为第一维，seq_length 作为第三维，并转换为指定的数据类型
    return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
    """
    Dropout add function

    Args:
        x (`torch.tensor`, *required*):
            input tensor
        residual (`torch.tensor`, *required*):
            residual tensor
        prob (`float`, *required*):
            dropout probability
        training (`bool`, *required*):
            training mode
    """
    # Apply dropout to the input tensor `x` based on the provided probability `prob` and training mode
    out = F.dropout(x, p=prob, training=training)
    # Add the residual tensor to the dropout-applied tensor `out`
    out = residual + out
    return out


def bloom_gelu_forward(x: torch.Tensor) -> torch.Tensor:
    """
    Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
    make the model jittable.

    Args:
        x (`torch.tensor`, *required*):
            input hidden states
    """
    # Implement the Gaussian Error Linear Unit (GELU) activation function with a custom bias
    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))


def bloom_gelu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
    """
    gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) +
    0.3989423 * x * torch.exp(-0.5 * x * x)

    Args:
        g (`torch.tensor`, *required*):
            gradient output tensor
        x (`torch.tensor`, *required*):
            input tensor
    """
    # Unpack the single element tuple `x` and calculate the derivative of the custom GELU function
    x = x[0]  # x is a tuple of 1 element, needs to unpack it first
    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
    # Calculate the backward gradient for the GELU function
    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
    return ff * g


class GeLUFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the custom autograd function for GeLU.

        Args:
            ctx (`torch.autograd.function.Context`):
                context object to save tensors for backward pass
            input (`torch.tensor`, *required*):
                input tensor
        """
        # Save the input tensor `input` in the context for later use in backward pass
        ctx.save_for_backward(input)
        # Return the output of the custom GELU forward function
        return bloom_gelu_forward(input)

    @staticmethod
    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
        """
        Backward pass of the custom autograd function for GeLU.

        Args:
            ctx (`torch.autograd.function.Context`):
                context object holding saved tensors from forward pass
            grad_output (`torch.tensor`, *required*):
                gradient of the output tensor
        """
        # Retrieve the saved input tensor from the context
        input = ctx.saved_tensors
        # Calculate the backward gradient using the custom GELU backward function
        tmp = bloom_gelu_back(grad_output, input)
        return tmp


class BloomGelu(nn.Module):
    """
    BloomBiasGelu wrapper function that make use of the simple function on inference mode to make the model
    torchscriptable and use the autograd function in training mode to get the accurate results of the gradients Partly
    copied from Megatron-DeepSpeed code and adapted for our needs

    See here why autograd functions are not torchscriptable: https://github.com/pytorch/pytorch/issues/22329
    """

    def __init__(self):
        super().__init__()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the BloomGelu module.

        Args:
            x (`torch.tensor`, *required*):
                input tensor
        """
        # Check if the model is in training mode; if yes, use custom autograd function GeLUFunction, otherwise use inference mode
        if self.training:
            return GeLUFunction.apply(x)
        else:
            return bloom_gelu_forward(x)


class BloomAttention(nn.Module):
    # This class definition is currently incomplete in the provided snippet
    pass
    def __init__(self, config: BloomConfig):
        # 调用父类构造函数初始化对象
        super().__init__()

        # 从配置对象中获取预训练的时间点和是否采用精确但较慢的模式
        self.pretraining_tp = config.pretraining_tp
        self.slow_but_exact = config.slow_but_exact

        # 从配置对象中获取隐藏层大小、头的数量和每个头的维度
        self.hidden_size = config.hidden_size
        self.num_heads = config.n_head
        self.head_dim = self.hidden_size // self.num_heads
        self.split_size = self.hidden_size
        self.hidden_dropout = config.hidden_dropout

        # 检查隐藏层大小能否被头的数量整除
        if self.head_dim * self.num_heads != self.hidden_size:
            raise ValueError(
                f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
                f" {self.num_heads})."
            )

        # 计算用于注意力机制的缩放因子和初始值
        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
        self.beta = 1.0

        # 初始化线性层，用于生成查询、键、值
        self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=True)
        # 初始化线性层，用于注意力输出的全连接层
        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
        # 初始化注意力机制的dropout层
        self.attention_dropout = nn.Dropout(config.attention_dropout)

    def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
        storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
            value: [batch_size, seq_length, num_heads, head_dim]
        """
        # 获取输入张量的维度信息
        batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
        # 将张量重新组织为 [batch_size, seq_length, num_heads, 3, head_dim]
        fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
        # 返回查询、键、值张量，将最后一个维度的前三个用作查询、键、值
        return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
        """
        Merge heads together over the last dimension

        Args:
            x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]

        Returns:
            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
        """
        # 获取输入张量的形状信息
        batch_size_and_num_heads, seq_length, _ = x.shape
        # 计算真实的 batch_size
        batch_size = batch_size_and_num_heads // self.num_heads

        # 将张量重塑为带有分解后的 batch size 的形状
        x = x.view(batch_size, self.num_heads, seq_length, self.head_dim)

        # 调整维度顺序，使得头部维度(num_heads)在中间，seq_length在前面
        x = x.permute(0, 2, 1, 3)

        # 将头部维度和 head_dim 合并成一个维度
        x = x.reshape(batch_size, seq_length, self.num_heads * self.head_dim)

        # 返回合并后的张量
        return x

    def forward(
        self,
        hidden_states: torch.Tensor,
        residual: torch.Tensor,
        alibi: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        head_mask: Optional[torch.Tensor] = None,
        use_cache: bool = False,
        output_attentions: bool = False,
# 定义一个名为 BloomMLP 的神经网络模块
class BloomMLP(nn.Module):
    # 初始化函数，接收一个 BloomConfig 类的配置对象作为参数
    def __init__(self, config: BloomConfig):
        super().__init__()
        hidden_size = config.hidden_size

        # 根据配置对象设置一些属性
        self.pretraining_tp = config.pretraining_tp  # 设定预训练的参数
        self.slow_but_exact = config.slow_but_exact  # 设定是否选择精确但慢的模式
        self.dense_h_to_4h = nn.Linear(hidden_size, 4 * hidden_size)  # 创建一个线性变换层，将隐藏层大小映射到4倍隐藏层大小
        self.gelu_impl = BloomGelu()  # 创建一个自定义的 GELU 激活函数对象
        self.dense_4h_to_h = nn.Linear(4 * hidden_size, hidden_size)  # 创建一个线性变换层，将4倍隐藏层大小映射回隐藏层大小
        self.hidden_dropout = config.hidden_dropout  # 设定隐藏层的 dropout 概率

    # 前向传播函数，接收隐藏状态和残差张量作为输入，并返回一个张量
    def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
        hidden_states = self.gelu_impl(self.dense_h_to_4h(hidden_states))  # 使用 GELU 激活函数处理线性变换后的隐藏状态

        # 根据预训练的参数和模式选择不同的计算方式
        if self.pretraining_tp > 1 and self.slow_but_exact:
            intermediate_output = torch.zeros_like(residual)  # 创建一个与残差张量相同大小的零张量
            slices = self.dense_4h_to_h.weight.shape[-1] / self.pretraining_tp  # 计算切片的大小
            for i in range(self.pretraining_tp):
                # 执行线性变换操作，将隐藏状态的不同部分映射到目标维度
                intermediate_output = intermediate_output + F.linear(
                    hidden_states[:, :, int(i * slices) : int((i + 1) * slices)],
                    self.dense_4h_to_h.weight[:, int(i * slices) : int((i + 1) * slices)],
                )
        else:
            intermediate_output = self.dense_4h_to_h(hidden_states)  # 直接使用线性变换将隐藏状态映射到目标维度

        # 调用外部的 dropout_add 函数，将中间输出与残差张量相加并应用 dropout
        output = dropout_add(intermediate_output, residual, self.hidden_dropout, self.training)

        return output  # 返回处理后的输出张量


# 定义一个名为 BloomBlock 的神经网络模块
class BloomBlock(nn.Module):
    # 初始化函数，接收一个 BloomConfig 类的配置对象作为参数
    def __init__(self, config: BloomConfig):
        super().__init__()
        hidden_size = config.hidden_size

        # 创建输入层归一化层，使用给定的 epsilon 值初始化
        self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        self.num_heads = config.n_head  # 设置头的数量
        self.self_attention = BloomAttention(config)  # 创建自注意力机制对象
        self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)  # 创建自注意力后的归一化层

        self.mlp = BloomMLP(config)  # 创建 MLP 模块对象

        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm  # 是否应用残差连接后的归一化
        self.hidden_dropout = config.hidden_dropout  # 设置隐藏层的 dropout 概率

    # 前向传播函数，接收隐藏状态、辅助张量、注意力掩码等作为输入，并可选择性地返回注意力信息
    def forward(
        self,
        hidden_states: torch.Tensor,
        alibi: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        head_mask: Optional[torch.Tensor] = None,
        use_cache: bool = False,
        output_attentions: bool = False,

        # 对输入隐藏状态应用层归一化
        hidden_states = self.input_layernorm(hidden_states)

        # 使用自注意力机制处理隐藏状态，返回处理后的注意力信息和新的隐藏状态
        attention_output = self.self_attention(
            hidden_states,
            attention_mask,
            layer_past=layer_past,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )

        # 如果配置中设置了应用残差连接后的归一化，则将自注意力输出与输入隐藏状态相加并再次进行归一化
        if self.apply_residual_connection_post_layernorm:
            hidden_states = hidden_states + attention_output[0]  # 使用注意力输出更新隐藏状态
            hidden_states = self.post_attention_layernorm(hidden_states)  # 对更新后的隐藏状态再次进行归一化
        else:
            hidden_states = attention_output[0]  # 否则直接使用注意力输出作为隐藏状态的更新

        # 使用 MLP 模块处理更新后的隐藏状态和辅助张量，并返回处理后的结果
        output = self.mlp(hidden_states, alibi)

        return output  # 返回最终处理结果
    ):
        # hidden_states: [batch_size, seq_length, hidden_size]
        # 定义一个方法，输入参数包含 hidden_states，表示隐藏状态的张量，形状为 [batch_size, seq_length, hidden_size]

        # Layer norm at the beginning of the transformer layer.
        # 在Transformer层的开始处进行层归一化处理
        layernorm_output = self.input_layernorm(hidden_states)
        # 使用 input_layernorm 对 hidden_states 进行层归一化处理，并将结果赋给 layernorm_output

        # Layer norm post the self attention.
        # 在自注意力机制之后进行层归一化处理
        if self.apply_residual_connection_post_layernorm:
            # 如果配置为在层归一化之后应用残差连接
            residual = layernorm_output
        else:
            # 否则，应用残差连接到原始的 hidden_states
            residual = hidden_states

        # Self attention.
        # 自注意力机制
        attn_outputs = self.self_attention(
            layernorm_output,  # 输入为归一化后的输出
            residual,           # 残差连接的输入
            layer_past=layer_past,              # 历史层的信息
            attention_mask=attention_mask,      # 注意力掩码
            alibi=alibi,                        # 辅助信息
            head_mask=head_mask,                # 注意力头的掩码
            use_cache=use_cache,                # 是否使用缓存
            output_attentions=output_attentions  # 是否输出注意力
        )

        attention_output = attn_outputs[0]  # 提取注意力输出

        outputs = attn_outputs[1:]  # 提取其它输出

        layernorm_output = self.post_attention_layernorm(attention_output)
        # 在注意力输出后进行层归一化处理，并将结果赋给 layernorm_output

        # Get residual
        # 获取残差连接
        if self.apply_residual_connection_post_layernorm:
            # 如果配置为在层归一化之后应用残差连接
            residual = layernorm_output
        else:
            # 否则，应用残差连接到注意力输出
            residual = attention_output

        # MLP.
        # 多层感知机（MLP）处理
        output = self.mlp(layernorm_output, residual)
        # 使用 MLP 处理层归一化的输出和残差连接结果，并将结果赋给 output

        if use_cache:
            # 如果使用缓存，则将 output 添加到输出元组中
            outputs = (output,) + outputs
        else:
            # 否则，仅将 output 添加到输出元组的第一个位置之后的元素中
            outputs = (output,) + outputs[1:]

        return outputs  # 返回结果元组，包含 hidden_states, present, attentions
    @staticmethod
    def _convert_to_bloom_cache(
        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
        """
        Convert the cache format to a custom format specific to Bloom.
        
        Args:
            past_key_value: Tuple of tuples containing tensors for keys and values from previous attention layers.
        
        Returns:
            Tuple of tuples containing tensors reshaped and restructured for Bloom model caching.
        """
        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
        # Convert to a custom cache format for Bloom
        # key: [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
        return tuple(
            (
                layer_past[0].view(batch_size * num_heads, head_dim, seq_length),
                layer_past[1].view(batch_size * num_heads, seq_length, head_dim),
                layer_past[0].view(batch_size * num_heads, head_dim, seq_length),  # Extra line added
            )
            for layer_past in past_key_value
        )
    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
        """
        Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
        """
        # 解构 past_key_value 中第一个元素，获取其形状信息
        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
        # 计算 batch_size * num_heads
        batch_size_times_num_heads = batch_size * num_heads
        # 将每个 layer_past 转换为 Bloom 预期的格式，并返回一个元组
        return tuple(
            (
                # 调整 key 的形状：[batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
                layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length),
                # 调整 value 的形状：[batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
                layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim),
            )
            # 遍历 past_key_value 中的每个 layer_past
            for layer_past in past_key_value
        )
"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`BloomConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

BLOOM_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
        past_key_values (:obj:`Tuple[Tuple[torch.Tensor, torch.Tensor], ...]`, `optional`):
            Tuple of length `config.num_hidden_layers`, containing tuples (`key`, `value`) for the cross-attention
            layers.
        attention_mask (:obj:`torch.Tensor`, `optional`):
            Mask to avoid performing attention on padding token indices. It is a tensor with shape
            `(batch_size, sequence_length)`, where each value is `0` for real tokens and `1` for padding tokens.
        head_mask (:obj:`torch.LongTensor`, `optional`):
            Mask to nullify selected heads of the self-attention modules. It is a tensor of shape
            `(num_heads,)`, where each value is either `0` or `1`. A `1` indicates the head is **not masked**, while a
            `0` indicates the head is masked.
        inputs_embeds (:obj:`torch.LongTensor`, `optional`):
            Embedded representation of the inputs. It is a tensor of shape `(batch_size, sequence_length,
            embedding_dim)`.
        use_cache (:obj:`bool`, `optional`):
            Whether or not to use the cached keys and values. If `False`, all intermediate keys and values are
            discarded and recomputed on-the-fly.
        output_attentions (:obj:`bool`, `optional`):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a dictionary instead of a tuple of outputs.

    Returns:
        :class:`~transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions`: A BaseModelOutputWithPastAndCrossAttentions
        object containing various elements depending on the configuration (e.g., hidden states, attentions, etc.).
"""

@add_start_docstrings(
    "The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.",
    BLOOM_START_DOCSTRING,
)
class BloomModel(BloomPreTrainedModel):
    def __init__(self, config: BloomConfig):
        super().__init__(config)

        self.embed_dim = config.hidden_size
        self.num_heads = config.n_head

        # Embedding + LN Embedding
        self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
        self.word_embeddings_layernorm = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)

        # Transformer blocks
        self.h = nn.ModuleList([BloomBlock(config) for _ in range(config.num_hidden_layers)])

        # Final Layer Norm
        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)

        self.gradient_checkpointing = False

        # Initialize weights and apply final processing
        self.post_init()

    def build_alibi_tensor(self, attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
        """
        Helper function to build a tensor with values based on attention_mask and number of heads.

        Args:
            attention_mask (:obj:`torch.Tensor`): Tensor indicating positions to ignore in attention computation.
            num_heads (:obj:`int`): Number of attention heads.
            dtype (:obj:`torch.dtype`): Data type of the tensor.

        Returns:
            :obj:`torch.Tensor`: A tensor with specific values based on input parameters.
        """
        return build_alibi_tensor(attention_mask, num_heads, dtype)

    def get_input_embeddings(self):
        """
        Retrieve the word embedding layer.

        Returns:
            :obj:`torch.nn.Embedding`: The word embedding layer.
        """
        return self.word_embeddings

    def set_input_embeddings(self, new_embeddings: torch.Tensor):
        """
        Set new word embeddings for the model.

        Args:
            new_embeddings (:obj:`torch.Tensor`): New word embeddings to be set.
        """
        self.word_embeddings = new_embeddings

    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPastAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **deprecated_arguments,
    ):
        """
        Perform a forward pass of the BloomModel.

        Args:
            input_ids (:obj:`torch.LongTensor`, `optional`):
                Indices of input sequence tokens in the vocabulary.
            past_key_values (:obj:`Tuple[Tuple[torch.Tensor, torch.Tensor], ...]`, `optional`):
                Tuple of length `config.num_hidden_layers`, containing tuples (`key`, `value`) for the cross-attention
                layers.
            attention_mask (:obj:`torch.Tensor`, `optional`):
                Mask to avoid performing attention on padding token indices.
            head_mask (:obj:`torch.LongTensor`, `optional`):
                Mask to nullify selected heads of the self-attention modules.
            inputs_embeds (:obj:`torch.LongTensor`, `optional`):
                Embedded representation of the inputs.
            use_cache (:obj:`bool`, `optional`):
                Whether or not to use the cached keys and values.
            output_attentions (:obj:`bool`, `optional`):
                Whether or not to return the attentions tensors of all attention layers.
            output_hidden_states (:obj:`bool`, `optional`):
                Whether or not to return the hidden states of all layers.
            return_dict (:obj:`bool`, `optional`):
                Whether or not to return a dictionary instead of a tuple of outputs.

        Returns:
            :class:`~transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions`: A BaseModelOutputWithPastAndCrossAttentions
            object containing various elements depending on the configuration.
        """
        # Implementation of forward pass is omitted for brevity in commenting.
        pass
@add_start_docstrings(
    """
    The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    BLOOM_START_DOCSTRING,
)
class BloomForCausalLM(BloomPreTrainedModel):
    # Define keys for tied weights
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config: BloomConfig):
        # Initialize the model with a configuration
        super().__init__(config)
        # Initialize the transformer model
        self.transformer = BloomModel(config)
        # Initialize the language modeling head
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and perform any final processing
        self.post_init()

    def get_output_embeddings(self):
        # Return the language modeling head for output embeddings
        return self.lm_head

    def set_output_embeddings(self, new_embeddings: torch.Tensor):
        # Set new embeddings for the language modeling head
        self.lm_head = new_embeddings

    def prepare_inputs_for_generation(
        self,
        input_ids: torch.LongTensor,
        past_key_values: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> dict:
        # Process inputs for generation
        
        # If past_key_values is provided, determine the length of past key values
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # Adjust input_ids to keep only the last tokens if necessary
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                remove_prefix_length = input_ids.shape[1] - 1
            input_ids = input_ids[:, remove_prefix_length:]

            # Convert past_key_values format if necessary for compatibility with Bloom cache
            if past_key_values[0][0].shape[0] == input_ids.shape[0]:
                past_key_values = self._convert_to_bloom_cache(past_key_values)

        # If inputs_embeds are provided and past_key_values is None, use them in the first generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        # Update model_inputs with additional parameters
        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播函数，用于执行模型推理过程
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的token ID序列，可以为空
        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,  # 缓存的注意力机制的过去键值对，可以为空
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，可以为空
        head_mask: Optional[torch.Tensor] = None,  # 多头注意力机制的掩码，可以为空
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入表示，可以为空
        labels: Optional[torch.Tensor] = None,  # 模型输出的标签，可以为空
        use_cache: Optional[bool] = None,  # 是否使用缓存，可以为空
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可以为空
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可以为空
        return_dict: Optional[bool] = None,  # 是否以字典形式返回结果，可以为空
        **deprecated_arguments,  # 其他已过时的参数，作为关键字参数传递
    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        # 如果deprecated_arguments中包含"position_ids"键，并且其值不为False，则发出警告
        if deprecated_arguments.pop("position_ids", False) is not False:
            warnings.warn(
                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
                " passing `position_ids`.",
                FutureWarning,
            )
        # 如果deprecated_arguments中还有其他未预期的参数，则抛出异常
        if len(deprecated_arguments) > 0:
            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")

        # 确定是否使用返回字典，如果未指定，则使用self.config.use_return_dict的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用transformer处理输入数据，获取transformer模型的输出
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 从transformer模型输出中获取隐藏状态
        hidden_states = transformer_outputs[0]

        # 使用lm_head将隐藏状态转换为语言模型的logits
        lm_logits = self.lm_head(hidden_states)

        # 初始化损失为None
        loss = None
        # 如果提供了标签，则计算交叉熵损失
        if labels is not None:
            # 将标签移动到正确的设备以启用模型并行处理
            labels = labels.to(lm_logits.device)
            # 将logits向左移动一位，以便预测下一个标记
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            batch_size, seq_length, vocab_size = shift_logits.shape
            # 展平标记
            loss_fct = CrossEntropyLoss()
            # 计算损失
            loss = loss_fct(
                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
            )

        # 如果不需要返回字典，则按照tuple形式返回输出
        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典，则创建CausalLMOutputWithCrossAttentions对象
        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def _reorder_cache(
        self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
   `
    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
        """
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.

        Output shares the same memory storage as `past`.
        """
        # 将 past 转换为标准缓存格式，batch_size 设置为 beam_idx 的长度
        standardized_past = self._convert_to_standard_cache(past, batch_size=len(beam_idx))

        # 创建一个字典，将每个 layer_past 的设备映射到对应的 beam_idx
        device_to_beam_idx = {
            past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
        }
        # 对每个 layer_past，使用 index_select 根据 device_to_beam_idx 重新排序，返回一个元组
        reordered_past = tuple(
            (
                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
            )
            for layer_past in standardized_past
        )
        # 将重新排序后的 past 转换为 bloom 缓存格式
        return self._convert_to_bloom_cache(reordered_past)
# Bloom 模型的序列分类器，使用线性层进行分类。

# 根据最后一个 token 进行分类，与其他因果模型（如 GPT-1）类似。

# 当进行最后一个 token 的分类时，需要知道最后一个 token 的位置。如果配置中定义了 `pad_token_id`，则在每一行中找到不是填充 token 的最后一个 token。
# 如果没有定义 `pad_token_id`，则在每个批次的每一行中取最后一个值。当传递 `inputs_embeds` 而不是 `input_ids` 时，无法猜测填充 token，因此也会取每行批次的最后一个值。

@add_start_docstrings(
    """
    Bloom 模型的令牌分类器，位于隐藏状态输出之上的线性层，例如用于命名实体识别（NER）任务。
    """,
    BLOOM_START_DOCSTRING,
)
class BloomForTokenClassification(BloomPreTrainedModel):
    # 初始化函数，接受一个BloomConfig类型的配置对象作为参数
    def __init__(self, config: BloomConfig):
        # 调用父类的初始化函数，将配置对象传递给父类
        super().__init__(config)
        # 从配置对象中获取num_labels属性，并赋值给当前对象的num_labels属性
        self.num_labels = config.num_labels

        # 使用BloomModel类根据配置对象初始化一个transformer模型
        self.transformer = BloomModel(config)

        # 根据配置对象中的classifier_dropout或hidden_dropout属性设置classifier_dropout变量
        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
            classifier_dropout = config.classifier_dropout
        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        # 使用nn.Dropout类根据classifier_dropout变量初始化一个dropout层
        self.dropout = nn.Dropout(classifier_dropout)
        # 使用nn.Linear类初始化一个线性层，输入维度为配置对象中的hidden_size，输出维度为num_labels
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 执行额外的初始化和最终处理步骤
        self.post_init()

    # 将BLOOM_INPUTS_DOCSTRING和其他文档注释添加到模型的forward函数上
    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义前向传播函数，接受多个输入参数，并返回输出结果
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **deprecated_arguments,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 检查是否有被弃用的参数 `position_ids`，如果有则发出警告
        if deprecated_arguments.pop("position_ids", False) is not False:
            warnings.warn(
                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
                " passing `position_ids`.",
                FutureWarning,
            )
        # 检查是否有未预期的其他参数，如果有则引发 ValueError 异常
        if len(deprecated_arguments) > 0:
            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")

        # 确定是否要返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 Transformer 模型处理输入数据
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取 Transformer 输出中的隐藏状态
        hidden_states = transformer_outputs[0]
        # 对隐藏状态应用 dropout 正则化
        hidden_states = self.dropout(hidden_states)
        # 使用分类器得到分类结果 logits
        logits = self.classifier(hidden_states)

        # 初始化损失值为 None
        loss = None
        # 如果提供了标签，则计算损失值
        if labels is not None:
            # 将标签移动到正确的设备以支持模型并行计算
            labels = labels.to(logits.device)
            batch_size, seq_length = labels.shape
            # 使用交叉熵损失函数计算损失值
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(
                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
            )

        # 如果不需要返回字典形式的输出，则组装返回结果
        if not return_dict:
            output = (logits,) + transformer_outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典形式的输出，则构建 TokenClassifierOutput 对象返回
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
# 使用自定义的文档字符串注释该类，描述其在问题回答任务中的用途和顶部的分类头部分
@add_start_docstrings(
    """
    The BLOOM Model transformer with a span classification head on top for extractive question-answering tasks like
    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    BLOOM_START_DOCSTRING,
)
class BloomForQuestionAnswering(BloomPreTrainedModel):
    def __init__(self, config):
        # 调用父类的初始化方法，传入配置对象
        super().__init__(config)
        # 初始化 BLOOM 模型部分
        self.transformer = BloomModel(config)
        # 初始化用于问题回答的输出线性层，输出大小为2（用于span的起始和结束logits）
        self.qa_outputs = nn.Linear(config.hidden_size, 2)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 根据 return_dict 参数确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 Transformer 模型处理输入数据
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 Transformer 输出中获取序列输出
        sequence_output = outputs[0]

        # 使用 QA 输出层处理序列输出，得到起始和结束位置的 logits
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果 start_positions 或 end_positions 是多维的，在第一维上进行压缩
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 将超出模型输入长度的位置调整到有效范围内
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，忽略 ignore_index 处的预测
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            # 计算起始和结束位置损失的平均值作为总损失
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            # 如果不要求返回字典，则返回一个元组
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果要求返回字典，则返回 QuestionAnsweringModelOutput 对象
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Transformers-源码解析-二十一-

Transformers 源码解析（二十一）

.\models\blip\modeling_tf_blip_text.py

.\models\blip\processing_blip.py

.\models\blip\__init__.py

.\models\blip_2\configuration_blip_2.py

.\models\blip_2\convert_blip_2_original_to_pytorch.py

.\models\blip_2\modeling_blip_2.py

.\models\blip_2\processing_blip_2.py

.\models\blip_2\__init__.py

.\models\bloom\configuration_bloom.py

.\models\bloom\convert_bloom_original_checkpoint_to_pytorch.py

.\models\bloom\modeling_bloom.py

`.\models\blip\modeling_tf_blip_text.py`

`.\models\blip\processing_blip.py`

`.\models\blip\init.py`

`.\models\blip_2\configuration_blip_2.py`

`.\models\blip_2\convert_blip_2_original_to_pytorch.py`

`.\models\blip_2\modeling_blip_2.py`

`.\models\blip_2\processing_blip_2.py`

`.\models\blip_2\init.py`

`.\models\bloom\configuration_bloom.py`

`.\models\bloom\convert_bloom_original_checkpoint_to_pytorch.py`

`.\models\bloom\modeling_bloom.py`