Transformers 源码解析(二十一)
.\models\blip\modeling_tf_blip_text.py
from __future__ import annotations
import math
from typing import Optional, Tuple
import tensorflow as tf
from ...modeling_tf_outputs import (
TFBaseModelOutputWithPastAndCrossAttentions,
TFBaseModelOutputWithPoolingAndCrossAttentions,
TFCausalLMOutputWithCrossAttentions,
)
from ...modeling_tf_utils import (
TFModelInputType,
TFPreTrainedModel,
get_initializer,
get_tf_activation,
keras,
keras_serializable,
shape_list,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, invert_attention_mask, stable_softmax
from ...utils import add_start_docstrings_to_model_forward, logging
from .configuration_blip import BlipTextConfig
logger = logging.get_logger(__name__)
BLIP_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
# 输入序列的标记索引在词汇表中的位置。默认情况下,将忽略填充部分。
# 可以使用 [`AutoProcessor`] 获得这些索引。有关详情,请参见 [`BlipProcessor.__call__`]。
# [什么是输入 ID?](../glossary#input-ids)
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
# 遮罩,用于在填充的标记索引上避免执行注意力操作。遮罩的取值范围为 `[0, 1]`:
# - 1 表示**未被遮罩**的标记,
# - 0 表示**被遮罩**的标记。
# [什么是注意力遮罩?](../glossary#attention-mask)
position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
# 每个输入序列标记在位置嵌入中的位置索引。取值范围为 `[0, config.max_position_embeddings - 1]`。
# [什么是位置 ID?](../glossary#position-ids)
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。有关更多细节,请参见返回的张量中的 `attentions`。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。有关更多细节,请参见返回的张量中的 `hidden_states`。
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
# 从 https://github.com/salesforce/BLIP/blob/main/models/med.py#L52 适配而来的代码,该类定义了一个 TFBlipTextEmbeddings 类
class TFBlipTextEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word and position embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 初始化词嵌入层,根据配置文件指定的词汇大小和隐藏大小
self.word_embeddings = keras.layers.Embedding(
config.vocab_size,
config.hidden_size,
embeddings_initializer=get_initializer(config.initializer_range),
name="word_embeddings",
)
# 初始化位置嵌入层,根据配置文件指定的最大位置嵌入和隐藏大小
self.position_embeddings = keras.layers.Embedding(
config.max_position_embeddings,
config.hidden_size,
embeddings_initializer=get_initializer(config.initializer_range),
name="position_embeddings",
)
# 使用 PyTorch 模型的变量命名风格,因此未使用蛇形命名法来定义 LayerNormalization 层,
# 以便能够加载任何 TensorFlow 检查点文件
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
# 定义丢弃层,根据配置文件指定的隐藏层丢弃概率
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
# 创建位置ID张量,用于表示绝对位置嵌入的位置
self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0)
# 获取配置文件中的位置嵌入类型,默认为"absolute"
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
# 保存配置对象
self.config = config
def call(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0, training=None):
# 如果传入了 input_ids,则获取其形状
if input_ids is not None:
input_shape = tf.shape(input_ids)
else:
# 否则获取 inputs_embeds 的形状,但不包括最后一个维度
input_shape = tf.shape(inputs_embeds)[:-1]
# 获取序列长度
seq_length = input_shape[1]
# 如果未提供 position_ids,则从预测键值长度到序列长度获取位置ID
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
# 如果未提供 inputs_embeds,则根据 input_ids 检查嵌入是否在有效范围内,并获取词嵌入
if inputs_embeds is None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = self.word_embeddings(input_ids)
# 获取嵌入
embeddings = inputs_embeds
# 如果位置嵌入类型为"absolute",则获取位置嵌入并加到嵌入中
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
# 对嵌入进行 LayerNorm 处理
embeddings = self.LayerNorm(embeddings)
# 使用丢弃层进行丢弃处理,根据训练状态决定是否启用丢弃
embeddings = self.dropout(embeddings, training=training)
# 返回最终的嵌入表示
return embeddings
# 如果模型已经构建完成,则直接返回,不进行重复构建
if self.built:
return
# 设置标志位,表示模型已经构建完成
self.built = True
# 如果存在词嵌入层对象,构建该层
if getattr(self, "word_embeddings", None) is not None:
# 在 TensorFlow 中使用命名空间为词嵌入层命名,并构建该层
with tf.name_scope(self.word_embeddings.name):
self.word_embeddings.build(None)
# 如果存在位置嵌入层对象,构建该层
if getattr(self, "position_embeddings", None) is not None:
# 在 TensorFlow 中使用命名空间为位置嵌入层命名,并构建该层
with tf.name_scope(self.position_embeddings.name):
self.position_embeddings.build(None)
# 如果存在 LayerNorm 层对象,构建该层
if getattr(self, "LayerNorm", None) is not None:
# 在 TensorFlow 中使用命名空间为 LayerNorm 层命名,并构建该层
with tf.name_scope(self.LayerNorm.name):
# 构建 LayerNorm 层,传入输入形状 [None, None, self.config.hidden_size]
self.LayerNorm.build([None, None, self.config.hidden_size])
# 如果存在 dropout 层对象,构建该层
if getattr(self, "dropout", None) is not None:
# 在 TensorFlow 中使用命名空间为 dropout 层命名,并构建该层
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97
class TFBlipTextSelfAttention(keras.layers.Layer):
def __init__(self, config, is_cross_attention, **kwargs):
super().__init__(**kwargs)
self.config = config
# 检查隐藏层大小是否能被注意力头数整除,如果不能且没有嵌入大小的属性,则引发错误
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention heads (%d)"
% (config.hidden_size, config.num_attention_heads)
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
# 创建用于查询、键、值的全连接层,初始化方法为指定范围的初始值
self.query = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
# 添加 dropout 层,使用配置中的注意力概率
self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
# 根据配置选择位置嵌入类型,默认为绝对位置
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
# 如果位置嵌入类型为相对键或相对键查询,则创建距离嵌入层
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = keras.layers.Embedding(
2 * config.max_position_embeddings - 1, self.attention_head_size
)
self.is_cross_attention = is_cross_attention
# 调整张量形状以便计算注意力分数
def transpose_for_scores(self, x):
new_x_shape = tf.concat(
[tf.shape(x)[:-1], tf.constant([self.num_attention_heads, self.attention_head_size], dtype=tf.int32)],
axis=0,
)
x = tf.reshape(x, new_x_shape)
return tf.transpose(x, perm=(0, 2, 1, 3))
# 定义层的调用方法,接收多个参数用于注意力计算和输出
def call(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
training=None,
# 如果已经构建过网络结构,则直接返回,不再重复构建
def build(self, input_shape=None):
if self.built:
return
# 标记网络已经构建
self.built = True
# 如果存在查询(query)模块,则构建其网络结构
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
# 如果是交叉注意力机制,构建键(key)和值(value)的网络结构
if self.is_cross_attention:
# 如果存在键(key)模块,则构建其网络结构
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.encoder_hidden_size])
# 如果存在值(value)模块,则构建其网络结构
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.encoder_hidden_size])
else:
# 如果存在键(key)模块,则构建其网络结构
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
# 如果存在值(value)模块,则构建其网络结构
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# TFBlipTextSelfOutput 类定义,继承自 keras.layers.Layer
class TFBlipTextSelfOutput(keras.layers.Layer):
# 初始化方法,接收 BlipTextConfig 类型的 config 对象和其他关键字参数
def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs)
# 创建一个 Dense 层,用于线性变换,units 参数为 config.hidden_size,使用指定的初始化器初始化权重
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 创建 LayerNormalization 层,epsilon 参数为 config.layer_norm_eps,用于归一化
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
# 创建 Dropout 层,丢弃率为 config.hidden_dropout_prob,用于随机丢弃部分神经元的输出
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
# 将 config 对象保存为实例变量,用于后续调用
self.config = config
# call 方法重写,定义了层的正向传播逻辑
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
# 将 hidden_states 输入到 Dense 层进行线性变换
hidden_states = self.dense(inputs=hidden_states)
# 在训练时,对 hidden_states 使用 Dropout 进行随机丢弃部分神经元的输出
hidden_states = self.dropout(inputs=hidden_states, training=training)
# 将经过 Dropout 处理后的 hidden_states 与 input_tensor 相加,然后输入到 LayerNorm 层进行归一化处理
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
# 返回处理后的 hidden_states
return hidden_states
# build 方法用于构建层,在第一次调用时构建层的权重
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 将标志 built 设置为 True,表示已构建
self.built = True
# 如果实例中存在 dense 层,使用 tf.name_scope 创建命名空间,并构建 dense 层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# 如果实例中存在 LayerNorm 层,使用 tf.name_scope 创建命名空间,并构建 LayerNorm 层
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# 从 https://github.com/salesforce/BLIP/blob/main/models/med.py#242 适配而来
# TFBlipTextAttention 类定义,继承自 keras.layers.Layer
class TFBlipTextAttention(keras.layers.Layer):
# 初始化方法,接收 config 和 is_cross_attention 参数以及其他关键字参数
def __init__(self, config, is_cross_attention=False, **kwargs):
super().__init__(**kwargs)
# 创建 TFBlipTextSelfAttention 类实例 self.self,用于自注意力计算,is_cross_attention 表示是否是跨注意力
self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self")
# 创建 TFBlipTextSelfOutput 类实例 self.self_output,用于自注意力层的输出处理
self.self_output = TFBlipTextSelfOutput(config, name="output")
# call 方法重写,定义了层的正向传播逻辑
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
encoder_hidden_states: tf.Tensor | None = None,
encoder_attention_mask: tf.Tensor | None = None,
past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
output_attentions: Optional[bool] = False,
training: Optional[bool] = None,
):
# 调用 self.self 的 call 方法进行自注意力计算,得到 self_outputs
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
training=training,
)
# 将 self_outputs[0] 作为输入,hidden_states 作为 input_tensor,传入 self.self_output 进行处理
attention_output = self.self_output(self_outputs[0], hidden_states, training=training)
# 构建输出元组,如果需要输出 attentions,将 attentions 添加到 outputs 中
outputs = (attention_output,) + self_outputs[1:] # 如果输出 attentions,则添加到 outputs 中
# 返回 outputs
return outputs
# 如果模型已经构建完成,直接返回,不做任何操作
if self.built:
return
# 标记模型已经构建
self.built = True
# 检查是否存在self属性,并且不为None
if getattr(self, "self", None) is not None:
# 使用self的名称创建一个命名空间,并在其中构建self对象
with tf.name_scope(self.self.name):
self.self.build(None)
# 检查是否存在self_output属性,并且不为None
if getattr(self, "self_output", None) is not None:
# 使用self_output的名称创建一个命名空间,并在其中构建self_output对象
with tf.name_scope(self.self_output.name):
self.self_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText
class TFBlipTextIntermediate(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs)
# 定义一个全连接层,用于转换输入的隐藏状态到中间层大小
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 根据配置选择激活函数,如果是字符串则转换为对应的 TensorFlow 激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# 应用全连接层到输入的隐藏状态
hidden_states = self.dense(inputs=hidden_states)
# 应用中间层激活函数到全连接层的输出
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果层已经构建且存在密集层,则构建密集层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFBlipTextOutput(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs)
# 定义一个全连接层,用于转换输入的隐藏状态到输出大小
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 应用层归一化,用于调整输出层的数据分布
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
# 应用丢弃层,用于随机丢弃部分神经元以防止过拟合
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
# 应用全连接层到输入的隐藏状态
hidden_states = self.dense(inputs=hidden_states)
# 如果训练模式开启,则应用丢弃层,否则跳过
hidden_states = self.dropout(inputs=hidden_states, training=training)
# 将输入张量和归一化后的隐藏状态相加,并应用层归一化
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果层已经构建且存在密集层,则构建密集层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
# 如果层已经构建且存在层归一化层,则构建层归一化层
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBlipTextLayer(keras.layers.Layer):
# 初始化函数,用于创建一个新的TFBlipTextLayer对象
def __init__(self, config, **kwargs):
# 调用父类的初始化方法
super().__init__(**kwargs)
# 将传入的config参数保存到对象的config属性中
self.config = config
# 创建一个TFBlipTextAttention对象,并命名为"attention"
self.attention = TFBlipTextAttention(config, name="attention")
# 如果config中指定为decoder模式,则创建一个用于跨attention的TFBlipTextAttention对象,并命名为"crossattention"
if self.config.is_decoder:
self.crossattention = TFBlipTextAttention(
config, is_cross_attention=self.config.is_decoder, name="crossattention"
)
# 创建一个TFBlipTextIntermediate对象,并命名为"intermediate"
self.intermediate = TFBlipTextIntermediate(config, name="intermediate")
# 创建一个TFBlipTextOutput对象,并命名为"output"
self.self_output = TFBlipTextOutput(config, name="output")
# call方法,用于执行前向传播操作
def call(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
training=None,
):
# 如果存在过去的key/value,则从中获取decoder单向self-attention的缓存
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
# 执行self attention操作,并获取输出结果
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
training=training,
)
# 从self attention输出中提取注意力输出
attention_output = self_attention_outputs[0]
# 提取除了注意力输出以外的所有输出
outputs = self_attention_outputs[1:-1]
# 获取当前的key/value
present_key_value = self_attention_outputs[-1]
# 如果存在encoder的隐藏状态,则执行cross attention操作
if encoder_hidden_states is not None:
# 执行cross attention操作,并获取输出结果
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
output_attentions=output_attentions,
training=training,
)
# 从cross attention输出中提取注意力输出
attention_output = cross_attention_outputs[0]
# 如果输出注意力权重,则将cross attention的输出添加到已有的outputs中
outputs = outputs + cross_attention_outputs[1:-1]
# 执行intermediate层的操作
intermediate_output = self.intermediate(attention_output)
# 执行self output层的操作,并获取最终的层输出
layer_output = self.self_output(intermediate_output, attention_output, training=training)
# 将最终的层输出和之前的outputs一起返回
outputs = (layer_output,) + outputs
# 将当前的key/value添加到outputs中
outputs = outputs + (present_key_value,)
# 返回所有的outputs
return outputs
# build方法,用于构建层,并确保每个组件被正确构建
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
# 将构建状态标记为已构建
self.built = True
# 如果存在attention对象,则构建attention对象
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
# 如果存在intermediate对象,则构建intermediate对象
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
# 如果存在self_output对象,则构建self_output对象
if getattr(self, "self_output", None) is not None:
with tf.name_scope(self.self_output.name):
self.self_output.build(None)
# 如果存在crossattention对象,则构建crossattention对象
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
# 基于 keras_serializable 装饰器将该类声明为可序列化的 Keras 层
@keras_serializable
class TFBlipTextEncoder(keras.layers.Layer):
# 指定配置类为 BlipTextConfig
config_class = BlipTextConfig
# 初始化方法,接受配置对象 config 和可选的名称参数
def __init__(self, config, name=None, **kwargs):
super().__init__(name=name, **kwargs)
# 将传入的配置对象保存为实例变量
self.config = config
# 创建一个由 TFBlipTextLayer 实例组成的列表,命名为 layer,用于表示隐藏层
self.layer = [TFBlipTextLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
# 使用 unpack_inputs 装饰器定义 call 方法,接收多个输入参数并进行处理
@unpack_inputs
def call(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
training=None,
):
# 如果设置了 output_hidden_states 标志,则初始化 all_hidden_states 为一个空元组
all_hidden_states = () if output_hidden_states else None
# 如果设置了 output_attentions 标志,则初始化 all_self_attentions 为一个空元组
all_self_attentions = () if output_attentions else None
# 如果是解码器且设置了 output_attentions 标志,则初始化 all_cross_attentions 为一个空元组
all_cross_attentions = () if output_attentions and self.config.is_decoder else None
# 如果设置了 use_cache 标志,则初始化 next_decoder_cache 为一个空元组
next_decoder_cache = () if use_cache else None
# 循环遍历每个隐藏层
for i in range(self.config.num_hidden_layers):
# 获取当前层的模块对象
layer_module = self.layer[i]
# 如果设置了 output_hidden_states 标志,则将当前隐藏状态添加到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 获取当前层的头部掩码
layer_head_mask = head_mask[i] if head_mask is not None else None
# 获取当前层的过去键值
past_key_value = past_key_values[i] if past_key_values is not None else None
# 调用当前层的模块对象进行前向传播
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
training=training,
)
# 更新隐藏状态为当前层输出的第一个元素
hidden_states = layer_outputs[0]
# 如果设置了 use_cache 标志,则将当前层输出的最后一个元素添加到 next_decoder_cache 中
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
# 如果设置了 output_attentions 标志,则将当前层输出的注意力值添加到对应的元组中
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
# 如果设置了 output_hidden_states 标志,则将最终隐藏状态添加到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果 return_dict 为 False,则返回一个元组,其中包含非 None 的值
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
# 如果 return_dict 为 True,则返回 TFBaseModelOutputWithPastAndCrossAttentions 对象
return TFBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
# 定义 build 方法,用于构建模型的层
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回,不再重复构建
if self.built:
return
# 将模型标记为已构建状态
self.built = True
# 检查是否存在 layer 属性,并对其进行迭代
if getattr(self, "layer", None) is not None:
for layer in self.layer:
# 使用 tf.name_scope 为当前层设置命名空间,确保命名唯一性
with tf.name_scope(layer.name):
# 调用每一层的 build 方法,传入 input_shape 为 None
layer.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText
class TFBlipTextPooler(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs)
# 定义一个全连接层,用于池化操作,输出维度为配置文件中的隐藏大小
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# 从输入的隐藏状态中获取第一个 token 的隐藏状态
first_token_tensor = hidden_states[:, 0]
# 将第一个 token 的隐藏状态输入到全连接层中进行池化操作
pooled_output = self.dense(inputs=first_token_tensor)
return pooled_output
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
self.built = True
# 如果定义了 dense 层,则在 tf 的命名空间下构建该层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText
class TFBlipTextPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs)
# 定义一个全连接层,用于预测头部转换,输出维度为配置文件中的隐藏大小
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
# 根据配置文件中的激活函数类型,选择相应的激活函数
if isinstance(config.hidden_act, str):
self.transform_act_fn = get_tf_activation(config.hidden_act)
else:
self.transform_act_fn = config.hidden_act
# 定义 LayerNormalization 层,用于规范化隐藏状态
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# 将输入的隐藏状态输入到全连接层中
hidden_states = self.dense(inputs=hidden_states)
# 应用预定义的激活函数
hidden_states = self.transform_act_fn(hidden_states)
# 将处理后的隐藏状态输入到 LayerNormalization 层中进行规范化
hidden_states = self.LayerNorm(inputs=hidden_states)
return hidden_states
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
self.built = True
# 如果定义了 dense 层,则在 tf 的命名空间下构建该层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# 如果定义了 LayerNorm 层,则在 tf 的命名空间下构建该层
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBlipTextLMPredictionHead(keras.layers.Layer):
# 这里是 TFBlipTextLMPredictionHead 类的定义,需要进一步补充注释
pass
# 初始化函数,接收配置参数和可选的关键字参数
def __init__(self, config, **kwargs):
# 调用父类的初始化方法
super().__init__(**kwargs)
# 使用配置初始化一个文本预测头部的转换器对象
self.transform = TFBlipTextPredictionHeadTransform(config, name="transform")
# 输出权重与输入嵌入相同,但每个标记有一个仅输出的偏置项
# 创建一个全连接层,输出大小为词汇表大小,使用指定的初始化器初始化权重
self.decoder = keras.layers.Dense(
config.vocab_size,
kernel_initializer=get_initializer(config.initializer_range),
name="decoder",
use_bias=False,
)
# 存储配置对象
self.config = config
# 构建模型
def build(self, input_shape=None):
# 添加一个名为bias的可训练权重,形状为词汇表大小,初始化为全零
self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True)
# 如果已经构建过,则直接返回
if self.built:
return
# 标记为已构建
self.built = True
# 如果存在transform属性,则构建transform层
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
# 如果存在decoder属性,则构建decoder层
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
# 构建时指定decoder层的输入形状为[None, None, 隐藏大小]
self.decoder.build([None, None, self.config.hidden_size])
# 调用模型,传入隐藏状态并返回预测结果
def call(self, hidden_states):
# 使用transform层处理隐藏状态
hidden_states = self.transform(hidden_states)
# 使用decoder层处理transform后的隐藏状态,并加上偏置
hidden_states = self.decoder(hidden_states) + self.bias
# 返回处理后的隐藏状态,即预测结果
return hidden_states
class TFBlipTextOnlyMLMHead(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 使用给定的配置创建 BLIP 文本预测头部对象
self.predictions = TFBlipTextLMPredictionHead(config, name="predictions")
def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
# 基于序列输出进行预测分数计算
prediction_scores = self.predictions(sequence_output)
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果 predictions 属性存在,则构建预测头部对象
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548
class TFBlipTextPreTrainedModel(TFPreTrainedModel):
"""
处理权重初始化和预训练模型下载加载的抽象类,提供简单的接口。
"""
config_class = BlipTextConfig
base_model_prefix = "bert"
_keys_to_ignore_on_load_missing = [r"position_ids"]
# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571
class TFBlipTextModel(TFBlipTextPreTrainedModel):
"""
该模型可以作为编码器(仅具有自注意力)或解码器行事,在后一种情况下,将在自注意力层之间添加交叉注意力层,遵循
[Attention is all you need](https://arxiv.org/abs/1706.03762) 的架构描述。
"""
def __init__(self, config, add_pooling_layer=True, name=None, **kwargs):
super().__init__(config, name=name, **kwargs)
self.config = config
# 创建 BLIP 文本嵌入层、编码层和池化层(如果需要)
self.embeddings = TFBlipTextEmbeddings(config, name="embeddings")
self.encoder = TFBlipTextEncoder(config, name="encoder")
self.pooler = TFBlipTextPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self):
# 返回输入嵌入层的权重
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
# 设置输入嵌入层的权重值
self.embeddings.word_embeddings = value
@tf.function
def get_extended_attention_mask(
self, attention_mask: tf.Tensor, input_shape: Tuple[int], is_decoder: bool
):
# 返回扩展的注意力遮罩张量
pass
@add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
@unpack_inputs
# 添加开始的文档字符串到模型前向传递
# 定义一个方法 `call`,用于执行模型的前向传播。
# 参数说明:
# input_ids: 输入的 token IDs
# attention_mask: 注意力遮罩张量
# position_ids: 位置 ID 张量
# head_mask: 头部遮罩张量
# inputs_embeds: 输入的嵌入张量
# encoder_embeds: 编码器的嵌入张量
# encoder_hidden_states: 编码器的隐藏状态张量
# encoder_attention_mask: 编码器的注意力遮罩张量
# past_key_values: 缓存的键值对元组
# use_cache: 是否使用缓存
# output_attentions: 是否输出注意力权重
# output_hidden_states: 是否输出隐藏状态
# return_dict: 是否返回字典形式的结果
# is_decoder: 是否作为解码器运行
# training: 是否在训练模式下运行
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果 `embeddings` 属性存在,则构建 `embeddings` 层
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
# 如果 `encoder` 属性存在,则构建 `encoder` 层
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
# 如果 `pooler` 属性存在,则构建 `pooler` 层
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
# 在加载模型时忽略的不期望键名列表
_keys_to_ignore_on_load_unexpected = [r"pooler"]
# 在加载模型时忽略的丢失键名列表
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
def __init__(self, config, **kwargs):
super().__init__(config, **kwargs)
# 初始化BERT模型,不包含池化层
self.bert = TFBlipTextModel(config, add_pooling_layer=False, name="bert")
# 初始化仅包含MLM头部的模型
self.cls = TFBlipTextOnlyMLMHead(config, name="cls")
# 获取标签平滑参数
self.label_smoothing = config.label_smoothing
# 获取输出的嵌入层
def get_output_embeddings(self):
return self.cls.predictions.decoder
# 设置输出的嵌入层
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
# 调用函数,按照模型前向传播的文档字符串注释,解包输入参数
@add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
@unpack_inputs
def call(
self,
input_ids=None,
attention_mask=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
labels=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
return_logits=False,
is_decoder=True,
training=None,
):
input_shape = input_ids.shape
# 如果没有提供注意力掩码,则创建一个全为1的掩码
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
# 如果使用了过去的键值对,则截断输入的input_ids
if past_key_values is not None:
input_ids = input_ids[:, -1:]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
"encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
"is_decoder": True,
}
# 重新排序缓存,以便于beam search
def _reorder_cache(self, past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
return reordered_past
# 构建模型
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果存在BERT模型,则构建BERT
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
# 如果存在CLS模型,则构建CLS
if getattr(self, "cls", None) is not None:
with tf.name_scope(self.cls.name):
self.cls.build(None)
.\models\blip\processing_blip.py
"""
Processor class for Blip.
"""
from typing import List, Optional, Union
from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class BlipProcessor(ProcessorMixin):
r"""
Constructs a BLIP processor which wraps a BERT tokenizer and BLIP image processor into a single processor.
[`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`BertTokenizerFast`]. See the
docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
Args:
image_processor (`BlipImageProcessor`):
An instance of [`BlipImageProcessor`]. The image processor is a required input.
tokenizer (`BertTokenizerFast`):
An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "BlipImageProcessor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, image_processor, tokenizer):
tokenizer.return_token_type_ids = False
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(
self,
images: ImageInput = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_token_type_ids: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
):
"""
Process images and text inputs using the BLIP processor.
Args:
images (ImageInput, optional): Input images to be processed.
text (Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], optional):
Input text or pre-tokenized text inputs to be processed.
add_special_tokens (bool, optional): Whether to add special tokens to the inputs.
padding (Union[bool, str, PaddingStrategy], optional): Padding strategy for inputs.
truncation (Union[bool, str, TruncationStrategy], optional): Truncation strategy for inputs.
max_length (int, optional): Maximum length of the processed inputs.
stride (int, optional): Stride used for overflowing tokens.
pad_to_multiple_of (int, optional): Pad inputs to a multiple of this number.
return_attention_mask (bool, optional): Whether to return attention masks.
return_overflowing_tokens (bool, optional): Whether to return overflowing tokens.
return_special_tokens_mask (bool, optional): Whether to return special tokens mask.
return_offsets_mapping (bool, optional): Whether to return offsets mapping.
return_token_type_ids (bool, optional): Whether to return token type IDs.
return_length (bool, optional): Whether to return the length of the processed inputs.
verbose (bool, optional): Whether to print verbose information.
return_tensors (Union[str, TensorType], optional): Type of tensor to return.
Returns:
BatchEncoding: Processed inputs in batch encoding format.
"""
pass
) -> BatchEncoding:
"""
使用 [`BlipImageProcessor.__call__`] 方法准备图像数据以供模型使用,
并使用 [`BertTokenizerFast.__call__`] 方法准备文本数据以供模型使用。
更多信息请参考上述两个方法的文档字符串。
"""
if images is None and text is None:
raise ValueError("You have to specify either images or text.")
if images is None:
self.current_processor = self.tokenizer
text_encoding = self.tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_token_type_ids=return_token_type_ids,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
return text_encoding
encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
if text is not None:
text_encoding = self.tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_token_type_ids=return_token_type_ids,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
else:
text_encoding = None
if text_encoding is not None:
encoding_image_processor.update(text_encoding)
return encoding_image_processor
def batch_decode(self, *args, **kwargs):
"""
将所有参数转发给 BertTokenizerFast 的 [`~PreTrainedTokenizer.batch_decode`] 方法。
请参考该方法的文档字符串获取更多信息。
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
.\models\blip\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_blip": [
"BLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
"BlipConfig",
"BlipTextConfig",
"BlipVisionConfig",
],
"processing_blip": ["BlipProcessor"],
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_blip"] = ["BlipImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_blip"] = [
"BLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
"BlipModel",
"BlipPreTrainedModel",
"BlipForConditionalGeneration",
"BlipForQuestionAnswering",
"BlipVisionModel",
"BlipTextModel",
"BlipForImageTextRetrieval",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_blip"] = [
"TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFBlipModel",
"TFBlipPreTrainedModel",
"TFBlipForConditionalGeneration",
"TFBlipForQuestionAnswering",
"TFBlipVisionModel",
"TFBlipTextModel",
"TFBlipForImageTextRetrieval",
]
if TYPE_CHECKING:
from .configuration_blip import BLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, BlipConfig, BlipTextConfig, BlipVisionConfig
from .processing_blip import BlipProcessor
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_blip import BlipImageProcessor
else:
from .modeling_blip import (
BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
BlipForConditionalGeneration,
BlipForImageTextRetrieval,
BlipForQuestionAnswering,
BlipModel,
BlipPreTrainedModel,
BlipTextModel,
BlipVisionModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_blip import (
TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
TFBlipForConditionalGeneration,
TFBlipForImageTextRetrieval,
TFBlipForQuestionAnswering,
TFBlipModel,
TFBlipPreTrainedModel,
TFBlipTextModel,
TFBlipVisionModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\blip_2\configuration_blip_2.py
""" BLIP-2 model configuration"""
import os
from typing import Union
from ...configuration_utils import PretrainedConfig
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"salesforce/blip2-opt-2.7b": "https://huggingface.co/salesforce/blip2-opt-2.7b/resolve/main/config.json",
}
class Blip2VisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Blip2VisionModel`]. It is used to instantiate a
BLIP-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration defaults will yield a similar configuration to that of the BLIP-2
[Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
Args:
hidden_size (`int`, *optional*, defaults to 1408):
intermediate_size (`int`, *optional*, defaults to 6144):
num_hidden_layers (`int`, *optional*, defaults to 39):
num_attention_heads (`int`, *optional*, defaults to 16):
image_size (`int`, *optional*, defaults to 224):
patch_size (`int`, *optional*, defaults to 14):
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
attention_dropout (`float`, *optional*, defaults to 0.0):
initializer_range (`float`, *optional*, defaults to 0.02):
qkv_bias (`bool`, *optional*, defaults to `True`):
Example:
```
>>> from transformers import Blip2VisionConfig, Blip2VisionModel
>>>
>>> configuration = Blip2VisionConfig()
>>>
>>> model = Blip2VisionModel(configuration)
>>>
>>> configuration = model.config
```"""
model_type = "blip_2_vision_model"
def __init__(
self,
hidden_size=1408, # 编码器层和池化层的维度,默认为 1408
intermediate_size=6144, # Transformer 编码器中前馈层的维度,默认为 6144
num_hidden_layers=39, # Transformer 编码器的隐藏层数量,默认为 39
num_attention_heads=16, # 每个注意力层的注意力头数量,默认为 16
image_size=224, # 每个图像的大小(分辨率),默认为 224
patch_size=14, # 每个 patch 的大小(分辨率),默认为 14
hidden_act="gelu", # 编码器和池化器的非线性激活函数,默认为 "gelu"
layer_norm_eps=1e-6, # 层归一化层使用的 epsilon 值,默认为 1e-6
attention_dropout=0.0, # 注意力概率的丢弃比率,默认为 0.0
initializer_range=1e-10, # 初始化所有权重矩阵的 truncated_normal_initializer 的标准差,默认为 1e-10
qkv_bias=True, # 是否在自注意力层的查询和值添加偏置,默认为 True
**kwargs, # 接受其他关键字参数
):
# 调用父类的构造方法,传递所有关键字参数
super().__init__(**kwargs)
# 设置隐藏层大小
self.hidden_size = hidden_size
# 设置中间层大小
self.intermediate_size = intermediate_size
# 设置隐藏层数量
self.num_hidden_layers = num_hidden_layers
# 设置注意力头的数量
self.num_attention_heads = num_attention_heads
# 设置补丁大小
self.patch_size = patch_size
# 设置图像大小
self.image_size = image_size
# 设置初始化范围
self.initializer_range = initializer_range
# 设置注意力机制的dropout率
self.attention_dropout = attention_dropout
# 设置层归一化的epsilon值
self.layer_norm_eps = layer_norm_eps
# 设置隐藏层激活函数
self.hidden_act = hidden_act
# 设置注意力机制中的QKV偏置
self.qkv_bias = qkv_bias
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
# 在关键字参数中设置token
cls._set_token_in_kwargs(kwargs)
# 获取预训练模型的配置字典和更新后的关键字参数
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# 如果配置字典指定的模型类型是"blip-2",则使用视觉配置字典
if config_dict.get("model_type") == "blip-2":
config_dict = config_dict["vision_config"]
# 如果配置字典中存在"model_type"且与当前类的模型类型不同,发出警告
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
# 从配置字典和关键字参数中创建类的实例
return cls.from_dict(config_dict, **kwargs)
# 定义 Blip2QFormerConfig 类,继承自 PretrainedConfig 类
class Blip2QFormerConfig(PretrainedConfig):
r"""
这是配置类,用于存储 [`Blip2QFormerModel`] 的配置信息。它被用来根据指定的参数实例化一个 BLIP-2 Querying Transformer (Q-Former) 模型,
定义模型的架构。使用默认参数实例化一个配置对象会产生与 BLIP-2 [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) 架构
类似的配置。配置对象继承自 [`PretrainedConfig`],可以用来控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。
注意,[`Blip2QFormerModel`] 与 [`BertLMHeadModel`] 非常相似,具有交错的跨注意力机制。
```
Args:
vocab_size (`int`, *optional*, defaults to 30522):
Q-Former 模型的词汇表大小,定义了在调用模型时 `inputs_ids` 可以表示的不同标记数量。
hidden_size (`int`, *optional*, defaults to 768):
编码器层和池化层的维度大小。
num_hidden_layers (`int`, *optional*, defaults to 12):
Transformer 编码器中隐藏层的数量。
num_attention_heads (`int`, *optional*, defaults to 12):
Transformer 编码器中每个注意力层的注意力头数量。
intermediate_size (`int`, *optional*, defaults to 3072):
Transformer 编码器中“中间”(常称为前馈)层的维度。
hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
编码器和池化器中的非线性激活函数(函数或字符串)。支持的字符串有:"gelu"、"relu"、"silu" 和 "gelu_new"。
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
注意力概率的 dropout 比率。
max_position_embeddings (`int`, *optional*, defaults to 512):
模型可能使用的最大序列长度。通常设置为一个较大的值(例如 512、1024 或 2048)。
initializer_range (`float`, *optional*, defaults to 0.02):
初始化所有权重矩阵的截断正态分布的标准差。
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
层归一化层使用的 epsilon 值。
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
位置嵌入的类型。选择 `"absolute"`、`"relative_key"` 或 `"relative_key_query"` 之一。关于 `"relative_key"` 的更多信息,请参考
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155)。
关于 `"relative_key_query"` 的更多信息,请参考
[Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658) 中的 *Method 4*。
cross_attention_frequency (`int`, *optional*, defaults to 2):
在 Transformer 层中添加跨注意力的频率。
encoder_hidden_size (`int`, *optional*, defaults to 1408):
用于跨注意力的隐藏状态的隐藏大小。
Examples:
```
>>> from transformers import Blip2QFormerConfig, Blip2QFormerModel
>>>
model_type = "blip_2_qformer"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
position_embedding_type="absolute",
cross_attention_frequency=2,
encoder_hidden_size=1408,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.cross_attention_frequency = cross_attention_frequency
self.encoder_hidden_size = encoder_hidden_size
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "blip-2":
config_dict = config_dict["qformer_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class Blip2Config(PretrainedConfig):
model_type = "blip-2"
def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
super().__init__(**kwargs)
if vision_config is None:
vision_config = {}
logger.info("vision_config is None. initializing the Blip2VisionConfig with default values.")
if qformer_config is None:
qformer_config = {}
logger.info("qformer_config is None. Initializing the Blip2QFormerConfig with default values.")
if text_config is None:
text_config = {}
logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
self.vision_config = Blip2VisionConfig(**vision_config)
self.qformer_config = Blip2QFormerConfig(**qformer_config)
text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
self.tie_word_embeddings = self.text_config.tie_word_embeddings
self.is_encoder_decoder = self.text_config.is_encoder_decoder
self.num_query_tokens = num_query_tokens
self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
self.initializer_factor = 1.0
self.initializer_range = 0.02
@classmethod
def from_vision_qformer_text_configs(
cls,
vision_config: Blip2VisionConfig,
qformer_config: Blip2QFormerConfig,
text_config: PretrainedConfig,
**kwargs,
):
r"""
Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model
configurations.
Returns:
[`Blip2Config`]: An instance of a configuration object
"""
return cls(
vision_config=vision_config.to_dict(),
qformer_config=qformer_config.to_dict(),
text_config=text_config.to_dict(),
**kwargs,
)
.\models\blip_2\convert_blip_2_original_to_pytorch.py
"""
Convert BLIP-2 checkpoints from the original repository.
URL: https://github.com/salesforce/LAVIS/tree/main/projects/blip2
"""
import argparse
import requests
import torch
from lavis.models import load_model_and_preprocess
from PIL import Image
from transformers import (
AutoTokenizer,
Blip2Config,
Blip2ForConditionalGeneration,
Blip2Processor,
Blip2VisionConfig,
BlipImageProcessor,
OPTConfig,
T5Config,
set_seed,
)
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
def load_demo_image():
url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
return image
def create_rename_keys(config):
rename_keys = []
rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
for i in range(config.vision_config.num_hidden_layers):
rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.layernorm.weight"))
rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.layernorm.bias"))
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def read_in_q_v_bias(state_dict, config):
for i in range(config.vision_config.num_hidden_layers):
q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")
qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias
def get_blip2_config(model_name, eos_token_id):
image_size = 364 if "coco" in model_name else 224
vision_config = Blip2VisionConfig(image_size=image_size).to_dict()
if "opt-2.7b" in model_name:
text_config = OPTConfig.from_pretrained("facebook/opt-2.7b", eos_token_id=eos_token_id).to_dict()
elif "opt-6.7b" in model_name:
text_config = OPTConfig.from_pretrained("facebook/opt-6.7b", eos_token_id=eos_token_id).to_dict()
elif "t5-xl" in model_name:
text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
elif "t5-xxl" in model_name:
text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()
config = Blip2Config(vision_config=vision_config, text_config=text_config)
return config, image_size
@torch.no_grad()
def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
"""
复制/粘贴/调整模型权重到 Transformers 设计。
"""
tokenizer = (
AutoTokenizer.from_pretrained("facebook/opt-2.7b")
if "opt" in model_name
else AutoTokenizer.from_pretrained("google/flan-t5-xl")
)
eos_token_id = tokenizer("\n", add_special_tokens=False).input_ids[0]
config, image_size = get_blip2_config(model_name, eos_token_id=eos_token_id)
hf_model = Blip2ForConditionalGeneration(config).eval()
model_name_to_original = {
"blip2-opt-2.7b": ("blip2_opt", "pretrain_opt2.7b"),
"blip2-opt-6.7b": ("blip2_opt", "pretrain_opt6.7b"),
"blip2-opt-2.7b-coco": ("blip2_opt", "caption_coco_opt2.7b"),
"blip2-opt-6.7b-coco": ("blip2_opt", "caption_coco_opt6.7b"),
"blip2-flan-t5-xl": ("blip2_t5", "pretrain_flant5xl"),
"blip2-flan-t5-xl-coco": ("blip2_t5", "caption_coco_flant5xl"),
"blip2-flan-t5-xxl": ("blip2_t5", "pretrain_flant5xxl"),
}
hf_model_device = "cuda:0" if torch.cuda.is_available() else "cpu"
lavis_device = "cuda:1" if torch.cuda.is_available() else "cpu"
print("Loading original model...")
original_model, vis_processors, _ = load_model_and_preprocess(
name=name, model_type=type, is_eval=True, device=lavis_device
)
original_model.eval()
print("Done!")
state_dict = original_model.state_dict()
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
for key, val in state_dict.copy().items():
val = state_dict.pop(key)
if key.startswith("Qformer.bert"):
key = key.replace("Qformer.bert", "qformer")
if "attention.self" in key:
key = key.replace("self", "attention")
if "opt_proj" in key:
key = key.replace("opt_proj", "language_projection")
if "t5_proj" in key:
key = key.replace("t5_proj", "language_projection")
if key.startswith("opt"):
key = key.replace("opt", "language")
if key.startswith("t5"):
key = key.replace("t5", "language")
state_dict[key] = val
read_in_q_v_bias(state_dict, config)
missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False)
assert len(missing_keys) == 0
assert unexpected_keys == ["qformer.embeddings.position_ids"]
image = load_demo_image()
original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
input_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to(hf_model_device)
image_processor = BlipImageProcessor(
size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
)
processor = Blip2Processor(image_processor=image_processor, tokenizer=tokenizer)
pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(hf_model_device)
assert torch.allclose(pixel_values, original_pixel_values.to(pixel_values.device))
original_model.to(lavis_device)
hf_model.to(hf_model_device)
with torch.no_grad():
if "opt" in model_name:
original_logits = original_model({"image": original_pixel_values, "text_input": [""]}).logits
logits = hf_model(pixel_values, input_ids).logits
else:
original_logits = original_model(
{"image": original_pixel_values, "text_input": ["\n"], "text_output": ["\n"]}
).logits
labels = input_ids.masked_fill(input_ids == tokenizer.pad_token_id, -100)
logits = hf_model(pixel_values, input_ids, labels=labels).logits
assert original_logits.shape == logits.shape
if __name__ == "__main__":
parser = argparse.ArgumentParser()
choices = [
"blip2-opt-2.7b",
"blip2-opt-6.7b",
"blip2-opt-2.7b-coco",
"blip2-opt-6.7b-coco",
"blip2-flan-t5-xl",
"blip2-flan-t5-xl-coco",
"blip2-flan-t5-xxl",
]
parser.add_argument(
"--model_name",
default="blip2-opt-2.7b",
choices=choices,
type=str,
help="Path to hf config.json of model to convert",
)
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to push the model and processor to the hub after converting",
)
args = parser.parse_args()
convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\blip_2\modeling_blip_2.py
""" PyTorch BLIP-2 model."""
import math
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPooling,
BaseModelOutputWithPoolingAndCrossAttentions,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ..auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM
from .configuration_blip_2 import Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "Salesforce/blip2-opt-2.7b"
BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"Salesforce/blip2-opt-2.7b",
]
@dataclass
class Blip2ForConditionalGenerationModelOutput(ModelOutput):
"""
Class defining the outputs of [`Blip2ForConditionalGeneration`].
Args:
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Language modeling loss from the language model.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head of the language model.
vision_outputs (`BaseModelOutputWithPooling`):
Outputs of the vision encoder.
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
Outputs of the Q-Former (Querying Transformer).
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
Outputs of the language model.
"""
loss: Optional[Tuple[torch.FloatTensor]] = None
logits: Optional[Tuple[torch.FloatTensor]] = None
vision_outputs: Optional[torch.FloatTensor] = None
qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None
language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None
def to_tuple(self) -> Tuple[Any]:
return tuple(
self[k]
if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
else getattr(self, k).to_tuple()
for k in self.keys()
)
class Blip2VisionEmbeddings(nn.Module):
def __init__(self, config: Blip2VisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
self.patch_embedding = nn.Conv2d(
in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
)
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
target_dtype = self.patch_embedding.weight.dtype
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
return embeddings
class Blip2Attention(nn.Module):
"""来自'Attention Is All You Need'论文的多头注意力模块"""
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim必须能够被num_heads整除(得到`embed_dim`:{self.embed_dim}和`num_heads`:{self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = nn.Dropout(config.attention_dropout)
self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)
if config.qkv_bias:
q_bias = nn.Parameter(torch.zeros(self.embed_dim))
v_bias = nn.Parameter(torch.zeros(self.embed_dim))
else:
q_bias = None
v_bias = None
if q_bias is not None:
qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
self.qkv.bias = nn.Parameter(qkv_bias)
self.projection = nn.Linear(self.embed_dim, self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel"""
bsz, tgt_len, embed_dim = hidden_states.size()
mixed_qkv = self.qkv(hidden_states)
mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
2, 0, 3, 1, 4
)
query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
attention_scores = attention_scores * self.scale
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
context_layer = context_layer.reshape(new_context_layer_shape)
output = self.projection(context_layer)
outputs = (output, attention_probs) if output_attentions else (output, None)
return outputs
class Blip2MLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act]
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.fc1(hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(hidden_states)
return hidden_states
class Blip2EncoderLayer(nn.Module):
def __init__(self, config: Blip2Config):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = Blip2Attention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = Blip2MLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers.
"""
residual = hidden_states
hidden_states = self.layer_norm1(hidden_states)
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
head_mask=attention_mask,
output_attentions=output_attentions,
)
hidden_states = hidden_states + residual
residual = hidden_states
hidden_states = self.layer_norm2(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = hidden_states + residual
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class Blip2PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = Blip2Config
base_model_prefix = "blip"
supports_gradient_checkpointing = True
_no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_keep_in_fp32_modules = ["wo"]
def _init_weights(self, module):
"""Initialize the weights"""
factor = self.config.initializer_range
if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=factor)
if hasattr(module, "bias") and module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, Blip2VisionEmbeddings):
if hasattr(self.config, "vision_config"):
factor = self.config.vision_config.initializer_range
nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
BLIP_2_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`Blip2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BLIP_2_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for
details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
BLIP_2_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
# 输入序列 token 在词汇表中的索引。默认情况下会忽略填充部分。可以使用 `AutoTokenizer` 获取索引。详见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。
[什么是输入 ID?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
# 避免在填充 token 索引上执行注意力操作的掩码。掩码取值为 `[0, 1]`:
- 1 表示**未被掩蔽**的 token,
- 0 表示**被掩蔽**的 token。
[什么是注意力掩码?](../glossary#attention-mask)
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
# 解码器输入序列 token 在词汇表中的索引。
# 可以使用 `AutoTokenizer` 获取索引。详见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。
[什么是解码器输入 ID?](../glossary#decoder-input-ids)
T5 使用 `pad_token_id` 作为 `decoder_input_ids` 生成的起始 token。如果使用了 `past_key_values`,可以选择仅输入最后的 `decoder_input_ids`(参见 `past_key_values`)。
欲了解更多有关预训练的 `decoder_input_ids` 准备工作,请查看 [T5 Training](./t5#training)。
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
# 默认行为:生成一个忽略 `decoder_input_ids` 中填充 token 的张量。默认情况下也会使用因果掩码。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。详见返回张量中的 `attentions` 获取更多细节。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。详见返回张量中的 `hidden_states` 获取更多细节。
return_dict (`bool`, *optional*):
# 是否返回一个 `~utils.ModelOutput` 而非简单的元组。
"""
BLIP_2_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for
details.
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
provided to serve as text prompt, which the language model can continue.
Indices can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an
encoder-decoder language model (like T5) is used.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids)
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
Only relevant in case an encoder-decoder language model (like T5) is used.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class Blip2Encoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Blip2EncoderLayer`].
Args:
config (`Blip2Config`):
The corresponding vision configuration for the `Blip2Encoder`.
"""
def __init__(self, config: Blip2Config):
super().__init__()
self.config = config
self.layers = nn.ModuleList([Blip2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
inputs_embeds,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
r"""
Args:
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Embedded representation of the inputs. Should be float, not int tokens.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
hidden_states = inputs_embeds
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
encoder_layer.__call__,
hidden_states,
attention_mask,
output_attentions,
)
else:
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
class Blip2VisionModel(Blip2PreTrainedModel):
main_input_name = "pixel_values"
config_class = Blip2VisionConfig
def __init__(self, config: Blip2VisionConfig):
super().__init__(config)
self.config = config
embed_dim = config.hidden_size
self.embeddings = Blip2VisionEmbeddings(config)
self.encoder = Blip2Encoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.post_init()
@add_start_docstrings_to_model_forward(BLIP_2_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
前向传播函数
Returns:
根据 return_dict 的值返回不同的输出格式
"""
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
hidden_states = self.embeddings(pixel_values)
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
last_hidden_state = self.post_layernorm(last_hidden_state)
pooled_output = last_hidden_state[:, 0, :]
pooled_output = self.post_layernorm(pooled_output)
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def get_input_embeddings(self):
return self.embeddings
class Blip2QFormerMultiHeadAttention(nn.Module):
def __init__(self, config, is_cross_attention=False):
super().__init__()
self.config = config
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention heads (%d)"
% (config.hidden_size, config.num_attention_heads)
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
if is_cross_attention:
self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
else:
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.save_attention = False
def save_attn_gradients(self, attn_gradients):
self.attn_gradients = attn_gradients
def get_attn_gradients(self):
return self.attn_gradients
def save_attention_map(self, attention_map):
self.attention_map = attention_map
def get_attention_map(self):
return self.attention_map
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
class Blip2QFormerSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class Blip2QFormerAttention(nn.Module):
def __init__(self, config, is_cross_attention=False):
super().__init__()
self.attention = Blip2QFormerMultiHeadAttention(config, is_cross_attention)
self.output = Blip2QFormerSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
)
self.attention.query = prune_linear_layer(self.attention.query, index)
self.attention.key = prune_linear_layer(self.attention.key, index)
self.attention.value = prune_linear_layer(self.attention.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class Blip2QFormerIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class Blip2QFormerOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class Blip2QFormerLayer(nn.Module):
def __init__(self, config, layer_idx):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = Blip2QFormerAttention(config)
self.layer_idx = layer_idx
if layer_idx % config.cross_attention_frequency == 0:
self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True)
self.has_cross_attention = True
else:
self.has_cross_attention = False
self.intermediate_query = Blip2QFormerIntermediate(config)
self.output_query = Blip2QFormerOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
query_length=0,
):
):
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
if query_length > 0:
query_attention_output = attention_output[:, :query_length, :]
if self.has_cross_attention:
if encoder_hidden_states is None:
raise ValueError("encoder_hidden_states must be given for cross-attention layers")
cross_attention_outputs = self.crossattention(
query_attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
output_attentions=output_attentions,
)
query_attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk_query,
self.chunk_size_feed_forward,
self.seq_len_dim,
query_attention_output,
)
if attention_output.shape[1] > query_length:
layer_output_text = apply_chunking_to_forward(
self.feed_forward_chunk,
self.chunk_size_feed_forward,
self.seq_len_dim,
attention_output[:, query_length:, :],
)
layer_output = torch.cat([layer_output, layer_output_text], dim=1)
else:
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk,
self.chunk_size_feed_forward,
self.seq_len_dim,
attention_output,
)
outputs = (layer_output,) + outputs
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
def feed_forward_chunk_query(self, attention_output):
intermediate_output = self.intermediate_query(attention_output)
layer_output = self.output_query(intermediate_output, attention_output)
return layer_output
class Blip2QFormerEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList(
[Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
query_length=0,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions else None
next_decoder_cache = () if use_cache else None
for i in range(self.config.num_hidden_layers):
layer_module = self.layer[i]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if getattr(self.config, "gradient_checkpointing", False) and self.training:
if use_cache:
logger.warning(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
query_length,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if layer_module.has_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
"""
Querying Transformer (Q-Former), used in BLIP-2.
"""
def __init__(self, config: Blip2QFormerConfig):
super().__init__(config)
self.config = config
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.encoder = Blip2QFormerEncoder(config)
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def get_extended_attention_mask(
self,
attention_mask: torch.Tensor,
input_shape: Tuple[int],
device: torch.device,
has_query: bool = False,
) -> torch.Tensor:
"""
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
Arguments:
attention_mask (`torch.Tensor`):
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
input_shape (`Tuple[int]`):
The shape of the input to the model.
device (`torch.device`):
The device of the input to the model.
Returns:
`torch.Tensor` The extended attention mask, with the same dtype as `attention_mask.dtype`.
"""
if attention_mask.dim() == 3:
extended_attention_mask = attention_mask[:, None, :, :]
elif attention_mask.dim() == 2:
extended_attention_mask = attention_mask[:, None, None, :]
else:
raise ValueError(
"Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
input_shape, attention_mask.shape
)
)
extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
return extended_attention_mask
@add_start_docstrings(
"""
BLIP-2 Model for generating text and image features. The model consists of a vision encoder, Querying Transformer
(Q-Former) and a language model.
""",
BLIP_2_START_DOCSTRING,
)
class Blip2Model(Blip2PreTrainedModel):
config_class = Blip2Config
main_input_name = "pixel_values"
def __init__(self, config: Blip2Config):
super().__init__(config)
self.vision_model = Blip2VisionModel(config.vision_config)
self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
self.qformer = Blip2QFormerModel(config.qformer_config)
self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
if config.use_decoder_only_language_model:
language_model = AutoModelForCausalLM.from_config(config.text_config)
else:
language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
if language_model._tied_weights_keys is not None:
self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
self.language_model = language_model
self.post_init()
def get_input_embeddings(self):
return self.language_model.get_input_embeddings()
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_output_embeddings(self, new_embeddings):
self.language_model.set_output_embeddings(new_embeddings)
def get_output_embeddings(self) -> nn.Module:
return self.language_model.get_output_embeddings()
def get_encoder(self):
return self.language_model.get_encoder()
def get_decoder(self):
return self.language_model.get_decoder()
def _tie_weights(self):
if not self.config.use_decoder_only_language_model:
self.language_model.encoder.embed_tokens = self.language_model.shared
self.language_model.decoder.embed_tokens = self.language_model.shared
@add_start_docstrings_to_model_forward(BLIP_2_TEXT_INPUTS_DOCSTRING)
def get_text_features(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.Tensor] = None,
decoder_attention_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings_to_model_forward(BLIP_2_VISION_INPUTS_DOCSTRING)
def get_image_features(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
r"""
Returns:
text_outputs (`CausalLMOutputWithPast`, or `tuple(torch.FloatTensor)` if `return_dict=False`):
The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
contains the language model logits, the past key values and the hidden states if
`output_hidden_states=True`.
Examples:
```
>>> import torch
>>> from transformers import AutoTokenizer, Blip2Model
>>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")
>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip2-opt-2.7b")
>>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if self.config.use_decoder_only_language_model:
text_outputs = self.language_model(
input_ids=input_ids,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
else:
inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
text_outputs = self.language_model(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
labels=labels,
)
return text_outputs
):
r"""
Returns:
vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`):
The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
contains the image features, the pooled image features and the hidden states if
`output_hidden_states=True`.
Examples:
```
>>> import torch
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Blip2Model
>>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> image_outputs = model.get_image_features(**inputs)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
return vision_outputs
@add_start_docstrings_to_model_forward(BLIP_2_INPUTS_DOCSTRING)
def get_qformer_features(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
r"""
Returns:
vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`):
The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
contains the image features, the pooled image features and the hidden states if
`output_hidden_states=True`.
Examples:
```
>>> import torch
>>> from PIL import Image
>>> import requests
>>> from transformers import Blip2Processor, Blip2Model
>>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
>>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> qformer_outputs = model.get_qformer_features(**inputs)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
image_embeds = vision_outputs[0]
image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
query_outputs = self.qformer(
query_embeds=query_tokens,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
return query_outputs
def forward(
self,
pixel_values: torch.FloatTensor,
input_ids: torch.FloatTensor,
attention_mask: Optional[torch.LongTensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
return_dict: Optional[bool] = None,
"""
BLIP-2 Model for generating text given an image and an optional text prompt. The model consists of a vision
encoder, Querying Transformer (Q-Former) and a language model.
One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
<Tip>
Note that Flan-T5 checkpoints cannot be cast to float16. They are pre-trained using bfloat16.
</Tip>
"""
@add_start_docstrings(
"""
BLIP-2 Model for generating text given an image and an optional text prompt. The model consists of a vision
encoder, Querying Transformer (Q-Former) and a language model.
One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
<Tip>
Note that Flan-T5 checkpoints cannot be cast to float16. They are pre-trained using bfloat16.
</Tip>
""",
BLIP_2_START_DOCSTRING,
)
class Blip2ForConditionalGeneration(Blip2PreTrainedModel):
config_class = Blip2Config
main_input_name = "pixel_values"
def __init__(self, config: Blip2Config):
super().__init__(config)
self.vision_model = Blip2VisionModel(config.vision_config)
self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
self.qformer = Blip2QFormerModel(config.qformer_config)
self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
if config.use_decoder_only_language_model:
language_model = AutoModelForCausalLM.from_config(config.text_config)
else:
language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
if language_model._tied_weights_keys is not None:
self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
self.language_model = language_model
self.post_init()
def get_input_embeddings(self):
return self.language_model.get_input_embeddings()
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_output_embeddings(self, new_embeddings):
self.language_model.set_output_embeddings(new_embeddings)
def get_output_embeddings(self) -> nn.Module:
return self.language_model.get_output_embeddings()
def get_encoder(self):
return self.language_model.get_encoder()
def get_decoder(self):
return self.language_model.get_decoder()
def _tie_weights(self):
if not self.config.use_decoder_only_language_model:
self.language_model.encoder.embed_tokens = self.language_model.shared
self.language_model.decoder.embed_tokens = self.language_model.shared
def _preprocess_accelerate(self):
r"""
Some pre-processing hacks to make the model `accelerate` compatible. Check
https://github.com/huggingface/transformers/pull/21707 for more details.
"""
hf_device_map = self.hf_device_map
if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
logger.warning(
"The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
" in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
" Please pass a `device_map` that contains `language_model` to remove this warning."
" Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
" more details on creating a `device_map` for large models.",
)
if hasattr(self.language_model, "_hf_hook"):
self.language_model._hf_hook.io_same_device = True
@add_start_docstrings_to_model_forward(BLIP_2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Blip2ForConditionalGenerationModelOutput, config_class=Blip2VisionConfig)
def forward(
self,
pixel_values: torch.FloatTensor,
input_ids: torch.FloatTensor,
attention_mask: Optional[torch.LongTensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
return_dict: Optional[bool] = None,
):
@torch.no_grad()
def generate(
self,
pixel_values: torch.FloatTensor,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
**generate_kwargs,
):
) -> torch.LongTensor:
"""
Overrides `generate` function to be able to use the model as a conditional generator.
Args:
pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
Input images to be processed.
input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
The sequence used as a prompt for the generation.
attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
Mask to avoid performing attention on padding token indices
Returns:
captions (list): A list of strings of length batch_size * num_captions.
"""
if hasattr(self, "hf_device_map"):
self._preprocess_accelerate()
batch_size = pixel_values.shape[0]
image_embeds = self.vision_model(pixel_values, return_dict=True).last_hidden_state
image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
query_outputs = self.qformer(
query_embeds=query_tokens,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_attention_mask,
return_dict=True,
)
query_output = query_outputs.last_hidden_state
language_model_inputs = self.language_projection(query_output)
language_attention_mask = torch.ones(
language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
)
if input_ids is None:
input_ids = (
torch.LongTensor([[self.config.text_config.bos_token_id]])
.repeat(batch_size, 1)
.to(image_embeds.device)
)
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)
inputs_embeds = self.get_input_embeddings()(input_ids)
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
if not self.language_model.config.is_encoder_decoder:
generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1]
generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
outputs = self.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
**generate_kwargs,
)
return outputs
.\models\blip_2\processing_blip_2.py
"""
Processor class for BLIP-2.
"""
from typing import List, Optional, Union
from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class Blip2Processor(ProcessorMixin):
r"""
Constructs a BLIP-2 processor which wraps a BLIP image processor and an OPT/T5 tokenizer into a single processor.
[`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the docstring
of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
Args:
image_processor (`BlipImageProcessor`):
An instance of [`BlipImageProcessor`]. The image processor is a required input.
tokenizer (`AutoTokenizer`):
An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "BlipImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(self, image_processor, tokenizer):
tokenizer.return_token_type_ids = False
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(
self,
images: ImageInput = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_token_type_ids: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
):
pass
) -> BatchEncoding:
"""
使用 BlipImageProcessor.__call__ 方法准备模型的图像输入,
使用 BertTokenizerFast.__call__ 方法准备模型的文本输入。
详细信息请参考上述两个方法的文档字符串。
"""
if images is None and text is None:
raise ValueError("You have to specify either images or text.")
if images is None:
self.current_processor = self.tokenizer
text_encoding = self.tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_token_type_ids=return_token_type_ids,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
return text_encoding
encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
if text is not None:
text_encoding = self.tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_token_type_ids=return_token_type_ids,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
else:
text_encoding = None
if text_encoding is not None:
encoding_image_processor.update(text_encoding)
return encoding_image_processor
def batch_decode(self, *args, **kwargs):
"""
此方法将所有参数转发给 PreTrainedTokenizer 的 batch_decode 方法。请参考该方法的文档字符串获取详细信息。
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
.\models\blip_2\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_blip_2": [
"BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
"Blip2Config",
"Blip2QFormerConfig",
"Blip2VisionConfig",
],
"processing_blip_2": ["Blip2Processor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_blip_2"] = [
"BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST",
"Blip2Model",
"Blip2QFormerModel",
"Blip2PreTrainedModel",
"Blip2ForConditionalGeneration",
"Blip2VisionModel",
]
if TYPE_CHECKING:
from .configuration_blip_2 import (
BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
Blip2Config,
Blip2QFormerConfig,
Blip2VisionConfig,
)
from .processing_blip_2 import Blip2Processor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_blip_2 import (
BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST,
Blip2ForConditionalGeneration,
Blip2Model,
Blip2PreTrainedModel,
Blip2QFormerModel,
Blip2VisionModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\bloom\configuration_bloom.py
""" Bloom configuration"""
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, List, Mapping, Optional
from packaging import version
if TYPE_CHECKING:
from ... import PreTrainedTokenizer, TensorType
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfigWithPast, PatchingSpec
from ...utils import is_torch_available, logging
logger = logging.get_logger(__name__)
BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"bigscience/bloom": "https://huggingface.co/bigscience/bloom/resolve/main/config.json",
"bigscience/bloom-560m": "https://huggingface.co/bigscience/bloom-560m/blob/main/config.json",
"bigscience/bloom-1b1": "https://huggingface.co/bigscience/bloom-1b1/blob/main/config.json",
"bigscience/bloom-1b7": "https://huggingface.co/bigscience/bloom-1b7/blob/main/config.json",
"bigscience/bloom-3b": "https://huggingface.co/bigscience/bloom-3b/blob/main/config.json",
"bigscience/bloom-7b1": "https://huggingface.co/bigscience/bloom-7b1/blob/main/config.json",
}
class BloomConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`BloomModel`]. It is used to instantiate a Bloom
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to the Bloom architecture
[bigscience/bloom](https://huggingface.co/bigscience/bloom).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
pass
Args:
vocab_size (`int`, *optional*, defaults to 250880):
Bloom 模型的词汇表大小,定义了在调用 `BloomModel` 时 `inputs_ids` 可以表示的最大不同 token 数量。
参考 [此讨论](https://huggingface.co/bigscience/bloom/discussions/120
hidden_size (`int`, *optional*, defaults to 64):
嵌入和隐藏状态的维度。
n_layer (`int`, *optional*, defaults to 2):
Transformer 编码器中的隐藏层数量。
n_head (`int`, *optional*, defaults to 8):
Transformer 编码器中每个注意力层的注意力头数。
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
层归一化层中使用的 epsilon 值。
initializer_range (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态分布的标准差。
apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`):
如果启用,则在 transformer 块中使用隐藏状态的层归一化作为残差。
hidden_dropout (`float`, *optional*, defaults to 0.1):
应用于偏置丢弃的 dropout 率。
attention_dropout (`float`, *optional*, defaults to 0.1):
应用于注意力概率的 dropout 率。
use_cache (`bool`, *optional*, defaults to `True`):
模型是否应返回最后的 key/values 注意力(不是所有模型都使用)。
pretraining_tp (`int`, *optional*, defaults to `1`):
实验性功能。Megatron 预训练期间使用的张量并行性等级。请参考 [此文档](https://huggingface.co/docs/transformers/parallelism) 了解更多信息。
此值对确保预训练结果的精确再现性至关重要。请参考 [此问题](https://github.com/pytorch/pytorch/issues/76232)。
注意,仅在 `slow_but_exact=True` 时启用。
slow_but_exact (`bool`, *optional*, defaults to `False`):
实验性功能。是否使用注意力机制的缓慢但精确实现。在合并 TP 等级张量时,由于切片操作,Megatron 训练模型和我们模型之间的结果可能会略有不同。
请参考 [此问题](https://github.com/pytorch/pytorch/issues/76232)。启用此功能可获得更准确的结果,但会增加推断的计算时间。
一旦主模型通过 TP_rank=1 进行了精细调整,这个问题可能会在未来得到解决。
from transformers import BloomConfig, BloomModel
configuration = BloomConfig()
model = BloomModel(configuration)
configuration = model.config
model_type = "bloom"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_hidden_layers": "n_layer",
"num_attention_heads": "n_head",
}
def __init__(
self,
vocab_size=250880,
hidden_size=64,
n_layer=2,
n_head=8,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
use_cache=True,
bos_token_id=1,
eos_token_id=2,
apply_residual_connection_post_layernorm=False,
hidden_dropout=0.0,
attention_dropout=0.0,
pretraining_tp=1,
slow_but_exact=False,
**kwargs,
):
self.vocab_size = vocab_size
n_embed = kwargs.pop("n_embed", None)
self.hidden_size = hidden_size if n_embed is None else n_embed
self.n_layer = n_layer
self.n_head = n_head
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.use_cache = use_cache
self.pretraining_tp = pretraining_tp
self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.slow_but_exact = slow_but_exact
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
class BloomOnnxConfig(OnnxConfigWithPast):
torch_onnx_minimum_version = version.parse("1.12")
def __init__(
self,
config: PretrainedConfig,
task: str = "default",
patching_specs: List[PatchingSpec] = None,
use_past: bool = False,
):
super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
if not getattr(self._config, "pad_token_id", None):
self._config.pad_token_id = 0
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
if self.use_past:
self.fill_with_past_key_values_(common_inputs, direction="inputs", inverted_values_shape=True)
common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
else:
common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
return common_inputs
@property
def num_layers(self) -> int:
return self._config.n_layer
@property
def num_attention_heads(self) -> int:
return self._config.n_head
@property
def atol_for_validation(self) -> float:
return 1e-3
def generate_dummy_inputs(
self,
tokenizer: "PreTrainedTokenizer",
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional["TensorType"] = None,
) -> Mapping[str, Any]:
common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, seqlen = common_inputs["input_ids"].shape
past_key_values_length = seqlen + 2
head_dim = self._config.hidden_size // self.num_attention_heads
past_key_shape = (
batch * self.num_attention_heads,
head_dim,
past_key_values_length,
)
past_value_shape = (
batch * self.num_attention_heads,
past_key_values_length,
head_dim,
)
ordered_inputs["past_key_values"] = [
(torch.zeros(past_key_shape), torch.zeros(past_value_shape)) for _ in range(self.num_layers)
]
ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
if self.use_past:
mask_dtype = ordered_inputs["attention_mask"].dtype
ordered_inputs["attention_mask"] = torch.cat(
[ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
)
return ordered_inputs
@property
def default_onnx_opset(self) -> int:
return 13
.\models\bloom\convert_bloom_original_checkpoint_to_pytorch.py
"""Convert BigScience BLOOM checkpoint."""
import argparse
import json
import os
import re
import torch
from transformers import BloomConfig, BloomModel
from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
from transformers.utils import logging
logging.set_verbosity_info()
WEIGHTS_TO_AVERAGE_ENDSWITH = [
"word_embeddings_layernorm.weight",
"word_embeddings_layernorm.bias",
"input_layernorm.weight",
"input_layernorm.bias",
"post_attention_layernorm.weight",
"post_attention_layernorm.bias",
"self_attention.dense.bias",
"mlp.dense_4h_to_h.bias",
"ln_f.weight",
"ln_f.bias",
]
WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [
"mlp.dense_4h_to_h.weight",
"self_attention.dense.weight",
]
def layer_name_mapping(key, file):
"""Convert Megatron-DeepSpeed TP/PP weights mapping in transformers PP only"""
layer_rename_map = {
"word_embeddings.weight": "word_embeddings.weight",
"word_embeddings.norm.weight": "word_embeddings_layernorm.weight",
"word_embeddings.norm.bias": "word_embeddings_layernorm.bias",
"weight": "ln_f.weight",
"bias": "ln_f.bias",
}
if key in layer_rename_map:
return layer_rename_map[key]
layer_number = int(re.match(r".*layer_(\d*).*", file)[1])
layer_number -= 3
return f"h.{layer_number}." + key
def get_dtype_size(dtype):
"""获取数据类型的字节大小"""
if dtype == torch.bool:
return 1 / 8
bit_search = re.search(r"[^\d](\d+)$", str(dtype))
if bit_search is None:
raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
bit_size = int(bit_search.groups()[0])
return bit_size // 8
def convert_bloom_checkpoint_to_pytorch(
bloom_checkpoint_path, bloom_config_file, pytorch_dump_folder_path, shard_model, pretraining_tp
):
"""将 BLOOM 模型的检查点文件转换为 PyTorch 模型"""
if bloom_config_file == "":
config = BloomConfig()
else:
config = BloomConfig.from_json_file(bloom_config_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--bloom_checkpoint_path",
default=None,
type=str,
required=True,
help="Path to the Megatron-LM checkpoint path.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument(
"--bloom_config_file",
default="",
type=str,
help=(
"An optional config json file corresponding to the pre-trained model. \n"
"This specifies the model architecture."
),
)
parser.add_argument(
"--shard_model",
action="store_true",
help="An optional setting to shard the output model \nThis enables sharding the converted checkpoint",
)
parser.add_argument(
"--pretraining_tp",
default=4,
type=int,
help="Pretraining TP rank that has been used when training the model in Megatron-LM \n",
)
args = parser.parse_args()
convert_bloom_checkpoint_to_pytorch(
args.bloom_checkpoint_path,
args.bloom_config_file,
args.pytorch_dump_folder_path,
args.shard_model,
args.pretraining_tp,
)
.\models\bloom\modeling_bloom.py
import math
import warnings
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
from torch.nn import functional as F
from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
QuestionAnsweringModelOutput,
SequenceClassifierOutputWithPast,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import logging
from .configuration_bloom import BloomConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "bigscience/bloom-560m"
_CONFIG_FOR_DOC = "BloomConfig"
BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"bigscience/bigscience-small-testing",
"bigscience/bloom-560m",
"bigscience/bloom-1b1",
"bigscience/bloom-1b7",
"bigscience/bloom-3b",
"bigscience/bloom-7b1",
"bigscience/bloom",
]
def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
"""
构建 Alibi 张量,参考文献:https://arxiv.org/abs/2108.12409。Alibi 张量不是因果性的,
原始论文提到它依赖于 softmax 的平移不变性以进行快速实现:对于一个张量 l 和一个固定值 `softmax(l+a) = softmax(l)`。
基于 https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
TODO @thomasw21 这并不完全适用于掩码策略,因此掩码稍有不同。
Args:
attention_mask (`torch.Tensor`):
令牌级别的注意力掩码,应为形状 (batch_size, max_seq_len)。
num_heads (`int`, *required*):
多头注意力的数量。
dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
输出张量的数据类型。
Returns:
torch.Tensor:
形状为 (batch_size * num_heads, 1, max_seq_len) 的张量。
"""
batch_size, seq_length = attention_mask.shape
closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
base = torch.tensor(
2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
)
powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
slopes = torch.pow(base, powers)
if closest_power_of_2 != num_heads:
extra_base = torch.tensor(
2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
)
num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
alibi = slopes[..., None] * arange_tensor
return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
"""
Dropout add function
Args:
x (`torch.tensor`, *required*):
input tensor
residual (`torch.tensor`, *required*):
residual tensor
prob (`float`, *required*):
dropout probability
training (`bool`, *required*):
training mode
"""
out = F.dropout(x, p=prob, training=training)
out = residual + out
return out
def bloom_gelu_forward(x: torch.Tensor) -> torch.Tensor:
"""
Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
make the model jittable.
Args:
x (`torch.tensor`, *required*):
input hidden states
"""
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
def bloom_gelu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
"""
gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) +
0.3989423 * x * torch.exp(-0.5 * x * x)
Args:
g (`torch.tensor`, *required*):
gradient output tensor
x (`torch.tensor`, *required*):
input tensor
"""
x = x[0]
tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
return ff * g
class GeLUFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, input: torch.Tensor) -> torch.Tensor:
"""
Forward pass of the custom autograd function for GeLU.
Args:
ctx (`torch.autograd.function.Context`):
context object to save tensors for backward pass
input (`torch.tensor`, *required*):
input tensor
"""
ctx.save_for_backward(input)
return bloom_gelu_forward(input)
@staticmethod
def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
"""
Backward pass of the custom autograd function for GeLU.
Args:
ctx (`torch.autograd.function.Context`):
context object holding saved tensors from forward pass
grad_output (`torch.tensor`, *required*):
gradient of the output tensor
"""
input = ctx.saved_tensors
tmp = bloom_gelu_back(grad_output, input)
return tmp
class BloomGelu(nn.Module):
"""
BloomBiasGelu wrapper function that make use of the simple function on inference mode to make the model
torchscriptable and use the autograd function in training mode to get the accurate results of the gradients Partly
copied from Megatron-DeepSpeed code and adapted for our needs
See here why autograd functions are not torchscriptable: https://github.com/pytorch/pytorch/issues/22329
"""
def __init__(self):
super().__init__()
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Forward pass of the BloomGelu module.
Args:
x (`torch.tensor`, *required*):
input tensor
"""
if self.training:
return GeLUFunction.apply(x)
else:
return bloom_gelu_forward(x)
class BloomAttention(nn.Module):
pass
def __init__(self, config: BloomConfig):
super().__init__()
self.pretraining_tp = config.pretraining_tp
self.slow_but_exact = config.slow_but_exact
self.hidden_size = config.hidden_size
self.num_heads = config.n_head
self.head_dim = self.hidden_size // self.num_heads
self.split_size = self.hidden_size
self.hidden_dropout = config.hidden_dropout
if self.head_dim * self.num_heads != self.hidden_size:
raise ValueError(
f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
f" {self.num_heads})."
)
self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
self.beta = 1.0
self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=True)
self.dense = nn.Linear(self.hidden_size, self.hidden_size)
self.attention_dropout = nn.Dropout(config.attention_dropout)
def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
storage as `fused_qkv`
Args:
fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
Returns:
query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
value: [batch_size, seq_length, num_heads, head_dim]
"""
batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
"""
Merge heads together over the last dimension
Args:
x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
Returns:
torch.tensor: [batch_size, seq_length, num_heads * head_dim]
"""
batch_size_and_num_heads, seq_length, _ = x.shape
batch_size = batch_size_and_num_heads // self.num_heads
x = x.view(batch_size, self.num_heads, seq_length, self.head_dim)
x = x.permute(0, 2, 1, 3)
x = x.reshape(batch_size, seq_length, self.num_heads * self.head_dim)
return x
def forward(
self,
hidden_states: torch.Tensor,
residual: torch.Tensor,
alibi: torch.Tensor,
attention_mask: torch.Tensor,
layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
use_cache: bool = False,
output_attentions: bool = False,
class BloomMLP(nn.Module):
def __init__(self, config: BloomConfig):
super().__init__()
hidden_size = config.hidden_size
self.pretraining_tp = config.pretraining_tp
self.slow_but_exact = config.slow_but_exact
self.dense_h_to_4h = nn.Linear(hidden_size, 4 * hidden_size)
self.gelu_impl = BloomGelu()
self.dense_4h_to_h = nn.Linear(4 * hidden_size, hidden_size)
self.hidden_dropout = config.hidden_dropout
def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
hidden_states = self.gelu_impl(self.dense_h_to_4h(hidden_states))
if self.pretraining_tp > 1 and self.slow_but_exact:
intermediate_output = torch.zeros_like(residual)
slices = self.dense_4h_to_h.weight.shape[-1] / self.pretraining_tp
for i in range(self.pretraining_tp):
intermediate_output = intermediate_output + F.linear(
hidden_states[:, :, int(i * slices) : int((i + 1) * slices)],
self.dense_4h_to_h.weight[:, int(i * slices) : int((i + 1) * slices)],
)
else:
intermediate_output = self.dense_4h_to_h(hidden_states)
output = dropout_add(intermediate_output, residual, self.hidden_dropout, self.training)
return output
class BloomBlock(nn.Module):
def __init__(self, config: BloomConfig):
super().__init__()
hidden_size = config.hidden_size
self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.num_heads = config.n_head
self.self_attention = BloomAttention(config)
self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.mlp = BloomMLP(config)
self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
self.hidden_dropout = config.hidden_dropout
def forward(
self,
hidden_states: torch.Tensor,
alibi: torch.Tensor,
attention_mask: torch.Tensor,
layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
use_cache: bool = False,
output_attentions: bool = False,
hidden_states = self.input_layernorm(hidden_states)
attention_output = self.self_attention(
hidden_states,
attention_mask,
layer_past=layer_past,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
if self.apply_residual_connection_post_layernorm:
hidden_states = hidden_states + attention_output[0]
hidden_states = self.post_attention_layernorm(hidden_states)
else:
hidden_states = attention_output[0]
output = self.mlp(hidden_states, alibi)
return output
):
layernorm_output = self.input_layernorm(hidden_states)
if self.apply_residual_connection_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states
attn_outputs = self.self_attention(
layernorm_output,
residual,
layer_past=layer_past,
attention_mask=attention_mask,
alibi=alibi,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions
)
attention_output = attn_outputs[0]
outputs = attn_outputs[1:]
layernorm_output = self.post_attention_layernorm(attention_output)
if self.apply_residual_connection_post_layernorm:
residual = layernorm_output
else:
residual = attention_output
output = self.mlp(layernorm_output, residual)
if use_cache:
outputs = (output,) + outputs
else:
outputs = (output,) + outputs[1:]
return outputs
@staticmethod
def _convert_to_bloom_cache(
past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
) -> Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
"""
Convert the cache format to a custom format specific to Bloom.
Args:
past_key_value: Tuple of tuples containing tensors for keys and values from previous attention layers.
Returns:
Tuple of tuples containing tensors reshaped and restructured for Bloom model caching.
"""
batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
return tuple(
(
layer_past[0].view(batch_size * num_heads, head_dim, seq_length),
layer_past[1].view(batch_size * num_heads, seq_length, head_dim),
layer_past[0].view(batch_size * num_heads, head_dim, seq_length),
)
for layer_past in past_key_value
)
) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
"""
Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
"""
batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
batch_size_times_num_heads = batch_size * num_heads
return tuple(
(
layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length),
layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim),
)
for layer_past in past_key_value
)
"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BloomConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BLOOM_INPUTS_DOCSTRING = r"""
Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
past_key_values (:obj:`Tuple[Tuple[torch.Tensor, torch.Tensor], ...]`, `optional`):
Tuple of length `config.num_hidden_layers`, containing tuples (`key`, `value`) for the cross-attention
layers.
attention_mask (:obj:`torch.Tensor`, `optional`):
Mask to avoid performing attention on padding token indices. It is a tensor with shape
`(batch_size, sequence_length)`, where each value is `0` for real tokens and `1` for padding tokens.
head_mask (:obj:`torch.LongTensor`, `optional`):
Mask to nullify selected heads of the self-attention modules. It is a tensor of shape
`(num_heads,)`, where each value is either `0` or `1`. A `1` indicates the head is **not masked**, while a
`0` indicates the head is masked.
inputs_embeds (:obj:`torch.LongTensor`, `optional`):
Embedded representation of the inputs. It is a tensor of shape `(batch_size, sequence_length,
embedding_dim)`.
use_cache (:obj:`bool`, `optional`):
Whether or not to use the cached keys and values. If `False`, all intermediate keys and values are
discarded and recomputed on-the-fly.
output_attentions (:obj:`bool`, `optional`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (:obj:`bool`, `optional`):
Whether or not to return a dictionary instead of a tuple of outputs.
Returns:
:class:`~transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions`: A BaseModelOutputWithPastAndCrossAttentions
object containing various elements depending on the configuration (e.g., hidden states, attentions, etc.).
"""
@add_start_docstrings(
"The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.",
BLOOM_START_DOCSTRING,
)
class BloomModel(BloomPreTrainedModel):
def __init__(self, config: BloomConfig):
super().__init__(config)
self.embed_dim = config.hidden_size
self.num_heads = config.n_head
self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
self.word_embeddings_layernorm = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.h = nn.ModuleList([BloomBlock(config) for _ in range(config.num_hidden_layers)])
self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.gradient_checkpointing = False
self.post_init()
def build_alibi_tensor(self, attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
"""
Helper function to build a tensor with values based on attention_mask and number of heads.
Args:
attention_mask (:obj:`torch.Tensor`): Tensor indicating positions to ignore in attention computation.
num_heads (:obj:`int`): Number of attention heads.
dtype (:obj:`torch.dtype`): Data type of the tensor.
Returns:
:obj:`torch.Tensor`: A tensor with specific values based on input parameters.
"""
return build_alibi_tensor(attention_mask, num_heads, dtype)
def get_input_embeddings(self):
"""
Retrieve the word embedding layer.
Returns:
:obj:`torch.nn.Embedding`: The word embedding layer.
"""
return self.word_embeddings
def set_input_embeddings(self, new_embeddings: torch.Tensor):
"""
Set new word embeddings for the model.
Args:
new_embeddings (:obj:`torch.Tensor`): New word embeddings to be set.
"""
self.word_embeddings = new_embeddings
@add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**deprecated_arguments,
):
"""
Perform a forward pass of the BloomModel.
Args:
input_ids (:obj:`torch.LongTensor`, `optional`):
Indices of input sequence tokens in the vocabulary.
past_key_values (:obj:`Tuple[Tuple[torch.Tensor, torch.Tensor], ...]`, `optional`):
Tuple of length `config.num_hidden_layers`, containing tuples (`key`, `value`) for the cross-attention
layers.
attention_mask (:obj:`torch.Tensor`, `optional`):
Mask to avoid performing attention on padding token indices.
head_mask (:obj:`torch.LongTensor`, `optional`):
Mask to nullify selected heads of the self-attention modules.
inputs_embeds (:obj:`torch.LongTensor`, `optional`):
Embedded representation of the inputs.
use_cache (:obj:`bool`, `optional`):
Whether or not to use the cached keys and values.
output_attentions (:obj:`bool`, `optional`):
Whether or not to return the attentions tensors of all attention layers.
output_hidden_states (:obj:`bool`, `optional`):
Whether or not to return the hidden states of all layers.
return_dict (:obj:`bool`, `optional`):
Whether or not to return a dictionary instead of a tuple of outputs.
Returns:
:class:`~transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions`: A BaseModelOutputWithPastAndCrossAttentions
object containing various elements depending on the configuration.
"""
pass
@add_start_docstrings(
"""
The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
BLOOM_START_DOCSTRING,
)
class BloomForCausalLM(BloomPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: BloomConfig):
super().__init__(config)
self.transformer = BloomModel(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings: torch.Tensor):
self.lm_head = new_embeddings
def prepare_inputs_for_generation(
self,
input_ids: torch.LongTensor,
past_key_values: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs,
) -> dict:
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
if past_key_values[0][0].shape[0] == input_ids.shape[0]:
past_key_values = self._convert_to_bloom_cache(past_key_values)
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask,
}
)
return model_inputs
@add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**deprecated_arguments,
) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
if deprecated_arguments.pop("position_ids", False) is not False:
warnings.warn(
"`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
" passing `position_ids`.",
FutureWarning,
)
if len(deprecated_arguments) > 0:
raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states)
loss = None
if labels is not None:
labels = labels.to(lm_logits.device)
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
batch_size, seq_length, vocab_size = shift_logits.shape
loss_fct = CrossEntropyLoss()
loss = loss_fct(
shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
)
if not return_dict:
output = (lm_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return CausalLMOutputWithCrossAttentions(
loss=loss,
logits=lm_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def _reorder_cache(
self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
`
) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
"""
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
Output shares the same memory storage as `past`.
"""
standardized_past = self._convert_to_standard_cache(past, batch_size=len(beam_idx))
device_to_beam_idx = {
past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
}
reordered_past = tuple(
(
layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
)
for layer_past in standardized_past
)
return self._convert_to_bloom_cache(reordered_past)
@add_start_docstrings(
"""
Bloom 模型的令牌分类器,位于隐藏状态输出之上的线性层,例如用于命名实体识别(NER)任务。
""",
BLOOM_START_DOCSTRING,
)
class BloomForTokenClassification(BloomPreTrainedModel):
def __init__(self, config: BloomConfig):
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = BloomModel(config)
if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
classifier_dropout = config.classifier_dropout
elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
classifier_dropout = config.hidden_dropout
else:
classifier_dropout = 0.1
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**deprecated_arguments,
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
if deprecated_arguments.pop("position_ids", False) is not False:
warnings.warn(
"`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
" passing `position_ids`.",
FutureWarning,
)
if len(deprecated_arguments) > 0:
raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
hidden_states = self.dropout(hidden_states)
logits = self.classifier(hidden_states)
loss = None
if labels is not None:
labels = labels.to(logits.device)
batch_size, seq_length = labels.shape
loss_fct = CrossEntropyLoss()
loss = loss_fct(
logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
)
if not return_dict:
output = (logits,) + transformer_outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings(
"""
The BLOOM Model transformer with a span classification head on top for extractive question-answering tasks like
SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
BLOOM_START_DOCSTRING,
)
class BloomForQuestionAnswering(BloomPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.transformer = BloomModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, 2)
self.post_init()
@add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)