Transformers 源码解析(十九)
.\models\blenderbot\modeling_tf_blenderbot.py
""" TF 2.0 Blenderbot 模型。"""
from __future__ import annotations
import os
import random
import warnings
from typing import List, Optional, Tuple, Union
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPastAndCrossAttentions,
TFSeq2SeqLMOutput,
TFSeq2SeqModelOutput,
)
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFPreTrainedModel,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_blenderbot import BlenderbotConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill"
_CONFIG_FOR_DOC = "BlenderbotConfig"
LARGE_NEGATIVE = -1e8
def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
start_tokens = tf.fill(
(shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)
)
shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
shifted_input_ids = tf.where(
shifted_input_ids == -100,
tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
shifted_input_ids,
)
assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))
with tf.control_dependencies([assert_gte0]):
shifted_input_ids = tf.identity(shifted_input_ids)
return shifted_input_ids
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
"""
Make causal mask used for bi-directional self-attention.
创建用于双向自注意力的因果遮罩。
"""
bsz = input_ids_shape[0]
tgt_len = input_ids_shape[1]
mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
mask_cond = tf.range(shape_list(mask)[-1])
mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
if past_key_values_length > 0:
mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
将注意力遮罩从 `[bsz, seq_len]` 扩展到 `[bsz, 1, tgt_seq_len, src_seq_len]`。
"""
src_len = shape_list(mask)[1]
tgt_len = tgt_len if tgt_len is not None else src_len
one_cst = tf.constant(1.0)
mask = tf.cast(mask, dtype=one_cst.dtype)
expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFBlenderbotLearnedPositionalEmbedding(keras.layers.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
此模块学习位置嵌入,最多到固定的最大大小。
"""
def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
super().__init__(num_embeddings, embedding_dim, **kwargs)
def call(
self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
):
"""Input is expected to be of size [bsz x seqlen]."""
if position_ids is None:
seq_len = input_shape[1]
position_ids = tf.range(seq_len, delta=1, name="range")
position_ids += past_key_values_length
return super().call(tf.cast(position_ids, dtype=tf.int32))
class TFBlenderbotAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need"
多头注意力机制,源自于《Attention Is All You Need》"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
def call(
self,
hidden_states: tf.Tensor,
key_value_states: tf.Tensor | None = None,
past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
training: Optional[bool] = False,
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFBlenderbotEncoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
self.self_attn = TFBlenderbotAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
layer_head_mask: tf.Tensor,
training: Optional[bool] = False,
):
"""
Args:
hidden_states (`tf.Tensor`): 输入到该层的张量,形状为 *(batch, seq_len, embed_dim)*
attention_mask (`tf.Tensor`): 注意力掩码张量,大小为 *(batch, 1, tgt_len, src_len)*,其中填充元素由非常大的负值表示。
layer_head_mask (`tf.Tensor`): 给定层中注意力头的掩码张量,大小为 *(encoder_attention_heads,)*
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, self_attn_weights, _ = self.self_attn(
hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask
)
tf.debugging.assert_equal(
shape_list(hidden_states),
shape_list(residual),
message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = self.activation_dropout(hidden_states, training=training)
hidden_states = self.fc2(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBlenderbotDecoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
self.self_attn = TFBlenderbotAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
name="self_attn",
is_decoder=True,
)
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFBlenderbotAttention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
name="encoder_attn",
is_decoder=True,
)
self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
encoder_hidden_states: tf.Tensor | None = None,
encoder_attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
cross_attn_layer_head_mask: tf.Tensor | None = None,
past_key_value: Tuple[tf.Tensor] | None = None,
training: Optional[bool] = False,
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBlenderbotPreTrainedModel(TFPreTrainedModel):
config_class = BlenderbotConfig
base_model_prefix = "model"
>>> UTTERANCE = "My friends are cool but they eat too many carbs."
>>> print("Human: ", UTTERANCE)
>>> inputs = tokenizer([UTTERANCE], return_tensors="tf")
>>> reply_ids = model.generate(**inputs)
>>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
>>> REPLY = "I'm not sure"
>>> print("Human: ", REPLY)
>>> NEXT_UTTERANCE = (
... "My friends are cool but they eat too many carbs.</s> <s>That's unfortunate. "
... "Are they trying to lose weight or are they just trying to be healthier?</s> "
... "<s> I'm not sure."
... )
>>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="tf")
>>> next_reply_ids = model.generate(**inputs)
>>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
"""
BLENDERBOT_INPUTS_DOCSTRING = r"""
"""
@keras_serializable
class TFBlenderbotEncoder(keras.layers.Layer):
config_class = BlenderbotConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`TFBlenderbotEncoderLayer`].
Args:
config: BlenderbotConfig
"""
def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
self.config = config
self.dropout = keras.layers.Dropout(config.dropout) # 初始化一个丢弃层,用于在训练过程中随机丢弃输入
self.layerdrop = config.encoder_layerdrop # 从配置中获取层丢弃率,表示在每个训练步骤中丢弃编码器层的概率
self.padding_idx = config.pad_token_id # 获取填充标记的索引,用于处理输入序列的填充
self.max_source_positions = config.max_position_embeddings # 获取最大源序列位置数,用于限制输入序列的最大长度
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 # 根据配置是否缩放嵌入向量的大小
self.embed_tokens = embed_tokens # 用于输入序列的嵌入令牌
self.embed_positions = TFBlenderbotLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
name="embed_positions",
) # 初始化学习的位置嵌入层,用于将输入序列的位置编码成向量
self.layers = [TFBlenderbotEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] # 创建多层编码器层
self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") # 初始化层归一化层,用于每个层输出的归一化处理
def get_embed_tokens(self):
return self.embed_tokens # 返回当前嵌入令牌
def set_embed_tokens(self, embed_tokens):
self.embed_tokens = embed_tokens # 设置新的嵌入令牌
@unpack_inputs
def call(
self,
input_ids=None,
inputs_embeds=None,
attention_mask=None,
head_mask=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
): # 定义 Transformer 编码器的前向传播函数
"""
参数:
input_ids: 输入的 token IDs
inputs_embeds: 替代的嵌入输入
attention_mask: 注意力掩码,用于指示哪些位置需要注意哪些位置不需要
head_mask: 多头注意力机制的掩码
output_attentions: 是否输出注意力权重
output_hidden_states: 是否输出所有隐藏状态
return_dict: 是否返回字典格式的输出
training: 是否处于训练模式
返回:
根据配置返回不同格式的输出
"""
# 以下是前向传播的具体实现,根据输入参数进行不同的计算和处理
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None) # 构建位置嵌入层
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model]) # 构建层归一化层
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None) # 构建每一层的编码器层
@keras_serializable
class TFBlenderbotDecoder(keras.layers.Layer):
config_class = BlenderbotConfig
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotDecoderLayer`]
Args:
config: BlenderbotConfig
embed_tokens: output embedding
"""
# 初始化方法,用于创建一个新的TFBlenderbotDecoder对象
def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
# 调用父类的初始化方法
super().__init__(**kwargs)
# 将传入的配置对象保存到实例变量中
self.config = config
# 将配置中的填充标记ID保存到实例变量中
self.padding_idx = config.pad_token_id
# 将传入的嵌入层对象保存到实例变量中
self.embed_tokens = embed_tokens
# 从配置中获取解码器层dropout的比例并保存到实例变量中
self.layerdrop = config.decoder_layerdrop
# 创建一个学习的位置嵌入对象并保存到实例变量中
self.embed_positions = TFBlenderbotLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
name="embed_positions",
)
# 如果配置中指定了缩放嵌入,则计算并保存嵌入缩放因子;否则设置为1.0
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
# 创建解码器层列表,每个解码器层都使用给定的配置对象进行初始化,并保存到实例变量中
self.layers = [TFBlenderbotDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
# 创建一个层归一化层对象,设置epsilon为1e-5,并保存到实例变量中
self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
# 创建一个dropout层对象,并保存到实例变量中,使用配置中的dropout比例
self.dropout = keras.layers.Dropout(config.dropout)
# 获取嵌入层对象的方法
def get_embed_tokens(self):
return self.embed_tokens
# 设置嵌入层对象的方法
def set_embed_tokens(self, embed_tokens):
self.embed_tokens = embed_tokens
# 使用@unpack_inputs装饰器标记的调用方法,定义了Blenderbot解码器的前向传播逻辑
@unpack_inputs
def call(
self,
input_ids=None,
inputs_embeds=None,
attention_mask=None,
position_ids=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
# 省略了前向传播的具体实现,根据参数配置实现解码器的逻辑
# 构建方法,在第一次调用call方法时被调用,用于构建模型的层次结构
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回
if self.built:
return
# 设置模型已构建标志为True
self.built = True
# 如果实例中存在embed_positions属性,则构建embed_positions对象
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
# 如果实例中存在layer_norm属性,则构建layer_norm对象
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
# 遍历解码器层列表中的每一层,分别构建每一层解码器层对象
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
# 使用装饰器将类标记为可序列化,适用于Keras
@keras_serializable
class TFBlenderbotMainLayer(keras.layers.Layer):
# 配置类为BlenderbotConfig
config_class = BlenderbotConfig
# 初始化方法,接收BlenderbotConfig实例和其他关键字参数
def __init__(self, config: BlenderbotConfig, **kwargs):
# 调用父类的初始化方法
super().__init__(**kwargs)
# 将传入的配置对象保存为属性
self.config = config
# 创建共享的嵌入层,用于编码器和解码器共享的词汇表和模型尺寸
self.shared = keras.layers.Embedding(
input_dim=config.vocab_size, # 输入维度为词汇表大小
output_dim=config.d_model, # 输出维度为模型维度
embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), # 初始化嵌入层的权重
name="model.shared", # 层的名称
)
# 附加属性,指定层的预期名称范围(用于加载/存储权重)
self.shared.load_weight_prefix = "model.shared"
# 创建编码器对象,传入配置对象和共享的嵌入层
self.encoder = TFBlenderbotEncoder(config, self.shared, name="encoder")
# 创建解码器对象,传入配置对象和共享的嵌入层
self.decoder = TFBlenderbotDecoder(config, self.shared, name="decoder")
# 获取输入嵌入层的方法
def get_input_embeddings(self):
return self.shared
# 设置输入嵌入层的方法
def set_input_embeddings(self, new_embeddings):
# 更新共享的嵌入层
self.shared = new_embeddings
# 更新编码器和解码器中的嵌入层
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
# 使用装饰器解包输入参数的方法
@unpack_inputs
def call(
self,
input_ids=None,
attention_mask=None,
decoder_input_ids=None,
decoder_attention_mask=None,
decoder_position_ids=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
past_key_values=None,
inputs_embeds=None,
decoder_inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
**kwargs,
):
# 如果用户没有提供隐藏状态的输出,则使用模型配置中的默认设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果没有提供编码器输出,则调用编码器进行前向传播
if encoder_outputs is None:
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 如果 return_dict=True 并且用户传递了一个元组作为 encoder_outputs,则将其包装在 TFBaseModelOutput 中
elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput):
encoder_outputs = TFBaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
)
# 如果 return_dict=False 并且用户传递了 TFBaseModelOutput 作为 encoder_outputs,则将其包装在元组中
elif not return_dict and not isinstance(encoder_outputs, tuple):
encoder_outputs = encoder_outputs.to_tuple()
# 使用解码器进行解码操作
decoder_outputs = self.decoder(
decoder_input_ids,
attention_mask=decoder_attention_mask,
position_ids=decoder_position_ids,
encoder_hidden_states=encoder_outputs[0],
encoder_attention_mask=attention_mask,
head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 如果 return_dict=False,则将解码器输出和编码器输出合并并返回
if not return_dict:
return decoder_outputs + encoder_outputs
# 如果 return_dict=True,则将解码器输出和编码器输出合并为 TFSeq2SeqModelOutput 类型并返回
return TFSeq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
# 定义模型的构建方法,当输入形状为None时表示该方法可接受任意输入形状
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回,避免重复构建
if self.built:
return
# 标记模型为已构建状态
self.built = True
# 共享/共同权重期望在模型基础命名空间中
# 在 tf.name_scope 的末尾添加 "/"(但不是开头!)将其放置在根命名空间而不是当前命名空间
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
# 构建共享部分模型
self.shared.build(None)
# 如果存在编码器部分,进入编码器的命名空间并构建
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
# 如果存在解码器部分,进入解码器的命名空间并构建
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
# 添加模型的文档字符串,说明这是一个输出原始隐藏状态的 BLENDERBOT 模型,没有特定的输出头部分
@add_start_docstrings(
"The bare BLENDERBOT Model outputting raw hidden-states without any specific head on top.",
BLENDERBOT_START_DOCSTRING,
)
class TFBlenderbotModel(TFBlenderbotPreTrainedModel):
def __init__(self, config: BlenderbotConfig, *inputs, **kwargs):
# 调用父类的初始化方法,传入配置和其他输入参数
super().__init__(config, *inputs, **kwargs)
# 创建 TFBlenderbotMainLayer 实例作为模型的主要组成部分
self.model = TFBlenderbotMainLayer(config, name="model")
# 返回编码器部分的方法
def get_encoder(self):
return self.model.encoder
# 返回解码器部分的方法
def get_decoder(self):
return self.model.decoder
@classmethod
# 从预训练模型加载模型的类方法
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
if pretrained_model_name_or_path == "facebook/blenderbot-90M":
# 如果加载的是 facebook/blenderbot-90M 模型,则发出未来警告
from ..blenderbot_small import TFBlenderbotSmallModel
warnings.warn(
"The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical"
" checkpoint `facebook/small_blenderbot-90M` with"
" `TFBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/small_blenderbot-90M')`"
" instead.",
FutureWarning,
)
# 返回 TFBlenderbotSmallModel 的预训练模型
return TFBlenderbotSmallModel.from_pretrained(pretrained_model_name_or_path)
# 否则调用父类的 from_pretrained 方法加载模型
return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
@unpack_inputs
@add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSeq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
)
# 定义模型的调用方法,接收多个输入参数
def call(
self,
input_ids: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
decoder_input_ids: tf.Tensor | None = None,
decoder_attention_mask: tf.Tensor | None = None,
decoder_position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
decoder_head_mask: tf.Tensor | None = None,
cross_attn_head_mask: tf.Tensor | None = None,
encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
past_key_values: List[tf.Tensor] | None = None,
inputs_embeds: tf.Tensor | None = None,
decoder_inputs_embeds: tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs,
) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]:
# 调用模型的方法,传入以下参数,并接收返回的输出
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
decoder_position_ids=decoder_position_ids,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
encoder_outputs=encoder_outputs,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 返回模型的输出
return outputs
# 从 transformers.models.bart.modeling_tf_bart.TFBartModel.serving_output 复制而来
def serving_output(self, output):
# 根据配置判断是否需要处理过去键值(past_key_values)
pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
# 根据配置判断是否需要输出解码器隐藏状态(decoder_hidden_states)
dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
# 根据配置判断是否需要输出解码器注意力权重(decoder_attentions)
dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
# 根据配置判断是否需要输出交叉注意力权重(cross_attentions)
cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
# 根据配置判断是否需要输出编码器隐藏状态(encoder_hidden_states)
enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
# 根据配置判断是否需要输出编码器注意力权重(encoder_attentions)
enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
# 构建并返回 TFSeq2SeqModelOutput 对象
return TFSeq2SeqModelOutput(
last_hidden_state=output.last_hidden_state,
past_key_values=pkv,
decoder_hidden_states=dec_hs,
decoder_attentions=dec_attns,
cross_attentions=cross_attns,
encoder_last_hidden_state=output.encoder_last_hidden_state,
encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns,
)
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 标记为已构建
self.built = True
# 如果模型存在,使用模型的名称构建
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(keras.layers.Layer):
"""
Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer.
"""
def __init__(self, shape, initializer, trainable, name, **kwargs):
super().__init__(name=name, **kwargs)
# Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of
# "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see:
# https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214
# 添加偏置权重作为层的一部分,以便在模型保存和加载时能够正确处理
self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable)
def call(self, x):
# 在输入张量 x 上添加偏置向量 self.bias
return x + self.bias
@add_start_docstrings(
"The BLENDERBOT Model with a language modeling head. Can be used for summarization.",
BLENDERBOT_START_DOCSTRING,
)
class TFBlenderbotForConditionalGeneration(TFBlenderbotPreTrainedModel, TFCausalLanguageModelingLoss):
_keys_to_ignore_on_load_unexpected = [
r"model.encoder.embed_tokens.weight",
r"model.decoder.embed_tokens.weight",
]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 创建 TFBlenderbotMainLayer 实例,并命名为 "model",作为模型的核心组件
self.model = TFBlenderbotMainLayer(config, name="model")
# 根据配置中的参数设置是否使用缓存
self.use_cache = config.use_cache
# 创建 BiasLayer 实例作为模型输出的偏置向量,名为 "final_logits_bias"
# 该偏置向量用于调整模型最终输出的 logits,设置为不可训练以保持一致性
self.bias_layer = BiasLayer(
name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
)
def get_decoder(self):
# 获取模型的解码器(decoder)部分
return self.model.decoder
def get_encoder(self):
# 获取模型的编码器(encoder)部分
return self.model.encoder
def get_output_embeddings(self):
# 获取模型的输出嵌入层
return self.get_input_embeddings()
def set_output_embeddings(self, value):
# 设置模型的输出嵌入层
self.set_input_embeddings(value)
def get_bias(self):
# 返回模型当前使用的偏置向量,以字典形式返回,键为 "final_logits_bias"
return {"final_logits_bias": self.bias_layer.bias}
def set_bias(self, value):
# 用给定的偏置值替换当前模型中的偏置层,确保正确的序列化和反序列化过程
vocab_size = value["final_logits_bias"].shape[-1]
self.bias_layer = BiasLayer(
name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False
)
self.bias_layer.bias.assign(value["final_logits_bias"])
@classmethod
# 根据预训练模型名称或路径加载模型,并传递给模型的参数和关键字参数
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
# 如果预训练模型名称或路径是特定的字符串
if pretrained_model_name_or_path == "facebook/blenderbot-90M":
# 从模块中导入 TFBlenderbotSmallForConditionalGeneration 类
from ..blenderbot_small import TFBlenderbotSmallForConditionalGeneration
# 发出警告,说明特定检查点已弃用,并建议新的检查点名称和使用方式
warnings.warn(
"The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical"
" checkpoint `facebook/small_blenderbot-90M` with"
" `TFBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/small_blenderbot-90M')`"
" instead.",
FutureWarning,
)
# 返回从预训练模型加载的 TFBlenderbotSmallForConditionalGeneration 实例
return TFBlenderbotSmallForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
# 调用父类的 from_pretrained 方法,传递预训练模型名称或路径以及其他参数和关键字参数
return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
# 将装饰器应用于 call 方法,以添加模型输入和输出的文档字符串
@unpack_inputs
@add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(BLENDERBOT_GENERATION_EXAMPLE)
def call(
self,
# 模型的输入张量,可以为 None
input_ids: tf.Tensor | None = None,
# 注意力遮罩张量,可以为 None
attention_mask: tf.Tensor | None = None,
# 解码器输入的 ID 张量,可以为 None
decoder_input_ids: tf.Tensor | None = None,
# 解码器的注意力遮罩张量,可以为 None
decoder_attention_mask: tf.Tensor | None = None,
# 解码器的位置 ID 张量,可以为 None
decoder_position_ids: tf.Tensor | None = None,
# 头部遮罩张量,可以为 None
head_mask: tf.Tensor | None = None,
# 解码器头部遮罩张量,可以为 None
decoder_head_mask: tf.Tensor | None = None,
# 跨注意力头部遮罩张量,可以为 None
cross_attn_head_mask: tf.Tensor | None = None,
# 编码器输出,可以为元组或 TFBaseModelOutput 类型
encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
# 过去键值列表,可以为 None
past_key_values: List[tf.Tensor] | None = None,
# 输入嵌入张量,可以为 None
inputs_embeds: tf.Tensor | None = None,
# 解码器输入嵌入张量,可以为 None
decoder_inputs_embeds: tf.Tensor | None = None,
# 是否使用缓存,可以为 None
use_cache: Optional[bool] = None,
# 是否输出注意力权重,可以为 None
output_attentions: Optional[bool] = None,
# 是否输出隐藏状态,可以为 None
output_hidden_states: Optional[bool] = None,
# 是否返回字典类型结果,可以为 None
return_dict: Optional[bool] = None,
# 标签张量,可以为 None
labels: tf.Tensor | None = None,
# 是否处于训练模式,默认为 False
training: Optional[bool] = False,
) -> Union[Tuple[tf.Tensor], TFSeq2SeqLMOutput]:
r"""
labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
"""
# 如果给定了标签,则处理标签,将所有标记为 pad_token_id 的标签改为 -100,其余保持不变
if labels is not None:
labels = tf.where(
labels == self.config.pad_token_id,
tf.cast(tf.fill(shape_list(labels), -100), labels.dtype),
labels,
)
# 如果未提供解码器的输入,根据标签生成解码器的输入
use_cache = False
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(
labels, self.config.pad_token_id, self.config.decoder_start_token_id
)
# 使用模型进行前向传播
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
decoder_position_ids=decoder_position_ids,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 计算语言模型的 logits
lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True)
lm_logits = self.bias_layer(lm_logits)
# 计算掩码语言模型的损失,如果没有标签则损失为 None
masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits)
# 如果 return_dict 为 False,则按照元组形式返回输出
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
# 如果 return_dict 为 True,则按照 TFSeq2SeqLMOutput 类的实例形式返回输出
return TFSeq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values, # 索引 1 的 d outputs
decoder_hidden_states=outputs.decoder_hidden_states, # 索引 2 的 d outputs
decoder_attentions=outputs.decoder_attentions, # 索引 3 的 d outputs
cross_attentions=outputs.cross_attentions, # 索引 4 的 d outputs
encoder_last_hidden_state=outputs.encoder_last_hidden_state, # 索引 0 的 encoder outputs
encoder_hidden_states=outputs.encoder_hidden_states, # 索引 1 的 e outputs
encoder_attentions=outputs.encoder_attentions, # 索引 2 的 e outputs
)
# 从 transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.serving_output 复制而来
# 定义一个方法用于处理模型的输出,根据配置选择性地包含不同的输出信息
def serving_output(self, output):
# 如果配置要求使用缓存,则从输出中获取过去的键-值对
pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
# 如果配置要求输出隐藏状态,则将解码器的隐藏状态转换为张量
dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置要求输出注意力权重,则将解码器的注意力权重转换为张量
dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
# 如果配置要求输出交叉注意力权重,则将交叉注意力权重转换为张量
cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
# 如果配置要求输出隐藏状态,则将编码器的隐藏状态转换为张量
enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置要求输出注意力权重,则将编码器的注意力权重转换为张量
enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
# 返回一个 TFSeq2SeqLMOutput 对象,包含处理后的输出信息
return TFSeq2SeqLMOutput(
logits=output.logits,
past_key_values=pkv,
decoder_hidden_states=dec_hs,
decoder_attentions=dec_attns,
cross_attentions=cross_attns,
encoder_last_hidden_state=output.encoder_last_hidden_state,
encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns,
)
# 从 transformers 库中的 TFBartForConditionalGeneration 类的方法 prepare_inputs_for_generation 复制而来
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
decoder_attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
# 如果存在过去的键-值对,根据此情况截取 decoder_input_ids
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
# 如果有 decoder_attention_mask,使用 XLA 编译执行
if decoder_attention_mask is not None: # xla
# 计算累积的位置 IDs,并取最后一个位置
decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:]
# 如果没有 XLA + 存在过去的键-值对
elif past_key_values is not None: # no xla + past_key_values
# 获取过去键-值对的第一个元素的第一个维度的长度作为位置 IDs
decoder_position_ids = past_key_values[0][0].shape[2]
else: # 没有 XLA + 没有过去的键-值对
# 创建 decoder_input_ids 的位置 IDs
decoder_position_ids = tf.range(decoder_input_ids.shape[1])
# 返回一个包含准备好用于生成的输入参数的字典
return {
"input_ids": None, # encoder_outputs 已定义,不需要 input_ids
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"decoder_attention_mask": decoder_attention_mask,
"decoder_position_ids": decoder_position_ids,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache, # 更改此项以避免缓存(可能用于调试)
}
# 定义一个方法用于构建网络层,支持接收输入形状参数,如果已经构建过则直接返回
def build(self, input_shape=None):
# 如果已经构建过,直接返回,避免重复构建
if self.built:
return
# 标记为已构建
self.built = True
# 检查是否存在模型属性,如果存在,则使用 TensorFlow 的名称空间来构建模型
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
# 调用模型的build方法来构建模型,传入None表示不指定输入形状
self.model.build(None)
# 检查是否存在偏置层属性,如果存在,则使用 TensorFlow 的名称空间来构建偏置层
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
# 调用偏置层的build方法来构建偏置层,传入None表示不指定输入形状
self.bias_layer.build(None)
.\models\blenderbot\tokenization_blenderbot.py
"""Tokenization class for Blenderbot."""
import json
import os
from functools import lru_cache
from typing import List, Optional, Tuple
import regex as re
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
"merges_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
"tokenizer_config_file": {
"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class BlenderbotTokenizer(PreTrainedTokenizer):
"""
Constructs a Blenderbot tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
```
>>> from transformers import BlenderbotTokenizer
>>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B")
>>> tokenizer.add_prefix_space = False
>>> tokenizer("Hello world")["input_ids"]
[47, 921, 86, 1085, 2]
>>> tokenizer(" Hello world")["input_ids"]
[6950, 1085, 2]
```
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
<Tip>
When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
</Tip>
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
errors (`str`, *optional*, defaults to `"replace"`):
Specifies the error handling scheme to use for decoding bytes to UTF-8.
See [bytes.decode](https://docs.python.org/3/library/stdtypes.html
bos_token (`str`, *optional*, defaults to `"<s>"`):
Beginning of sequence token used during pretraining. Often employed as a sequence classifier token.
<Tip>
This token is not typically used as the beginning of sequence when special tokens are employed.
Instead, the `cls_token` is used.
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
End of sequence token.
<Tip>
When constructing sequences with special tokens, this is not used as the end of sequence.
The `sep_token` is used instead.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
Separator token used for constructing sequences from multiple sources,
such as for sequence classification or question answering.
cls_token (`str`, *optional*, defaults to `"<s>"`):
Classifier token used in sequence classification tasks. It is the first token in the sequence when using special tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
Token representing unknown words or tokens not in the vocabulary.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
Token used for padding sequences to equal lengths during batching.
mask_token (`str`, *optional*, defaults to `"<mask>"`):
Token used during masked language modeling, indicating positions where the model will predict.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Indicates whether to add an initial space to the input, treating the leading word like any other word.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
# 从transformers.models.roberta.tokenization_roberta.RobertaTokenizer.__init__中复制而来,用于初始化Blenderbot的Tokenizer类
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
**kwargs,
):
# 如果bos_token是字符串,则创建一个对应的AddedToken对象,用于表示序列开始的特殊标记
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
# 如果pad_token是字符串,则创建一个对应的AddedToken对象,用于表示填充的特殊标记
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
# 如果eos_token是字符串,则创建一个对应的AddedToken对象,用于表示序列结束的特殊标记
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
# 如果unk_token是字符串,则创建一个对应的AddedToken对象,用于表示未知标记的特殊标记
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
# 如果sep_token是字符串,则创建一个对应的AddedToken对象,用于表示分隔符的特殊标记
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
# 如果cls_token是字符串,则创建一个对应的AddedToken对象,用于表示类别标记的特殊标记
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
# mask_token的行为类似于普通单词,即在其前面包含空格
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
if isinstance(mask_token, str)
else mask_token
)
# 这些特殊标记不包含在vocab.json中,因此将它们按正确顺序添加
# 用UTF-8编码打开vocab_file,并加载其中的编码器内容为字典self.encoder
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
# 通过self.encoder创建反向映射字典self.decoder
self.decoder = {v: k for k, v in self.encoder.items()}
# 设置解码过程中的错误处理方式
self.errors = errors
# 创建字节到Unicode的编码映射字典self.byte_encoder
self.byte_encoder = bytes_to_unicode()
# 通过self.byte_encoder创建反向映射字典self.byte_decoder
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
# 用UTF-8编码打开merges_file,读取内容并分割成行,排除第一行和最后一行空行后,将其转换为元组列表bpe_merges
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
# 将bpe_merges列表中的每个合并规则字符串转换为元组,并构建合并规则到索引的映射字典self.bpe_ranks
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
# 初始化缓存字典self.cache为空字典
self.cache = {}
# 设置是否在特殊标记前添加空格的标志
self.add_prefix_space = add_prefix_space
# 应该添加re.IGNORECASE,以便对缩写的大写版本进行BPE合并
# 编译正则表达式,用于识别缩写、字母和数字、非空白非字母数字字符、空白(排除非空白字符后的空白)
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
# 调用父类的初始化方法,传递参数设置
super().__init__(
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
# 返回当前词汇表的大小,即编码器的长度
def vocab_size(self):
return len(self.encoder)
# 从Blenderbot的词汇表中获取完整的词汇表,包括添加的特殊标记
def get_vocab(self):
# 复制编码器中的内容到vocab字典中
vocab = dict(self.encoder).copy()
# 将添加的特殊标记编码器内容更新到vocab字典中
vocab.update(self.added_tokens_encoder)
return vocab
# 根据Blenderbot的BPE算法处理给定的token,返回处理后的字符串
def bpe(self, token):
# 如果token已经在缓存中,则直接返回缓存中的结果
if token in self.cache:
return self.cache[token]
word = tuple(token)
# 使用Blenderbot的BPE算法处理token,生成pairs
pairs = get_pairs(word)
if not pairs:
return token
while True:
# 找到当前pairs中优先级最低的bigram
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
# 使用Blenderbot的BPE算法对给定的文本进行分词,返回分词后的结果
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
# 使用正则表达式找到所有匹配的token,并逐个处理
for token in re.findall(self.pat, text):
# 将token编码成字节,并通过Blenderbot的字节编码器映射成unicode字符串
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
# 使用Blenderbot的BPE算法对编码后的token进行分词,将分词结果添加到bpe_tokens列表中
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
# 将给定的token转换为其在Blenderbot词汇表中的ID,如果token不存在,则使用未知标记的ID
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
# 将给定的ID转换为其在Blenderbot词汇表中对应的token,如果ID不存在,则返回对应的未知标记
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
# 从transformers.models.roberta.tokenization_roberta.RobertaTokenizer.convert_tokens_to_string复制而来,将Roberta->Blenderbot,RoBERTa->Blenderbot
def convert_tokens_to_string(self, tokens):
"""将一系列的tokens(字符串)转换为单个字符串。"""
# 将tokens列表中的所有字符串连接成一个字符串
text = "".join(tokens)
# 使用self.byte_decoder中的映射将text中的每个字符解码为UTF-8编码的字符串
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
# 从transformers.models.roberta.tokenization_roberta.RobertaTokenizer.save_vocabulary复制而来,将Roberta->Blenderbot,RoBERTa->Blenderbot
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 如果save_directory不是一个目录,则记录错误并返回
if not os.path.isdir(save_directory):
logger.error(f"词汇表路径 ({save_directory}) 应为一个目录")
return
# 构建词汇文件的路径,如果提供了filename_prefix,则使用它作为前缀
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# 构建合并文件的路径,如果提供了filename_prefix,则使用它作为前缀
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
# 将self.encoder中的内容以UTF-8编码格式写入vocab_file
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
# 将BPE(Byte Pair Encoding)的tokens和它们的索引写入merge_file
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
# 按照token_index排序self.bpe_ranks.items(),并将每个bpe_tokens列表写入文件
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"保存词汇到 {merge_file}: BPE合并索引不是连续的。请确保分词器未损坏!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
# 从transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_special_tokens_mask复制而来,将Roberta->Blenderbot,RoBERTa->Blenderbot
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
# 返回一个掩码,指示哪些token是特殊token(如[PAD]、[CLS]、[SEP]等)
pass
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
# Check if the token list already has special tokens; if so, delegate to superclass method
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
# If no special tokens are present and there is only one token list, add special tokens at the start and end
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
# If there are two token lists, add special tokens appropriately for sequence pairs
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.create_token_type_ids_from_sequences with Roberta->Blenderbot, RoBERTa->Blenderbot
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
# Define special tokens for separation and classification
sep = [self.sep_token_id]
cls = [self.cls_token_id]
# If there's only one sequence, return a list of zeros of appropriate length
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
"""
Prepare the text for tokenization, ensuring correct formatting based on tokenizer settings.
Args:
text (str): The input text to be tokenized.
is_split_into_words (bool, optional): Whether the text is already split into words.
**kwargs: Additional keyword arguments.
Returns:
Tuple[str, Dict]: A tuple containing the processed text and any additional kwargs.
"""
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
text = " " + text
return (text, kwargs)
def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A Blenderbot sequence has the following format:
- single sequence: ` X </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (`List[int]`, *optional*):
Will be ignored
Returns:
`List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
return token_ids_0 + [self.eos_token_id]
@property
def default_chat_template(self):
"""
A very simple chat template that just adds whitespace between messages.
"""
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return (
"{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
"{{ message['content'] }}"
"{% if not loop.last %}{{ ' ' }}{% endif %}"
"{% endfor %}"
"{{ eos_token }}"
)
.\models\blenderbot\tokenization_blenderbot_fast.py
import json
from typing import List, Optional, Tuple
from tokenizers import pre_tokenizers, processors
from ...tokenization_utils_base import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_blenderbot import BlenderbotTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
"merges_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
"tokenizer_config_file": {
"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}
class BlenderbotTokenizerFast(PreTrainedTokenizerFast):
"""
快速实现的 Blenderbot 分词器,基于 HuggingFace 的 tokenizers 库,衍生自 GPT-2 分词器,使用字节级别的 BPE。
这个分词器经过训练,将空格视为词元的一部分(类似于 sentencepiece),因此一个词在句子开头(无空格)和其他位置编码会不同:
```
>>> from transformers import BlenderbotTokenizerFast
>>> tokenizer = BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B")
>>> tokenizer("Hello world")["input_ids"]
[6950, 1085, 2]
>>> tokenizer(" Hello world")["input_ids"]
[6950, 1085, 2]
```
如果要避免这种行为,可以在实例化分词器时或调用时传递 add_prefix_space=True,但由于模型不是这样预训练的,可能会降低性能。
<Tip>
当使用 is_split_into_words=True 时,需要以 add_prefix_space=True 实例化这个分词器。
</Tip>
这个分词器继承自 [`PreTrainedTokenizerFast`],其中包含大部分主要方法。用户应该
"""
pass
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = BlenderbotTokenizer
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
trim_offsets=True,
**kwargs,
):
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
if isinstance(mask_token, str)
else mask_token
)
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
trim_offsets=trim_offsets,
**kwargs,
)
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
self.add_prefix_space = add_prefix_space
tokenizer_component = "post_processor"
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
if tokenizer_component_instance:
state = json.loads(tokenizer_component_instance.__getstate__())
if "sep" in state:
state["sep"] = tuple(state["sep"])
if "cls" in state:
state["cls"] = tuple(state["cls"])
changes_to_apply = False
if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
state["add_prefix_space"] = add_prefix_space
changes_to_apply = True
if state.get("trim_offsets", trim_offsets) != trim_offsets:
state["trim_offsets"] = trim_offsets
changes_to_apply = True
if changes_to_apply:
component_class = getattr(processors, state.pop("type"))
new_value = component_class(**state)
setattr(self.backend_tokenizer, tokenizer_component, new_value)
def mask_token(self) -> str:
"""
`str`: 获取掩码标记,用于训练具有掩码语言建模功能的模型。如果在未设置的情况下使用,则记录错误信息。
Blenderbot 分词器有一个特殊的掩码标记,用于在填充掩码流水线中使用。掩码标记将贪婪地包括 *<mask>* 前面的空格。
"""
if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None
return str(self._mask_token)
@mask_token.setter
def mask_token(self, value):
"""
重写掩码标记的默认行为,使其能够包含前置空格。
这是为了与所有基于 Roberta 的先前使用的模型保持向后兼容性。
"""
value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
self._mask_token = value
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._batch_encode_plus(*args, **kwargs)
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._encode_plus(*args, **kwargs)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
"""
根据 token_ids_0 和(可选)token_ids_1 创建 token 类型 ID。
如果使用预分词的输入,需要用 add_prefix_space=True 来实例化 {self.__class__.__name__}。
"""
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A Blenderbot sequence has the following format:
- single sequence: ` X </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (`List[int]`, *optional*):
Will be ignored
Returns:
`List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
return token_ids_0 + [self.eos_token_id]
@property
def default_chat_template(self):
"""
A very simple chat template that just adds whitespace between messages.
"""
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return (
"{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
"{{ message['content'] }}"
"{% if not loop.last %}{{ ' ' }}{% endif %}"
"{% endfor %}"
"{{ eos_token }}"
)
.\models\blenderbot\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_blenderbot": [
"BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP",
"BlenderbotConfig",
"BlenderbotOnnxConfig",
],
"tokenization_blenderbot": ["BlenderbotTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_blenderbot_fast"] = ["BlenderbotTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_blenderbot"] = [
"BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST",
"BlenderbotForCausalLM",
"BlenderbotForConditionalGeneration",
"BlenderbotModel",
"BlenderbotPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_blenderbot"] = [
"TFBlenderbotForConditionalGeneration",
"TFBlenderbotModel",
"TFBlenderbotPreTrainedModel",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_blenderbot"] = [
"FlaxBlenderbotForConditionalGeneration",
"FlaxBlenderbotModel",
"FlaxBlenderbotPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_blenderbot import (
BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP,
BlenderbotConfig,
BlenderbotOnnxConfig,
)
from .tokenization_blenderbot import BlenderbotTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_blenderbot_fast import BlenderbotTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_blenderbot import (
BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
BlenderbotForCausalLM,
BlenderbotForConditionalGeneration,
BlenderbotModel,
BlenderbotPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_blenderbot import (
TFBlenderbotForConditionalGeneration,
TFBlenderbotModel,
TFBlenderbotPreTrainedModel,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_blenderbot import (
FlaxBlenderbotForConditionalGeneration,
FlaxBlenderbotModel,
FlaxBlenderbotPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\blenderbot_small\configuration_blenderbot_small.py
"""
BlenderbotSmall model configuration
This module defines the configuration class `BlenderbotSmallConfig` for the BlenderbotSmall model.
It specifies how the model should be instantiated and configured. It inherits from `PretrainedConfig`
and provides defaults similar to the `facebook/blenderbot_small-90M` architecture.
Example:
>>> from transformers import BlenderbotSmallConfig, BlenderbotSmallModel
>>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
>>> configuration = BlenderbotSmallConfig()
>>> # Initializing a model (with random weights) from the facebook/blenderbot_small-90M style configuration
>>> model = BlenderbotSmallModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
from collections import OrderedDict
from typing import Any, Mapping, Optional
from ... import PreTrainedTokenizer
from ...configuration_utils import PretrainedConfig
from ...file_utils import TensorType, is_torch_available
from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
from ...onnx.utils import compute_effective_axis_dimension
from ...utils import logging
logger = logging.get_logger(__name__)
BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/config.json",
}
class BlenderbotSmallConfig(PretrainedConfig):
r"""
BlenderbotSmall模型的配置类,用于存储[`BlenderbotSmallModel`]的配置。
它用于根据指定的参数实例化BlenderbotSmall模型,定义模型架构。
使用默认值实例化配置将生成类似于BlenderbotSmall [facebook/blenderbot_small-90M](https://huggingface.co/facebook/blenderbot_small-90M)架构的配置。
配置对象继承自[`PretrainedConfig`],可用于控制模型输出。有关更多信息,请阅读[`PretrainedConfig`]的文档。
Example:
```
>>> from transformers import BlenderbotSmallConfig, BlenderbotSmallModel
>>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
>>> configuration = BlenderbotSmallConfig()
>>> # Initializing a model (with random weights) from the facebook/blenderbot_small-90M style configuration
>>> model = BlenderbotSmallModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "blenderbot-small"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
vocab_size=50265,
max_position_embeddings=512,
encoder_layers=8,
encoder_ffn_dim=2048,
encoder_attention_heads=16,
decoder_layers=8,
decoder_ffn_dim=2048,
decoder_attention_heads=16,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
use_cache=True,
is_encoder_decoder=True,
activation_function="gelu",
d_model=512,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
decoder_start_token_id=1,
scale_embedding=False,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
forced_eos_token_id=2,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
class BlenderbotSmallOnnxConfig(OnnxSeq2SeqConfigWithPast):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task in ["default", "seq2seq-lm"]:
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
]
)
if self.use_past:
common_inputs["decoder_input_ids"] = {0: "batch"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
else:
common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
if self.use_past:
self.fill_with_past_key_values_(common_inputs, direction="inputs")
elif self.task == "causal-lm":
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
]
)
if self.use_past:
num_encoder_layers, _ = self.num_layers
for i in range(num_encoder_layers):
common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
else:
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
]
)
return common_inputs
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task in ["default", "seq2seq-lm"]:
common_outputs = super().outputs
else:
common_outputs = super(OnnxConfigWithPast, self).outputs
if self.use_past:
num_encoder_layers, _ = self.num_layers
for i in range(num_encoder_layers):
common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
return common_outputs
def _generate_dummy_inputs_for_default_and_seq2seq_lm(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, seq_length, is_pair, framework
)
decoder_seq_length = seq_length if not self.use_past else 1
decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, decoder_seq_length, is_pair, framework
)
decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
common_inputs = dict(**encoder_inputs, **decoder_inputs)
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, encoder_seq_length = common_inputs["input_ids"].shape
decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
encoder_shape = (
batch,
num_encoder_attention_heads,
encoder_seq_length,
self._config.hidden_size // num_encoder_attention_heads,
)
decoder_past_length = decoder_seq_length + 3
decoder_shape = (
batch,
num_decoder_attention_heads,
decoder_past_length,
self._config.hidden_size // num_decoder_attention_heads,
)
common_inputs["decoder_attention_mask"] = torch.cat(
[common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
)
common_inputs["past_key_values"] = []
num_encoder_layers, num_decoder_layers = self.num_layers
min_num_layers = min(num_encoder_layers, num_decoder_layers)
max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
for _ in range(min_num_layers):
common_inputs["past_key_values"].append(
(
torch.zeros(decoder_shape),
torch.zeros(decoder_shape),
torch.zeros(encoder_shape),
torch.zeros(encoder_shape),
)
)
shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
for _ in range(min_num_layers, max_num_layers):
common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
return common_inputs
def _generate_dummy_inputs_for_causal_lm(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, seq_length, is_pair, framework
)
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, seqlen = common_inputs["input_ids"].shape
past_key_values_length = seqlen + 2
num_encoder_layers, _ = self.num_layers
num_encoder_attention_heads, _ = self.num_attention_heads
past_shape = (
batch,
num_encoder_attention_heads,
past_key_values_length,
self._config.hidden_size // num_encoder_attention_heads,
)
mask_dtype = common_inputs["attention_mask"].dtype
common_inputs["attention_mask"] = torch.cat(
[common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
)
common_inputs["past_key_values"] = [
(torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers)
]
return common_inputs
def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
batch_size = compute_effective_axis_dimension(
batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
)
token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
seq_length = compute_effective_axis_dimension(
seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
)
dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
return common_inputs
def generate_dummy_inputs(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
if self.task in ["default", "seq2seq-lm"]:
common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
elif self.task == "causal-lm":
common_inputs = self._generate_dummy_inputs_for_causal_lm(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
else:
common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
return common_inputs
def _flatten_past_key_values_(self, flattened_output, name, idx, t):
if self.task in ["default", "seq2seq-lm"]:
flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
else:
flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
flattened_output, name, idx, t
)
.\models\blenderbot_small\modeling_blenderbot_small.py
""" PyTorch BlenderbotSmall model."""
import copy
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_blenderbot_small import BlenderbotSmallConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "BlenderbotSmallConfig"
BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/blenderbot_small-90M",
]
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
class BlenderbotSmallLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int):
super().__init__(num_embeddings, embedding_dim)
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
"""
`input_ids_shape` is expected to be [bsz x seqlen].
Forward pass of the model.
"""
bsz, seq_len = input_ids_shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
)
return super().forward(positions)
class BlenderbotSmallAttention(nn.Module):
"""来自'Attention Is All You Need'论文的多头注意力机制"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[BlenderbotSmallConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim必须能被num_heads整除 (当前 `embed_dim`: {self.embed_dim}"
f" 和 `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
pass
class BlenderbotSmallEncoderLayer(nn.Module):
def __init__(self, config: BlenderbotSmallConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
config=config,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.FloatTensor,
attention_mask: torch.FloatTensor,
layer_head_mask: torch.FloatTensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
BLENDERBOT_SMALL_ATTENTION_CLASSES = {
"eager": BlenderbotSmallAttention,
}
class BlenderbotSmallDecoderLayer(nn.Module):
def __init__(self, config: BlenderbotSmallConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
is_causal=True,
config=config,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[config._attn_implementation](
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
config=config,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
def dummy_inputs(self):
pad_token = self.config.pad_token_id
input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
dummy_inputs = {
"attention_mask": input_ids.ne(pad_token),
"input_ids": input_ids,
"decoder_input_ids": input_ids,
}
return dummy_inputs
BLENDERBOT_SMALL_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BlenderbotSmallConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BLENDERBOT_SMALL_GENERATION_EXAMPLE = r"""
Conversation example:
```
>>> from transformers import AutoTokenizer, BlenderbotSmallForConditionalGeneration
>>> mname = "facebook/blenderbot_small-90M"
>>> model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname)
>>> tokenizer = AutoTokenizer.from_pretrained(mname)
>>> UTTERANCE = "My friends are cool but they eat too many carbs."
>>> print("Human: ", UTTERANCE)
Human: My friends are cool but they eat too many carbs.
>>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
>>> reply_ids = model.generate(**inputs)
>>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
Bot: what kind of carbs do they eat? i don't know much about carbs.
>>> REPLY = "I'm not sure"
>>> print("Human: ", REPLY)
Human: I'm not sure
>>> NEXT_UTTERANCE = (
... "My friends are cool but they eat too many carbs.__end__ __start__what kind of carbs do they eat? "
... "i don't know much about carbs__end__ "
... "__start__ I'm not sure."
... )
>>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="pt")
>>> next_reply_ids = model.generate(**inputs)
>>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
Bot: they eat a lot of carbs. carbs are high in fat, protein, and fats.
```
"""
BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
"""
class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`BlenderbotSmallEncoderLayer`].
Args:
config: BlenderbotSmallConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
self.embed_positions = BlenderbotSmallLearnedPositionalEmbedding(
config.max_position_embeddings,
embed_dim,
)
self.layers = nn.ModuleList([BlenderbotSmallEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.gradient_checkpointing = False
self.post_init()
def forward(
self,
input_ids=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
"""
Transformer解码器,由config.decoder_layers个BlenderbotSmallDecoderLayer层组成。
Args:
config: BlenderbotSmallConfig的实例,包含模型配置信息
embed_tokens (nn.Embedding): 输出的嵌入层
"""
def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
self.embed_positions = BlenderbotSmallLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
)
self.layers = nn.ModuleList([BlenderbotSmallDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
pass
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Union[Tuple, BaseModelOutput]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The BlenderbotSmall Model with a language modeling head. Can be used for summarization.",
BLENDERBOT_SMALL_START_DOCSTRING,
)
class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: BlenderbotSmallConfig):
super().__init__(config)
self.model = BlenderbotSmallModel(config)
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
def get_decoder(self):
return self.model.get_decoder()
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
self._resize_final_logits_bias(new_embeddings.weight.shape[0])
return new_embeddings
def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
old_num_tokens = self.final_logits_bias.shape[-1]
if new_num_tokens <= old_num_tokens:
new_bias = self.final_logits_bias[:, :new_num_tokens]
else:
extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
self.register_buffer("final_logits_bias", new_bias)
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
@add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(BLENDERBOT_SMALL_GENERATION_EXAMPLE)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Union[Tuple, BaseModelOutput]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Tuple of masked language modeling loss and model outputs if not in `return_dict` mode,
otherwise a `Seq2SeqLMOutput` containing various model outputs.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
if use_cache:
logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
use_cache = False
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(
labels, self.config.pad_token_id, self.config.decoder_start_token_id
)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return Seq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if decoder_input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = decoder_input_ids.shape[1] - 1
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
class BlenderbotSmallDecoderWrapper(BlenderbotSmallPreTrainedModel):
"""
这个包装类是一个辅助类,用于在因果语言模型与EncoderDecoderModel框架结合使用时正确加载预训练检查点。
"""
def __init__(self, config):
super().__init__(config)
self.decoder = BlenderbotSmallDecoder(config)
def forward(self, *args, **kwargs):
return self.decoder(*args, **kwargs)
class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
config = copy.deepcopy(config)
config.is_decoder = True
config.is_encoder_decoder = False
super().__init__(config)
self.model = BlenderbotSmallDecoderWrapper(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model.decoder = decoder
def get_decoder(self):
return self.model.decoder
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
这个方法定义了模型的前向传播逻辑,支持各种可选参数。
"""
return self.model(input_ids=input_ids, attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask, head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask, past_key_values=past_key_values,
inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache,
output_attentions=output_attentions, output_hidden_states=output_hidden_states,
return_dict=return_dict)
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
):
"""
准备生成过程的输入,支持各种可选参数。
"""
raise NotImplementedError
):
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
.\models\blenderbot_small\modeling_flax_blenderbot_small.py
""" Flax BlenderbotSmall 模型。"""
import math
import random
from functools import partial
from typing import Callable, Optional, Tuple
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from jax.random import PRNGKey
from ...modeling_flax_outputs import (
FlaxBaseModelOutput,
FlaxBaseModelOutputWithPastAndCrossAttentions,
FlaxCausalLMOutputWithCrossAttentions,
FlaxSeq2SeqLMOutput,
FlaxSeq2SeqModelOutput,
)
from ...modeling_flax_utils import (
ACT2FN,
FlaxPreTrainedModel,
append_call_sample_docstring,
append_replace_return_docstrings,
overwrite_call_docstring,
)
from ...utils import add_start_docstrings, logging, replace_return_docstrings
from .configuration_blenderbot_small import BlenderbotSmallConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/blenderbot_small-90M"
_CONFIG_FOR_DOC = "BlenderbotSmallConfig"
BLENDERBOT_SMALL_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a Flax Linen
[flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
"""
"""
BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
"""
BLENDERBOT_SMALL_ENCODE_INPUTS_DOCSTRING = r"""
Args:
input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
BLENDERBOT_SMALL_DECODE_INPUTS_DOCSTRING = r"""
"""
# Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
"""
Shift input ids one token to the right.
"""
shifted_input_ids = jnp.zeros_like(input_ids) # 创建一个与输入数组相同形状的全零数组
shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1]) # 将输入数组向右移动一个位置
shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id) # 设置起始位置的标记
shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids) # 替换特殊标记为pad_token_id
return shifted_input_ids
# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention with Bart->BlenderbotSmall
class FlaxBlenderbotSmallAttention(nn.Module):
config: BlenderbotSmallConfig # 配置对象
embed_dim: int # 嵌入维度
num_heads: int # 头的数量
dropout: float = 0.0 # dropout率,默认为0.0
causal: bool = False # 是否为因果(causal)注意力
bias: bool = True # 是否包含偏置项
dtype: jnp.dtype = jnp.float32 # 计算时的数据类型,使用jnp.float32
# 设置函数,用于初始化模型参数
def setup(self) -> None:
# 计算每个注意力头的维度
self.head_dim = self.embed_dim // self.num_heads
# 检查embed_dim是否能被num_heads整除,否则抛出数值错误
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {self.num_heads})."
)
# 定义一个偏函数dense,用于创建带有预设参数的全连接层
dense = partial(
nn.Dense,
self.embed_dim,
use_bias=self.bias,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std),
)
# 创建查询、键、值投影层以及输出投影层
self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
self.out_proj = dense()
# 创建一个dropout层,用于模型训练时的随机失活
self.dropout_layer = nn.Dropout(rate=self.dropout)
# 如果需要因果注意力机制,创建一个因果掩码
if self.causal:
self.causal_mask = make_causal_mask(
jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
)
# 将隐藏状态按照注意力头分割
def _split_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
# 将分割后的注意力头重新合并
def _merge_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
# 使用JAX库的compact装饰器定义一个紧凑模型组件
@nn.compact
"""
This function takes projected key, value states from a single input token and concatenates the states to cached
states from previous steps. This function is slightly adapted from the official Flax repository:
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py
"""
# detect if we're initializing by absence of existing cache data.
# 检测是否需要初始化,通过检查缓存数据是否存在来判断
is_initialized = self.has_variable("cache", "cached_key")
# initialize or retrieve cached key and value states with zeros of appropriate shape and type
# 初始化或获取缓存的键和值状态,使用适当形状和类型的零值
cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
# initialize or retrieve cache index, starting from 0
# 初始化或获取缓存索引,起始为0
cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
if is_initialized:
# extract batch dimensions and other relevant dimensions from cached key shape
# 提取批量维度和其他相关维度,从缓存键的形状中
*batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
# update cached key and value with new 1d spatial slices based on current cache index
# 使用当前缓存索引更新缓存键和值的新的一维空间切片
cur_index = cache_index.value
indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
key = lax.dynamic_update_slice(cached_key.value, key, indices)
value = lax.dynamic_update_slice(cached_value.value, value, indices)
# update cached_key and cached_value variables with new values
# 更新 cached_key 和 cached_value 变量的值
cached_key.value = key
cached_value.value = value
# determine number of updated cache vectors from the current query shape
# 确定从当前查询形状更新的缓存向量数量
num_updated_cache_vectors = query.shape[1]
cache_index.value = cache_index.value + num_updated_cache_vectors
# create a pad mask for causal attention to avoid attending to future elements
# 创建一个用于因果注意力的填充掩码,以避免关注未来元素
pad_mask = jnp.broadcast_to(
jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
)
# combine pad_mask with existing attention_mask if provided
# 如果提供了 attention_mask,则与其结合
attention_mask = combine_masks(pad_mask, attention_mask)
# return updated key, value, and attention_mask
# 返回更新后的 key、value 和 attention_mask
return key, value, attention_mask
# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartEncoderLayer with Bart->BlenderbotSmall
class FlaxBlenderbotSmallEncoderLayer(nn.Module):
config: BlenderbotSmallConfig
dtype: jnp.dtype = jnp.float32
def setup(self) -> None:
self.embed_dim = self.config.d_model # 从配置中获取模型的嵌入维度
self.self_attn = FlaxBlenderbotSmallAttention( # 创建自注意力机制实例
config=self.config,
embed_dim=self.embed_dim,
num_heads=self.config.encoder_attention_heads,
dropout=self.config.attention_dropout,
dtype=self.dtype,
)
self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) # 创建自注意力层规范化实例
self.dropout_layer = nn.Dropout(rate=self.config.dropout) # 创建丢弃层实例
self.activation_fn = ACT2FN[self.config.activation_function] # 根据配置选择激活函数
self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout) # 创建激活函数丢弃层实例
self.fc1 = nn.Dense(
self.config.encoder_ffn_dim, # 配置中编码器前馈网络的维度
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std), # 使用正态分布初始化权重
)
self.fc2 = nn.Dense(
self.embed_dim, # 嵌入维度
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std), # 使用正态分布初始化权重
)
self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) # 创建最终层规范化实例
def __call__(
self,
hidden_states: jnp.ndarray, # 隐藏状态张量
attention_mask: jnp.ndarray, # 注意力掩码张量
output_attentions: bool = True, # 是否输出注意力权重
deterministic: bool = True, # 是否使用确定性计算
) -> Tuple[jnp.ndarray]:
residual = hidden_states # 保存原始隐藏状态,用于残差连接
# 自注意力计算
hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask)
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) # 应用丢弃层
hidden_states = residual + hidden_states # 残差连接
hidden_states = self.self_attn_layer_norm(hidden_states) # 自注意力层规范化
residual = hidden_states # 保存残差连接后的状态
# 前馈网络计算
hidden_states = self.activation_fn(self.fc1(hidden_states)) # 应用激活函数和第一个全连接层
hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic) # 应用激活函数的丢弃层
hidden_states = self.fc2(hidden_states) # 第二个全连接层
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) # 应用丢弃层
hidden_states = residual + hidden_states # 残差连接
hidden_states = self.final_layer_norm(hidden_states) # 最终层规范化
outputs = (hidden_states,) # 输出隐藏状态作为元组的第一个元素
if output_attentions:
outputs += (attn_weights,) # 如果需要输出注意力权重,则作为元组的第二个元素添加到输出中
return outputs
# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartEncoderLayerCollection with Bart->BlenderbotSmall
class FlaxBlenderbotSmallEncoderLayerCollection(nn.Module):
config: BlenderbotSmallConfig
dtype: jnp.dtype = jnp.float32 # 计算的数据类型
def setup(self):
self.layers = [
FlaxBlenderbotSmallEncoderLayer(self.config, name=str(i), dtype=self.dtype) # 创建编码器层实例列表
for i in range(self.config.encoder_layers) # 根据配置中编码器层数创建
]
self.layerdrop = self.config.encoder_layerdrop # 设置编码器层的丢弃率
# 定义一个调用方法,用于执行模型的前向传播
def __call__(
self,
hidden_states, # 输入的隐藏状态张量
attention_mask, # 注意力掩码,用于指示哪些位置需要注意
deterministic: bool = True, # 是否使用确定性推断
output_attentions: bool = False, # 是否输出注意力权重
output_hidden_states: bool = False, # 是否输出所有隐藏状态
return_dict: bool = True, # 是否返回字典形式的输出
):
# 如果需要输出注意力权重,则初始化空元组用于存储所有注意力权重
all_attentions = () if output_attentions else None
# 如果需要输出所有隐藏状态,则初始化空元组用于存储所有隐藏状态
all_hidden_states = () if output_hidden_states else None
# 遍历所有编码器层
for encoder_layer in self.layers:
# 如果需要输出所有隐藏状态,则将当前隐藏状态添加到all_hidden_states中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 添加层丢弃(参见 https://arxiv.org/abs/1909.11556 进行描述)
dropout_probability = random.uniform(0, 1)
# 如果不是确定性推断且随机dropout概率小于层丢弃率,则跳过该层
if not deterministic and (dropout_probability < self.layerdrop):
layer_outputs = (None, None)
else:
# 否则,调用当前编码器层进行前向传播
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
output_attentions,
deterministic,
)
# 更新隐藏状态为当前层的输出的第一个元素
hidden_states = layer_outputs[0]
# 如果需要输出注意力权重,则将当前层的注意力权重添加到all_attentions中
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
# 如果需要输出所有隐藏状态,则将最终的隐藏状态添加到all_hidden_states中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 构建模型的输出结果,包括最终的隐藏状态、所有隐藏状态和所有注意力权重
outputs = (hidden_states, all_hidden_states, all_attentions)
# 如果不需要以字典形式返回结果,则返回一个元组,过滤掉None值
if not return_dict:
return tuple(v for v in outputs if v is not None)
# 否则,以FlaxBaseModelOutput的形式返回结果,包括最终的隐藏状态、所有隐藏状态和所有注意力权重
return FlaxBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
# 从 transformers.models.bart.modeling_flax_bart.FlaxBartDecoderLayer 复制并修改为使用 BlenderbotSmall
class FlaxBlenderbotSmallDecoderLayer(nn.Module):
# 配置参数对象,指定为 BlenderbotSmallConfig 类型
config: BlenderbotSmallConfig
# 数据类型,默认为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 初始化函数,设置层的属性
def setup(self) -> None:
# 设定嵌入维度为模型配置中的 d_model
self.embed_dim = self.config.d_model
# 使用 BlenderbotSmallAttention 定义自注意力机制
self.self_attn = FlaxBlenderbotSmallAttention(
config=self.config,
embed_dim=self.embed_dim,
num_heads=self.config.decoder_attention_heads,
dropout=self.config.attention_dropout,
causal=True,
dtype=self.dtype,
)
# 定义 dropout 层,用于模型训练时的随机失活
self.dropout_layer = nn.Dropout(rate=self.config.dropout)
# 激活函数,根据配置中的激活函数类型选择对应的函数
self.activation_fn = ACT2FN[self.config.activation_function]
# 激活函数的 dropout 层,用于激活函数的输出时的随机失活
self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
# 定义自注意力机制的 LayerNorm 层
self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 定义与编码器注意力相关的注意力机制
self.encoder_attn = FlaxBlenderbotSmallAttention(
config=self.config,
embed_dim=self.embed_dim,
num_heads=self.config.decoder_attention_heads,
dropout=self.config.attention_dropout,
dtype=self.dtype,
)
# 编码器注意力的 LayerNorm 层
self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 第一个全连接层,用于进行线性变换
self.fc1 = nn.Dense(
self.config.decoder_ffn_dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std),
)
# 第二个全连接层,输出维度为嵌入维度,用于线性变换
self.fc2 = nn.Dense(
self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
)
# 最终的 LayerNorm 层,用于模型输出的标准化
self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 调用函数,定义层的前向传播逻辑
def __call__(
self,
hidden_states: jnp.ndarray, # 输入的隐藏状态
attention_mask: jnp.ndarray, # 注意力遮罩,掩盖无效位置
encoder_hidden_states: Optional[jnp.ndarray] = None, # 编码器隐藏状态(可选)
encoder_attention_mask: Optional[jnp.ndarray] = None, # 编码器注意力遮罩(可选)
init_cache: bool = False, # 是否初始化缓存(默认为 False)
output_attentions: bool = True, # 是否输出注意力权重(默认为 True)
deterministic: bool = True, # 是否确定性推断模式(默认为 True)
# 函数定义未完,需继续编写
) -> Tuple[jnp.ndarray]:
# 将输入的 hidden_states 保存为 residual,用于后续残差连接
residual = hidden_states
# 自注意力机制
# 调用 self_attn 方法进行自注意力计算,得到更新后的 hidden_states 和 self_attn_weights
hidden_states, self_attn_weights = self.self_attn(
hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache
)
# 应用 dropout 层,根据 deterministic 参数确定是否使用确定性 dropout
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 添加残差连接
hidden_states = residual + hidden_states
# 应用自注意力层的 LayerNormalization
hidden_states = self.self_attn_layer_norm(hidden_states)
# 跨注意力块
cross_attn_weights = None
# 如果存在 encoder_hidden_states,则执行以下操作
if encoder_hidden_states is not None:
# 将当前的 hidden_states 保存为 residual
residual = hidden_states
# 执行 encoder_attn 方法进行跨注意力计算,得到更新后的 hidden_states 和 cross_attn_weights
hidden_states, cross_attn_weights = self.encoder_attn(
hidden_states=hidden_states,
key_value_states=encoder_hidden_states,
attention_mask=encoder_attention_mask,
)
# 应用 dropout 层,根据 deterministic 参数确定是否使用确定性 dropout
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 添加残差连接
hidden_states = residual + hidden_states
# 应用跨注意力层的 LayerNormalization
hidden_states = self.encoder_attn_layer_norm(hidden_states)
# 全连接层
# 将当前的 hidden_states 保存为 residual
residual = hidden_states
# 应用激活函数 activation_fn 到 fc1 全连接层
hidden_states = self.activation_fn(self.fc1(hidden_states))
# 应用 activation_dropout_layer,根据 deterministic 参数确定是否使用确定性 dropout
hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
# 应用 fc2 全连接层
hidden_states = self.fc2(hidden_states)
# 应用 dropout 层,根据 deterministic 参数确定是否使用确定性 dropout
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 添加残差连接
hidden_states = residual + hidden_states
# 应用最终的 LayerNormalization
hidden_states = self.final_layer_norm(hidden_states)
# 准备输出
outputs = (hidden_states,)
# 如果需要输出注意力权重,则将 self_attn_weights 和 cross_attn_weights 添加到 outputs 中
if output_attentions:
outputs += (self_attn_weights, cross_attn_weights)
# 返回最终的 outputs
return outputs
# 从transformers.models.bart.modeling_flax_bart.FlaxBartDecoderLayerCollection复制而来,修改为BlenderbotSmall模型
class FlaxBlenderbotSmallDecoderLayerCollection(nn.Module):
# 使用BlenderbotSmallConfig配置
config: BlenderbotSmallConfig
# 计算过程中使用的数据类型
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
def setup(self):
# 创建decoder层列表,根据配置中的decoder_layers数量
self.layers = [
FlaxBlenderbotSmallDecoderLayer(self.config, name=str(i), dtype=self.dtype)
for i in range(self.config.decoder_layers)
]
# 设置layer drop参数
self.layerdrop = self.config.decoder_layerdrop
def __call__(
self,
hidden_states,
attention_mask,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
deterministic: bool = True,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 如果需要输出隐藏状态,则初始化空元组
all_hidden_states = () if output_hidden_states else None
# 如果需要输出注意力权重,则初始化空元组
all_self_attns = () if output_attentions else None
# 如果需要输出交叉注意力权重,并且encoder_hidden_states不为None,则初始化空元组
all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
# 遍历每个decoder层
for decoder_layer in self.layers:
# 如果需要输出隐藏状态,则将当前隐藏状态添加到all_hidden_states中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 添加LayerDrop机制(参见https://arxiv.org/abs/1909.11556)
# 生成0到1之间的随机数作为dropout概率
dropout_probability = random.uniform(0, 1)
# 如果不是确定性推断,并且dropout_probability小于layerdrop值,则将输出置为None
if not deterministic and (dropout_probability < self.layerdrop):
layer_outputs = (None, None, None)
else:
# 否则,调用当前decoder层进行前向传播计算
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
init_cache=init_cache,
output_attentions=output_attentions,
deterministic=deterministic,
)
# 更新隐藏状态为当前decoder层的输出的第一个元素
hidden_states = layer_outputs[0]
# 如果需要输出注意力权重,则将当前层的自注意力权重添加到all_self_attns中
if output_attentions:
all_self_attns += (layer_outputs[1],)
# 如果encoder_hidden_states不为None,则将当前层的交叉注意力权重添加到all_cross_attentions中
if encoder_hidden_states is not None:
all_cross_attentions += (layer_outputs[2],)
# 如果需要输出最终的隐藏状态,则将最终隐藏状态添加到all_hidden_states中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 组装输出结果列表
outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions]
# 如果return_dict为False,则返回元组形式的输出列表
if not return_dict:
return tuple(v for v in outputs if v is not None)
# 否则,返回带有过去和交叉注意力的FlaxBaseModelOutputWithPastAndCrossAttentions对象
return FlaxBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attns,
cross_attentions=all_cross_attentions,
)
class FlaxBlenderbotSmallEncoder(nn.Module):
# 使用BlenderbotSmallConfig配置
config: BlenderbotSmallConfig
# 编码器token的嵌入层
embed_tokens: nn.Embed
# 定义默认数据类型为 jax 中的 float32,用于计算过程中的数据类型
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
# 初始化方法,设置模型中的 dropout 层和一些与 embedding 相关的属性
def setup(self):
# 根据配置参数初始化 dropout 层
self.dropout_layer = nn.Dropout(rate=self.config.dropout)
# 获取配置中的 embedding 维度大小
embed_dim = self.config.d_model
# 获取配置中的填充索引
self.padding_idx = self.config.pad_token_id
# 获取配置中的最大位置编码长度
self.max_source_positions = self.config.max_position_embeddings
# 根据配置是否缩放 embedding 的初始化权重
self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0
# 初始化位置编码的嵌入层
self.embed_positions = nn.Embed(
self.config.max_position_embeddings,
embed_dim,
# 使用正态分布初始化权重,标准差为配置中的初始化标准差
embedding_init=jax.nn.initializers.normal(self.config.init_std),
)
# 初始化多层编码器
self.layers = FlaxBlenderbotSmallEncoderLayerCollection(self.config, self.dtype)
# 初始化 embedding 的 LayerNorm 层
self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 模型的调用方法,接收输入和各种标志位,执行模型的前向传播
def __call__(
self,
input_ids,
attention_mask,
position_ids,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
):
# 获取输入张量的形状信息
input_shape = input_ids.shape
# 将输入张量展平为二维张量,保留最后一个维度的形状
input_ids = input_ids.reshape(-1, input_shape[-1])
# 使用 token embedding 对输入 token 进行嵌入,并根据缩放因子缩放
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
# 根据位置编码的位置 IDs 获取位置编码的嵌入
embed_pos = self.embed_positions(position_ids)
# 将 token embedding 和位置编码的嵌入相加得到最终的隐藏状态
hidden_states = inputs_embeds + embed_pos
# 对隐藏状态进行 LayerNorm 归一化处理
hidden_states = self.layernorm_embedding(hidden_states)
# 对归一化后的隐藏状态应用 dropout,根据 deterministic 标志位决定是否使用确定性 dropout
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 将隐藏状态传入多层编码器中进行编码
outputs = self.layers(
hidden_states,
attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果 return_dict 为 False,则直接返回编码器的输出
if not return_dict:
return outputs
# 否则,返回一个包含模型输出各部分的字典结构
return FlaxBaseModelOutput(
last_hidden_state=outputs.last_hidden_state,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 定义了一个名为FlaxBlenderbotSmallDecoder的类,继承自nn.Module
class FlaxBlenderbotSmallDecoder(nn.Module):
# 类变量config,类型为BlenderbotSmallConfig,用于存储模型配置信息
config: BlenderbotSmallConfig
# 类变量embed_tokens,类型为nn.Embed,用于存储嵌入层信息
embed_tokens: nn.Embed
# 类变量dtype,默认为jnp.float32,表示计算过程中的数据类型
# 初始化方法setup,用于配置模型的各个组件
def setup(self):
# 初始化dropout_layer,用于实现随机失活
self.dropout_layer = nn.Dropout(rate=self.config.dropout)
# 从config中获取嵌入维度
embed_dim = self.config.d_model
# 从config中获取填充token的索引
self.padding_idx = self.config.pad_token_id
# 从config中获取目标位置的最大值
self.max_target_positions = self.config.max_position_embeddings
# 初始化embed_scale,根据scale_embedding参数决定是否开启缩放
self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0
# 初始化embed_positions,用于嵌入位置信息
self.embed_positions = nn.Embed(
self.config.max_position_embeddings, # 嵌入位置的最大数量
embed_dim, # 嵌入的维度
embedding_init=jax.nn.initializers.normal(self.config.init_std), # 使用正态分布初始化嵌入矩阵
)
# 初始化layers,即解码器的层集合
self.layers = FlaxBlenderbotSmallDecoderLayerCollection(self.config, self.dtype)
# 初始化layernorm_embedding,用于对输入嵌入进行层归一化
self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 实现调用方法,定义了模型的前向计算过程
def __call__(
self,
input_ids, # 输入的token id
attention_mask, # 注意力掩码
position_ids, # 位置id
encoder_hidden_states: Optional[jnp.ndarray] = None, # 编码器隐藏状态,默认为None
encoder_attention_mask: Optional[jnp.ndarray] = None, # 编码器注意力掩码,默认为None
init_cache: bool = False, # 是否初始化缓存,默认为False
output_attentions: bool = False, # 是否输出注意力权重,默认为False
output_hidden_states: bool = False, # 是否输出隐藏状态,默认为False
return_dict: bool = True, # 是否返回字典格式的输出,默认为True
deterministic: bool = True, # 是否确定性计算,默认为True
):
# 获取输入tensor的形状
input_shape = input_ids.shape
# 重塑input_ids的形状为(batch_size * seq_length, embed_dim)
input_ids = input_ids.reshape(-1, input_shape[-1])
# 根据input_ids获取对应的嵌入表示,并乘以embed_scale进行缩放
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
# 嵌入位置信息
positions = self.embed_positions(position_ids)
# 对输入嵌入进行层归一化处理
inputs_embeds = self.layernorm_embedding(inputs_embeds)
# 将位置嵌入加到输入嵌入上形成最终的隐藏状态表示
hidden_states = inputs_embeds + positions
# 使用dropout_layer对隐藏状态进行随机失活处理
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 调用layers的前向计算方法,处理隐藏状态,返回相应的输出
outputs = self.layers(
hidden_states,
attention_mask,
encoder_hidden_states,
encoder_attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果return_dict为False,则直接返回outputs
if not return_dict:
return outputs
# 如果return_dict为True,则构造包含额外信息的输出对象并返回
return FlaxBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=outputs.last_hidden_state,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
# 从transformers.models.bart.modeling_flax_bart.FlaxBartModule复制而来,修改Bart为BlenderbotSmall
class FlaxBlenderbotSmallModule(nn.Module):
# 类变量config,类型为BlenderbotSmallConfig,用于存储模型配置信息
config: BlenderbotSmallConfig
# 类变量dtype,默认为jnp.float32,表示计算过程中的数据类型
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
# 初始化方法,设置共享的嵌入层,编码器和解码器模块
def setup(self):
self.shared = nn.Embed(
self.config.vocab_size,
self.config.d_model,
embedding_init=jax.nn.initializers.normal(self.config.init_std),
dtype=self.dtype,
)
# 初始化编码器模块,使用小型Blenderbot编码器
self.encoder = FlaxBlenderbotSmallEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared)
# 初始化解码器模块,使用小型Blenderbot解码器,共享相同的嵌入层
self.decoder = FlaxBlenderbotSmallDecoder(self.config, dtype=self.dtype, embed_tokens=self.shared)
# 返回当前对象中的编码器模块
def _get_encoder_module(self):
return self.encoder
# 返回当前对象中的解码器模块
def _get_decoder_module(self):
return self.decoder
# 实现对象的调用接口,用于进行序列到序列的转换任务
def __call__(
self,
input_ids,
attention_mask,
decoder_input_ids,
decoder_attention_mask,
position_ids,
decoder_position_ids,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
):
# 编码器模块处理输入序列,生成编码器输出
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=deterministic,
)
# 解码器模块处理解码器输入序列,使用编码器输出来辅助生成解码器输出
decoder_outputs = self.decoder(
input_ids=decoder_input_ids,
attention_mask=decoder_attention_mask,
position_ids=decoder_position_ids,
encoder_hidden_states=encoder_outputs[0], # 使用编码器的隐藏状态作为解码器的输入
encoder_attention_mask=attention_mask, # 使用编码器的注意力掩码
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=deterministic,
)
# 如果不要求返回字典形式,则将编码器和解码器输出直接拼接返回
if not return_dict:
return decoder_outputs + encoder_outputs
# 返回经过序列到序列模型包装的输出结果
return FlaxSeq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
# 定义一个自定义的 Flax 模型类,继承自 FlaxPreTrainedModel
class FlaxBlenderbotSmallPreTrainedModel(FlaxPreTrainedModel):
# 设置配置类为 BlenderbotSmallConfig
config_class = BlenderbotSmallConfig
# 基础模型前缀为 "model"
base_model_prefix: str = "model"
# 模块类初始化为 None,将在实例化时赋值
def __init__(
self,
config: BlenderbotSmallConfig,
input_shape: Tuple[int] = (1, 1),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
# 使用模块类创建模块实例
module = self.module_class(config=config, dtype=dtype, **kwargs)
# 调用父类的初始化方法
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化输入张量 input_ids,数据类型为整型
input_ids = jnp.zeros(input_shape, dtype="i4")
# 确保初始化步骤适用于 FlaxBlenderbotSmallForSequenceClassificationModule
input_ids = input_ids.at[(..., -1)].set(self.config.eos_token_id)
# 初始化 attention_mask 为全 1 的张量,与 input_ids 形状相同
attention_mask = jnp.ones_like(input_ids)
# 将 decoder_input_ids 初始化为 input_ids
decoder_input_ids = input_ids
# 将 decoder_attention_mask 初始化为全 1 的张量,与 input_ids 形状相同
decoder_attention_mask = jnp.ones_like(input_ids)
# 获取 batch_size 和 sequence_length
batch_size, sequence_length = input_ids.shape
# 初始化 position_ids 为广播后的序列索引张量
position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
# 初始化 decoder_position_ids 为广播后的序列索引张量
decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
# 分割随机数生成器
params_rng, dropout_rng = jax.random.split(rng)
# 创建随机数字典
rngs = {"params": params_rng, "dropout": dropout_rng}
# 使用模块的初始化方法生成随机参数
random_params = self.module.init(
rngs,
input_ids,
attention_mask,
decoder_input_ids,
decoder_attention_mask,
position_ids,
decoder_position_ids,
)["params"]
# 如果传入了已有的参数,则将随机生成的参数与已有参数合并
if params is not None:
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
return freeze(unflatten_dict(params))
else:
# 否则,直接返回随机生成的参数
return random_params
# 初始化缓存用于快速自回归解码
def init_cache(self, batch_size, max_length, encoder_outputs):
r"""
Args:
batch_size (`int`):
用于快速自回归解码的批处理大小。定义了初始化缓存的批处理大小。
max_length (`int`):
自回归解码的最大可能长度。定义了初始化缓存的序列长度。
encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
`encoder_outputs` 包括 (`last_hidden_state`, *可选*: `hidden_states`, *可选*: `attentions`)。
`last_hidden_state` 的形状为 `(batch_size, sequence_length, hidden_size)`,
*可选* 是编码器最后一层输出的隐藏状态序列。用于解码器的交叉注意力。
"""
# 初始化解码器的输入 ID,全部为1
decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
# 解码器的注意力掩码与输入 ID 相同,全部为1
decoder_attention_mask = jnp.ones_like(decoder_input_ids)
# 解码器的位置 ID,广播到与输入 ID 相同的形状
decoder_position_ids = jnp.broadcast_to(
jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
)
# 定义内部函数 `_decoder_forward`,用于调用解码器模块
def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
decoder_module = module._get_decoder_module()
return decoder_module(
decoder_input_ids,
decoder_attention_mask,
decoder_position_ids,
**kwargs,
)
# 初始化模型的变量,用于初始化缓存
init_variables = self.module.init(
jax.random.PRNGKey(0),
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
decoder_position_ids=decoder_position_ids,
encoder_hidden_states=encoder_outputs[0],
init_cache=True,
method=_decoder_forward, # 只需调用解码器来初始化缓存
)
# 解冻并返回初始化的缓存变量
return unfreeze(init_variables["cache"])
@add_start_docstrings(BLENDERBOT_SMALL_ENCODE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=BlenderbotSmallConfig)
def encode(
self,
input_ids: jnp.ndarray,
attention_mask: Optional[jnp.ndarray] = None,
position_ids: Optional[jnp.ndarray] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
train: bool = False,
params: dict = None,
dropout_rng: PRNGKey = None,
):
r"""
Returns:
Example:
```
>>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration
>>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
>>> text = "My friends are cool but they eat too many carbs."
>>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
>>> encoder_outputs = model.encode(**inputs)
```"""
# 初始化输出注意力的设置,如果未指定则使用模型配置的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 初始化输出隐藏状态的设置,如果未指定则使用模型配置的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 初始化返回字典的设置,如果未指定则使用模型配置的默认值
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 如果未提供注意力掩码,则创建一个全为1的注意力掩码,与输入张量形状相同
if attention_mask is None:
attention_mask = jnp.ones_like(input_ids)
# 如果未提供位置编码,则使用输入张量的形状创建位置编码
if position_ids is None:
batch_size, sequence_length = input_ids.shape
position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
# 如果需要处理任何伪随机数生成器,则创建一个空字典来存储这些伪随机数生成器
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
# 定义一个内部函数来执行编码器的前向传播
def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs):
encode_module = module._get_encoder_module()
return encode_module(input_ids, attention_mask, position_ids, **kwargs)
# 调用模型的 apply 方法,执行编码器的前向传播
return self.module.apply(
{"params": params or self.params}, # 使用给定的参数或默认参数执行模型前向传播
input_ids=jnp.array(input_ids, dtype="i4"), # 将输入张量转换为 Flax 所需的数据类型和格式
attention_mask=jnp.array(attention_mask, dtype="i4"), # 将注意力掩码转换为 Flax 所需的数据类型和格式
position_ids=jnp.array(position_ids, dtype="i4"), # 将位置编码转换为 Flax 所需的数据类型和格式
output_attentions=output_attentions, # 指定是否输出注意力
output_hidden_states=output_hidden_states, # 指定是否输出隐藏状态
return_dict=return_dict, # 指定是否以字典形式返回结果
deterministic=not train, # 指定是否处于训练模式
rngs=rngs, # 提供任何伪随机数生成器
method=_encoder_forward, # 指定执行的方法
)
@add_start_docstrings(BLENDERBOT_SMALL_DECODE_INPUTS_DOCSTRING)
@replace_return_docstrings(
output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=BlenderbotSmallConfig
)
# 定义解码方法,接受一系列输入参数,并可选地返回一个字典形式的输出
def decode(
self,
decoder_input_ids,
encoder_outputs,
encoder_attention_mask: Optional[jnp.ndarray] = None,
decoder_attention_mask: Optional[jnp.ndarray] = None,
decoder_position_ids: Optional[jnp.ndarray] = None,
past_key_values: dict = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
train: bool = False,
params: dict = None,
dropout_rng: PRNGKey = None,
def __call__(
self,
input_ids: jnp.ndarray,
attention_mask: Optional[jnp.ndarray] = None,
decoder_input_ids: Optional[jnp.ndarray] = None,
decoder_attention_mask: Optional[jnp.ndarray] = None,
position_ids: Optional[jnp.ndarray] = None,
decoder_position_ids: Optional[jnp.ndarray] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
train: bool = False,
params: dict = None,
dropout_rng: PRNGKey = None,
):
# 设置输出注意力权重的选项,如果未指定则使用配置中的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 设置输出隐藏状态的选项,如果未指定则使用配置中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 设置返回字典的选项,如果未指定则使用配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 准备编码器输入
if attention_mask is None:
# 如果未提供注意力遮罩,则创建一个全为1的遮罩,形状与input_ids相同
attention_mask = jnp.ones_like(input_ids)
if position_ids is None:
# 如果未提供位置编码,则根据input_ids的形状创建位置编码
batch_size, sequence_length = input_ids.shape
position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
# 准备解码器输入
if decoder_input_ids is None:
# 如果未提供解码器输入的token ids,则通过向右移动input_ids创建解码器的输入
decoder_input_ids = shift_tokens_right(
input_ids, self.config.pad_token_id, decoder_start_token_id=self.config.decoder_start_token_id
)
if decoder_attention_mask is None:
# 如果未提供解码器的注意力遮罩,则创建一个全为1的遮罩,形状与decoder_input_ids相同
decoder_attention_mask = jnp.ones_like(decoder_input_ids)
if decoder_position_ids is None:
# 如果未提供解码器的位置编码,则根据decoder_input_ids的形状创建位置编码
batch_size, sequence_length = decoder_input_ids.shape
decoder_position_ids = jnp.broadcast_to(
jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
)
# 处理需要的任何随机数生成器(PRNG)
rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
# 调用模块的apply方法,传递所需参数和设置
return self.module.apply(
{"params": params or self.params},
input_ids=jnp.array(input_ids, dtype="i4"),
attention_mask=jnp.array(attention_mask, dtype="i4"),
position_ids=jnp.array(position_ids, dtype="i4"),
decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=not train,
rngs=rngs,
)
# 添加文档字符串到类定义,描述 BlenderbotSmall 模型的基本信息和功能
@add_start_docstrings(
"The bare BlenderbotSmall Model transformer outputting raw hidden-states without any specific head on top.",
BLENDERBOT_SMALL_START_DOCSTRING,
)
# 定义 FlaxBlenderbotSmallModel 类,继承自 FlaxBlenderbotSmallPreTrainedModel 类
class FlaxBlenderbotSmallModel(FlaxBlenderbotSmallPreTrainedModel):
# 配置信息为 BlenderbotSmallConfig 类型的对象
config: BlenderbotSmallConfig
# 计算使用的数据类型为 jnp.float32
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
# 模块类为 FlaxBlenderbotSmallModule
module_class = FlaxBlenderbotSmallModule
# 调用函数 append_call_sample_docstring,添加样例调用文档字符串到 FlaxBlenderbotSmallModel 类中
append_call_sample_docstring(FlaxBlenderbotSmallModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
# 从 transformers.models.bart.modeling_flax_bart.FlaxBartForConditionalGenerationModule 复制而来,将 Bart 改为 BlenderbotSmall
# 定义 FlaxBlenderbotSmallForConditionalGenerationModule 类,继承自 nn.Module
class FlaxBlenderbotSmallForConditionalGenerationModule(nn.Module):
# 配置信息为 BlenderbotSmallConfig 类型的对象
config: BlenderbotSmallConfig
# 计算使用的数据类型为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 偏置初始化函数为 jax.nn.initializers.zeros
bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros
# 设置函数,初始化模型和 lm_head
def setup(self):
# 使用配置和数据类型初始化 FlaxBlenderbotSmallModule 模型
self.model = FlaxBlenderbotSmallModule(config=self.config, dtype=self.dtype)
# 初始化 lm_head,使用 Dense 层,无偏置,数据类型为 dtype,初始化方式为正态分布
self.lm_head = nn.Dense(
self.model.shared.num_embeddings,
use_bias=False,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std),
)
# 初始化 final_logits_bias,作为模型参数,维度为 (1, num_embeddings),初始化方式为 bias_init
self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, self.model.shared.num_embeddings))
# 获取编码器模块
def _get_encoder_module(self):
return self.model.encoder
# 获取解码器模块
def _get_decoder_module(self):
return self.model.decoder
# 定义 __call__ 方法,接受多个输入参数和标志位,执行条件生成任务
def __call__(
self,
input_ids,
attention_mask,
decoder_input_ids,
decoder_attention_mask,
position_ids,
decoder_position_ids,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
):
# 使用模型进行推理,返回包含输出的字典
outputs = self.model(
input_ids=input_ids, # 输入的token IDs
attention_mask=attention_mask, # 输入的注意力掩码
decoder_input_ids=decoder_input_ids, # 解码器的token IDs
decoder_attention_mask=decoder_attention_mask, # 解码器的注意力掩码
position_ids=position_ids, # 位置编码
decoder_position_ids=decoder_position_ids, # 解码器位置编码
output_attentions=output_attentions, # 是否输出注意力权重
output_hidden_states=output_hidden_states, # 是否输出隐藏状态
return_dict=return_dict, # 是否返回字典格式的输出
deterministic=deterministic, # 是否确定性推断
)
hidden_states = outputs[0] # 提取模型输出的隐藏状态
if self.config.tie_word_embeddings:
# 如果配置了共享词嵌入,从模型变量中获取共享的嵌入层
shared_embedding = self.model.variables["params"]["shared"]["embedding"]
# 应用共享嵌入到隐藏状态上得到语言模型的logits
lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
else:
# 否则直接使用语言模型头部处理隐藏状态得到logits
lm_logits = self.lm_head(hidden_states)
# 将最终logits加上偏置项,使用jax中的stop_gradient函数确保偏置项不参与梯度计算
lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype))
if not return_dict:
# 如果不返回字典格式的输出,则将logits和其它输出作为元组返回
output = (lm_logits,) + outputs[1:]
return output
# 返回FlaxSeq2SeqLMOutput格式的输出,包括logits和其它相关的隐藏状态和注意力权重
return FlaxSeq2SeqLMOutput(
logits=lm_logits,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
@add_start_docstrings(
"The BLENDERBOT_SMALL Model with a language modeling head. Can be used for summarization.",
BLENDERBOT_SMALL_START_DOCSTRING,
)
class FlaxBlenderbotSmallForConditionalGeneration(FlaxBlenderbotSmallPreTrainedModel):
module_class = FlaxBlenderbotSmallForConditionalGenerationModule
dtype: jnp.dtype = jnp.float32
@add_start_docstrings(BLENDERBOT_SMALL_DECODE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=BlenderbotSmallConfig)
def decode(
self,
decoder_input_ids,
encoder_outputs,
encoder_attention_mask: Optional[jnp.ndarray] = None,
decoder_attention_mask: Optional[jnp.ndarray] = None,
decoder_position_ids: Optional[jnp.ndarray] = None,
past_key_values: dict = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
deterministic: bool = True,
params: dict = None,
dropout_rng: PRNGKey = None,
):
"""
Decodes the input sequence using the model for conditional generation.
Args:
decoder_input_ids: Tensor of decoder input IDs.
encoder_outputs: Output of the encoder model.
encoder_attention_mask: Optional tensor indicating which positions in the encoder output should not be attended to.
decoder_attention_mask: Optional tensor specifying which positions in the decoder input should not be attended to.
decoder_position_ids: Optional tensor specifying positional IDs for the decoder input.
past_key_values: Optional dictionary containing cached key-value pairs for fast decoding.
output_attentions: Whether to output attentions.
output_hidden_states: Whether to output hidden states.
return_dict: Whether to return a dictionary.
deterministic: Whether to apply deterministic computation.
params: Optional parameters for the model.
dropout_rng: Random number generator for dropout.
Returns:
FlaxCausalLMOutputWithCrossAttentions: Model outputs including logits, past key values, and optionally attentions and hidden states.
"""
# Function body is implemented in the actual method, no further comment needed here.
pass
def prepare_inputs_for_generation(
self,
decoder_input_ids,
max_length,
attention_mask: Optional[jax.Array] = None,
decoder_attention_mask: Optional[jax.Array] = None,
encoder_outputs=None,
**kwargs,
):
"""
Prepares inputs for the generation process.
Args:
decoder_input_ids: Tensor of decoder input IDs.
max_length: Maximum length of the generated sequence.
attention_mask: Optional tensor indicating which positions should be attended to.
decoder_attention_mask: Optional tensor specifying which positions in the decoder input should not be attended to.
encoder_outputs: Optional outputs of the encoder model.
**kwargs: Additional keyword arguments.
Returns:
dict: Dictionary containing prepared inputs for the generation process.
Includes past key values, encoder outputs, encoder attention mask, decoder attention mask, and decoder position IDs.
"""
# initializing the cache
batch_size, seq_length = decoder_input_ids.shape
past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
# Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
# But since the decoder uses a causal mask, those positions are masked anyways.
# Thus we can create a single static attention_mask here, which is more efficient for compilation
# Create an extended attention mask for the decoder
extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
if decoder_attention_mask is not None:
# Calculate position IDs from decoder_attention_mask
position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
# Update the extended_attention_mask with decoder_attention_mask values
extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
else:
# Broadcast positional IDs if decoder_attention_mask is not provided
position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
return {
"past_key_values": past_key_values,
"encoder_outputs": encoder_outputs,
"encoder_attention_mask": attention_mask,
"decoder_attention_mask": extended_attention_mask,
"decoder_position_ids": position_ids,
}
def update_inputs_for_generation(self, model_outputs, model_kwargs):
"""
Updates model inputs for the generation process based on model outputs.
Args:
model_outputs: Outputs from the model.
model_kwargs: Original input arguments for the model.
Returns:
dict: Updated model input arguments including past key values and adjusted decoder position IDs.
"""
model_kwargs["past_key_values"] = model_outputs.past_key_values
model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
return model_kwargs
# 导入所需的库和模型
>>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration
# 使用预训练的 Blenderbot 模型初始化生成模型
>>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
# 使用预训练的 tokenizer 初始化分词器
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
# 待总结的文章内容
>>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
# 使用 tokenizer 处理文章,限定最大长度为 1024,并转换为 NumPy 数据结构
>>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="np")
# 生成摘要
>>> summary_ids = model.generate(inputs["input_ids"]).sequences
# 解码生成的摘要内容,去除特殊标记并保留原始分词方式
>>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
# 掩码填充示例:
>>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration
# 使用预训练的 tokenizer 初始化分词器
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
# 待处理的文本带有掩码标记
>>> TXT = "My friends are <mask> but they eat too many carbs."
# 使用预训练的 Blenderbot 模型初始化生成模型
>>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
# 将文本转换为输入的 token IDs,并转换为 NumPy 数据结构
>>> input_ids = tokenizer([TXT], return_tensors="np")["input_ids"]
# 获取模型的 logits
>>> logits = model(input_ids).logits
# 确定掩码位置的索引
>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
# 对 logits 应用 softmax 函数,沿着指定的轴计算概率
>>> probs = jax.nn.softmax(logits[0, masked_index], axis=0)
# 获取概率最高的前 k 个预测结果和它们的值
>>> values, predictions = jax.lax.top_k(probs)
# 解码预测结果并按空格分割成单词列表
>>> tokenizer.decode(predictions).split()
"""
给 FlaxBlenderbotSmallForConditionalGeneration 类的调用覆盖文档字符串,
使用 BLENDERBOT_SMALL_INPUTS_DOCSTRING 和 FLAX_BLENDERBOT_SMALL_CONDITIONAL_GENERATION_DOCSTRING 进行扩展。
"""
overwrite_call_docstring(
FlaxBlenderbotSmallForConditionalGeneration,
BLENDERBOT_SMALL_INPUTS_DOCSTRING + FLAX_BLENDERBOT_SMALL_CONDITIONAL_GENERATION_DOCSTRING,
)
"""
为 FlaxBlenderbotSmallForConditionalGeneration 类附加或替换返回文档字符串,
设置输出类型为 FlaxSeq2SeqLMOutput,配置类为 _CONFIG_FOR_DOC。
"""
append_replace_return_docstrings(
FlaxBlenderbotSmallForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
)
.\models\blenderbot_small\modeling_tf_blenderbot_small.py
""" TF 2.0 BlenderbotSmall 模型。"""
from __future__ import annotations
import random
from typing import List, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPastAndCrossAttentions,
TFSeq2SeqLMOutput,
TFSeq2SeqModelOutput,
)
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFPreTrainedModel,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_blenderbot_small import BlenderbotSmallConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/blenderbot_small-90M"
_CONFIG_FOR_DOC = "BlenderbotSmallConfig"
LARGE_NEGATIVE = -1e8
def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
start_tokens = tf.fill(
(shape_list(input_ids)[0], 1),
tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)
)
shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
shifted_input_ids = tf.where(
shifted_input_ids == -100,
tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
shifted_input_ids,
)
assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))
with tf.control_dependencies([assert_gte0]):
shifted_input_ids = tf.identity(shifted_input_ids)
return shifted_input_ids
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
"""
Make causal mask used for bi-directional self-attention.
"""
bsz = input_ids_shape[0]
tgt_len = input_ids_shape[1]
mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
mask_cond = tf.range(shape_list(mask)[-1])
mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
if past_key_values_length > 0:
mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
src_len = shape_list(mask)[1]
tgt_len = tgt_len if tgt_len is not None else src_len
one_cst = tf.constant(1.0)
mask = tf.cast(mask, dtype=one_cst.dtype)
expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFBlenderbotSmallLearnedPositionalEmbedding(keras.layers.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
super().__init__(num_embeddings, embedding_dim, **kwargs)
def call(
self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
):
"""Input is expected to be of size [bsz x seqlen]."""
if position_ids is None:
seq_len = input_shape[1]
position_ids = tf.range(seq_len, delta=1, name="range")
position_ids += past_key_values_length
return super().call(tf.cast(position_ids, dtype=tf.int32))
class TFBlenderbotSmallAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
def call(
self,
hidden_states: tf.Tensor,
key_value_states: tf.Tensor | None = None,
past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
training: Optional[bool] = False,
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFBlenderbotSmallEncoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotSmallConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
self.self_attn = TFBlenderbotSmallAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
hidden_states: tf.Tensor,
attention_mask: np.ndarray | tf.Tensor | None,
layer_head_mask: tf.Tensor | None,
training: Optional[bool] = False,
) -> tf.Tensor:
"""
Args:
hidden_states (`tf.Tensor`): 输入到层的张量,形状为 `(batch, seq_len, embed_dim)`
attention_mask (`tf.Tensor`): 注意力掩码,形状为 `(batch, 1, tgt_len, src_len)`,
其中填充元素由非常大的负值表示。
layer_head_mask (`tf.Tensor`): 给定层的注意力头部掩码,形状为 `(encoder_attention_heads,)`
"""
residual = hidden_states
hidden_states, self_attn_weights, _ = self.self_attn(
hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask
)
tf.debugging.assert_equal(
shape_list(hidden_states),
shape_list(residual),
message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = self.activation_dropout(hidden_states, training=training)
hidden_states = self.fc2(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBlenderbotSmallDecoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotSmallConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
self.self_attn = TFBlenderbotSmallAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
name="self_attn",
is_decoder=True,
)
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFBlenderbotSmallAttention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
name="encoder_attn",
is_decoder=True,
)
self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
hidden_states: tf.Tensor,
attention_mask: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
cross_attn_layer_head_mask: tf.Tensor | None = None,
past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
training: Optional[bool] = False,
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBlenderbotSmallPreTrainedModel(TFPreTrainedModel):
config_class = BlenderbotSmallConfig
base_model_prefix = "model"
>>> UTTERANCE = "My friends are cool but they eat too many carbs."
>>> print("Human: ", UTTERANCE)
打印出人类的发言
>>> inputs = tokenizer([UTTERANCE], return_tensors="tf")
使用分词器对发言进行处理,返回模型输入的张量表示
>>> reply_ids = model.generate(**inputs)
使用模型生成回复
>>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
打印出生成的机器人回复,跳过特殊标记后的解码结果
>>> REPLY = "I'm not sure"
>>> print("Human: ", REPLY)
打印出人类的回复
>>> NEXT_UTTERANCE = (
... "My friends are cool but they eat too many carbs.</s> "
... "<s>what kind of carbs do they eat? i don't know much about carbs.</s> "
... "<s>I'm not sure."
... )
设置下一轮对话的文本
>>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="tf")
使用分词器处理下一轮对话文本,返回模型输入的张量表示
>>> inputs.pop("token_type_ids")
移除张量表示中的token_type_ids(标记类型标识符)
>>> next_reply_ids = model.generate(**inputs)
使用模型生成下一轮对话的回复
>>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
打印出生成的机器人回复,跳过特殊标记后的解码结果
"""
BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
"""
@keras_serializable
class TFBlenderbotSmallEncoder(keras.layers.Layer):
config_class = BlenderbotSmallConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`TFBlenderbotSmallEncoderLayer`].
Args:
config: BlenderbotSmallConfig
"""
def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
self.config = config
self.dropout = keras.layers.Dropout(config.dropout) # 初始化dropout层,根据配置设置dropout率
self.layerdrop = config.encoder_layerdrop # 获取配置中的layerdrop参数,用于层级别的dropout
self.padding_idx = config.pad_token_id # 获取配置中的pad_token_id,用于填充的特殊token
self.max_source_positions = config.max_position_embeddings # 获取配置中的max_position_embeddings,最大位置嵌入长度
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 # 根据配置设置嵌入的缩放因子
self.embed_tokens = embed_tokens # 初始化嵌入token
self.embed_positions = TFBlenderbotSmallLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
name="embed_positions",
) # 初始化位置嵌入
self.layers = [TFBlenderbotSmallEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] # 创建多个编码层
self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") # 初始化嵌入层归一化
self.embed_dim = config.d_model # 获取配置中的嵌入维度
def get_embed_tokens(self):
return self.embed_tokens # 返回嵌入token
def set_embed_tokens(self, embed_tokens):
self.embed_tokens = embed_tokens # 设置嵌入token
@unpack_inputs
def call(
self,
input_ids=None,
inputs_embeds=None,
attention_mask=None,
head_mask=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
"""
实现Layer的call方法,用于前向传播
Args:
input_ids: 输入的token ids
inputs_embeds: 嵌入表示
attention_mask: 注意力掩码
head_mask: 多头注意力的掩码
output_attentions: 是否输出注意力权重
output_hidden_states: 是否输出隐藏状态
return_dict: 是否返回字典格式结果
training: 是否为训练模式
Returns:
根据配置返回相应的结果
"""
# 省略具体实现细节,实现模型的前向传播逻辑
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None) # 构建位置嵌入
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.embed_dim]) # 构建嵌入层的归一化
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None) # 构建每个编码层
@keras_serializable
class TFBlenderbotSmallDecoder(keras.layers.Layer):
config_class = BlenderbotSmallConfig
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotSmallDecoderLayer`]
Args:
config: BlenderbotSmallConfig
embed_tokens: output embedding
"""
# 使用给定的配置和嵌入标记初始化对象,继承父类的初始化方法
def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
# 将配置保存在对象中
self.config = config
# 设置填充索引为配置中的填充标记 ID
self.padding_idx = config.pad_token_id
# 设置嵌入标记为给定的嵌入标记
self.embed_tokens = embed_tokens
# 设置层的丢弃率为配置中的解码器层丢弃率
self.layerdrop = config.decoder_layerdrop
# 使用给定的最大位置嵌入数量和模型维度创建位置嵌入对象
self.embed_positions = TFBlenderbotSmallLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
name="embed_positions",
)
# 如果配置中设置了缩放嵌入,则计算并设置嵌入的缩放因子为模型维度的平方根,否则设为1.0
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
# 创建解码器层的列表,每一层使用给定的配置创建一个解码器层对象
self.layers = [TFBlenderbotSmallDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
# 创建用于嵌入层归一化的层归一化对象
self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
# 创建一个丢弃层,使用配置中的丢弃率
self.dropout = keras.layers.Dropout(config.dropout)
# 获取当前嵌入标记对象的方法
def get_embed_tokens(self):
return self.embed_tokens
# 设置新的嵌入标记对象的方法
def set_embed_tokens(self, embed_tokens):
self.embed_tokens = embed_tokens
# 装饰器,解包输入参数,用于处理call方法的输入参数
@unpack_inputs
# 模型的调用方法,处理输入并返回模型的输出
def call(
self,
input_ids=None,
inputs_embeds=None,
attention_mask=None,
position_ids=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
# 方法体的具体实现将在下文注释中描述
注释:
@keras_serializable
class TFBlenderbotSmallMainLayer(keras.layers.Layer):
# 设定配置类为 BlenderbotSmallConfig
config_class = BlenderbotSmallConfig
def __init__(self, config: BlenderbotSmallConfig, **kwargs):
super().__init__(**kwargs)
# 初始化函数,接收 BlenderbotSmallConfig 对象作为配置参数
self.config = config
# 创建一个共享的嵌入层,用于共享模型的词汇表和嵌入大小
self.shared = keras.layers.Embedding(
input_dim=config.vocab_size,
output_dim=config.d_model,
embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
name="model.shared",
)
# 添加一个额外的属性,用于指定层的预期名称范围(用于加载/存储权重)
self.shared.load_weight_prefix = "model.shared"
# 创建编码器和解码器层,使用 TFBlenderbotSmallEncoder 和 TFBlenderbotSmallDecoder 类
self.encoder = TFBlenderbotSmallEncoder(config, self.shared, name="encoder")
self.decoder = TFBlenderbotSmallDecoder(config, self.shared, name="decoder")
# 返回共享的嵌入层对象
def get_input_embeddings(self):
return self.shared
# 设置新的输入嵌入层对象,并更新编码器和解码器中的 embed_tokens 属性
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
# 使用装饰器 unpack_inputs,处理输入参数并调用模型
def call(
self,
input_ids=None,
attention_mask=None,
decoder_input_ids=None,
decoder_attention_mask=None,
decoder_position_ids=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
past_key_values=None,
inputs_embeds=None,
decoder_inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
**kwargs,
):
# 如果输出隐藏状态参数为 None,则使用模型配置中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果 encoder_outputs 为 None,则调用 encoder 进行编码
if encoder_outputs is None:
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 如果 return_dict=True 且 encoder_outputs 是元组,则将其包装在 TFBaseModelOutput 中
elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput):
encoder_outputs = TFBaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
)
# 如果 return_dict=False 且 encoder_outputs 是 TFBaseModelOutput,则将其转换为元组
elif not return_dict and not isinstance(encoder_outputs, tuple):
encoder_outputs = encoder_outputs.to_tuple()
# 调用 decoder 进行解码,使用 encoder 输出作为其中的一些参数
decoder_outputs = self.decoder(
decoder_input_ids,
attention_mask=decoder_attention_mask,
position_ids=decoder_position_ids,
encoder_hidden_states=encoder_outputs[0],
encoder_attention_mask=attention_mask,
head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 如果 return_dict=False,则将 decoder 和 encoder 输出组合后返回
if not return_dict:
return decoder_outputs + encoder_outputs
# 如果 return_dict=True,则根据 TFSeq2SeqModelOutput 的结构返回 decoder 和 encoder 的输出
return TFSeq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
# 构建模型的方法,在输入形状为None时
def build(self, input_shape=None):
# 如果模型已经构建完成,则直接返回
if self.built:
return
# 设置模型已构建的标志为True
self.built = True
# 共享/绑定的权重期望位于模型基本命名空间中
# 将"/"添加到tf.name_scope的末尾(而不是开头!)会将其放置在根命名空间而不是当前命名空间中。
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
# 构建共享/绑定模型
self.shared.build(None)
# 如果存在编码器(encoder)模型,则在其命名空间内构建
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
# 如果存在解码器(decoder)模型,则在其命名空间内构建
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
# 为 TFBlenderbotSmallModel 类添加文档字符串,说明这是一个不带特定顶部头的原始隐藏状态输出的 BLENDERBOT_SMALL 模型。
@add_start_docstrings(
"The bare BLENDERBOT_SMALL Model outputting raw hidden-states without any specific head on top.",
BLENDERBOT_SMALL_START_DOCSTRING,
)
class TFBlenderbotSmallModel(TFBlenderbotSmallPreTrainedModel):
def __init__(self, config: BlenderbotSmallConfig, *inputs, **kwargs):
# 调用父类的构造函数,传递配置和其他输入参数
super().__init__(config, *inputs, **kwargs)
# 创建 TFBlenderbotSmallMainLayer 实例作为模型的主要层
self.model = TFBlenderbotSmallMainLayer(config, name="model")
# 返回模型的编码器部分
def get_encoder(self):
return self.model.encoder
# 返回模型的解码器部分
def get_decoder(self):
return self.model.decoder
# 定义模型的前向传播方法,接收多个输入参数,输出模型的结果
@unpack_inputs
@add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSeq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
decoder_input_ids: tf.Tensor | None = None,
decoder_attention_mask: tf.Tensor | None = None,
decoder_position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
decoder_head_mask: tf.Tensor | None = None,
cross_attn_head_mask: tf.Tensor | None = None,
encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
past_key_values: List[tf.Tensor] | None = None,
inputs_embeds: tf.Tensor | None = None,
decoder_inputs_embeds: tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs,
) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]:
# 调用模型的前向传播方法,将输入参数传递给模型并获取输出结果
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
decoder_position_ids=decoder_position_ids,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
encoder_outputs=encoder_outputs,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 返回模型的输出结果
return outputs
# 从 transformers.models.bart.modeling_tf_bart.TFBartModel.serving_output 复制并注释
# 该部分功能的具体内容未在提供的代码片段中给出,需要进一步补充
# 定义一个方法用于处理模型的输出
def serving_output(self, output):
# 如果配置要求使用缓存,则从输出中获取过去键值对中的第二个元素;否则设为 None
pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
# 如果配置要求输出隐藏状态,则将输出的解码器隐藏状态转换为张量;否则设为 None
dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置要求输出注意力权重,则将输出的解码器注意力权重转换为张量;否则设为 None
dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
# 如果配置要求输出交叉注意力权重,则将输出的交叉注意力权重转换为张量;否则设为 None
cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
# 如果配置要求输出隐藏状态,则将输出的编码器隐藏状态转换为张量;否则设为 None
enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置要求输出注意力权重,则将输出的编码器注意力权重转换为张量;否则设为 None
enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
# 返回一个 TFSeq2SeqModelOutput 对象,包含不同类型的模型输出
return TFSeq2SeqModelOutput(
last_hidden_state=output.last_hidden_state,
past_key_values=pkv,
decoder_hidden_states=dec_hs,
decoder_attentions=dec_attns,
cross_attentions=cross_attns,
encoder_last_hidden_state=output.encoder_last_hidden_state,
encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns,
)
# 构建方法,用于建立模型结构
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
# 设定为已构建状态
self.built = True
# 如果已存在模型,则在指定的命名空间下构建模型
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(keras.layers.Layer):
"""
Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer.
"""
def __init__(self, shape, initializer, trainable, name, **kwargs):
super().__init__(name=name, **kwargs)
# Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of
# "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see:
# https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214
# 添加权重到层中,用于偏置项,名称不会进行作用域化处理以便正确序列化
self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable)
def call(self, x):
# 在输入张量 x 上加上偏置项
return x + self.bias
@add_start_docstrings(
"The BLENDERBOT_SMALL Model with a language modeling head. Can be used for summarization.",
BLENDERBOT_SMALL_START_DOCSTRING,
)
class TFBlenderbotSmallForConditionalGeneration(TFBlenderbotSmallPreTrainedModel, TFCausalLanguageModelingLoss):
_keys_to_ignore_on_load_unexpected = [
r"model.encoder.embed_tokens.weight",
r"model.decoder.embed_tokens.weight",
]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 创建 TFBlenderbotSmallMainLayer 实例作为模型主体,并命名为 "model"
self.model = TFBlenderbotSmallMainLayer(config, name="model")
# 从配置中获取是否使用缓存
self.use_cache = config.use_cache
# 创建 BiasLayer 实例作为模型的偏置项,用于最终的 logits,设置为不可训练以保持一致性
self.bias_layer = BiasLayer(
name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
)
def get_decoder(self):
# 返回模型的解码器部分
return self.model.decoder
def get_encoder(self):
# 返回模型的编码器部分
return self.model.encoder
def get_output_embeddings(self):
# 返回输入嵌入层
return self.get_input_embeddings()
def set_output_embeddings(self, value):
# 设置输出嵌入层
self.set_input_embeddings(value)
def get_bias(self):
# 返回偏置项字典
return {"final_logits_bias": self.bias_layer.bias}
def set_bias(self, value):
# 替换已有的包含偏置项的层,以便正确序列化和反序列化
vocab_size = value["final_logits_bias"].shape[-1]
self.bias_layer = BiasLayer(
name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False
)
# 将新的偏置值赋给偏置层
self.bias_layer.bias.assign(value["final_logits_bias"])
@unpack_inputs
@add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(BLENDERBOT_SMALL_GENERATION_EXAMPLE)
# 定义一个方法用于执行模型的前向传播。参数如下:
# input_ids: 输入的张量,表示模型的输入序列的标识符
input_ids: tf.Tensor | None = None,
# attention_mask: 输入的张量,用于指示哪些位置的标识符需要被注意力层忽略
attention_mask: tf.Tensor | None = None,
# decoder_input_ids: 解码器的输入序列的标识符
decoder_input_ids: tf.Tensor | None = None,
# decoder_attention_mask: 解码器的输入张量,指示哪些位置的标识符需要被注意力层忽略
decoder_attention_mask: tf.Tensor | None = None,
# decoder_position_ids: 解码器的位置标识符
decoder_position_ids: tf.Tensor | None = None,
# head_mask: 指定哪些注意力头部应该被屏蔽的张量
head_mask: tf.Tensor | None = None,
# decoder_head_mask: 解码器的注意力头部的屏蔽张量
decoder_head_mask: tf.Tensor | None = None,
# cross_attn_head_mask: 交叉注意力的头部屏蔽张量
cross_attn_head_mask: tf.Tensor | None = None,
# encoder_outputs: 编码器输出的可选结果
encoder_outputs: Optional[TFBaseModelOutput] = None,
# past_key_values: 解码器过去的键值对列表
past_key_values: List[tf.Tensor] | None = None,
# inputs_embeds: 输入的嵌入张量
inputs_embeds: tf.Tensor | None = None,
# decoder_inputs_embeds: 解码器的输入嵌入张量
decoder_inputs_embeds: tf.Tensor | None = None,
# use_cache: 是否使用缓存的布尔值
use_cache: Optional[bool] = None,
# output_attentions: 是否输出注意力权重的布尔值
output_attentions: Optional[bool] = None,
# output_hidden_states: 是否输出隐藏状态的布尔值
output_hidden_states: Optional[bool] = None,
# return_dict: 是否返回字典格式的输出结果的布尔值
return_dict: Optional[bool] = None,
# labels: 标签张量,用于模型训练
labels: tf.Tensor | None = None,
# training: 是否为训练模式的布尔值,默认为False
training: Optional[bool] = False,
) -> Union[Tuple[tf.Tensor], TFSeq2SeqLMOutput]:
r"""
labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Returns a tuple containing either masked_lm_loss and model outputs or a TFSeq2SeqLMOutput object.
"""
# Adjust labels to replace pad_token_id with -100, preserving dtype
if labels is not None:
labels = tf.where(
labels == self.config.pad_token_id,
tf.cast(tf.fill(shape_list(labels), -100), labels.dtype),
labels,
)
# Set use_cache to False if decoder_input_ids or decoder_inputs_embeds are not provided
use_cache = False
if decoder_input_ids is None and decoder_inputs_embeds is None:
# Shift labels to the right and prepend decoder_start_token_id
decoder_input_ids = shift_tokens_right(
labels, self.config.pad_token_id, self.config.decoder_start_token_id
)
# Pass inputs to the model for computation
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
decoder_position_ids=decoder_position_ids,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
encoder_outputs=encoder_outputs,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# Compute logits and apply bias
lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True)
lm_logits = self.bias_layer(lm_logits)
# Compute masked language modeling loss if labels are provided
masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits)
# Return outputs based on return_dict flag
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
# Return TFSeq2SeqLMOutput object containing relevant model outputs
return TFSeq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values, # index 1 of d outputs
decoder_hidden_states=outputs.decoder_hidden_states, # index 2 of d outputs
decoder_attentions=outputs.decoder_attentions, # index 3 of d outputs
cross_attentions=outputs.cross_attentions, # index 4 of d outputs
encoder_last_hidden_state=outputs.encoder_last_hidden_state, # index 0 of encoder outputs
encoder_hidden_states=outputs.encoder_hidden_states, # index 1 of encoder outputs
encoder_attentions=outputs.encoder_attentions, # index 2 of encoder outputs
)
# Copied from transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.serving_output
# 定义一个方法用于生成模型输出,将输入的输出对象output转换为TFSeq2SeqLMOutput对象
def serving_output(self, output):
# 如果配置允许使用缓存,则从output的过去键值对中获取第一个元素作为past_key_values
pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
# 如果配置允许输出隐藏状态,则将output的解码器隐藏状态转换为张量dec_hs
dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置允许输出注意力权重,则将output的解码器注意力转换为张量dec_attns
dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
# 如果配置允许输出注意力权重,则将output的交叉注意力转换为张量cross_attns
cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
# 如果配置允许输出隐藏状态,则将output的编码器隐藏状态转换为张量enc_hs
enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置允许输出注意力权重,则将output的编码器注意力转换为张量enc_attns
enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
# 返回一个TFSeq2SeqLMOutput对象,包含logits、past_key_values、decoder_hidden_states、decoder_attentions、
# cross_attentions、encoder_last_hidden_state、encoder_hidden_states和encoder_attentions等属性
return TFSeq2SeqLMOutput(
logits=output.logits,
past_key_values=pkv,
decoder_hidden_states=dec_hs,
decoder_attentions=dec_attns,
cross_attentions=cross_attns,
encoder_last_hidden_state=output.encoder_last_hidden_state,
encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns,
)
# 从transformers库中复制的方法,用于生成生成过程的输入
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
decoder_attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
# 如果past_key_values不为None,则截取decoder_input_ids的最后一个标记作为输入
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
# 如果decoder_attention_mask不为None,则使用累积求和操作计算decoder_position_ids
if decoder_attention_mask is not None: # xla
decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:]
# 否则如果past_key_values不为None,则根据past_key_values的形状获取decoder_position_ids
elif past_key_values is not None: # no xla + past_key_values
decoder_position_ids = past_key_values[0][0].shape[2]
# 否则使用tf.range生成decoder_input_ids的位置ids作为decoder_position_ids
else: # no xla + no past_key_values
decoder_position_ids = tf.range(decoder_input_ids.shape[1])
# 返回一个字典,包含生成过程中的所有输入参数
return {
"input_ids": None, # encoder_outputs is defined. input_ids not needed
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"decoder_attention_mask": decoder_attention_mask,
"decoder_position_ids": decoder_position_ids,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache, # change this to avoid caching (presumably for debugging)
}
# 定义一个方法 `build`,用于构建神经网络层次结构
def build(self, input_shape=None):
# 如果已经构建过,则直接返回,避免重复构建
if self.built:
return
# 标记为已构建状态
self.built = True
# 如果存在名为 `model` 的属性且不为 None,则进入条件
if getattr(self, "model", None) is not None:
# 使用 `model` 的名字作为命名空间,构建模型
with tf.name_scope(self.model.name):
self.model.build(None)
# 如果存在名为 `bias_layer` 的属性且不为 None,则进入条件
if getattr(self, "bias_layer", None) is not None:
# 使用 `bias_layer` 的名字作为命名空间,构建偏置层
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)