Transformers 源码解析(六十八)
.\models\lxmert\modeling_tf_lxmert.py
""" TF 2.0 LXMERT model."""
from __future__ import annotations
import warnings
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_utils import (
TFModelInputType,
TFPreTrainedModel,
get_initializer,
keras,
keras_serializable,
shape_list,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, stable_softmax
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_lxmert import LxmertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "unc-nlp/lxmert-base-uncased"
_CONFIG_FOR_DOC = "LxmertConfig"
TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"unc-nlp/lxmert-base-uncased",
]
@dataclass
class TFLxmertModelOutput(ModelOutput):
"""
Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
encoder)
"""
Args:
language_output (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
语言编码器最后一层的隐藏状态序列。
vision_output (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
视觉编码器最后一层的隐藏状态序列。
pooled_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
序列第一个令牌(CLS令牌)的最后一层隐藏状态,经过线性层和Tanh激活函数进一步处理后的结果。
language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
语言编码器每个交叉模态层的输入特征和输出的元组,形状为 `(batch_size, sequence_length, hidden_size)`。
vision_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
视觉编码器每个交叉模态层的输入特征和输出的元组,形状为 `(batch_size, sequence_length, hidden_size)`。
language_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
自注意力头中注意力softmax后的权重张量元组,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
用于计算自注意力头中加权平均值。
vision_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
自注意力头中注意力softmax后的权重张量元组,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
用于计算自注意力头中加权平均值。
cross_encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
自注意力头中注意力softmax后的权重张量元组,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
用于计算自注意力头中加权平均值。
@dataclass
class TFLxmertForPreTrainingOutput(ModelOutput):
"""
Output type of [`LxmertForPreTraining`].
Args:
loss (*optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.
prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
cross_relationship_score (`tf.Tensor` of shape `(batch_size, 2)`):
Prediction scores of the textual matching objective (classification) head (scores of True/False
continuation before SoftMax).
question_answering_score (`tf.Tensor` of shape `(batch_size, n_qa_answers)`):
Prediction scores of question answering objective (classification).
language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape
`(batch_size, sequence_length, hidden_size)`.
vision_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape
`(batch_size, sequence_length, hidden_size)`.
language_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
vision_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
cross_encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
"""
loss: tf.Tensor | None = None
prediction_logits: tf.Tensor | None = None
cross_relationship_score: tf.Tensor | None = None
question_answering_score: tf.Tensor | None = None
language_hidden_states: tuple[tf.Tensor] | None = None
vision_hidden_states: tuple[tf.Tensor] | None = None
language_attentions: tuple[tf.Tensor] | None = None
vision_attentions: tuple[tf.Tensor] | None = None
cross_encoder_attentions: tuple[tf.Tensor] | None = None
cross_relationship_score: tf.Tensor | None = None
question_answering_score: tf.Tensor | None = None
language_hidden_states: Tuple[tf.Tensor] | None = None
vision_hidden_states: Tuple[tf.Tensor] | None = None
language_attentions: Tuple[tf.Tensor] | None = None
vision_attentions: Tuple[tf.Tensor] | None = None
cross_encoder_attentions: Tuple[tf.Tensor] | None = None
class TFLxmertVisualFeatureEncoder(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.visn_fc = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="visn_fc",
)
self.visn_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="visn_layer_norm")
self.box_fc = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="box_fc",
)
self.box_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="box_layer_norm")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.feat_dim = config.visual_feat_dim
self.pos_dim = config.visual_pos_dim
self.config = config
def call(self, visn_input, training=False):
feats, boxes = visn_input
x = self.visn_fc(feats)
x = self.visn_layer_norm(x)
y = self.box_fc(boxes)
y = self.box_layer_norm(y)
output = (x + y) / 2
output = self.dropout(output, training=training)
return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "visn_fc", None) is not None:
with tf.name_scope(self.visn_fc.name):
self.visn_fc.build([None, None, self.feat_dim])
if getattr(self, "visn_layer_norm", None) is not None:
with tf.name_scope(self.visn_layer_norm.name):
self.visn_layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "box_fc", None) is not None:
with tf.name_scope(self.box_fc.name):
self.box_fc.build([None, None, self.pos_dim])
if getattr(self, "box_layer_norm", None) is not None:
with tf.name_scope(self.box_layer_norm.name):
self.box_layer_norm.build([None, None, self.config.hidden_size])
class TFLxmertEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None):
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def call(self, input_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns:
final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if token_type_ids is None:
token_type_ids = tf.fill(dims=input_shape, value=0)
position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
final_embeddings = inputs_embeds + position_embeds + token_type_embeds
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
class TFLxmertAttention(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads}"
)
self.num_attention_heads = config.num_attention_heads
assert config.hidden_size % config.num_attention_heads == 0
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="query",
)
self.key = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="key",
)
self.value = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="value",
)
self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
self.ctx_dim = config.hidden_size
self.config = config
def transpose_for_scores(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, hidden_states, context, attention_mask, output_attentions, training=False):
batch_size = shape_list(hidden_states)[0]
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(context)
mixed_value_layer = self.value(context)
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
attention_scores = tf.matmul(
query_layer, key_layer, transpose_b=True
)
dk = tf.cast(shape_list(key_layer)[-1], dtype=attention_scores.dtype)
attention_scores = attention_scores / tf.math.sqrt(dk)
if attention_mask is not None:
attention_mask = tf.cast(attention_mask, dtype=attention_scores.dtype)
attention_scores = attention_scores + attention_mask
attention_probs = stable_softmax(attention_scores, axis=-1)
attention_probs = self.dropout(attention_probs, training=training)
context_layer = tf.matmul(attention_probs, value_layer)
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
context_layer = tf.reshape(
context_layer, (batch_size, -1, self.all_head_size)
)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.ctx_dim])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.ctx_dim])
class TFLxmertIntermediate(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
config.intermediate_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFLxmertOutput(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states, input_tensor, training=False):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, training)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFLxmertAttentionOutput(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states, input_tensor, training=False):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFLxmertSelfAttentionLayer(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.self = TFLxmertAttention(config, name="self")
self.attention_output = TFLxmertAttentionOutput(config, name="output")
def call(self, input_tensor, attention_mask, output_attentions, training=False):
self_output = self.self(input_tensor, input_tensor, attention_mask, output_attentions)
if output_attentions:
attention_probs = self_output[1]
attention_output = self.attention_output(self_output[0], input_tensor)
return (attention_output, attention_probs) if output_attentions else (attention_output,)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self", None) is not None:
with tf.name_scope(self.self.name):
self.self.build(None)
if getattr(self, "attention_output", None) is not None:
with tf.name_scope(self.attention_output.name):
self.attention_output.build(None)
class TFLxmertCrossAttentionLayer(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.att = TFLxmertAttention(config, name="att")
self.attention_output = TFLxmertAttentionOutput(config, name="output")
def call(
self,
input_tensor,
ctx_tensor,
ctx_att_mask,
output_attentions=False,
training=False,
):
output = self.att(input_tensor, ctx_tensor, ctx_att_mask, output_attentions, training=training)
if output_attentions:
attention_probs = output[1]
attention_output = self.attention_output(output[0], input_tensor, training=training)
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "att", None) is not None:
with tf.name_scope(self.att.name):
self.att.build(None)
if getattr(self, "attention_output", None) is not None:
with tf.name_scope(self.attention_output.name):
self.attention_output.build(None)
class TFLxmertLayer(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.attention = TFLxmertSelfAttentionLayer(config, name="attention")
self.intermediate = TFLxmertIntermediate(config, name="intermediate")
self.transformer_output = TFLxmertOutput(config, name="output")
def call(self, hidden_states, attention_mask, output_attentions, training=False):
attention_outputs = self.attention(hidden_states, attention_mask, output_attentions, training=training)
attention_output = attention_outputs[0]
intermediate_output = self.intermediate(attention_output)
layer_output = self.transformer_output(intermediate_output, attention_output, training=training)
outputs = (layer_output,) + attention_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "transformer_output", None) is not None:
with tf.name_scope(self.transformer_output.name):
self.transformer_output.build(None)
class TFLxmertXLayer(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.visual_attention = TFLxmertCrossAttentionLayer(config, name="visual_attention")
self.lang_self_att = TFLxmertSelfAttentionLayer(config, name="lang_self_att")
self.visn_self_att = TFLxmertSelfAttentionLayer(config, name="visn_self_att")
self.lang_inter = TFLxmertIntermediate(config, name="lang_inter")
self.lang_output = TFLxmertOutput(config, name="lang_output")
self.visn_inter = TFLxmertIntermediate(config, name="visn_inter")
self.visn_output = TFLxmertOutput(config, name="visn_output")
def cross_att(
self,
lang_input,
lang_attention_mask,
visn_input,
visn_attention_mask,
output_attentions,
training=False,
):
lang_attention_lang_input = tf.identity(lang_input)
visn_attention_lang_input = tf.identity(lang_input)
lang_attention_visn_input = tf.identity(visn_input)
visn_attention_visn_input = tf.identity(visn_input)
lang_att_output = self.visual_attention(
lang_attention_lang_input,
lang_attention_visn_input,
visn_attention_mask,
output_attentions=output_attentions,
training=training,
)
visn_att_output = self.visual_attention(
visn_attention_visn_input,
visn_attention_lang_input,
lang_attention_mask,
output_attentions=output_attentions,
training=training,
)
return lang_att_output, visn_att_output
def self_att(
self,
lang_input,
lang_attention_mask,
visn_input,
visn_attention_mask,
training=False,
):
output_attentions = False
lang_att_output = self.lang_self_att(lang_input, lang_attention_mask, output_attentions, training=training)
visn_att_output = self.visn_self_att(visn_input, visn_attention_mask, output_attentions, training=training)
return lang_att_output[0], visn_att_output[0]
def output_fc(self, lang_input, visn_input, training=False):
lang_inter_output = self.lang_inter(lang_input)
visn_inter_output = self.visn_inter(visn_input)
lang_output = self.lang_output(lang_inter_output, lang_input, training)
visn_output = self.visn_output(visn_inter_output, visn_input, training)
return lang_output, visn_output
def call(
self,
lang_feats,
lang_attention_mask,
visn_feats,
visn_attention_mask,
output_attentions,
training=False,
):
return self.cross_att(
lang_feats,
lang_attention_mask,
visn_feats,
visn_attention_mask,
output_attentions,
training=training,
)
):
lang_att_output = lang_feats
visn_att_output = visn_feats
lang_att_output, visn_att_output = self.cross_att(
lang_att_output,
lang_attention_mask,
visn_att_output,
visn_attention_mask,
output_attentions,
training=training,
)
attention_probs = lang_att_output[1:]
lang_att_output, visn_att_output = self.self_att(
lang_att_output[0],
lang_attention_mask,
visn_att_output[0],
visn_attention_mask,
training=training,
)
lang_output, visn_output = self.output_fc(lang_att_output, visn_att_output, training=training)
return (lang_output, visn_output, attention_probs[0]) if output_attentions else (lang_output, visn_output)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "visual_attention", None) is not None:
with tf.name_scope(self.visual_attention.name):
self.visual_attention.build(None)
if getattr(self, "lang_self_att", None) is not None:
with tf.name_scope(self.lang_self_att.name):
self.lang_self_att.build(None)
if getattr(self, "visn_self_att", None) is not None:
with tf.name_scope(self.visn_self_att.name):
self.visn_self_att.build(None)
if getattr(self, "lang_inter", None) is not None:
with tf.name_scope(self.lang_inter.name):
self.lang_inter.build(None)
if getattr(self, "lang_output", None) is not None:
with tf.name_scope(self.lang_output.name):
self.lang_output.build(None)
if getattr(self, "visn_inter", None) is not None:
with tf.name_scope(self.visn_inter.name):
self.visn_inter.build(None)
if getattr(self, "visn_output", None) is not None:
with tf.name_scope(self.visn_output.name):
self.visn_output.build(None)
class TFLxmertEncoder(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.visn_fc = TFLxmertVisualFeatureEncoder(config, name="visn_fc")
self.num_l_layers = config.l_layers
self.num_x_layers = config.x_layers
self.num_r_layers = config.r_layers
self.layer = [TFLxmertLayer(config, name=f"layer_._{i}") for i in range(self.num_l_layers)]
self.x_layers = [TFLxmertXLayer(config, name=f"x_layers_._{i}") for i in range(self.num_x_layers)]
self.r_layers = [TFLxmertLayer(config, name=f"r_layers_._{i}") for i in range(self.num_r_layers)]
self.config = config
def call(
self,
lang_feats=None,
lang_attention_mask=None,
visual_feats=None,
visual_pos=None,
visual_attention_mask=None,
output_attentions=None,
training=False,
**kwargs
):
):
vision_hidden_states = ()
language_hidden_states = ()
vision_attentions = () if output_attentions or self.config.output_attentions else None
language_attentions = () if output_attentions or self.config.output_attentions else None
cross_encoder_attentions = () if output_attentions or self.config.output_attentions else None
visual_feats = self.visn_fc([visual_feats, visual_pos], training=training)
for layer_module in self.layer:
l_outputs = layer_module(lang_feats, lang_attention_mask, output_attentions, training=training)
lang_feats = l_outputs[0]
language_hidden_states = language_hidden_states + (lang_feats,)
if language_attentions is not None:
language_attentions = language_attentions + (l_outputs[1],)
for layer_module in self.r_layers:
v_outputs = layer_module(
visual_feats,
visual_attention_mask,
output_attentions,
training=training,
)
visual_feats = v_outputs[0]
vision_hidden_states = vision_hidden_states + (visual_feats,)
if vision_attentions is not None:
vision_attentions = vision_attentions + (v_outputs[1],)
for layer_module in self.x_layers:
x_outputs = layer_module(
lang_feats,
lang_attention_mask,
visual_feats,
visual_attention_mask,
output_attentions,
training=training,
)
lang_feats, visual_feats = x_outputs[:2]
vision_hidden_states = vision_hidden_states + (visual_feats,)
language_hidden_states = language_hidden_states + (lang_feats,)
if cross_encoder_attentions is not None:
cross_encoder_attentions = cross_encoder_attentions + (x_outputs[2],)
visual_encoder_outputs = (
vision_hidden_states,
vision_attentions if output_attentions else None,
)
lang_encoder_outputs = (
language_hidden_states,
language_attentions if output_attentions else None,
)
return (
visual_encoder_outputs,
lang_encoder_outputs,
cross_encoder_attentions if output_attentions else None,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "visn_fc", None) is not None:
with tf.name_scope(self.visn_fc.name):
self.visn_fc.build(None)
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
if getattr(self, "x_layers", None) is not None:
for layer in self.x_layers:
with tf.name_scope(layer.name):
layer.build(None)
if getattr(self, "r_layers", None) is not None:
for layer in self.r_layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFLxmertMainLayer(keras.layers.Layer):
config_class = LxmertConfig
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.num_l_layers = config.l_layers
self.num_x_layers = config.x_layers
self.num_r_layers = config.r_layers
self.initializer_range = config.initializer_range
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.embeddings = TFLxmertEmbeddings(config, name="embeddings")
self.encoder = TFLxmertEncoder(config, name="encoder")
self.pooler = TFLxmertPooler(config, name="pooler")
self.config = config
def get_input_embeddings(self):
return self.embeddings
def set_input_embeddings(self, value):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids=None,
visual_feats=None,
visual_pos=None,
attention_mask=None,
visual_attention_mask=None,
token_type_ids=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
class TFLxmertPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = LxmertConfig
base_model_prefix = "lxmert"
@property
def dummy_inputs(self):
"""
Dummy inputs to build the network.
Returns:
tf.Tensor with dummy inputs
"""
batch_size = 2
num_visual_features = 10
input_ids = tf.constant([[3, 5, 6], [2, 3, 4]], dtype=tf.int32)
visual_feats = tf.random.uniform((batch_size, num_visual_features, self.config.visual_feat_dim))
visual_pos = tf.random.uniform((batch_size, num_visual_features, 4))
return {
"input_ids": input_ids,
"visual_feats": visual_feats,
"visual_pos": visual_pos,
}
@property
这段代码定义了一个自定义的 Keras 层 `TFLxmertMainLayer` 和一个抽象类 `TFLxmertPreTrainedModel`,分别用于处理 LXMERT 模型的主要层和预训练模型的初始化和虚拟输入数据。
def input_signature(self):
return {
"input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
"attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
"visual_feats": tf.TensorSpec((None, None, self.config.visual_feat_dim), tf.float32, name="visual_feats"),
"visual_pos": tf.TensorSpec((None, None, 4), tf.float32, name="visual_pos"),
"visual_attention_mask": tf.TensorSpec((None, None), tf.int32, name="visual_attention_mask"),
"token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
}
LXMERT_START_DOCSTRING = r"""
The LXMERT model was proposed in [LXMERT: Learning Cross-Modality Encoder Representations from
Transformers](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal. It's a vision and language transformer
model, pre-trained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual
genome, using a combination of masked language modeling, region of interest feature regression, cross entropy loss
for question answering attribute prediction, and object tag prediction.
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Parameters:
config ([`LxmertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
LXMERT_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare Lxmert Model transformer outputting raw hidden-states without any specific head on top.",
LXMERT_START_DOCSTRING,
)
class TFLxmertModel(TFLxmertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.lxmert = TFLxmertMainLayer(config, name="lxmert")
def call(
self,
input_ids: TFModelInputType | None = None,
visual_feats: tf.Tensor | None = None,
visual_pos: tf.Tensor | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
visual_attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[Tuple, TFLxmertModelOutput]:
outputs = self.lxmert(
input_ids,
visual_feats,
visual_pos,
attention_mask,
visual_attention_mask,
token_type_ids,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict,
training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "lxmert", None) is not None:
with tf.name_scope(self.lxmert.name):
self.lxmert.build(None)
class TFLxmertPooler(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states):
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFLxmertPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: LxmertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
if isinstance(config.hidden_act, str):
self.transform_act_fn = get_tf_activation(config.hidden_act)
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(inputs=hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFLxmertLMPredictionHead(keras.layers.Layer):
def __init__(self, config: LxmertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.config = config
self.hidden_size = config.hidden_size
self.transform = TFLxmertPredictionHeadTransform(config, name="transform")
self.input_embeddings = input_embeddings
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
if self.built:
return
self.built = True
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable):
self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self) -> Dict[str, tf.Variable]:
return {"bias": self.bias}
def set_bias(self, value: tf.Variable):
self.bias = value["bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states
class TFLxmertMLMHead(keras.layers.Layer):
def __init__(self, config: LxmertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.predictions = TFLxmertLMPredictionHead(config, input_embeddings, name="predictions")
def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
prediction_scores = self.predictions(hidden_states=sequence_output)
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
class TFLxmertPreTrainingHeads(keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
self.predictions = TFLxmertLMPredictionHead(config, input_embeddings, name="predictions")
self.seq_relationship = keras.layers.Dense(
2,
kernel_initializer=get_initializer(config.initializer_range),
name="seq_relationship",
)
self.config = config
def call(self, sequence_output, pooled_output):
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
if getattr(self, "seq_relationship", None) is not None:
with tf.name_scope(self.seq_relationship.name):
self.seq_relationship.build([None, None, self.config.hidden_size])
class TFLxmertVisualAnswerHead(keras.layers.Layer):
def __init__(self, config, num_labels, **kwargs):
super().__init__(**kwargs)
hid_dim = config.hidden_size
self.dense = keras.layers.Dense(
hid_dim * 2,
kernel_initializer=get_initializer(config.initializer_range),
name="logit_fc_._0",
)
self.activation = get_tf_activation("gelu")
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="logit_fc_._2")
self.dense_1 = keras.layers.Dense(
num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="logit_fc_._3",
)
self.hid_dim = hid_dim
def call(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.layer_norm(hidden_states)
hidden_states = self.dense_1(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.hid_dim])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, self.hid_dim * 2])
if getattr(self, "dense_1", None) is not None:
with tf.name_scope(self.dense_1.name):
self.dense_1.build([None, None, self.hid_dim * 2])
class TFLxmertVisualObjHead(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.transform = TFLxmertPredictionHeadTransform(config, name="transform")
visual_losses = {}
if config.visual_obj_loss:
visual_losses["obj"] = {"shape": (-1,), "num": config.num_object_labels}
if config.visual_attr_loss:
visual_losses["attr"] = {"shape": (-1,), "num": config.num_attr_labels}
if config.visual_feat_loss:
visual_losses["feat"] = {"shape": (-1, 2048), "num": config.visual_feat_dim}
self.visual_losses = visual_losses
self.decoder_dict = {
key: keras.layers.Dense(
self.visual_losses[key]["num"],
kernel_initializer=get_initializer(config.initializer_range),
name=f"decoder_dict.{key}",
)
for key in self.visual_losses
}
self.config = config
def call(self, hidden_states):
hidden_states = self.transform(hidden_states)
output = {}
for key in self.visual_losses:
output[key] = self.decoder_dict[key](hidden_states)
return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
if getattr(self, "decoder_dict", None) is not None:
for layer in self.decoder_dict.values():
with tf.name_scope(layer.name):
layer.build([None, None, self.config.hidden_size])
@add_start_docstrings("""Lxmert Model with a `language modeling` head on top.""", LXMERT_START_DOCSTRING)
class TFLxmertForPreTraining(TFLxmertPreTrainedModel):
pass
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.config = config
self.num_qa_labels = config.num_qa_labels
self.visual_loss_normalizer = config.visual_loss_normalizer
self.task_mask_lm = config.task_mask_lm
self.task_obj_predict = config.task_obj_predict
self.task_matched = config.task_matched
self.task_qa = config.task_qa
self.lxmert = TFLxmertMainLayer(config, name="lxmert")
self.cls = TFLxmertPreTrainingHeads(config, self.lxmert.embeddings, name="cls")
if self.task_obj_predict:
self.obj_predict_head = TFLxmertVisualObjHead(config, name="obj_predict_head")
if self.task_qa:
self.answer_head = TFLxmertVisualAnswerHead(config, self.num_qa_labels, name="answer_head")
self.loss_fcts = {
"l2": keras.losses.Huber(delta=1.0, name="huber_loss"),
"visn_ce": keras.losses.SparseCategoricalCrossentropy(from_logits=True),
"ce": keras.losses.SparseCategoricalCrossentropy(from_logits=True),
}
visual_losses = {}
if config.visual_obj_loss:
visual_losses["obj"] = {
"shape": (-1,),
"num": config.num_object_labels,
"loss": "visn_ce",
}
if config.visual_attr_loss:
visual_losses["attr"] = {
"shape": (-1,),
"num": config.num_attr_labels,
"loss": "visn_ce",
}
if config.visual_feat_loss:
visual_losses["feat"] = {
"shape": (-1, config.visual_feat_dim),
"num": config.visual_feat_dim,
"loss": "l2",
}
self.visual_losses = visual_losses
@unpack_inputs
@add_start_docstrings_to_model_forward(LXMERT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFLxmertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
visual_feats: tf.Tensor | None = None,
visual_pos: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
visual_attention_mask: tf.Tensor | None = None,
token_type_ids: tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
masked_lm_labels: tf.Tensor | None = None,
obj_labels: Dict[str, Tuple[tf.Tensor, tf.Tensor]] | None = None,
matched_label: tf.Tensor | None = None,
ans: tf.Tensor | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
return_dict: bool | None = None,
training: bool = False,
这段代码定义了一个 `call` 方法,用于执行模型的前向传播操作。方法中使用了装饰器来增强其功能和修改返回文档。
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "lxmert", None) is not None:
with tf.name_scope(self.lxmert.name):
self.lxmert.build(None)
if getattr(self, "cls", None) is not None:
with tf.name_scope(self.cls.name):
self.cls.build(None)
if getattr(self, "obj_predict_head", None) is not None:
with tf.name_scope(self.obj_predict_head.name):
self.obj_predict_head.build(None)
if getattr(self, "answer_head", None) is not None:
with tf.name_scope(self.answer_head.name):
self.answer_head.build(None)
.\models\lxmert\tokenization_lxmert.py
import collections
import os
import unicodedata
from typing import List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"unc-nlp/lxmert-base-uncased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class LxmertTokenizer(PreTrainedTokenizer):
r"""
Construct a Lxmert tokenizer. Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
pass
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = LxmertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text, split_special_tokens=False):
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(
text, never_split=self.all_special_tokens if not split_special_tokens else None
):
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A Lxmert sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create token type IDs from token lists representing sequences.
Args:
token_ids_0 (`List[int]`):
List of IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: A list of token type IDs where each ID corresponds to the segment ID of a token.
"""
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Lxmert sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
"""
初始化函数,设置基本分词器的参数。
Args:
do_lower_case (bool, optional): 是否在分词时将输入转换为小写,默认为 True。
never_split (Iterable, optional): 在分词过程中永远不会被分割的标记集合,默认为 None。
tokenize_chinese_chars (bool, optional): 是否分词中文字符,默认为 True。
对于日语可能需要禁用此选项(参见相关问题)。
strip_accents (bool, optional): 是否去除所有重音符号。如果未指定,则由 lowercase 的值决定。
do_split_on_punc (bool, optional): 是否进行基本的标点符号分割,默认为 True。
在某些情况下,我们希望跳过基本的标点符号分割,以便后续的分词可以捕捉词语的完整上下文,例如缩略词。
"""
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
.\models\lxmert\tokenization_lxmert_fast.py
import json
from typing import List, Optional, Tuple
from tokenizers import normalizers
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from .tokenization_lxmert import LxmertTokenizer
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/vocab.txt",
},
"tokenizer_file": {
"unc-nlp/lxmert-base-uncased": (
"https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/tokenizer.json"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"unc-nlp/lxmert-base-uncased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
}
class LxmertTokenizerFast(PreTrainedTokenizerFast):
r"""
Construct a "fast" Lxmert tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = LxmertTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
):
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
self.do_lower_case = do_lower_case
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A Lxmert sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
return output
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
):
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
self.do_lower_case = do_lower_case
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A Lxmert sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
return output
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Lxmert sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\lxmert\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_lxmert": ["LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LxmertConfig"],
"tokenization_lxmert": ["LxmertTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_lxmert_fast"] = ["LxmertTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_lxmert"] = [
"LxmertEncoder",
"LxmertForPreTraining",
"LxmertForQuestionAnswering",
"LxmertModel",
"LxmertPreTrainedModel",
"LxmertVisualFeatureEncoder",
"LxmertXLayer",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_lxmert"] = [
"TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFLxmertForPreTraining",
"TFLxmertMainLayer",
"TFLxmertModel",
"TFLxmertPreTrainedModel",
"TFLxmertVisualFeatureEncoder",
]
if TYPE_CHECKING:
from .configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig
from .tokenization_lxmert import LxmertTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_lxmert_fast import LxmertTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_lxmert import (
LxmertEncoder,
LxmertForPreTraining,
LxmertForQuestionAnswering,
LxmertModel,
LxmertPreTrainedModel,
LxmertVisualFeatureEncoder,
LxmertXLayer,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_lxmert import (
TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFLxmertForPreTraining,
TFLxmertMainLayer,
TFLxmertModel,
TFLxmertPreTrainedModel,
TFLxmertVisualFeatureEncoder,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\m2m_100\configuration_m2m_100.py
""" M2M100 model configuration"""
from collections import OrderedDict
from typing import Any, Mapping, Optional
from ... import PreTrainedTokenizer
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
from ...onnx.utils import compute_effective_axis_dimension
from ...utils import TensorType, is_torch_available, logging
logger = logging.get_logger(__name__)
M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/config.json",
}
class M2M100Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`M2M100Model`]. It is used to instantiate an
M2M100 model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the M2M100
[facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import M2M100Config, M2M100Model
>>> # Initializing a M2M100 facebook/m2m100_418M style configuration
>>> configuration = M2M100Config()
>>> # Initializing a model (with random weights) from the facebook/m2m100_418M style configuration
>>> model = M2M100Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "m2m_100"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
vocab_size=128112,
max_position_embeddings=1024,
encoder_layers=12,
encoder_ffn_dim=4096,
encoder_attention_heads=16,
decoder_layers=12,
decoder_ffn_dim=4096,
decoder_attention_heads=16,
encoder_layerdrop=0.05,
decoder_layerdrop=0.05,
use_cache=True,
is_encoder_decoder=True,
activation_function="relu",
d_model=1024,
dropout=0.1,
attention_dropout=0.1,
activation_dropout=0.0,
init_std=0.02,
decoder_start_token_id=2,
scale_embedding=True,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
class M2M100OnnxConfig(OnnxSeq2SeqConfigWithPast):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
]
)
if self.use_past:
common_inputs["decoder_input_ids"] = {0: "batch"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
else:
common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
if self.use_past:
self.fill_with_past_key_values_(common_inputs, direction="inputs")
return common_inputs
def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
batch_size = compute_effective_axis_dimension(
batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
)
token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
seq_length = compute_effective_axis_dimension(
seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
)
dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
return common_inputs
def _generate_dummy_inputs_for_default_and_seq2seq_lm(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, seq_length, is_pair, framework
)
decoder_seq_length = seq_length if not self.use_past else 1
decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, decoder_seq_length, is_pair, framework
)
decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
common_inputs = dict(**encoder_inputs, **decoder_inputs)
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, encoder_seq_length = common_inputs["input_ids"].shape
decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
encoder_shape = (
batch,
num_encoder_attention_heads,
encoder_seq_length,
self._config.hidden_size // num_encoder_attention_heads,
)
decoder_past_length = decoder_seq_length + 3
decoder_shape = (
batch,
num_decoder_attention_heads,
decoder_past_length,
self._config.hidden_size // num_decoder_attention_heads,
)
common_inputs["decoder_attention_mask"] = torch.cat(
[common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
)
common_inputs["past_key_values"] = []
num_encoder_layers, num_decoder_layers = self.num_layers
min_num_layers = min(num_encoder_layers, num_decoder_layers)
max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
for _ in range(min_num_layers):
common_inputs["past_key_values"].append(
(
torch.zeros(decoder_shape),
torch.zeros(decoder_shape),
torch.zeros(encoder_shape),
torch.zeros(encoder_shape),
)
)
shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
for _ in range(min_num_layers, max_num_layers):
common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
return common_inputs
generate_dummy_inputs = _generate_dummy_inputs_for_default_and_seq2seq_lm
.\models\m2m_100\convert_m2m100_original_checkpoint_to_pytorch.py
import argparse
import torch
from torch import nn
from transformers import M2M100Config, M2M100ForConditionalGeneration
def remove_ignore_keys_(state_dict):
ignore_keys = [
"encoder.version",
"decoder.version",
"model.encoder.version",
"model.decoder.version",
"decoder.output_projection.weight",
"_float_tensor",
"encoder.embed_positions._float_tensor",
"decoder.embed_positions._float_tensor",
]
for k in ignore_keys:
state_dict.pop(k, None)
def make_linear_from_emb(emb):
vocab_size, emb_size = emb.weight.shape
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
lin_layer.weight.data = emb.weight.data
return lin_layer
def convert_fairseq_m2m100_checkpoint_from_disk(checkpoint_path):
m2m_100 = torch.load(checkpoint_path, map_location="cpu")
args = m2m_100["args"] or m2m_100["cfg"]["model"]
state_dict = m2m_100["model"]
remove_ignore_keys_(state_dict)
vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
config = M2M100Config(
vocab_size=vocab_size,
max_position_embeddings=1024,
encoder_layers=args.encoder_layers,
decoder_layers=args.decoder_layers,
encoder_attention_heads=args.encoder_attention_heads,
decoder_attention_heads=args.decoder_attention_heads,
encoder_ffn_dim=args.encoder_ffn_embed_dim,
decoder_ffn_dim=args.decoder_ffn_embed_dim,
d_model=args.encoder_embed_dim,
encoder_layerdrop=args.encoder_layerdrop,
decoder_layerdrop=args.decoder_layerdrop,
dropout=args.dropout,
attention_dropout=args.attention_dropout,
activation_dropout=args.activation_dropout,
activation_function="relu",
)
state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
model = M2M100ForConditionalGeneration(config)
model.model.load_state_dict(state_dict, strict=False)
model.lm_head = make_linear_from_emb(model.model.shared)
return model
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("fairseq_path", type=str, help="path to a model.pt on local filesystem.")
parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
args = parser.parse_args()
model = convert_fairseq_m2m100_checkpoint_from_disk(args.fairseq_path)
model.save_pretrained(args.pytorch_dump_folder_path)
.\models\m2m_100\modeling_m2m_100.py
""" PyTorch M2M100 model."""
import math
from typing import List, Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_m2m_100 import M2M100Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "M2M100Config"
_CHECKPOINT_FOR_DOC = "facebook/m2m100_418M"
M2M_100_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/m2m100_418M",
]
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
将输入的token向右移动一位。
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
根据输入的input_ids生成位置id,非padding符号用它们的位置数字表示。位置数字从padding_idx+1开始,padding符号被忽略。
这是从fairseq的`utils.make_positions`修改而来的。
"""
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx
class M2M100SinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
super().__init__()
self.offset = 2
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
if hasattr(self, "weights"):
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
self.register_buffer("weights", emb_weights, persistent=False)
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
"""
Build sinusoidal embeddings.
This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
"Attention Is All You Need".
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
if padding_idx is not None:
emb[padding_idx, :] = 0
return emb.to(torch.get_default_dtype())
@torch.no_grad()
def forward(
self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
):
if input_ids is not None:
bsz, seq_len = input_ids.size()
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
input_ids.device
)
else:
bsz, seq_len = inputs_embeds.size()[:-1]
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
if max_pos > self.weights.size(0):
self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
Args:
inputs_embeds: torch.Tensor # 输入的嵌入向量,形状为 [batch_size, sequence_length, embedding_size]
Returns: torch.Tensor # 返回形状与输入相同的位置编码标识符张量
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
class M2M100Attention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[M2M100Config] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
class M2M100EncoderLayer(nn.Module):
def __init__(self, config: M2M100Config):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = M2M100_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
config=config,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
layer_head_mask: torch.Tensor,
output_attentions: bool = False,
):
) -> torch.Tensor:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
M2M100_ATTENTION_CLASSES = {"eager": M2M100Attention}
class M2M100DecoderLayer(nn.Module):
def __init__(self, config: M2M100Config):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = M2M100_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
is_causal=True,
config=config,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = M2M100_ATTENTION_CLASSES[config._attn_implementation](
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
config=config,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
):
pass
class M2M100PreTrainedModel(PreTrainedModel):
config_class = M2M100Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["M2M100Attention"]
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
M2M_100_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
"""
Parameters:
config ([`M2M100Config`]):
模型配置类,包含模型的所有参数。使用配置文件初始化时不会加载与模型相关的权重,只加载配置信息。
可以查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型的权重。
"""
M2M_100_GENERATION_EXAMPLE = r"""
Translation example:
```
>>> from transformers import AutoTokenizer, M2M100ForConditionalGeneration
>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
>>> text_to_translate = "Life is like a box of chocolates"
>>> model_inputs = tokenizer(text_to_translate, return_tensors="pt")
>>>
>>> gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr"))
>>> print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
```
"""
M2M_100_INPUTS_DOCSTRING = r"""
"""
class M2M100Encoder(M2M100PreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`M2M100EncoderLayer`].
Args:
config: M2M100Config
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: M2M100Config, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
# Embedding layer for tokens
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
# Positional embedding for token positions
self.embed_positions = M2M100SinusoidalPositionalEmbedding(
config.max_position_embeddings,
embed_dim,
self.padding_idx,
)
# List of encoder layers
self.layers = nn.ModuleList([M2M100EncoderLayer(config) for _ in range(config.encoder_layers)])
# Layer normalization
self.layer_norm = nn.LayerNorm(config.d_model)
# Gradient checkpointing disabled by default
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# Implementation of forward pass for encoder
pass
class M2M100Decoder(M2M100PreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`M2M100DecoderLayer`]
Args:
config: M2M100Config
embed_tokens (nn.Embedding): output embedding
"""
# 初始化方法,接收配置参数和可选的嵌入层
def __init__(self, config: M2M100Config, embed_tokens: Optional[nn.Embedding] = None):
# 调用父类的初始化方法,传递配置参数
super().__init__(config)
# 设置对象的属性,从配置中获取各种参数
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
# 创建嵌入层对象,vocab_size表示词汇表大小,d_model表示嵌入维度,padding_idx表示填充标识
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
# 如果提供了外部的嵌入层,将其权重复制到当前的嵌入层
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
# 创建位置编码对象,使用正弦函数生成位置编码
self.embed_positions = M2M100SinusoidalPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
self.padding_idx,
)
# 创建解码器层的列表,每个解码器层具有相同的配置参数
self.layers = nn.ModuleList([M2M100DecoderLayer(config) for _ in range(config.decoder_layers)])
# 创建层归一化对象,对隐藏层进行归一化处理
self.layer_norm = nn.LayerNorm(config.d_model)
# 是否启用梯度检查点,初始化为False
self.gradient_checkpointing = False
# 执行初始化后的处理操作,可能包括权重初始化和其他的后续处理
self.post_init()
# 前向传播方法,接收多个输入参数,实现模型的数据流向
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 使用装饰器添加文档字符串到模型类,描述该类的基本信息和用途
@add_start_docstrings(
"The bare M2M100 Model outputting raw hidden-states without any specific head on top.",
M2M_100_START_DOCSTRING,
)
# 定义 M2M100Model 类,继承自 M2M100PreTrainedModel 类
class M2M100Model(M2M100PreTrainedModel):
# 定义用于共享权重的键列表
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
# 初始化函数,接受一个 M2M100Config 对象作为参数
def __init__(self, config: M2M100Config):
# 调用父类的初始化方法
super().__init__(config)
# 从配置中获取填充索引和词汇表大小
padding_idx, vocab_size = config.pad_token_id, config.vocab_size
# 创建一个共享的嵌入层,用于编码器和解码器
self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
# 初始化编码器和解码器
self.encoder = M2M100Encoder(config, self.shared)
self.decoder = M2M100Decoder(config, self.shared)
# 初始化权重并应用最终处理
self.post_init()
# 获取输入嵌入层的方法
def get_input_embeddings(self):
return self.shared
# 设置输入嵌入层的方法
def set_input_embeddings(self, value):
self.shared = value
# 更新编码器和解码器的嵌入层
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
# 实现权重绑定的方法
def _tie_weights(self):
# 如果配置中指定了词嵌入层共享
if self.config.tie_word_embeddings:
# 将编码器和解码器的词嵌入层绑定或克隆为共享的嵌入层
self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
# 获取编码器的方法
def get_encoder(self):
return self.encoder
# 获取解码器的方法
def get_decoder(self):
return self.decoder
# 前向传播方法,接受多个输入参数,并返回模型的输出
@add_start_docstrings_to_model_forward(M2M_100_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 函数参数的详细描述在文档字符串中给出
):
# 函数主体实现模型的前向传播逻辑,具体细节可以参考函数内部实现
) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
# 设置输出注意力权重,如果未指定则使用配置中的默认设置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 设置输出隐藏状态,如果未指定则使用配置中的默认设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 设置是否使用缓存,如果未指定则使用配置中的默认设置
use_cache = use_cache if use_cache is not None else self.config.use_cache
# 设置是否返回字典形式的输出,如果未指定则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果没有提供编码器的输出,则调用编码器来生成编码器的输出
if encoder_outputs is None:
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果用户传入的是一个元组形式的编码器输出,在 return_dict=True 时将其包装为 BaseModelOutput 类型
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
)
# 解码器的输出包括 (dec_features, past_key_value, dec_hidden, dec_attn)
decoder_outputs = self.decoder(
input_ids=decoder_input_ids,
attention_mask=decoder_attention_mask,
encoder_hidden_states=encoder_outputs[0],
encoder_attention_mask=attention_mask,
head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果不是以字典形式返回结果,则将解码器和编码器输出组合起来返回
if not return_dict:
return decoder_outputs + encoder_outputs
# 以 Seq2SeqModelOutput 类型返回结果,包括解码器和编码器的相关隐藏状态和注意力权重
return Seq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
# 为 M2M100ForConditionalGeneration 类添加文档字符串,描述其作为带有语言建模头的 M2M100 模型,用于摘要生成
@add_start_docstrings(
"The M2M100 Model with a language modeling head. Can be used for summarization.", M2M_100_START_DOCSTRING
)
class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
# 指定模型中用于连接的前缀
base_model_prefix = "model"
# 定义共享权重的键列表,这些权重被绑定在一起
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
# 初始化函数,接受 M2M100Config 类型的配置对象作为参数
def __init__(self, config: M2M100Config):
# 调用父类的初始化方法,传入配置对象
super().__init__(config)
# 创建一个 M2M100Model 模型实例,并赋值给 self.model
self.model = M2M100Model(config)
# 创建一个线性层 lm_head,用于语言建模任务的最终处理,将输入维度设为 config.d_model,输出维度设为 self.model.shared.num_embeddings,不带偏置
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
# 执行额外的初始化操作和最终处理
self.post_init()
# 返回模型中的编码器部分
def get_encoder(self):
return self.model.get_encoder()
# 返回模型中的解码器部分
def get_decoder(self):
return self.model.get_decoder()
# 返回 lm_head 层,用于输出嵌入
def get_output_embeddings(self):
return self.lm_head
# 设置 lm_head 层的新嵌入
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
# 前向传播函数,接受多个输入参数,包括输入的 ID、注意力掩码、解码器输入的 ID 等
# 使用装饰器添加文档字符串,描述了输入参数和输出类型
# 使用装饰器替换返回值的文档字符串为 Seq2SeqLMOutput 类型,并指定相关配置类
# 添加末尾的文档字符串,展示了 M2M-100 生成任务的示例
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Returns a tuple containing either torch.Tensor or Seq2SeqLMOutput.
"""
# Determine whether to use the provided return_dict or the default from configuration
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
# If decoder_input_ids is not provided, shift labels to the right for decoder input
if decoder_input_ids is None:
decoder_input_ids = shift_tokens_right(
labels, self.config.pad_token_id, self.config.decoder_start_token_id
)
# Forward pass through the model with specified inputs and optional arguments
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Generate logits from the language model head
lm_logits = self.lm_head(outputs[0])
masked_lm_loss = None
if labels is not None:
# Move labels tensor to the same device as lm_logits for proper loss computation
labels = labels.to(lm_logits.device)
loss_fct = CrossEntropyLoss()
# Compute masked language modeling loss using CrossEntropyLoss
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
# Return output as tuple if return_dict is False
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
# Return Seq2SeqLMOutput object containing relevant outputs if return_dict is True
return Seq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
# 如果使用了过去的键值(past_key_values),则计算过去的长度
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
# 一些生成方法可能已经仅传递了最后一个输入 ID
if decoder_input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# 默认行为:仅保留最后一个输入 ID
remove_prefix_length = decoder_input_ids.shape[1] - 1
# 修剪 decoder_input_ids,去除前面不需要的部分
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
# 返回一个字典,包含不同的模型输入和掩码
return {
"input_ids": None, # encoder_outputs 已定义,input_ids 不再需要
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache, # 更改此项以避免缓存(可能是为了调试目的)
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
# 重新排序过去的键值,根据 beam_idx 进行重新排列
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
.\models\m2m_100\tokenization_m2m_100.py
"""Tokenization classes for M2M100."""
import json
import os
from pathlib import Path
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union
import sentencepiece
from ...tokenization_utils import BatchEncoding, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"spm_file": "sentencepiece.bpe.model",
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/vocab.json",
"facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/vocab.json",
},
"spm_file": {
"facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/sentencepiece.bpe.model",
"facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/sentencepiece.bpe.model",
},
"tokenizer_config_file": {
"facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/tokenizer_config.json",
"facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/tokenizer_config.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/m2m100_418M": 1024,
}
FAIRSEQ_LANGUAGE_CODES = {
"m2m100": ["af", "am", "ar", "ast", "az", "ba", "be", "bg", "bn", "br", "bs", "ca", "ceb", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gu", "ha", "he", "hi", "hr", "ht", "hu", "hy", "id", "ig", "ilo", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "lb", "lg", "ln", "lo", "lt", "lv", "mg", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no", "ns", "oc", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sd", "si", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv", "sw", "ta", "th", "tl", "tn", "tr", "uk", "ur", "uz", "vi", "wo", "xh", "yi", "yo", "zh", "zu"],
"wmt21": ['en', 'ha', 'is', 'ja', 'cs', 'ru', 'zh', 'de']
}
class M2M100Tokenizer(PreTrainedTokenizer):
"""
构造一个M2M100分词器。基于SentencePiece实现。
"""
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
Path to the vocabulary file.
spm_file (`str`):
Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
contains the vocabulary.
src_lang (`str`, *optional*):
A string representing the source language.
tgt_lang (`str`, *optional*):
A string representing the target language.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
language_codes (`str`, *optional*, defaults to `"m2m100"`):
What language codes to use. Should be one of `"m2m100"` or `"wmt21"`.
sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:
- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Examples:
```
>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro")
>>> src_text = " UN Chief Says There Is No Military Solution in Syria"
>>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
outputs = model(**model_inputs)
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
prefix_tokens: List[int] = []
suffix_tokens: List[int] = []
def __init__(
self,
vocab_file,
spm_file,
src_lang=None,
tgt_lang=None,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
pad_token="<pad>",
unk_token="<unk>",
language_codes="m2m100",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
num_madeup_words=8,
**kwargs,
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.language_codes = language_codes
fairseq_language_code = FAIRSEQ_LANGUAGE_CODES[language_codes]
self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in fairseq_language_code}
additional_special_tokens = kwargs.pop("additional_special_tokens", [])
for lang_code in fairseq_language_code:
token = self.get_lang_token(lang_code)
if token not in additional_special_tokens and lang_code not in str(token) not in self.added_tokens_encoder:
additional_special_tokens.append(token)
self.vocab_file = vocab_file
self.encoder = load_json(vocab_file)
self.decoder = {v: k for k, v in self.encoder.items()}
self.spm_file = spm_file
self.sp_model = load_spm(spm_file, self.sp_model_kwargs)
self.encoder_size = len(self.encoder)
self.lang_token_to_id = {
self.get_lang_token(lang_code): self.encoder_size + i for i, lang_code in enumerate(fairseq_language_code)
}
self.lang_code_to_id = {lang_code: self.encoder_size + i for i, lang_code in enumerate(fairseq_language_code)}
self.id_to_lang_token = {v: k for k, v in self.lang_token_to_id.items()}
self._src_lang = src_lang if src_lang is not None else "en"
self.tgt_lang = tgt_lang
self.cur_lang_id = self.get_lang_id(self._src_lang)
self.num_madeup_words = num_madeup_words
super().__init__(
src_lang=src_lang,
tgt_lang=tgt_lang,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
unk_token=unk_token,
pad_token=pad_token,
language_codes=language_codes,
sp_model_kwargs=self.sp_model_kwargs,
additional_special_tokens=additional_special_tokens,
num_madeup_words=num_madeup_words,
**kwargs,
)
self.set_src_lang_special_tokens(self._src_lang)
@property
def vocab_size(self) -> int:
return len(self.encoder)
def get_vocab(self) -> Dict:
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
@property
def src_lang(self) -> str:
return self._src_lang
@src_lang.setter
def src_lang(self, new_src_lang: str) -> None:
self._src_lang = new_src_lang
self.set_src_lang_special_tokens(self._src_lang)
def _tokenize(self, text: str) -> List[str]:
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
if token in self.lang_token_to_id:
return self.lang_token_to_id[token]
return self.encoder.get(token, self.encoder[self.unk_token])
def _convert_id_to_token(self, index: int) -> str:
"""Converts an index (integer) in a token (str) using the decoder."""
if index in self.id_to_lang_token:
return self.id_to_lang_token[index]
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
for token in tokens:
if token in self.all_special_tokens:
out_string += self.sp_model.decode(current_sub_tokens) + token
current_sub_tokens = []
else:
current_sub_tokens.append(token)
out_string += self.sp_model.decode(current_sub_tokens)
return out_string.strip()
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens)
if token_ids_1 is None:
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:
- `input_ids` (for encoder) `X [eos, src_lang_code]`
- `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
def __getstate__(self) -> Dict:
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d: Dict) -> None:
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
save_dir = Path(save_directory)
if not save_dir.is_dir():
raise OSError(f"{save_directory} should be a directory")
vocab_save_path = save_dir / (
(filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
)
spm_save_path = save_dir / (
(filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["spm_file"]
)
save_json(self.encoder, vocab_save_path)
if os.path.abspath(self.spm_file) != os.path.abspath(spm_save_path) and os.path.isfile(self.spm_file):
copyfile(self.spm_file, spm_save_path)
elif not os.path.isfile(self.spm_file):
with open(spm_save_path, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (str(vocab_save_path), str(spm_save_path))
def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "en",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "ro",
**kwargs,
) -> BatchEncoding:
self.src_lang = src_lang
self.tgt_lang = tgt_lang
self.set_src_lang_special_tokens(self.src_lang)
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
def _build_translation_inputs(self, raw_inputs, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs):
"""Used by translation pipeline, to prepare inputs for the generate function"""
if src_lang is None or tgt_lang is None:
raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
self.src_lang = src_lang
inputs = self(raw_inputs, add_special_tokens=True, **extra_kwargs)
tgt_lang_id = self.get_lang_id(tgt_lang)
inputs["forced_bos_token_id"] = tgt_lang_id
return inputs
def _switch_to_input_mode(self):
self.set_src_lang_special_tokens(self.src_lang)
def _switch_to_target_mode(self):
self.set_tgt_lang_special_tokens(self.tgt_lang)
def set_src_lang_special_tokens(self, src_lang: str) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
lang_token = self.get_lang_token(src_lang)
self.cur_lang_id = self.lang_token_to_id[lang_token]
self.prefix_tokens = [self.cur_lang_id]
self.suffix_tokens = [self.eos_token_id]
def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
lang_token = self.get_lang_token(tgt_lang)
self.cur_lang_id = self.lang_token_to_id[lang_token]
self.prefix_tokens = [self.cur_lang_id]
self.suffix_tokens = [self.eos_token_id]
def get_lang_token(self, lang: str) -> str:
return self.lang_code_to_token[lang]
def get_lang_id(self, lang: str) -> int:
lang_token = self.get_lang_token(lang)
return self.lang_token_to_id[lang_token]
def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
spm.Load(str(path))
return spm
def load_json(path: str) -> Union[Dict, List]:
with open(path, "r") as f:
return json.load(f)
def save_json(data, path: str) -> None:
with open(path, "w") as f:
json.dump(data, f, indent=2)
.\models\m2m_100\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config", "M2M100OnnxConfig"],
"tokenization_m2m_100": ["M2M100Tokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_m2m_100"] = [
"M2M_100_PRETRAINED_MODEL_ARCHIVE_LIST",
"M2M100ForConditionalGeneration",
"M2M100Model",
"M2M100PreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config, M2M100OnnxConfig
from .tokenization_m2m_100 import M2M100Tokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_m2m_100 import (
M2M_100_PRETRAINED_MODEL_ARCHIVE_LIST,
M2M100ForConditionalGeneration,
M2M100Model,
M2M100PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\mamba\configuration_mamba.py
"""MAMBA configuration"""
import math
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
MAMBA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"state-spaces/mamba-2.8b": "https://huggingface.co/state-spaces/mamba-2.8b/resolve/main/config.json",
}
class MambaConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`MambaModel`]. It is used to instantiate a MAMBA
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the MAMBA
[state-spaces/mamba-2.8b](https://huggingface.co/state-spaces/mamba-2.8b) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import MambaConfig, MambaModel
>>> # Initializing a Mamba configuration
>>> configuration = MambaConfig()
>>> # Initializing a model (with random weights) from the configuration
>>> model = MambaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "mamba"
def __init__(
self,
vocab_size=50280,
hidden_size=768,
state_size=16,
num_hidden_layers=32,
layer_norm_epsilon=1e-5,
pad_token_id=0,
bos_token_id=0,
eos_token_id=0,
expand=2,
conv_kernel=4,
use_bias=False,
use_conv_bias=True,
hidden_act="silu",
initializer_range=0.1,
residual_in_fp32=True,
time_step_rank="auto",
time_step_scale=1.0,
time_step_min=0.001,
time_step_max=0.1,
time_step_init_scheme="random",
time_step_floor=1e-4,
rescale_prenorm_residual=False,
use_cache=True,
**kwargs,
):
"""
初始化 MambaConfig 类,设置 MAMBA 模型的配置参数。
参数:
vocab_size (int): 词汇表大小,默认为 50280
hidden_size (int): 隐藏层大小,默认为 768
state_size (int): 状态大小,默认为 16
num_hidden_layers (int): 隐藏层层数,默认为 32
layer_norm_epsilon (float): 层归一化的 epsilon 值,默认为 1e-5
pad_token_id (int): 填充标记的 ID,默认为 0
bos_token_id (int): 起始标记的 ID,默认为 0
eos_token_id (int): 结束标记的 ID,默认为 0
expand (int): 扩展参数,默认为 2
conv_kernel (int): 卷积核大小,默认为 4
use_bias (bool): 是否使用偏置,默认为 False
use_conv_bias (bool): 是否使用卷积偏置,默认为 True
hidden_act (str): 隐藏层激活函数,默认为 "silu"
initializer_range (float): 初始化范围,默认为 0.1
residual_in_fp32 (bool): 是否在 fp32 下进行残差连接,默认为 True
time_step_rank (str): 时间步长等级,默认为 "auto"
time_step_scale (float): 时间步长缩放,默认为 1.0
time_step_min (float): 最小时间步长,默认为 0.001
time_step_max (float): 最大时间步长,默认为 0.1
time_step_init_scheme (str): 时间步长初始化方案,默认为 "random"
time_step_floor (float): 时间步长下限,默认为 1e-4
rescale_prenorm_residual (bool): 是否对预归一化残差进行重新缩放,默认为 False
use_cache (bool): 是否使用缓存,默认为 True
**kwargs: 其他关键字参数
"""
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.state_size = state_size
self.num_hidden_layers = num_hidden_layers
self.layer_norm_epsilon = layer_norm_epsilon
self.conv_kernel = conv_kernel
self.expand = expand
self.intermediate_size = int(expand * self.hidden_size)
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.use_bias = use_bias
self.use_conv_bias = use_conv_bias
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
self.time_step_scale = time_step_scale
self.time_step_min = time_step_min
self.time_step_max = time_step_max
self.time_step_init_scheme = time_step_init_scheme
self.time_step_floor = time_step_floor
self.rescale_prenorm_residual = rescale_prenorm_residual
self.residual_in_fp32 = residual_in_fp32
self.use_cache = use_cache
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
.\models\mamba\modeling_mamba.py
"""PyTorch MAMBA model."""
import math
from dataclasses import dataclass
from typing import Any, Dict, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
from .configuration_mamba import MambaConfig
logger = logging.get_logger(__name__)
if is_mamba_ssm_available():
from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn
from mamba_ssm.ops.triton.selective_state_update import selective_state_update
else:
selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
if is_causal_conv1d_available():
from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
else:
causal_conv1d_update, causal_conv1d_fn = None, None
is_fast_path_available = all(
(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
)
_CHECKPOINT_FOR_DOC = "state-spaces/mamba-130m-hf"
_CONFIG_FOR_DOC = "MambaConfig"
MAMBA_PRETRAINED_MODEL_ARCHIVE_LIST = []
class MambaCache:
def __init__(self, config, batch_size, dtype=torch.float16, device=None):
"""
Initialize MambaCache object.
Args:
config (MambaConfig): The configuration object for MAMBA model.
batch_size (int): Batch size for the cache.
dtype (torch.dtype, optional): Data type for tensors in cache (default: torch.float16).
device (torch.device, optional): Device for tensors in cache (default: None).
"""
self.seqlen_offset = 0
self.dtype = dtype
intermediate_size = config.intermediate_size
ssm_state_size = config.state_size
conv_kernel_size = config.conv_kernel
self.conv_states = {
i: torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype)
for i in range(config.num_hidden_layers)
}
self.ssm_states = {
i: torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype)
for i in range(config.num_hidden_layers)
}
class MambaMixer(nn.Module):
"""
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
"""
"""
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
"""
def __init__(self, config, layer_idx):
super().__init__()
self.hidden_size = config.hidden_size
self.ssm_state_size = config.state_size
self.conv_kernel_size = config.conv_kernel
self.intermediate_size = config.intermediate_size
self.time_step_rank = config.time_step_rank
self.layer_idx = layer_idx
self.use_conv_bias = config.use_conv_bias
self.conv1d = nn.Conv1d(
in_channels=self.intermediate_size,
out_channels=self.intermediate_size,
bias=config.use_conv_bias,
kernel_size=config.conv_kernel,
groups=self.intermediate_size,
padding=config.conv_kernel - 1,
)
self.activation = config.hidden_act
self.act = ACT2FN[config.hidden_act]
self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias)
self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False)
self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True)
A = torch.arange(1, self.ssm_state_size + 1, dtype=torch.float32)[None, :]
A = A.expand(self.intermediate_size, -1).contiguous()
self.A_log = nn.Parameter(torch.log(A))
self.D = nn.Parameter(torch.ones(self.intermediate_size))
self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
self.use_bias = config.use_bias
if not is_fast_path_available:
logger.warning_once(
"The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
" is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and"
" https://github.com/Dao-AILab/causal-conv1d"
)
def forward(self, hidden_states, cache_params: Optional[MambaCache] = None):
if is_fast_path_available and "cuda" in self.x_proj.weight.device.type:
return self.cuda_kernels_forward(hidden_states, cache_params)
return self.slow_forward(hidden_states, cache_params)
class MambaRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states.to(input_dtype)
class MambaBlock(nn.Module):
def __init__(self, config, layer_idx):
super().__init__()
self.config = config
self.layer_idx = layer_idx
self.residual_in_fp32 = config.residual_in_fp32
self.norm = MambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
self.mixer = MambaMixer(config, layer_idx=layer_idx)
def forward(self, hidden_states, cache_params: Optional[MambaCache] = None):
residual = hidden_states
hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
if self.residual_in_fp32:
residual = residual.to(torch.float32)
hidden_states = self.mixer(hidden_states, cache_params=cache_params)
hidden_states = residual + hidden_states
return hidden_states
class MambaPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = MambaConfig
base_model_prefix = "backbone"
_no_split_modules = ["MambaBlock"]
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, MambaMixer):
module.A_log._no_weight_decay = True
module.D._no_weight_decay = True
dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
if self.config.time_step_init_scheme == "constant":
nn.init.constant_(module.dt_proj.weight, dt_init_std)
elif self.config.time_step_init_scheme == "random":
nn.init.uniform_(module.dt_proj.weight, -dt_init_std, dt_init_std)
dt = torch.exp(
torch.rand(self.config.intermediate_size)
* (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+ math.log(self.config.time_step_min)
).clamp(min=self.config.time_step_floor)
inv_dt = dt + torch.log(-torch.expm1(-dt))
with torch.no_grad():
module.dt_proj.bias.copy_(inv_dt)
module.dt_proj.bias._no_reinit = True
if isinstance(module, nn.Linear):
if module.bias is not None:
if not getattr(module.bias, "_no_reinit", False):
nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight, std=self.config.initializer_range)
if self.config.rescale_prenorm_residual:
for name, p in module.named_parameters():
if name in ["out_proj.weight"]:
nn.init.kaiming_uniform_(p, a=math.sqrt(5))
with torch.no_grad():
p /= math.sqrt(self.config.num_layers)
"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
"""
MambaOutput:
"""
Class for the MAMBA model outputs.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
cache_params (`MambaCache`):
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
avoid providing the old `input_ids`.
Includes both the State space model state matrices after the selective scan, and the Convolutional states
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
"""
last_hidden_state: Optional[torch.FloatTensor] = None
cache_params: Optional[MambaCache] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
MambaCausalLMOutput:
"""
Base class for causal language model (or autoregressive) outputs.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
cache_params (`MambaCache`):
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
avoid providing the old `input_ids`.
Includes both the State space model state matrices after the selective scan, and the Convolutional states
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
"""
loss: Optional[torch.FloatTensor] = None
logits: Optional[torch.FloatTensor] = None
cache_params: Optional[MambaCache] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`MambaConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
MAMBA_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
Indices of input sequence tokens in the vocabulary.
If `cache_params.seqlen_offset>0`, only `input_ids` that do not have their past calculated should be passed as
`input_ids`.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
cache_params (`MambaCache`, *optional*):
If passed along, the model uses the previous state in all the blocks (which will give the output for the
`input_ids` provided as if the model add `state_input_ids + input_ids` as context).
use_cache (`bool`, *optional*):
If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare MAMBA Model transformer outputting raw hidden-states without any specific head on top.",
MAMBA_START_DOCSTRING,
)
class MambaModel(MambaPreTrainedModel):
"""
MAMBA 模型的核心类,输出未经特定头部处理的原始隐藏状态。
"""
def __init__(self, config):
"""
初始化 MambaModel 类。
Args:
config (MambaConfig): 包含模型配置信息的实例。
Attributes:
embeddings (nn.Embedding): 输入 token 的嵌入表示。
layers (nn.ModuleList): MambaBlock 层的列表,构成模型的主体。
gradient_checkpointing (bool): 是否使用梯度检查点。
norm_f (MambaRMSNorm): 应用于隐藏状态的 RMS 标准化器。
"""
super().__init__(config)
self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
self.layers = nn.ModuleList([MambaBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
self.norm_f = MambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self):
"""
获取输入嵌入层。
Returns:
nn.Embedding: 输入嵌入层对象。
"""
return self.embeddings
def set_input_embeddings(self, new_embeddings):
"""
设置输入嵌入层。
Args:
new_embeddings (nn.Embedding): 新的输入嵌入层对象。
"""
self.embeddings = new_embeddings
@add_start_docstrings_to_model_forward(MAMBA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MambaOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(self, input_ids=None, inputs_embeds=None, cache_params=None, use_cache=False,
output_hidden_states=False, return_dict=True):
"""
模型的前向传播。
Args:
input_ids (torch.LongTensor, optional): 输入 token 的索引序列。
inputs_embeds (torch.FloatTensor, optional): 输入 token 的嵌入表示。
cache_params (MambaCache, optional): 缓存参数,用于模型的历史状态。
use_cache (bool, optional): 如果为 True,则返回缓存参数以便快速生成下一个 logit。
output_hidden_states (bool, optional): 是否返回所有层的隐藏状态。
return_dict (bool, optional): 是否返回 ModelOutput 对象而不是普通元组。
Returns:
ModelOutput or tuple: 模型输出对象或普通元组,具体取决于 return_dict 参数的设置。
"""
pass
# 定义模型的前向传播方法,接受多个输入参数,并返回一个元组或MambaOutput对象
def forward(
self,
input_ids: Optional[torch.LongTensor] = None, # 输入的token ids序列,可选
inputs_embeds: Optional[torch.LongTensor] = None, # 输入的嵌入表示,可选
cache_params: Optional[MambaCache] = None, # 缓存参数对象,可选
use_cache: Optional[bool] = None, # 是否使用缓存,可选
output_hidden_states: Optional[bool] = None, # 是否输出所有隐藏状态,可选
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,可选
**kwargs, # 其他关键字参数,例如attention_mask由分词器传递,不需要处理
) -> Union[Tuple, MambaOutput]: # 返回值可以是元组或MambaOutput对象
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) # 如果没有显式指定输出隐藏状态,则使用配置中的默认设置
use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
# 如果没有显式指定是否使用缓存,则根据训练状态和模型配置进行设定
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果没有显式指定是否返回字典形式的输出,则根据模型配置进行设定
if (input_ids is None) ^ (inputs_embeds is not None): # 异或运算符判断输入参数的合法性
raise ValueError(
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
)
if inputs_embeds is None:
inputs_embeds = self.embeddings(input_ids) # 如果没有提供嵌入表示,则根据输入的token ids生成嵌入表示
if self.gradient_checkpointing and self.training and use_cache:
use_cache = False # 如果启用了梯度检查点且处于训练模式并且使用缓存,则禁用缓存
if cache_params is None and use_cache:
cache_params = MambaCache(
self.config, inputs_embeds.size(0), device=inputs_embeds.device, dtype=inputs_embeds.dtype
) # 如果没有提供缓存参数且需要使用缓存,则创建新的MambaCache对象
hidden_states = inputs_embeds # 将嵌入表示作为初始隐藏状态
all_hidden_states = () if output_hidden_states else None # 如果需要输出所有隐藏状态,则初始化空元组
for mixer_block in self.layers:
if self.gradient_checkpointing and self.training:
hidden_states = self._gradient_checkpointing_func(mixer_block.__call__, hidden_states, cache_params)
# 如果启用了梯度检查点并且处于训练模式,则使用梯度检查点函数计算mixer_block的输出
else:
hidden_states = mixer_block(hidden_states, cache_params=cache_params)
# 否则直接调用mixer_block计算隐藏状态
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) # 如果需要输出所有隐藏状态,则保存当前隐藏状态
if use_cache:
cache_params.seqlen_offset += inputs_embeds.shape[1] # 更新缓存参数的序列长度偏移量
hidden_states = self.norm_f(hidden_states) # 对最终的隐藏状态进行归一化处理
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) # 如果需要输出所有隐藏状态,则保存最终的隐藏状态
if not return_dict:
return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None)
# 如果不需要返回字典形式的输出,则返回一个元组,包含非空的hidden_states、cache_params和all_hidden_states
return MambaOutput(
last_hidden_state=hidden_states, # 返回MambaOutput对象,包括最终的隐藏状态
cache_params=cache_params if use_cache else None, # 如果使用缓存,则返回缓存参数,否则为None
hidden_states=all_hidden_states, # 返回所有的隐藏状态
)
@add_start_docstrings(
"""
The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
MAMBA_START_DOCSTRING,
)
class MambaForCausalLM(MambaPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.backbone = MambaModel(config) # 初始化 MambaModel 作为 backbone
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# 初始化线性层 lm_head,用于语言建模头部,权重与输入嵌入层相关联
# 初始化后处理
self.post_init()
def get_output_embeddings(self):
return self.lm_head # 返回 lm_head 作为输出嵌入层
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings # 设置新的输出嵌入层
def get_input_embeddings(self):
return self.backbone.get_input_embeddings() # 获取输入嵌入层
def set_input_embeddings(self, new_embeddings):
return self.backbone.set_input_embeddings(new_embeddings) # 设置新的输入嵌入层
def _update_model_kwargs_for_generation(
self, outputs: ModelOutput, model_kwargs: Dict[str, Any], **kwargs
) -> Dict[str, Any]:
model_kwargs["cache_params"] = outputs.get("cache_params", None)
return model_kwargs
# 更新用于生成的模型参数,包括缓存参数
def prepare_inputs_for_generation(
self, input_ids, cache_params: Optional[MambaCache] = None, inputs_embeds=None, attention_mask=None, **kwargs
):
# 如果传递了状态,则只使用输入 IDs 的最后一个标记
if cache_params is not None:
input_ids = input_ids[:, -1].unsqueeze(-1)
if inputs_embeds is not None and cache_params is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs["cache_params"] = cache_params
return model_inputs
# 为生成准备输入数据,支持输入 IDs 或嵌入张量,以及缓存参数
@add_start_docstrings_to_model_forward(MAMBA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MambaCausalLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
cache_params: Optional[MambaCache] = None,
labels: Optional[torch.LongTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
use_cache: Optional[bool] = None,
**kwargs, # for now we need this for generation
):
# 此处是模型的前向传播方法,输入参数包括 input_ids、inputs_embeds 等等,用于生成或训练模型
) -> Union[Tuple, MambaCausalLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
# 根据需要确定是否返回字典形式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用模型的主体部分进行前向计算,获取模型输出
mamba_outputs = self.backbone(
input_ids,
cache_params=cache_params,
inputs_embeds=inputs_embeds,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
use_cache=use_cache,
)
# 获取模型主体输出中的隐藏状态
hidden_states = mamba_outputs[0]
# 使用语言模型头部计算逻辑回归结果
logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
# 初始化损失为None
loss = None
if labels is not None:
# 将标签移到正确的设备上,以支持模型并行计算
labels = labels.to(logits.device)
# 将预测的logits向左移动一个位置,以对齐标签
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# 使用交叉熵损失函数计算损失
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
# 如果不返回字典形式的输出,构造输出元组
if not return_dict:
output = (logits,) + mamba_outputs[1:]
return ((loss,) + output) if loss is not None else output
# 返回自定义的输出类MambaCausalLMOutput,包括损失、logits和其他额外的模型输出
return MambaCausalLMOutput(
loss=loss,
logits=logits,
cache_params=mamba_outputs.cache_params,
hidden_states=mamba_outputs.hidden_states,
)