Transformers 源码解析(七十三)
.\models\mbart\modeling_tf_mbart.py
""" TF 2.0 MBart model."""
from __future__ import annotations
import random
from typing import Optional, Tuple, Union
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPastAndCrossAttentions,
TFSeq2SeqLMOutput,
TFSeq2SeqModelOutput,
)
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFModelInputType,
TFPreTrainedModel,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_mbart import MBartConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/mbart-large-cc25"
_CONFIG_FOR_DOC = "MBartConfig"
LARGE_NEGATIVE = -1e8
def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int):
"""
将输入的token向右移动一个位置,并用最后一个非pad token(即<LID> token)进行包装。需要注意的是,与其他类似Bart的模型不同,MBart没有单一的`decoder_start_token_id`。
"""
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
input_ids = tf.where(
input_ids == -100, tf.fill(shape_list(input_ids), tf.cast(pad_token_id, input_ids.dtype)), input_ids
)
language_id_index = (
tf.reduce_sum(tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=input_ids.dtype), axis=-1) - 1
)
language_id_index = tf.stack(
[tf.range(shape_list(input_ids)[0], dtype=input_ids.dtype), language_id_index], axis=-1
)
languages_ids = tf.gather_nd(input_ids, language_id_index)
shifted_input_ids = tf.concat([tf.expand_dims(languages_ids, axis=-1), input_ids[:, :-1]], axis=-1)
return shifted_input_ids
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
"""
创建用于双向自注意力的因果掩码。
"""
bsz = input_ids_shape[0]
tgt_len = input_ids_shape[1]
mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
mask_cond = tf.range(shape_list(mask)[-1])
mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
if past_key_values_length > 0:
mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
src_len = shape_list(mask)[1]
tgt_len = tgt_len if tgt_len is not None else src_len
one_cst = tf.constant(1.0)
mask = tf.cast(mask, dtype=one_cst.dtype)
expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFMBartLearnedPositionalEmbedding(keras.layers.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)
def call(
self,
input_shape: Optional[tf.TensorShape] = None,
past_key_values_length: int = 0,
position_ids: tf.Tensor | None = None,
):
"""Input is expected to be of size [bsz x seqlen]."""
if position_ids is None:
seq_len = input_shape[1]
position_ids = tf.range(seq_len, delta=1, name="range")
position_ids += past_key_values_length
offset_dtype = position_ids.dtype if isinstance(position_ids, tf.Tensor) else tf.int32
return super().call(position_ids + tf.constant(self.offset, dtype=offset_dtype))
class TFMBartAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.is_decoder = is_decoder
self.bias = bias
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
def call(
self,
hidden_states: tf.Tensor,
key_value_states: tf.Tensor | None = None,
past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
training: Optional[bool] = False,
):
pass
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFMBartEncoderLayer(keras.layers.Layer):
def __init__(self, config: MBartConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
self.self_attn = TFMBartAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
layer_head_mask: tf.Tensor,
training: Optional[bool] = False,
):
"""
Args:
hidden_states (`tf.Tensor`): 输入到层的张量,形状为 *(batch, seq_len, embed_dim)*
attention_mask (`tf.Tensor`): 注意力掩码张量,形状为 *(batch, 1, tgt_len, src_len)*,
其中填充元素由非常大的负值表示。
layer_head_mask (`tf.Tensor`): 给定层中注意力头的掩码张量,形状为 *(encoder_attention_heads,)*
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, self_attn_weights, _ = self.self_attn(
hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask
)
tf.debugging.assert_equal(
shape_list(hidden_states),
shape_list(residual),
message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = self.activation_dropout(hidden_states, training=training)
hidden_states = self.fc2(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFMBartDecoderLayer(keras.layers.Layer):
def __init__(self, config: MBartConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
self.self_attn = TFMBartAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
name="self_attn",
is_decoder=True,
)
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFMBartAttention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
name="encoder_attn",
is_decoder=True,
)
self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
encoder_hidden_states: tf.Tensor | None = None,
encoder_attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
cross_attn_layer_head_mask: tf.Tensor | None = None,
past_key_value: Tuple[tf.Tensor] | None = None,
training: Optional[bool] = False,
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFMBartPreTrainedModel(TFPreTrainedModel):
config_class = MBartConfig
base_model_prefix = "model"
MBART_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Args:
config ([`MBartConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""
MBART_INPUTS_DOCSTRING = r"""
"""
MBART_GENERATION_EXAMPLE = r"""
Translation example:
```
>>> from transformers import AutoTokenizer, TFMBartForConditionalGeneration
>>> model = TFMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-en-ro")
>>> example_english_phrase = "42 is the answer"
```
>>> inputs = tokenizer(example_english_phrase, return_tensors="tf")
# 使用预训练的tokenizer将输入的英文短语编码成模型可以处理的张量形式
>>> # Translate
>>> generated_ids = model.generate(**inputs, num_beams=4, max_length=5)
# 使用预训练的翻译模型生成翻译结果,采用4束搜索,最大长度限制为5个token
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
# 解码生成的token IDs,跳过特殊token并保留tokenization空格,然后返回第一个翻译结果的文本形式
'42 este răspuns'
```
Mask filling example:
```
>>> from transformers import AutoTokenizer, TFMBartForConditionalGeneration
>>> import tensorflow as tf
>>> model = TFMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
# 从预训练的MBart模型加载条件生成器模型
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
# 从预训练的MBart模型加载tokenizer
>>> # de_DE is the language symbol id <LID> for German
>>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
# 定义一个包含掩码的文本字符串,用于在德语中填充掩码位置的词语
>>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="tf")["input_ids"]
# 使用tokenizer将文本转换为模型输入的token IDs张量,不添加特殊token
>>> logits = model(input_ids).logits
# 通过模型获取预测的logits
>>> masked_index = tf.where(input_ids[0] == tokenizer.mask_token_id)[0, 0]
# 找到输入中掩码token的索引位置
>>> probs = tf.nn.softmax(logits[0, masked_index], axis=0)
# 对模型预测的掩码位置的logits进行softmax操作,得到概率分布
>>> values, predictions = tf.math.top_k(probs, 5)
# 获取最高的五个概率值及其对应的索引作为预测结果
>>> tokenizer.decode(predictions).split()
# 解码预测的token IDs,并以列表形式返回词语预测结果
['nett', 'sehr', 'ganz', 'nicht', 'so']
```
"""
@keras_serializable
class TFMBartEncoder(keras.layers.Layer):
config_class = MBartConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`TFMBartEncoderLayer`].
Args:
config: MBartConfig
"""
def __init__(self, config: MBartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
self.config = config
self.dropout = keras.layers.Dropout(config.dropout)
self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.embed_tokens = embed_tokens
self.embed_positions = TFMBartLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
name="embed_positions",
)
self.layers = [TFMBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
self.embed_dim = config.d_model
def get_embed_tokens(self):
return self.embed_tokens
def set_embed_tokens(self, embed_tokens):
self.embed_tokens = embed_tokens
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
inputs_embeds: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.embed_dim])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFMBartDecoder(keras.layers.Layer):
config_class = MBartConfig
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFMBartDecoderLayer`]
Args:
config: MBartConfig
MBart模型的配置对象,包含模型的各种设置和超参数
embed_tokens: output embedding
可选的嵌入层对象,用于将输入的token转换为向量表示
"""
def __init__(self, config: MBartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
self.config = config
self.padding_idx = config.pad_token_id
self.embed_tokens = embed_tokens
self.layerdrop = config.decoder_layerdrop
self.embed_positions = TFMBartLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
name="embed_positions",
)
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.layers = [TFMBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
self.dropout = keras.layers.Dropout(config.dropout)
def get_embed_tokens(self):
return self.embed_tokens
def set_embed_tokens(self, embed_tokens):
self.embed_tokens = embed_tokens
@unpack_inputs
def call(
self,
input_ids: TFModelInputType = None,
inputs_embeds: tf.Tensor | None = None,
attention_mask: tf.Tensor | None = None,
position_ids: tf.Tensor | None = None,
encoder_hidden_states: tf.Tensor | None = None,
encoder_attention_mask: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
cross_attn_head_mask: tf.Tensor | None = None,
past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[
TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.config.d_model])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFMBartMainLayer(keras.layers.Layer):
config_class = MBartConfig
def __init__(self, config: MBartConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.shared = keras.layers.Embedding(
input_dim=config.vocab_size,
output_dim=config.d_model,
embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
name="model.shared",
)
self.shared.load_weight_prefix = "model.shared"
self.encoder = TFMBartEncoder(config, self.shared, name="encoder")
self.decoder = TFMBartDecoder(config, self.shared, name="decoder")
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
@unpack_inputs
def call(
self,
input_ids: TFModelInputType = None,
attention_mask: tf.Tensor | None = None,
decoder_input_ids: tf.Tensor | None = None,
decoder_attention_mask: tf.Tensor | None = None,
decoder_position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
decoder_head_mask: tf.Tensor | None = None,
cross_attn_head_mask: tf.Tensor | None = None,
encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
inputs_embeds: tf.Tensor | None = None,
decoder_inputs_embeds: tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs,
):
) -> Union[TFSeq2SeqModelOutput, tf.Tensor]:
if decoder_input_ids is None and decoder_inputs_embeds is None:
use_cache = False
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if decoder_input_ids is None and input_ids is not None:
decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id)
if encoder_outputs is None:
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput):
encoder_outputs = TFBaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
)
elif not return_dict and not isinstance(encoder_outputs, tuple):
encoder_outputs = encoder_outputs.to_tuple()
decoder_outputs = self.decoder(
decoder_input_ids,
attention_mask=decoder_attention_mask,
position_ids=decoder_position_ids,
encoder_hidden_states=encoder_outputs[0],
encoder_attention_mask=attention_mask,
head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
if not return_dict:
return decoder_outputs + encoder_outputs
return TFSeq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings(
"The bare MBART Model outputting raw hidden-states without any specific head on top.",
MBART_START_DOCSTRING,
)
class TFMBartModel(TFMBartPreTrainedModel):
def __init__(self, config: MBartConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.model = TFMBartMainLayer(config, name="model")
def get_encoder(self):
return self.model.encoder
def get_decoder(self):
return self.model.decoder
@unpack_inputs
@add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSeq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType = None,
attention_mask: tf.Tensor | None = None,
decoder_input_ids: tf.Tensor | None = None,
decoder_attention_mask: tf.Tensor | None = None,
decoder_position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
decoder_head_mask: tf.Tensor | None = None,
cross_attn_head_mask: tf.Tensor | None = None,
encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
inputs_embeds: tf.Tensor | None = None,
decoder_inputs_embeds: tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs,
) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]:
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
decoder_position_ids=decoder_position_ids,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
encoder_outputs=encoder_outputs,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def serving_output(self, output):
pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
return TFSeq2SeqModelOutput(
last_hidden_state=output.last_hidden_state,
past_key_values=pkv,
decoder_hidden_states=dec_hs,
decoder_attentions=dec_attns,
cross_attentions=cross_attns,
encoder_last_hidden_state=output.encoder_last_hidden_state,
encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
class BiasLayer(keras.layers.Layer):
"""
Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer.
"""
def __init__(self, shape, initializer, trainable, name, **kwargs):
super().__init__(name=name, **kwargs)
self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable)
def call(self, x):
return x + self.bias
@add_start_docstrings(
"The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.",
MBART_START_DOCSTRING,
)
class TFMBartForConditionalGeneration(TFMBartPreTrainedModel, TFCausalLanguageModelingLoss):
_keys_to_ignore_on_load_unexpected = [
r"model.encoder.embed_tokens.weight",
r"model.decoder.embed_tokens.weight",
]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.model = TFMBartMainLayer(config, name="model")
self.use_cache = config.use_cache
self.bias_layer = BiasLayer(
name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
)
def get_decoder(self):
return self.model.decoder
def get_encoder(self):
return self.model.encoder
def get_output_embeddings(self):
return self.get_input_embeddings()
def set_output_embeddings(self, value):
self.set_input_embeddings(value)
def get_bias(self):
return {"final_logits_bias": self.bias_layer.bias}
def set_bias(self, value):
vocab_size = value["final_logits_bias"].shape[-1]
self.bias_layer = BiasLayer(
name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False
)
self.bias_layer.bias.assign(value["final_logits_bias"])
@unpack_inputs
@add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(MBART_GENERATION_EXAMPLE)
def call(
self,
input_ids: TFModelInputType = None,
attention_mask: tf.Tensor | None = None,
decoder_input_ids: tf.Tensor | None = None,
decoder_attention_mask: tf.Tensor | None = None,
decoder_position_ids: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
decoder_head_mask: tf.Tensor | None = None,
cross_attn_head_mask: tf.Tensor | None = None,
encoder_outputs: Optional[TFBaseModelOutput] = None,
past_key_values: Tuple[Tuple[tf.Tensor]] = None,
inputs_embeds: tf.Tensor | None = None,
decoder_inputs_embeds: tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]:
"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Returns either a TFSeq2SeqLMOutput object or a tuple of tf.Tensor depending on `return_dict`.
"""
if labels is not None:
labels = tf.where(
labels == self.config.pad_token_id,
tf.cast(tf.fill(shape_list(labels), -100), labels.dtype),
labels,
)
use_cache = False
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
decoder_position_ids=decoder_position_ids,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True)
lm_logits = self.bias_layer(lm_logits)
masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits)
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
else:
return TFSeq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
def serving_output(self, output):
pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
return TFSeq2SeqLMOutput(
logits=output.logits,
past_key_values=pkv,
decoder_hidden_states=dec_hs,
decoder_attentions=dec_attns,
cross_attentions=cross_attns,
encoder_last_hidden_state=output.encoder_last_hidden_state,
encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns,
)
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
decoder_attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
if decoder_attention_mask is not None:
decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:]
elif past_key_values is not None:
decoder_position_ids = past_key_values[0][0].shape[2]
else:
decoder_position_ids = tf.range(decoder_input_ids.shape[1])
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"decoder_attention_mask": decoder_attention_mask,
"decoder_position_ids": decoder_position_ids,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)
.\models\mbart\tokenization_mbart.py
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/mbart-large-en-ro": (
"https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentencepiece.bpe.model"
),
"facebook/mbart-large-cc25": (
"https://huggingface.co/facebook/mbart-large-cc25/resolve/main/sentencepiece.bpe.model"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/mbart-large-en-ro": 1024,
"facebook/mbart-large-cc25": 1024,
}
FAIRSEQ_LANGUAGE_CODES = [
"ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN",
"it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO",
"ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN"
]
class MBartTokenizer(PreTrainedTokenizer):
"""
Construct an MBART tokenizer.
Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece).
The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
<tokens> <eos>` for target language documents.
Examples:
```
>>> from transformers import MBartTokenizer
>>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
```
"""
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
prefix_tokens: List[int] = []
suffix_tokens: List[int] = []
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
tokenizer_file=None,
src_lang=None,
tgt_lang=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
additional_special_tokens=None,
**kwargs,
mask_token = (
AddedToken(mask_token, lstrip=True, normalized=False) if isinstance(mask_token, str) else mask_token
)
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
self.fairseq_offset = 1
self.sp_model_size = len(self.sp_model)
self.lang_code_to_id = {
code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES)
}
self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
_additional_special_tokens = list(self.lang_code_to_id.keys())
if additional_special_tokens is not None:
_additional_special_tokens.extend(
[t for t in additional_special_tokens if t not in _additional_special_tokens]
)
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
tokenizer_file=None,
src_lang=src_lang,
tgt_lang=tgt_lang,
additional_special_tokens=_additional_special_tokens,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
self._src_lang = src_lang if src_lang is not None else "en_XX"
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
self.set_src_lang_special_tokens(self._src_lang)
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
state["sp_model_proto"] = self.sp_model.serialized_model_proto()
return state
def __setstate__(self, d):
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
@property
def vocab_size(self):
return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1
@property
def src_lang(self) -> str:
return self._src_lang
@src_lang.setter
def src_lang(self, new_src_lang: str) -> None:
self._src_lang = new_src_lang
self.set_src_lang_special_tokens(self._src_lang)
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
根据是否已经有特殊标记,获取未添加特殊标记的 token 列表的序列标识符。该方法在使用分词器的 `prepare_for_model` 方法添加特殊标记时调用。
Args:
token_ids_0 (`List[int]`):
ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个 ID 列表,用于序列对。
already_has_special_tokens (`bool`, *optional*, 默认为 `False`):
token 列表是否已经格式化为模型的特殊标记。
Returns:
`List[int]`: 包含整数的列表,范围在 [0, 1]:1 表示特殊标记,0 表示序列标记。
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens)
if token_ids_1 is None:
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
def build_inputs_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
从序列构建模型输入,用于序列分类任务,通过连接和添加特殊标记。MBART 序列具有以下格式,其中 `X` 表示序列:
- `input_ids` (用于编码器):`X [eos, src_lang_code]`
- `decoder_input_ids` (用于解码器):`X [eos, tgt_lang_code]`
BOS 标记不被使用。序列对不是预期的使用情况,但会在没有分隔符的情况下处理。
Args:
token_ids_0 (`List[int]`):
要添加特殊标记的 ID 列表。
token_ids_1 (`List[int]`, *可选*):
可选的第二个 ID 列表,用于序列对。
Returns:
`List[int]`: 带有适当特殊标记的输入 ID 列表。
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
从传入的两个序列创建用于序列对分类任务的掩码。MBART 不使用 token type ids,因此返回一个零列表。
Args:
token_ids_0 (`List[int]`):
ID 列表。
token_ids_1 (`List[int]`, *可选*):
可选的第二个 ID 列表,用于序列对。
Returns:
`List[int]`: 零列表。
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def _build_translation_inputs(
self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
):
"""用于翻译管道,准备用于 generate 函数的输入"""
if src_lang is None or tgt_lang is None:
raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
self.src_lang = src_lang
inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
inputs["forced_bos_token_id"] = tgt_lang_id
return inputs
def get_vocab(self):
"""
返回词汇表,将词汇映射到其对应的 ID。
Returns:
dict: 包含所有词汇及其 ID 的字典。
"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text: str) -> List[str]:
"""
使用 subword 编码器对文本进行分词。
Args:
text (str): 要分词的文本。
Returns:
List[str]: 分词后的字符串列表。
"""
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
"""Converts a token (str) into an id using the vocabulary.
Args:
token (str): The token to convert.
Returns:
int: The corresponding id from the fairseq_tokens_to_ids dictionary
if present, otherwise uses the SentencePiece model to fetch
the id. Returns unk_token_id if the SentencePiece model returns 0.
"""
if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token]
spm_id = self.sp_model.PieceToId(token)
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
def _convert_id_to_token(self, index):
"""Converts an index (integer) into a token (str) using the vocabulary.
Args:
index (int): The index to convert into a token.
Returns:
str: The corresponding token from the fairseq_ids_to_tokens dictionary
if present, otherwise uses the SentencePiece model to fetch
the token.
"""
if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) into a single string.
Args:
tokens (List[str]): List of tokens to concatenate.
Returns:
str: The concatenated string formed from tokens, with SPIECE_UNDERLINE
replaced by a space and leading/trailing whitespace removed.
"""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""Saves the vocabulary to a directory.
Args:
save_directory (str): The directory path where the vocabulary will be saved.
filename_prefix (Optional[str]): Optional prefix for the vocabulary file name.
Returns:
Tuple[str]: A tuple containing the path of the saved vocabulary file.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "en_XX",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "ro_RO",
**kwargs,
) -> BatchEncoding:
"""Prepares a batch for sequence-to-sequence model.
Args:
src_texts (List[str]): List of source texts.
src_lang (str, optional): Source language code. Defaults to "en_XX".
tgt_texts (Optional[List[str]], optional): List of target texts. Defaults to None.
tgt_lang (str, optional): Target language code. Defaults to "ro_RO".
**kwargs: Additional keyword arguments passed to the superclass method.
Returns:
BatchEncoding: The prepared batch containing encoded inputs for the model.
"""
self.src_lang = src_lang
self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
def _switch_to_input_mode(self):
"""Switches the model to input mode by setting source language special tokens."""
return self.set_src_lang_special_tokens(self.src_lang)
def _switch_to_target_mode(self):
"""Switches the model to target mode by setting target language special tokens."""
return self.set_tgt_lang_special_tokens(self.tgt_lang)
def set_src_lang_special_tokens(self, src_lang) -> None:
"""Resets special tokens to match the source language settings.
Args:
src_lang (str): Source language code.
Returns:
None
"""
self.cur_lang_code = self.lang_code_to_id[src_lang]
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""设置目标语言的特殊标记。无前缀,后缀为[eos, tgt_lang_code]。"""
self.cur_lang_code = self.lang_code_to_id[lang]
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
.\models\mbart\tokenization_mbart_fast.py
import os
from shutil import copyfile
from typing import List, Optional, Tuple
from tokenizers import processors
from ...tokenization_utils import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging
if is_sentencepiece_available():
from .tokenization_mbart import MBartTokenizer
else:
MBartTokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/mbart-large-en-ro": (
"https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentencepiece.bpe.model"
),
"facebook/mbart-large-cc25": (
"https://huggingface.co/facebook/mbart-large-cc25/resolve/main/sentencepiece.bpe.model"
),
},
"tokenizer_file": {
"facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/tokenizer.json",
"facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/mbart-large-en-ro": 1024,
"facebook/mbart-large-cc25": 1024,
}
FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN"]
class MBartTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
[BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
<tokens> <eos>` for target language documents.
Examples:
```
>>> from transformers import MBartTokenizerFast
>>> tokenizer = MBartTokenizerFast.from_pretrained(
```
注释:
创建一个“快速”MBART标记器(由HuggingFace的*tokenizers*库支持)。基于BPE模型。
继承自PreTrainedTokenizerFast类,该类包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。
用于源语言文档的标记化方法是`<tokens> <eos> <language code>`,用于目标语言文档的是`<language code> <tokens> <eos>`。
```
# 定义一个类,用于处理 MBart 模型的特定 tokenizer
class MBartTokenizer:
# 类属性:定义一些常量
vocab_files_names = VOCAB_FILES_NAMES # MBartTokenizer 实例的词汇文件名
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES # 预训练位置嵌入大小的最大模型输入尺寸
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP # 预训练词汇文件映射表
model_input_names = ["input_ids", "attention_mask"] # 模型输入的名称列表
slow_tokenizer_class = MBartTokenizer # MBartTokenizer 类本身作为慢速 tokenizer 类
prefix_tokens: List[int] = [] # 前缀 token 列表
suffix_tokens: List[int] = [] # 后缀 token 列表
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
src_lang=None,
tgt_lang=None,
additional_special_tokens=None,
**kwargs,
):
# 如果 mask_token 是字符串,将其转换为 AddedToken 对象,处理前后空格
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
# 复制 FAIRSEQ_LANGUAGE_CODES 到 _additional_special_tokens
_additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
# 如果有额外的特殊 token,且它们不在 _additional_special_tokens 中,则添加到 _additional_special_tokens
if additional_special_tokens is not None:
_additional_special_tokens.extend(
[t for t in additional_special_tokens if t not in _additional_special_tokens]
)
# 调用父类的初始化方法,设置实例属性
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
src_lang=src_lang,
tgt_lang=tgt_lang,
additional_special_tokens=_additional_special_tokens,
**kwargs,
)
# 设置实例属性:词汇文件名、语言代码到 ID 的映射、当前源语言代码、目标语言和特殊 token
self.vocab_file = vocab_file
self.lang_code_to_id = {
lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
}
self._src_lang = src_lang if src_lang is not None else "en_XX"
self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
self.tgt_lang = tgt_lang
self.set_src_lang_special_tokens(self._src_lang)
@property
def can_save_slow_tokenizer(self) -> bool:
# 检查词汇文件是否存在,用于判断是否可以保存慢速 tokenizer
return os.path.isfile(self.vocab_file) if self.vocab_file else False
@property
def src_lang(self) -> str:
# 返回当前源语言
return self._src_lang
@src_lang.setter
def src_lang(self, new_src_lang: str) -> None:
# 设置新的源语言,并更新特殊 token
self._src_lang = new_src_lang
self.set_src_lang_special_tokens(self._src_lang)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
从一个或一对序列构建模型输入,用于序列分类任务,通过连接和添加特殊标记来完成。特殊标记取决于调用set_lang。
一个MBART序列具有以下格式,其中 `X` 表示序列:
- `input_ids`(用于编码器):`X [eos, src_lang_code]`
- `decoder_input_ids`:(用于解码器):`X [eos, tgt_lang_code]`
BOS 永远不会被使用。序列对不是预期的使用情况,但它们会在没有分隔符的情况下处理。
Args:
token_ids_0 (`List[int]`):
要添加特殊标记的 ID 列表。
token_ids_1 (`List[int]`, *optional*):
第二个序列的可选 ID 列表,用于序列对。
Returns:
`List[int]`: 带有适当特殊标记的输入 ID 列表。
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
# 对于序列对,我们不希望处理它们,但为了 API 一致性保留了处理序列对的逻辑
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
从传递的两个序列创建一个在序列对分类任务中使用的掩码。mBART 不使用标记类型 ID,因此返回一个零列表。
Args:
token_ids_0 (`List[int]`):
ID 列表。
token_ids_1 (`List[int]`, *optional*):
第二个序列的可选 ID 列表,用于序列对。
Returns:
`List[int]`: 零列表。
"""
sep = [self.sep_token_id] # 分隔符标记的 ID
cls = [self.cls_token_id] # 类别标记的 ID
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0] # 返回一个全零列表,长度为特殊标记和输入长度的总和
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] # 返回一个全零列表,长度包括特殊标记和两个序列的总和
def _build_translation_inputs(
self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
):
"""由翻译管道使用,准备生成函数的输入"""
if src_lang is None or tgt_lang is None:
raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
self.src_lang = src_lang # 设置源语言属性
inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs) # 使用模型处理原始输入,添加特殊标记
tgt_lang_id = self.convert_tokens_to_ids(tgt_lang) # 将目标语言转换为其对应的 ID
inputs["forced_bos_token_id"] = tgt_lang_id # 在输入中添加强制的 BOS 标记 ID
return inputs # 返回处理后的输入
# 准备用于序列到序列模型的批处理数据,设置源语言和目标语言,默认源语言为英语,目标语言为罗马尼亚语
def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "en_XX",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "ro_RO",
**kwargs,
) -> BatchEncoding:
self.src_lang = src_lang # 设置源语言
self.tgt_lang = tgt_lang # 设置目标语言
# 调用父类方法准备序列到序列模型的批处理数据
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
# 切换到输入模式,使用当前源语言设置特殊标记
def _switch_to_input_mode(self):
return self.set_src_lang_special_tokens(self.src_lang)
# 切换到目标模式,使用当前目标语言设置特殊标记
def _switch_to_target_mode(self):
return self.set_tgt_lang_special_tokens(self.tgt_lang)
# 设置特殊标记为当前源语言的设定,包括结束标记和源语言编码
def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
self.cur_lang_code = self.convert_tokens_to_ids(src_lang) # 转换源语言为对应的语言编码
self.prefix_tokens = [] # 重置前缀标记为空列表
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] # 设置后缀标记为结束标记和当前语言编码
prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) # 将前缀标记转换为字符串形式
suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) # 将后缀标记转换为字符串形式
# 设置 Tokenizer 的后处理器,使用模板处理
self._tokenizer.post_processor = processors.TemplateProcessing(
single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
)
# 设置特殊标记为当前目标语言的设定,包括结束标记和目标语言编码
def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
self.cur_lang_code = self.convert_tokens_to_ids(lang) # 转换目标语言为对应的语言编码
self.prefix_tokens = [] # 重置前缀标记为空列表
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] # 设置后缀标记为结束标记和当前语言编码
prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) # 将前缀标记转换为字符串形式
suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) # 将后缀标记转换为字符串形式
# 设置 Tokenizer 的后处理器,使用模板处理
self._tokenizer.post_processor = processors.TemplateProcessing(
single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
)
# 定义一个保存词汇表的方法,接受一个保存目录路径和可选的文件名前缀作为参数,并返回一个包含文件路径字符串的元组
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 如果当前快速分词器不能保存慢速分词器所需的信息,则引发值错误异常
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
# 如果保存目录不存在,则记录错误信息并返回
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
return
# 构造输出词汇文件的路径,结合文件名前缀和常量中定义的词汇文件名
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# 如果当前词汇文件的绝对路径与输出词汇文件的绝对路径不同,则复制当前词汇文件到输出词汇文件
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# 返回包含输出词汇文件路径的元组
return (out_vocab_file,)
.\models\mbart\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_sentencepiece_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {"configuration_mbart": ["MBART_PRETRAINED_CONFIG_ARCHIVE_MAP", "MBartConfig", "MBartOnnxConfig"]}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_mbart"] = ["MBartTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_mbart_fast"] = ["MBartTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mbart"] = [
"MBART_PRETRAINED_MODEL_ARCHIVE_LIST",
"MBartForCausalLM",
"MBartForConditionalGeneration",
"MBartForQuestionAnswering",
"MBartForSequenceClassification",
"MBartModel",
"MBartPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_mbart"] = [
"TFMBartForConditionalGeneration",
"TFMBartModel",
"TFMBartPreTrainedModel",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_mbart"] = [
"FlaxMBartForConditionalGeneration",
"FlaxMBartForQuestionAnswering",
"FlaxMBartForSequenceClassification",
"FlaxMBartModel",
"FlaxMBartPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig, MBartOnnxConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_mbart import MBartTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
try:
from .tokenization_mbart_fast import MBartTokenizerFast
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mbart import (
MBART_PRETRAINED_MODEL_ARCHIVE_LIST,
MBartForCausalLM,
MBartForConditionalGeneration,
MBartForQuestionAnswering,
MBartForSequenceClassification,
MBartModel,
MBartPreTrainedModel,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mbart import (
MBART_PRETRAINED_MODEL_ARCHIVE_LIST,
MBartForCausalLM,
MBartForConditionalGeneration,
MBartForQuestionAnswering,
MBartForSequenceClassification,
MBartModel,
MBartPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_mbart import (
TFMBartForConditionalGeneration,
TFMBartModel,
TFMBartPreTrainedModel,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_mbart import (
FlaxMBartForConditionalGeneration,
FlaxMBartForQuestionAnswering,
FlaxMBartForSequenceClassification,
FlaxMBartModel,
FlaxMBartPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\mbart50\tokenization_mbart50.py
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/mbart-large-50-one-to-many-mmt": (
"https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model"
),
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/mbart-large-50-one-to-many-mmt": 1024,
}
FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"]
class MBart50Tokenizer(PreTrainedTokenizer):
"""
Construct a MBart50 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
Args:
vocab_file (`str`):
Path to the vocabulary file.
src_lang (`str`, *optional*):
A string representing the source language.
tgt_lang (`str`, *optional*):
A string representing the target language.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:
- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Examples:
```
>>> from transformers import MBart50Tokenizer
>>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
>>> src_text = " UN Chief Says There Is No Military Solution in Syria"
>>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
prefix_tokens: List[int] = []
suffix_tokens: List[int] = []
def __init__(
self,
vocab_file,
src_lang=None,
tgt_lang=None,
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
kwargs["additional_special_tokens"] += [
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
]
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
self.fairseq_offset = 1
self.sp_model_size = len(self.sp_model)
self.lang_code_to_id = {
code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES)
}
self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
super().__init__(
src_lang=src_lang,
tgt_lang=tgt_lang,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
self._src_lang = src_lang if src_lang is not None else "en_XX"
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
self.set_src_lang_special_tokens(self._src_lang)
@property
def vocab_size(self) -> int:
return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1
@property
def src_lang(self) -> str:
return self._src_lang
@src_lang.setter
def src_lang(self, new_src_lang: str) -> None:
self._src_lang = new_src_lang
self.set_src_lang_special_tokens(self._src_lang)
def __getstate__(self) -> Dict:
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d: Dict) -> None:
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
def get_vocab(self) -> Dict:
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text: str) -> List[str]:
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token: str) -> int:
"""Converts a token (str) in an id using the vocab."""
if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token]
spm_id = self.sp_model.PieceToId(token)
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
def _convert_id_to_token(self, index: int) -> str:
"""Converts an index (integer) in a token (str) using the vocab."""
if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
prev_is_special = False
for token in tokens:
if token in self.all_special_tokens:
if not prev_is_special:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string.strip()
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens)
if token_ids_1 is None:
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An MBART-50 sequence has the following format, where `X` represents the sequence:
- `input_ids` (for encoder) `[src_lang_code] X [eos]`
- `labels`: (for decoder) `[tgt_lang_code] X [eos]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
def _build_translation_inputs(
self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
):
"""Used by translation pipeline, to prepare inputs for the generate function"""
if src_lang is None or tgt_lang is None:
raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
self.src_lang = src_lang
inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
inputs["forced_bos_token_id"] = tgt_lang_id
return inputs
def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "en_XX",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "ro_RO",
**kwargs,
) -> BatchEncoding:
self.src_lang = src_lang
self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
def _switch_to_input_mode(self):
return self.set_src_lang_special_tokens(self.src_lang)
def _switch_to_target_mode(self):
return self.set_tgt_lang_special_tokens(self.tgt_lang)
def set_src_lang_special_tokens(self, src_lang: str) -> None:
"""Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""
self.cur_lang_code_id = self.lang_code_to_id[src_lang]
self.prefix_tokens = [self.cur_lang_code_id]
self.suffix_tokens = [self.eos_token_id]
def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
"""重设特殊标记以适应目标语言设置。前缀=[tgt_lang_code] 和 后缀=[eos]。"""
self.cur_lang_code_id = self.lang_code_to_id[tgt_lang]
self.prefix_tokens = [self.cur_lang_code_id]
self.suffix_tokens = [self.eos_token_id]
.\models\mbart50\tokenization_mbart50_fast.py
import os
from shutil import copyfile
from typing import List, Optional, Tuple
from tokenizers import processors
from ...tokenization_utils import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging
if is_sentencepiece_available():
from .tokenization_mbart50 import MBart50Tokenizer
else:
MBart50Tokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/mbart-large-50-one-to-many-mmt": (
"https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model"
),
},
"tokenizer_file": {
"facebook/mbart-large-50-one-to-many-mmt": (
"https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/tokenizer.json"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/mbart-large-50-one-to-many-mmt": 1024,
}
FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"]
from transformers import MBart50TokenizerFast
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
src_text = " UN Chief Says There Is No Military Solution in Syria"
tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
kwargs["additional_special_tokens"] += [
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
]
super().__init__(
vocab_file,
src_lang=src_lang,
tgt_lang=tgt_lang,
tokenizer_file=tokenizer_file,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
**kwargs,
)
self.vocab_file = vocab_file
self.lang_code_to_id = {
lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
}
self._src_lang = src_lang if src_lang is not None else "en_XX"
self.tgt_lang = tgt_lang
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
self.set_src_lang_special_tokens(self._src_lang)
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
@property
def src_lang(self) -> str:
return self._src_lang
@src_lang.setter
def src_lang(self, new_src_lang: str) -> None:
self._src_lang = new_src_lang
self.set_src_lang_special_tokens(self._src_lang)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. The special tokens depend on calling set_lang.
An MBART-50 sequence has the following format, where `X` represents the sequence:
- `input_ids` (for encoder) `[src_lang_code] X [eos]`
- `labels`: (for decoder) `[tgt_lang_code] X [eos]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "en_XX",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "ro_RO",
**kwargs,
) -> BatchEncoding:
self.src_lang = src_lang
self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
def _switch_to_input_mode(self):
return self.set_src_lang_special_tokens(self.src_lang)
def _switch_to_target_mode(self):
return self.set_tgt_lang_special_tokens(self.tgt_lang)
def set_src_lang_special_tokens(self, src_lang: str) -> None:
"""Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""
self.cur_lang_code_id = self.convert_tokens_to_ids(src_lang)
self.prefix_tokens = [self.cur_lang_code_id]
self.suffix_tokens = [self.eos_token_id]
prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
self._tokenizer.post_processor = processors.TemplateProcessing(
single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
)
def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
"""Reset the special tokens to the target language setting. prefix=[src_lang_code] and suffix=[eos]."""
self.cur_lang_code_id = self.convert_tokens_to_ids(tgt_lang)
self.prefix_tokens = [self.cur_lang_code_id]
self.suffix_tokens = [self.eos_token_id]
prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
self._tokenizer.post_processor = processors.TemplateProcessing(
single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
)
def _build_translation_inputs(
self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
):
"""Used by translation pipeline, to prepare inputs for the generate function"""
if src_lang is None or tgt_lang is None:
raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
self.src_lang = src_lang
inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
inputs["forced_bos_token_id"] = tgt_lang_id
return inputs
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
.\models\mbart50\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available
_import_structure = {}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_mbart50"] = ["MBart50Tokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_mbart50_fast"] = ["MBart50TokenizerFast"]
if TYPE_CHECKING:
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_mbart50 import MBart50Tokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_mbart50_fast import MBart50TokenizerFast
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\mega\configuration_mega.py
""" MEGA configuration"""
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"mnaylor/mega-base-wikitext": "https://huggingface.co/mnaylor/mega-base-wikitext/resolve/main/config.json",
}
class MegaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MegaModel`]. It is used to instantiate a Mega
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Mega
[mnaylor/mega-base-wikitext](https://huggingface.co/mnaylor/mega-base-wikitext) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import MegaConfig, MegaModel
>>> # Initializing a Mega configuration
>>> configuration = MegaConfig()
>>> # Initializing a model (with random weights) from the configuration
>>> model = MegaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "mega"
def __init__(
self,
vocab_size=30522,
hidden_size=128,
num_hidden_layers=4,
intermediate_size=256,
ema_projection_size=16,
bidirectional=True,
shared_representation_size=64,
use_chunking=False,
chunk_size=-1,
truncation=None,
normalize_before_mega=True,
normalization_type="scalenorm",
norm_affine=True,
activation="silu",
attention_activation="softmax",
dropout_prob=0.1,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
use_feature_dropout=False,
use_normalized_ffn=True,
nffn_hidden_size=256,
normalize_before_ffn=True,
nffn_activation_dropout_prob=0.1,
max_positions=2048,
add_token_type_embeddings=False,
type_vocab_size=2,
initializer_range=0.02,
ema_delta_alpha_range=0.2,
ema_beta_range=0.02,
ema_gamma_omega_range=1.0,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
relative_positional_bias="rotary",
classifier_dropout=None,
use_cache=True,
add_lm_hidden_dense_layer=True,
**kwargs,
):
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.activation = activation
self.attention_activation = attention_activation
self.intermediate_size = intermediate_size
self.ema_projection_size = ema_projection_size
self.bidirectional = bidirectional
self.shared_representation_size = shared_representation_size
self.use_chunking = use_chunking
self.chunk_size = chunk_size
self.truncation = truncation
self.normalize_before_mega = normalize_before_mega
self.normalization_type = normalization_type
self.norm_affine = norm_affine
self.dropout_prob = dropout_prob
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.use_feature_dropout = use_feature_dropout
self.use_normalized_ffn = use_normalized_ffn
self.nffn_hidden_size = nffn_hidden_size
self.normalize_before_ffn = normalize_before_ffn
self.nffn_activation_dropout_prob = nffn_activation_dropout_prob
self.max_positions = max_positions
self.add_token_type_embeddings = add_token_type_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.ema_delta_alpha_range = ema_delta_alpha_range
self.ema_beta_range = ema_beta_range
self.ema_gamma_omega_range = ema_gamma_omega_range
self.relative_positional_bias = relative_positional_bias
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
self.add_lm_hidden_dense_layer = add_lm_hidden_dense_layer
self.num_attention_heads = 1
class MegaOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
]
)
.\models\mega\convert_mega_original_pytorch_checkpoint_to_pytorch.py
"""
Convert Mega pretrained checkpoint. Built to convert the Masked LM checkpoint located at
https://huggingface.co/mnaylor/mega-wikitext-103
Requirements:
- clone the Mega repo and install fairseq from there
1. git clone https://github.com/facebookresearch/mega.git
2. cd mega && pip install -e
- clone the pretrained weights for the original implementation from the hugging face repo
* use this location as the path for pretrained weights
"""
import argparse
import os
import pickle as pkl
import torch
from torch import nn
from transformers import AutoTokenizer, MegaConfig, MegaForMaskedLM
try:
from fairseq.modules.mega_layer import MegaEncoderLayer
except ImportError:
raise ImportError("You need to install the version of fairseq from the Mega repo!")
class MegaLM(nn.Module):
"The base class for our Mega encoder - given input IDs, embed text and return encoder output"
def __init__(self, mega_args, depth, vocab_size):
super().__init__()
self.mega_args = mega_args
self.embedding_layer = nn.Embedding(vocab_size, self.mega_args.encoder_embed_dim)
self.encoders = nn.ModuleList([MegaEncoderLayer(self.mega_args) for _ in range(depth)])
self.depth = depth
def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0):
"""
Code for a forward pass - expects input_ids and attention_mask to come from a Hugging Face tokenizer as PyTorch
tensors, and returns a tensor of size (batch, n_classes) containing classification logits
Other options:
- batch_first: boolean indicating whether the batch dimension is first in input_ids (default: True, which
aligns with the HF tokenizer behavior)
- ignore_mask_value: the value in attention_mask that identifies tokens that should be ignored (default: 0,
which aligns with HF tokenizer)
"""
if batch_first:
input_ids = input_ids.T
if ignore_mask_value == 0:
attention_mask = 1 - attention_mask
embeds = self.embedding_layer(input_ids)
for encoder in self.encoders:
embeds = encoder(embeds, attention_mask)
if batch_first:
return torch.transpose(embeds, 0, 1)
else:
return embeds
class OriginalMegaForMaskedLM(nn.Module):
"A wrapper class for doing masked language modeling with Mega"
def __init__(self, mega_args, depth, vocab_size):
super().__init__()
self.mega = MegaLM(mega_args, depth, vocab_size)
self.mlm_head = nn.Linear(mega_args.encoder_embed_dim, vocab_size)
self.dropout = nn.Dropout(p=0.1)
def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0):
"""
执行 Mega 编码器和 MLM 头部的前向传播。返回每个词汇表条目的 logits。
如果 `batch_first` 为 True(默认与 Hugging Face tokenizer 的行为一致),输出形状为 (Batch size, Sequence length, Vocab size);
否则为 (Sequence length, Batch size, Vocab size)。
"""
encoder_output = self.mega(input_ids, attention_mask, batch_first, ignore_mask_value)
return self.mlm_head(self.dropout(encoder_output))
def convert_checkpoint_to_huggingface(pretrained_checkpoint_path, output_path, includes_tokenizer):
with open(os.path.join(pretrained_checkpoint_path, "model_args.pkl"), "rb") as f:
mega_original_args = pkl.load(f)
original_mlm = OriginalMegaForMaskedLM(**mega_original_args).eval()
print(
"Original Mega encoder:",
original_mlm.mega.load_state_dict(
torch.load(os.path.join(pretrained_checkpoint_path, "encoder_weights.pt"), map_location="cpu")
),
)
print(
"Original Mega MLM layer:",
original_mlm.mlm_head.load_state_dict(
torch.load(os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu")
),
)
hf_config = MegaConfig(
num_hidden_layers=mega_original_args["depth"],
vocab_size=mega_original_args["vocab_size"],
hidden_size=mega_original_args["mega_args"].encoder_embed_dim,
shared_representation_size=mega_original_args["mega_args"].encoder_z_dim,
intermediate_size=mega_original_args["mega_args"].encoder_hidden_dim,
ema_projection_size=mega_original_args["mega_args"].encoder_n_dim,
dropout_prob=mega_original_args["mega_args"].dropout,
attention_probs_dropout_prob=mega_original_args["mega_args"].attention_dropout,
hidden_dropout_prob=mega_original_args["mega_args"].hidden_dropout,
activation=mega_original_args["mega_args"].activation_fn,
attention_activation=mega_original_args["mega_args"].attention_activation_fn,
bidirectional=mega_original_args["mega_args"].bidirectional,
use_chunking=mega_original_args["mega_args"].encoder_chunk_size > 0,
chunk_size=mega_original_args["mega_args"].encoder_chunk_size,
truncation=mega_original_args["mega_args"].truncation_length,
normalization_type=mega_original_args["mega_args"].normalization_type,
normalize_before_mega=True,
norm_affine=True,
use_feature_dropout=mega_original_args["mega_args"].feature_dropout,
relative_positional_bias=mega_original_args["mega_args"].rel_pos_bias,
max_positions=mega_original_args["mega_args"].max_source_positions,
nffn_hidden_size=mega_original_args["mega_args"].encoder_ffn_embed_dim,
normalize_before_ffn=mega_original_args["mega_args"].normalize_before,
nffn_activation_dropout_prob=0.0,
add_token_type_embeddings=False,
add_lm_hidden_dense_layer=False,
)
hf_mlm = MegaForMaskedLM(hf_config).eval()
hf_mlm.mega.embedding_layer.word_embeddings.weight = original_mlm.mega.embedding_layer.weight
original_state_dict = original_mlm.mega.encoders.state_dict()
updated_keys = {}
for module_name in original_state_dict.keys():
new_module_name = None
if "beta" in module_name:
if "move.beta" in module_name:
new_module_name = module_name.replace("move.beta", "ema_gate.ema_expansion_matrix")
elif "mega_layer.beta" in module_name:
new_module_name = module_name.replace("beta", "qk_bias")
else:
new_module_name = module_name.replace("beta", "b_param")
elif "gamma" in module_name:
if "move.gamma" in module_name:
new_module_name = module_name.replace("move.gamma", "ema_gate.kernel_projection_matrix")
elif "mega_layer.gamma" in module_name:
new_module_name = module_name.replace("gamma", "qk_weight")
else:
new_module_name = module_name.replace("gamma", "g_param")
elif "move.alpha" in module_name:
new_module_name = module_name.replace("move.alpha", "ema_gate.decay_factor")
elif "move.delta" in module_name:
new_module_name = module_name.replace("move.delta", "ema_gate.damping_factor")
elif "omega" in module_name:
new_module_name = module_name.replace("move.omega", "ema_gate.residual_weight")
if new_module_name:
updated_keys[module_name] = new_module_name
if len(updated_keys) != 0:
print(f"Renaming these keys: {updated_keys.keys()}")
else:
print("No need to rename state dict entries")
for old, new in updated_keys.items():
original_state_dict[new] = original_state_dict.pop(old)
print("HF Mega encoder:", hf_mlm.mega.layers.load_state_dict(original_state_dict))
print(
"HF Mega MLM layer:",
hf_mlm.mlm_head.load_state_dict(
torch.load(os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu")
),
)
input_ids = torch.randint(0, hf_config.vocab_size, size=(4, 256))
input_mask = torch.ones_like(input_ids)
input_mask[:, -10:] = 0
original_output = original_mlm(input_ids, input_mask, batch_first=True, ignore_mask_value=0)
hf_output = hf_mlm(input_ids, input_mask)[0]
print(f"original output {original_output.shape}")
print(f"hf output {hf_output.shape}")
print(f"max diff: {(original_output - hf_output).max()}")
success = torch.allclose(original_output, hf_output, atol=1e-3)
if success:
print("Yay!")
hf_mlm.save_pretrained(output_path)
else:
raise RuntimeError(f"Something's broken :(\nOriginal:\n{original_output}\n\nHF\n{hf_output}\n{hf_mlm}")
if includes_tokenizer:
print("Transferring tokenizer")
tokenizer = AutoTokenizer.from_pretrained(pretrained_checkpoint_path)
tokenizer.save_pretrained(output_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--pretrained_checkpoint_path",
default=None,
type=str,
required=True,
help="Point to the directory containing your model weights using the official Mega repo",
)
parser.add_argument(
"--output_path", default=None, type=str, required=True, help="Location to save the Hugging Face version"
)
parser.add_argument(
"--includes_tokenizer",
action="store_true",
help="Use this flag if there is a Hugging Face tokenizer in the original checkpoint repo",
)
args = parser.parse_args()
convert_checkpoint_to_huggingface(args.pretrained_checkpoint_path, args.output_path, args.includes_tokenizer)
这段代码是一个命令行程序的入口,它使用 argparse 库来解析命令行参数,并调用 `convert_checkpoint_to_huggingface` 函数进行预训练模型检查点的转换工作。
.\models\mega\modeling_mega.py
"""
PyTorch MEGA model.
"""
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithPoolingAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_mega import MegaConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "mnaylor/mega-base-wikitext"
_CONFIG_FOR_DOC = "MegaConfig"
MEGA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"mnaylor/mega-base-wikitext",
]
class MegaEmbeddings(nn.Module):
"""
Mega's basic implementation does not incorporate token type embeddings, so this is a stripped-down version of
RoBERTa's embeddings which optionally includes token types
"""
def __init__(self, config: MegaConfig):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.use_token_types = config.add_token_type_embeddings
if self.use_token_types:
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.register_buffer(
"token_type_ids", torch.zeros(config.max_positions, dtype=torch.long).expand((1, -1)), persistent=False
)
self.padding_idx = config.pad_token_id
def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None):
if (input_ids is None) and (inputs_embeds is None):
raise ValueError("Must provide one of input_ids or inputs_embeds")
elif input_ids is not None:
input_shape = input_ids.size()
device = input_ids.device
inputs_embeds = self.word_embeddings(input_ids)
else:
input_shape = inputs_embeds.size()[:-1]
device = inputs_embeds.device
if self.use_token_types:
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, : input_shape[1]]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], input_shape[1])
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
else:
embeddings = inputs_embeds
return embeddings
class MegaSimpleRelativePositionalBias(nn.Module):
"""
Simple relative positional embeddings copied from the Mega repo; renamed variables for better readability
"""
def __init__(self, config: MegaConfig):
super().__init__()
self.config = config
self.max_positions = self.config.max_positions if self.config.chunk_size < 0 else self.config.chunk_size
self.rel_pos_bias = nn.Parameter(torch.Tensor(2 * config.max_positions - 1))
def forward(self, seq_len):
if seq_len > self.max_positions:
raise ValueError("Sequence length {} going beyond max length {}".format(seq_len, self.max_positions))
bias = self.rel_pos_bias[(self.max_positions - seq_len) : (self.max_positions + seq_len - 1)]
tile = F.pad(bias, (0, seq_len))
tile = torch.tile(tile, (seq_len,))
tile = tile[:-seq_len]
return tile
class MegaRotaryRelativePositionalBias(nn.Module):
"""
Rotary relative bias for positional information; similar in concept to RoPE (i.e. RoFormer) but taken from the Mega
repo due to differences in implementation.
When initialized, produces a positional bias which ranges from position 0 to config.max_positions, but can
extrapolate to longer sequences. Can be indexed according to input position IDs
"""
def __init__(self, config: MegaConfig):
super().__init__()
if config.hidden_size % 2 != 0:
raise RuntimeError("Rotary positional bias requires `hidden_size` to be a multiple of 2")
self.config = config
self.embed_dim = config.shared_representation_size
self.max_positions = self.config.max_positions if self.config.chunk_size < 0 else self.config.chunk_size
self.sine, self.cosine = MegaRotaryRelativePositionalBias.get_sinusoid_embeddings(
config.max_positions, self.embed_dim
)
self.alpha = nn.Parameter(torch.Tensor(1, self.embed_dim))
self.b_param = nn.Parameter(torch.Tensor(1, self.embed_dim))
self.register_buffer("_float_tensor", torch.FloatTensor([0.0]))
@staticmethod
def get_sinusoid_embeddings(max_positions: int, embedding_dim: int):
half_dim = embedding_dim // 2
emb = math.log(10000) / half_dim
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(max_positions, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
return torch.sin(emb), torch.cos(emb)
def rotary(self, input):
seq_len, embed_dim = input.size()
chunk_1, chunk_2 = torch.chunk(input, 2, dim=-1)
if self.sine is None or seq_len > self.sine.size(0):
self.sine, self.cosine = MegaRotaryRelativePositionalBias.get_sinusoid_embeddings(seq_len, embed_dim)
self.max_positions = seq_len
self.sine = self.sine.to(self._float_tensor)
self.cosine = self.cosine.to(self._float_tensor)
sin = self.sine[:seq_len]
cos = self.cosine[:seq_len]
return torch.cat([chunk_1 * cos - chunk_2 * sin, chunk_2 * cos + chunk_1 * sin], dim=1)
def forward(self, seq_len):
rotary_alpha = self.rotary(self.alpha.expand(seq_len, self.embed_dim))
rotary_beta = self.rotary(self.b_param.expand(seq_len, self.embed_dim))
bias = torch.einsum("mk,nk->mn", rotary_alpha, rotary_beta)
return bias
class MegaDropout(nn.Module):
"""
A unified class for standard dropout functionality and featurewise dropout.
The original fairseq Mega repo used 2 classes for these, which included some unnecessary handling of training logic
and an unused `inplace` option. The original implementation used torch.nn.functional instead of submodules, which
is retained here as well.
"""
def __init__(self, dropout_probability, is_featurewise=False):
super().__init__()
self.dropout_probability = dropout_probability
self.is_featurewise = is_featurewise
def forward(self, input, batch_first: bool = False):
if self.is_featurewise:
if batch_first:
return F.dropout2d(
input.transpose(-1, -2), p=self.dropout_probability, training=self.training
).transpose(-1, -2)
else:
if input.dim() != 3:
raise ValueError(
"Feature dropout inputs must be exactly 3-dimensional if inputs are ordered [sequence length, batch size, hidden dimension]"
)
return F.dropout2d(input.permute(1, 2, 0), p=self.dropout_probability, training=self.training).permute(
2, 0, 1
)
else:
return F.dropout(input, p=self.dropout_probability, training=self.training)
class MegaRMSNorm(nn.Module):
"""
RMSNorm used in Mega implementation. Differs from T5's RMSNorm by applying the weight prior to taking the square
root (as opposed to after in T5)
"""
def __init__(self, number_features, eps=1e-6, affine=True):
super().__init__()
self.num_features = number_features
self.eps = eps
self.affine = affine
if affine:
self.weight = nn.Parameter(torch.Tensor(self.num_features))
else:
self.register_parameter("weight", None)
def forward(self, input):
mean_square = torch.mean(torch.square(input), dim=-1, keepdim=True)
if self.weight is not None:
input = input * self.weight
input * torch.rsqrt(mean_square + self.eps)
return input
class MegaScaleNorm(nn.Module):
"""
Scale normalization introduced in MEGA which is similar to RMSNorm, but uses a single parameter for scalar
multiplication instead of a vector, and applies over a specified dimension
"""
def __init__(self, dim, eps=1e-6, affine=True):
super().__init__()
self.dim = dim
self.eps = eps
self.affine = affine
if affine:
self.scalar = nn.Parameter(torch.Tensor(1))
else:
self.register_parameter("scalar", None)
def forward(self, input):
mean_square = torch.mean(torch.square(input), dim=self.dim, keepdim=True)
if self.scalar is not None:
input = self.scalar * input
output = input * torch.rsqrt(mean_square + self.eps)
return output
"""
A wrapper class for various layer normalization options used in Mega. Used to handle differences in expectations on
input axis locations for different normalization methods.
"""
def __init__(self, norm_type, embedding_dim, eps=1e-5, affine=True, export=False):
super().__init__()
if norm_type == "layernorm":
self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine=affine)
elif norm_type == "scalenorm":
self.norm = MegaScaleNorm(dim=-1, eps=eps, affine=affine)
elif norm_type == "rmsnorm":
self.norm = MegaRMSNorm(embedding_dim, eps=eps, affine=affine)
elif norm_type == "batchnorm":
self.norm = nn.BatchNorm1d(embedding_dim, eps=eps, affine=affine)
elif norm_type == "syncbatchnorm":
self.norm = nn.SyncBatchNorm(embedding_dim, eps=eps, affine=affine)
else:
raise ValueError("Unknown norm type: {}".format(norm_type))
def forward(self, input):
if isinstance(self.norm, nn.modules.batchnorm._BatchNorm):
if input.dim() != 3:
raise ValueError("BatchNorm inputs must be exactly 3-dimensional")
input = input.permute(1, 2, 0)
input = self.norm(input)
return input.permute(2, 0, 1)
else:
return self.norm(input)
ALL_LAYERNORM_LAYERS.append(MegaSequenceNorm)
class MegaMultiDimensionDampedEma(nn.Module):
"""
Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of
variable names and moving away from the stateful representation of incremental decoding state. See
"https://arxiv.org/abs/2209.10655" for more details.
"""
def __init__(self, config: MegaConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.ndim = config.ema_projection_size
self.bidirectional = config.bidirectional
self.truncation = config.truncation
self.scale = math.sqrt(1.0 / self.ndim)
kernel_dim = 2 * config.hidden_size if self.bidirectional else config.hidden_size
self.damping_factor = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
self.decay_factor = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
self.ema_expansion_matrix = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
self.kernel_projection_matrix = nn.Parameter(torch.Tensor(kernel_dim, self.ndim))
self.residual_weight = nn.Parameter(torch.Tensor(config.hidden_size))
self._kernel = None
self._coeffs = None
def _compute_ema_coefficients(self):
self._coeffs = None
damping_factor = torch.sigmoid(self.damping_factor)
decay_factor = torch.sigmoid(self.decay_factor)
previous_timestep_weight = 1.0 - damping_factor * decay_factor
return damping_factor, previous_timestep_weight
def _compute_efficient_ema_kernel(self, length: int):
self._kernel = None
damping_factor, previous_timestep_weight = self._compute_ema_coefficients()
vander = torch.arange(length).to(damping_factor).view(1, 1, length) * torch.log(previous_timestep_weight)
kernel = (damping_factor * self.ema_expansion_matrix) * torch.exp(vander)
return torch.einsum("dnl,dn->dl", kernel, self.kernel_projection_matrix * self.scale)
def get_ema_coefficients(self):
if self.training:
return self._compute_ema_coefficients()
else:
if self._coeffs is None:
self._coeffs = self._compute_ema_coefficients()
return self._coeffs
def get_ema_kernel(self, length: int):
kernel_size = length if self.truncation is None else min(self.truncation, length)
if self.training:
return self._compute_efficient_ema_kernel(kernel_size)
else:
if self._kernel is None or self._kernel.size(-1) < kernel_size:
self._kernel = self._compute_efficient_ema_kernel(kernel_size)
return self._kernel[..., :kernel_size]
def fft_convolution(self, inputs, kernel, length):
inputs_fft = torch.fft.rfft(inputs.float(), n=2 * length)
kernel_fft = torch.fft.rfft(kernel.float(), n=2 * length)
convolved_sequence = torch.fft.irfft(inputs_fft * kernel_fft, n=2 * length)
return convolved_sequence
def ema_step(self, inputs, length, past_state=None):
if length == 1:
return self.one_ema_step(inputs, past_state=past_state)
damping_factor, previous_timestep_weight = self.get_ema_coefficients()
vander = torch.arange(length + 1).to(damping_factor).view(1, 1, length + 1) * torch.log(
previous_timestep_weight
)
vander = torch.exp(vander)
if past_state is not None:
past_ema_proj = vander[:, :, 1:] * (self.kernel_projection_matrix * self.scale).unsqueeze(-1)
past_ema_state = torch.einsum("bdn,dnl->bdl", past_state, past_ema_proj)
past_vandermonde = vander[:, :, -1] * past_state
else:
past_ema_state = None
past_vandermonde = None
vander = vander[:, :, :-1]
kernel = (damping_factor * self.ema_expansion_matrix) * vander
kernel_proj = torch.einsum("dnl,dn->dl", kernel, self.kernel_projection_matrix * self.scale)
ema_output = self.fft_convolution(inputs, kernel_proj, length=length)[..., 0:length]
ema_output = ema_output.type_as(inputs)
if past_ema_state is not None:
ema_output = ema_output + past_ema_state
updated_hidden_state = torch.einsum("bdl,dnl->bdn", inputs, torch.flip(kernel, dims=[2]))
if past_vandermonde is not None:
updated_hidden_state = updated_hidden_state + past_vandermonde
return ema_output.permute(2, 0, 1), updated_hidden_state
def one_ema_step(self, inputs, past_state=None):
damping_factor, previous_timestep_weight = self.get_ema_coefficients()
updated_state = (damping_factor * self.ema_expansion_matrix).squeeze(-1) * inputs
if past_state is not None:
updated_state = updated_state + previous_timestep_weight.squeeze(-1) * past_state
out = torch.einsum("bdn,dn->bd", updated_state, self.kernel_projection_matrix * self.scale)
return out.unsqueeze(0), updated_state
class MegaGatedCrossAttention(nn.Module):
"""
Gated Structured State Attention for use in encoder-decoder model. See Mega paper for more details. Only
modifications from original implementation are variable names, removing the unnecessary `before_attn_fn` and
`static_kv` arguments, and the stateful representation of incremental decoder state.
"""
def __init__(self, config: MegaConfig):
super().__init__()
self.config = config
self.activation = ACT2FN[self.config.activation]
self.attention_activation = self.config.attention_activation
self.scaling = self.config.shared_representation_size**-0.5 if self.attention_activation == "softmax" else None
self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
self.hidden_dropout = MegaDropout(
self.config.hidden_dropout_prob, is_featurewise=self.config.use_feature_dropout
)
self.attention_dropout = MegaDropout(self.config.attention_probs_dropout_prob, is_featurewise=False)
self.prenorm = self.config.normalize_before_mega
self.norm = MegaSequenceNorm(
self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
)
self.k_proj = nn.Linear(self.config.hidden_size, self.config.shared_representation_size)
self.v_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size)
self.q_proj = nn.Linear(
self.config.hidden_size, 2 * self.config.hidden_size + self.config.shared_representation_size
)
self.h_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size)
if self.config.relative_positional_bias == "simple":
self.rel_pos_bias = MegaSimpleRelativePositionalBias(config)
elif self.config.relative_positional_bias == "rotary":
self.rel_pos_bias = MegaRotaryRelativePositionalBias(config)
else:
raise ValueError("unknown relative position bias: {}".format(self.config.relative_positional_bias))
self.softmax = nn.Softmax(dim=-1)
def element_attention(self, query, key, key_padding_mask, pidx):
bsz, src_len, _ = key.size()
tgt_len = query.size(1) if pidx is None else pidx + 1
if key_padding_mask is not None:
lengths = key_padding_mask.sum(dim=-1).view(bsz, 1, 1)
else:
lengths = src_len
bias = self.rel_pos_bias(max(tgt_len, src_len))[:, :src_len]
if pidx is not None:
if query.size(1) != 1:
raise ValueError("Position offset provided with queries longer than 1 token")
bias = bias[pidx]
else:
bias = bias[:tgt_len]
qk = torch.bmm(query, key.transpose(1, 2)) / lengths + bias
attn_weights = ACT2FN[self.attention_activation](qk).type_as(qk)
if key_padding_mask is not None:
attn_weights = attn_weights * key_padding_mask.unsqueeze(1)
return attn_weights
def softmax_attention(self, query, key, key_padding_mask, pidx):
bsz, src_len, _ = key.size()
tgt_len = query.size(1) if pidx is None else pidx + 1
bias = self.rel_pos_bias(max(tgt_len, src_len))[:, :src_len]
if pidx is not None:
if query.size(1) != 1:
raise ValueError("Position offset provided with queries longer than 1 token")
bias = bias[pidx]
else:
bias = bias[:tgt_len]
query = query * self.scaling
qk = torch.bmm(query, key.transpose(1, 2)) + bias
if key_padding_mask is not None:
qk = qk.masked_fill((1 - key_padding_mask).unsqueeze(1).to(torch.bool), float("-inf"))
attn_weights = self.softmax(qk).type_as(qk)
return attn_weights
def forward(
self,
query,
key: Optional[torch.Tensor],
value: Optional[torch.Tensor],
key_padding_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
"""
Pure PyTorch implementation of Mega block; see https://arxiv.org/abs/2209.10655 and original fairseq implementation
at https://github.com/facebookresearch/mega (copyright Meta Research, licensed under MIT License)
Differences from original implementation include hidden state refactor and fixed inconsistency with additive /
multiplicative attention masks
"""
def __init__(self, config: MegaConfig):
super().__init__()
self.config = config
self.activation = ACT2FN[self.config.activation]
self.scaling = (
self.config.shared_representation_size**-0.5 if self.config.attention_activation == "softmax" else None
)
self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
self.hidden_dropout = MegaDropout(
self.config.hidden_dropout_prob, is_featurewise=self.config.use_feature_dropout
)
self.attention_dropout = MegaDropout(self.config.attention_probs_dropout_prob, is_featurewise=False)
self.norm = MegaSequenceNorm(
self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
)
self.ema_gate = MegaMultiDimensionDampedEma(config)
self.v_proj = nn.Linear(self.config.hidden_size, self.config.intermediate_size)
self.mx_proj = nn.Linear(
self.config.hidden_size,
self.config.shared_representation_size + self.config.intermediate_size + 2 * self.config.hidden_size,
)
self.h_proj = nn.Linear(self.config.intermediate_size, self.config.hidden_size)
self.qk_weight = nn.Parameter(torch.Tensor(2, self.config.shared_representation_size))
self.qk_bias = nn.Parameter(torch.Tensor(2, self.config.shared_representation_size))
if self.config.relative_positional_bias == "simple":
self.rel_pos_bias = MegaSimpleRelativePositionalBias(config)
elif self.config.relative_positional_bias == "rotary":
self.rel_pos_bias = MegaRotaryRelativePositionalBias(config)
else:
raise ValueError(f"Unknown relative positional bias: {self.config.relative_positional_bias}")
self.softmax = nn.Softmax(dim=-1)
self.attention_function = (
self.softmax_attention if self.config.attention_activation == "softmax" else self.element_attention
)
def element_attention(self, query, key, padding_mask, causal_mask):
"""
Apply element-wise attention via relu^2 or laplace. Same as original implementation but with standardized
causal attention mask. Expects the Hugging Face standard attention mask paradigm: 1 for not masked, and 0 for
masked.
"""
seq_len = key.size(2)
if padding_mask is not None:
lengths = padding_mask.sum(-1, keepdim=True)
lengths = lengths.clamp(min=1.0).unsqueeze(-1)
else:
lengths = seq_len
if causal_mask is not None:
lengths = causal_mask.sum(dim=-1, keepdim=True)
bias = self.rel_pos_bias(seq_len)
if seq_len != query.size(2):
if query.size(2) != 1:
raise ValueError("Size mismatch between Q and K in element attention")
bias = bias[-1:]
qk = torch.matmul(query, key.transpose(2, 3)) / lengths + bias
attn_weights = ACT2FN[self.config.attention_activation](qk).type_as(qk)
if padding_mask is not None:
attn_weights = attn_weights * padding_mask.unsqueeze(2)
if causal_mask is not None:
attn_weights = attn_weights * causal_mask
return attn_weights
def softmax_attention(self, query, key, padding_mask, causal_mask):
"Standard softmax self-attention, as in the original Transformer paper"
seq_len = key.size(2)
bias = self.rel_pos_bias(seq_len)
if seq_len != query.size(2):
if query.size(2) != 1:
raise ValueError("Size mismatch between Q and K in softmax attention")
bias = bias[-1:]
query = query * self.scaling
qk = torch.matmul(query, key.transpose(2, 3)) + bias
if causal_mask is not None:
additive_causal_mask = torch.zeros_like(causal_mask, dtype=qk.dtype)
additive_causal_mask = additive_causal_mask.masked_fill((1 - causal_mask).bool(), float("-inf"))
qk = qk + additive_causal_mask
if padding_mask is not None:
padding_mask = 1 - padding_mask
padding_mask_all = padding_mask.all(dim=-1, keepdim=True)
padding_mask = torch.logical_and(padding_mask, ~padding_mask_all)
qk = qk.masked_fill(padding_mask.unsqueeze(2).to(torch.bool), float("-inf"))
attn_weights = self.softmax(qk).type_as(qk)
return attn_weights
def forward(
self,
input,
padding_mask: Optional[torch.Tensor] = None,
causal_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[torch.Tensor]] = None,
output_attentions=False,
use_cache=False,
class MegaNormalizedFeedForwardNetwork(nn.Module):
"""
Normalized feed-forward network used in Mega blocks. Left as-is from original Mega repo aside from retrieving args
from Hugging Face config
"""
def __init__(self, config: MegaConfig):
super().__init__()
self.config = config
self.hidden_dim = config.nffn_hidden_size
self.act_fn = config.activation
self.activation = ACT2FN[config.activation]
self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
self.hidden_dropout = MegaDropout(
self.config.nffn_activation_dropout_prob, is_featurewise=self.config.use_feature_dropout
)
self.prenorm = self.config.normalize_before_ffn
self.norm = MegaSequenceNorm(
self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
)
self.fc1 = nn.Linear(self.config.hidden_size, self.config.nffn_hidden_size)
self.fc2 = nn.Linear(self.config.nffn_hidden_size, self.config.hidden_size)
def forward(self, inputs):
residual = inputs
if self.prenorm:
inputs = self.norm(inputs)
hidden = self.activation(self.fc1(inputs))
hidden = self.hidden_dropout(hidden)
output = self.fc2(hidden)
output = self.dropout(output)
output = output + residual
if not self.prenorm:
output = self.norm(output)
return output
class MegaBlock(nn.Module):
def __init__(self, config: MegaConfig):
super().__init__()
self.seq_len_dim = 1
self.mega_layer = MegaMovingAverageGatedAttention(config)
self.nffn = MegaNormalizedFeedForwardNetwork(config) if config.use_normalized_ffn else None
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.cross_attn = MegaGatedCrossAttention(config)
else:
self.cross_attn = None
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
causal_mask: Optional[torch.LongTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[torch.FloatTensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: bool = False,
class MegaPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class MegaPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = MegaConfig
base_model_prefix = "mega"
supports_gradient_checkpointing = False
_no_split_modules = ["MegaMovingAverageGatedAttention"]
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, MegaMultiDimensionDampedEma):
with torch.no_grad():
nn.init.normal_(module.damping_factor, mean=0.0, std=self.config.ema_delta_alpha_range)
nn.init.normal_(module.decay_factor, mean=0.0, std=self.config.ema_delta_alpha_range)
val = torch.ones(self.config.ema_projection_size, 1)
if self.config.ema_projection_size > 1:
idx = torch.tensor(list(range(1, self.config.ema_projection_size, 2)))
val.index_fill_(0, idx, -1.0)
module.ema_expansion_matrix.normal_(mean=0.0, std=self.config.ema_beta_range).add_(val)
nn.init.normal_(module.kernel_projection_matrix, mean=0.0, std=self.config.ema_gamma_omega_range)
nn.init.normal_(module.residual_weight, mean=0.0, std=self.config.ema_gamma_omega_range)
elif isinstance(module, MegaSimpleRelativePositionalBias):
nn.init.normal_(module.rel_pos_bias, mean=0.0, std=self.config.initializer_range)
elif isinstance(module, MegaRotaryRelativePositionalBias):
nn.init.normal_(module.alpha, mean=0.0, std=self.config.initializer_range)
nn.init.normal_(module.b_param, mean=0.0, std=self.config.initializer_range)
elif isinstance(module, MegaScaleNorm):
if self.config.norm_affine:
nn.init.constant_(module.scalar, 1.0)
elif isinstance(module, MegaRMSNorm):
if self.config.norm_affine:
nn.init.constant_(module.weight, 1.0)
elif isinstance(module, MegaMovingAverageGatedAttention):
nn.init.normal_(module.qk_weight, mean=0.0, std=self.config.initializer_range)
nn.init.constant_(module.qk_bias, 0.0)
elif isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
MEGA_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`MegaConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
MEGA_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
# 输入序列标记的索引,对应词汇表中的标记
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
# 遮罩用于避免在填充的标记索引上执行注意力操作。遮罩值在 `[0, 1]` 范围内:
- 1 表示 **未被遮罩** 的标记,
- 0 表示 **被遮罩** 的标记。
[What are attention masks?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 段标记索引,用于指示输入的第一部分和第二部分。索引选择在 `[0,1]` 范围内:
- 0 对应于 *句子A* 的标记,
- 1 对应于 *句子B* 的标记。
此参数仅在模型初始化时设置了 `add_token_type_embeddings` 参数为 `True` 时可用。此张量中的所有值应始终 < config.type_vocab_size。
[What are token type IDs?](../glossary#token-type-ids)
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
# 可选地,您可以直接传递嵌入表示,而不是传递 `input_ids`。如果您想对如何将 `input_ids` 索引转换为关联向量有更多控制,这将非常有用,而不是使用模型内部的嵌入查找矩阵。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。有关更多详细信息,请参见返回的张量中的 `attentions`。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。有关更多详细信息,请参见返回的张量中的 `hidden_states`。
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""
@add_start_docstrings(
"The bare MEGA Model transformer outputting raw hidden-states without any specific head on top.",
MEGA_START_DOCSTRING,
)
class MegaModel(MegaPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added after self-attention, following the architecture described in *Mega: Moving Average
Equipped Gated Attention*_ by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig,
Jonathan May, and Luke Zettlemoyer
To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
`True` and `bidirectional` set to `False`. To be used in a Seq2Seq model, the model needs to initialized with both
`is_decoder=True` and `bidirectional=False` argument as well as `add_cross_attention` set to `True`; an
`encoder_hidden_states` is then expected as an input to the forward pass.
.. _*Mega: Moving Average Equipped Gated Attention*: https://arxiv.org/abs/2209.10655
"""
def __init__(self, config: MegaConfig, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.embedding_layer = MegaEmbeddings(config)
self.layers = nn.ModuleList([MegaBlock(config) for _ in range(config.num_hidden_layers)])
self.pooler = MegaPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embedding_layer.word_embeddings
def set_input_embeddings(self, value):
self.embedding_layer.word_embeddings = value
@add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""MEGA Model with a `language modeling` head on top for CLM fine-tuning.""", MEGA_START_DOCSTRING
)
class MegaForCausalLM(MegaPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: MegaConfig):
super().__init__(config)
if not config.is_decoder:
logger.warning("If you want to use `MegaForCausalLM` as a standalone, add `is_decoder=True.`")
self.mega = MegaModel(config, add_pooling_layer=False)
if config.add_lm_hidden_dense_layer:
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.hidden_activation = nn.Tanh()
else:
self.dense = None
self.hidden_activation = None
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
self.post_init()
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
input_shape = input_ids.shape
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
if past_key_values is not None:
input_ids = input_ids[:, -1:]
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
def _reorder_cache(self, past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
@add_start_docstrings("""MEGA Model with a `language modeling` head on top.""", MEGA_START_DOCSTRING)
class MegaForMaskedLM(MegaPreTrainedModel):
_tied_weights_keys = ["mlm_head.weight"]
def __init__(self, config: MegaConfig):
super().__init__(config)
if config.is_decoder:
logger.warning(
"If you want to use `MegaForMaskedLM`, set `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.mega = MegaModel(config, add_pooling_layer=False)
if config.add_lm_hidden_dense_layer:
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.hidden_activation = nn.Tanh()
else:
self.dense = None
self.hidden_activation = None
self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size)
self.dropout = nn.Dropout(config.dropout_prob)
self.post_init()
def get_output_embeddings(self):
return self.mlm_head
def set_output_embeddings(self, new_embeddings):
self.mlm_head = new_embeddings
@add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="<mask>",
expected_output="' Paris'",
expected_loss=0.1,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mega(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
if self.dense is not None:
sequence_output = self.dense(sequence_output)
sequence_output = self.hidden_activation(sequence_output)
prediction_scores = self.mlm_head(sequence_output)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
MEGA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
MEGA_START_DOCSTRING,
)
class MegaForSequenceClassification(MegaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.mega = MegaModel(config, add_pooling_layer=False)
self.classifier = MegaClassificationHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mega(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
MEGA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
MEGA_START_DOCSTRING,
)
class MegaForMultipleChoice(MegaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.mega = MegaModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.post_init()
@add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
flat_inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.mega(
flat_input_ids,
token_type_ids=flat_token_type_ids,
attention_mask=flat_attention_mask,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
MEGA Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
@add_start_docstrings(
"""
MEGA Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
MEGA_START_DOCSTRING,
)
class MegaForTokenClassification(MegaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.mega = MegaModel(config, add_pooling_layer=False)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mega(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
else:
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class MegaClassificationHead(nn.Module):
"""用于句子级分类任务的头部模块。"""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features[:, 0, :]
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
@add_start_docstrings(
"""
MEGA模型,顶部具有一个用于抽取式问答任务(例如SQuAD)的跨度分类头部模块(在隐藏状态输出的线性层之上计算`跨度起始logits`和`跨度终止logits`)。
""",
MEGA_START_DOCSTRING,
)
class MegaForQuestionAnswering(MegaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.mega = MegaModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mega(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)