Transformers 源码解析(九十五)
.\models\roberta\convert_roberta_original_pytorch_checkpoint_to_pytorch.py
import argparse
import pathlib
import fairseq
import torch
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
from fairseq.modules import TransformerSentenceEncoderLayer
from packaging import version
from transformers import RobertaConfig, RobertaForMaskedLM, RobertaForSequenceClassification
from transformers.models.bert.modeling_bert import (
BertIntermediate,
BertLayer,
BertOutput,
BertSelfAttention,
BertSelfOutput,
)
from transformers.utils import logging
if version.parse(fairseq.__version__) < version.parse("0.9.0"):
raise Exception("requires fairseq >= 0.9.0")
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
SAMPLE_TEXT = "Hello world! cécé herlolip"
def convert_roberta_checkpoint_to_pytorch(
roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
):
"""
复制/粘贴/调整 RoBERTa 的权重以适应我们的 BERT 结构。
"""
roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
roberta.eval()
roberta_sent_encoder = roberta.model.encoder.sentence_encoder
config = RobertaConfig(
vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
hidden_size=roberta.args.encoder_embed_dim,
num_hidden_layers=roberta.args.encoder_layers,
num_attention_heads=roberta.args.encoder_attention_heads,
intermediate_size=roberta.args.encoder_ffn_embed_dim,
max_position_embeddings=514,
type_vocab_size=1,
layer_norm_eps=1e-5,
)
if classification_head:
config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0]
print("Our BERT config:", config)
model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
model.eval()
model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
model.roberta.embeddings.token_type_embeddings.weight
)
model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
for i in range(config.num_hidden_layers):
layer: BertLayer = model.roberta.encoder.layer[i]
roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
self_attn: BertSelfAttention = layer.attention.self
assert (
roberta_layer.self_attn.k_proj.weight.data.shape
== roberta_layer.self_attn.q_proj.weight.data.shape
== roberta_layer.self_attn.v_proj.weight.data.shape
== torch.Size((config.hidden_size, config.hidden_size))
)
self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
self_output: BertSelfOutput = layer.attention.output
assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
intermediate: BertIntermediate = layer.intermediate
assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
intermediate.dense.weight = roberta_layer.fc1.weight
intermediate.dense.bias = roberta_layer.fc1.bias
bert_output: BertOutput = layer.output
assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
bert_output.dense.weight = roberta_layer.fc2.weight
bert_output.dense.bias = roberta_layer.fc2.bias
bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
if classification_head:
model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
else:
model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight
model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias
model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight
model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias
model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight
model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias
input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)
our_output = model(input_ids)[0]
if classification_head:
their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
else:
their_output = roberta.model(input_ids)[0]
print(our_output.shape, their_output.shape)
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
print(f"max_absolute_diff = {max_absolute_diff}")
success = torch.allclose(our_output, their_output, atol=1e-3)
print("Do both models output the same tensors?", "🔥" if success else "💩")
if not success:
raise Exception("Something went wRoNg")
pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument(
"--classification_head", action="store_true", help="Whether to convert a final classification head."
)
args = parser.parse_args()
convert_roberta_checkpoint_to_pytorch(
args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
)
.\models\roberta\modeling_flax_roberta.py
ROBERTA_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
This model is also a
[flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
config ([`RobertaConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
# 定义了一个长字符串,用于文档化 RobertaEmbeddings 类的输入参数及其说明
ROBERTA_INPUTS_DOCSTRING = r"""
Args:
input_ids (`numpy.ndarray` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary
position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
head_mask (`numpy.ndarray` of shape `({0})`, `optional):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# 定义了一个类 FlaxRobertaEmbeddings,继承自 nn.Module
class FlaxRobertaEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
# 类属性 config,指定为 RobertaConfig 类型
config: RobertaConfig
# 类属性 dtype,指定为 jnp.float32 类型,用于计算的数据类型
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
def setup(self):
# 初始化词嵌入层,用于将词汇 ID 映射到隐藏大小的向量空间
self.word_embeddings = nn.Embed(
self.config.vocab_size,
self.config.hidden_size,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
dtype=self.dtype,
)
# 初始化位置嵌入层,用于将位置 ID 映射到隐藏大小的向量空间
self.position_embeddings = nn.Embed(
self.config.max_position_embeddings,
self.config.hidden_size,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
dtype=self.dtype,
)
# 初始化类型嵌入层,用于将类型 ID 映射到隐藏大小的向量空间
self.token_type_embeddings = nn.Embed(
self.config.type_vocab_size,
self.config.hidden_size,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
dtype=self.dtype,
)
# 初始化层归一化模块,使用给定的 epsilon 参数
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# 初始化 dropout 模块,使用给定的 dropout 率
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True):
# 将输入的词汇 ID 转换为词嵌入向量
inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
# 将位置 ID 转换为位置嵌入向量
position_embeds = self.position_embeddings(position_ids.astype("i4"))
# 将类型 ID 转换为类型嵌入向量
token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
# 合并所有嵌入向量
hidden_states = inputs_embeds + token_type_embeddings + position_embeds
# 应用层归一化
hidden_states = self.LayerNorm(hidden_states)
# 应用 dropout
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
# 返回最终的隐藏状态表示
return hidden_states
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention 复制过来的类,修改了 Bert -> Roberta
class FlaxRobertaSelfAttention(nn.Module):
# 类的构造函数中声明了配置参数 config,以及两个额外的类属性:causal 表示是否使用因果(causal)注意力,dtype 表示计算过程中使用的数据类型,默认为 jnp.float32
config: RobertaConfig
causal: bool = False
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
def setup(self):
# 计算每个注意力头的维度
self.head_dim = self.config.hidden_size // self.config.num_attention_heads
# 检查 hidden_size 是否能被 num_attention_heads 整除,如果不能则抛出 ValueError 异常
if self.config.hidden_size % self.config.num_attention_heads != 0:
raise ValueError(
"`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
" : {self.config.num_attention_heads}"
)
# 创建 query、key 和 value 网络层,分别初始化为指定的 hidden_size,并使用 normal 分布初始化权重
self.query = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
self.key = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
self.value = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
# 如果设置了 causal=True,则创建一个因果掩码,用于在自注意力机制中排除未来信息
if self.causal:
self.causal_mask = make_causal_mask(
jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
)
# 将隐藏状态分割成多个注意力头,返回的形状为 (batch_size, seq_length, num_attention_heads, head_dim)
def _split_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim))
# 合并多个注意力头到原始隐藏状态,返回的形状为 (batch_size, seq_length, hidden_size)
def _merge_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,))
@nn.compact
# 从 transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache 复制过来的方法
def _concatenate_to_cache(self, key, value, query, attention_mask):
"""
This function takes projected key, value states from a single input token and concatenates the states to cached
states from previous steps. This function is slightly adapted from the official Flax repository:
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py
"""
# 检测是否已经初始化缓存数据
is_initialized = self.has_variable("cache", "cached_key")
# 获取或创建缓存的键和值,并初始化为零矩阵,与输入的形状和数据类型相匹配
cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
# 获取或创建缓存索引,初始化为整数0
cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
if is_initialized:
# 获取批次维度、最大长度、头数和每头深度等维度信息
*batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
# 使用新的一维空间片段更新键和值的缓存
cur_index = cache_index.value
indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
key = lax.dynamic_update_slice(cached_key.value, key, indices)
value = lax.dynamic_update_slice(cached_value.value, value, indices)
# 更新缓存的键和值
cached_key.value = key
cached_value.value = value
# 更新缓存索引,增加已更新的缓存向量数量
num_updated_cache_vectors = query.shape[1]
cache_index.value = cache_index.value + num_updated_cache_vectors
# 为缓存的解码器自注意力生成因果掩码:单个查询位置应只关注已生成和缓存的键位置,而不是剩余的零元素。
pad_mask = jnp.broadcast_to(
jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
)
# 合并因果掩码和输入的注意力掩码
attention_mask = combine_masks(pad_mask, attention_mask)
# 返回更新后的键、值和注意力掩码
return key, value, attention_mask
# 定义一个用于Roberta模型自注意力层输出的类
class FlaxRobertaSelfOutput(nn.Module):
config: RobertaConfig # Roberta模型的配置信息
dtype: jnp.dtype = jnp.float32 # 计算时使用的数据类型
def setup(self):
# 定义一个全连接层,将隐藏状态映射到隐藏大小,使用正态分布初始化权重
self.dense = nn.Dense(
self.config.hidden_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
)
# LayerNorm层,用于归一化隐藏状态
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# Dropout层,用于随机失活以防止过拟合
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
# 全连接层处理隐藏状态
hidden_states = self.dense(hidden_states)
# 使用Dropout层进行随机失活
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
# LayerNorm归一化并与输入张量相加
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
# 定义一个用于Roberta模型注意力机制的类
class FlaxRobertaAttention(nn.Module):
config: RobertaConfig # Roberta模型的配置信息
causal: bool = False # 是否是因果(causal)注意力
dtype: jnp.dtype = jnp.float32 # 计算时使用的数据类型
def setup(self):
# 定义自注意力层
self.self = FlaxRobertaSelfAttention(self.config, causal=self.causal, dtype=self.dtype)
# 定义自注意力层输出层
self.output = FlaxRobertaSelfOutput(self.config, dtype=self.dtype)
def __call__(
self,
hidden_states,
attention_mask,
layer_head_mask,
key_value_states=None,
init_cache=False,
deterministic=True,
output_attentions: bool = False,
):
# 调用自注意力层进行处理
attn_outputs = self.self(
hidden_states,
attention_mask,
layer_head_mask=layer_head_mask,
key_value_states=key_value_states,
init_cache=init_cache,
deterministic=deterministic,
output_attentions=output_attentions,
)
# 取得自注意力层的输出
attn_output = attn_outputs[0]
# 使用自注意力层输出层处理自注意力层的输出和隐藏状态
hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_outputs[1],) # 如果需要输出注意力权重,则加入到输出中
return outputs
# 定义一个用于Roberta模型中间层的类
class FlaxRobertaIntermediate(nn.Module):
config: RobertaConfig # Roberta模型的配置信息
dtype: jnp.dtype = jnp.float32 # 计算时使用的数据类型
def setup(self):
# 定义一个全连接层,将隐藏状态映射到中间大小,使用正态分布初始化权重
self.dense = nn.Dense(
self.config.intermediate_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
)
# 激活函数,根据配置选择激活函数类型
self.activation = ACT2FN[self.config.hidden_act]
# 定义类的方法 __call__,用于对输入的 hidden_states 进行处理并返回结果
def __call__(self, hidden_states):
# 将输入的 hidden_states 通过全连接层 dense 进行线性变换
hidden_states = self.dense(hidden_states)
# 将线性变换后的结果通过激活函数 activation 进行非线性变换
hidden_states = self.activation(hidden_states)
# 返回经过线性变换和激活函数处理后的 hidden_states 结果
return hidden_states
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertOutput 复制并将 Bert 替换为 Roberta
class FlaxRobertaOutput(nn.Module):
config: RobertaConfig # Roberta 模型的配置对象
dtype: jnp.dtype = jnp.float32 # 计算过程中使用的数据类型
def setup(self):
# 定义全连接层,将隐藏状态映射到指定大小的输出空间
self.dense = nn.Dense(
self.config.hidden_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range), # 使用正态分布初始化权重
dtype=self.dtype,
)
# 定义 Dropout 层,用于随机屏蔽神经元,防止过拟合
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
# 定义 LayerNorm 层,用于归一化隐藏状态
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
def __call__(self, hidden_states, attention_output, deterministic: bool = True):
# 全连接层处理隐藏状态
hidden_states = self.dense(hidden_states)
# 应用 Dropout 层
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
# 应用 LayerNorm 层,并将注意力输出加到处理后的隐藏状态上
hidden_states = self.LayerNorm(hidden_states + attention_output)
return hidden_states
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertLayer 复制并将 Bert 替换为 Roberta
class FlaxRobertaLayer(nn.Module):
config: RobertaConfig # Roberta 模型的配置对象
dtype: jnp.dtype = jnp.float32 # 计算过程中使用的数据类型
def setup(self):
# 定义 Roberta 自注意力层
self.attention = FlaxRobertaAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype)
# 定义 Roberta 中间层
self.intermediate = FlaxRobertaIntermediate(self.config, dtype=self.dtype)
# 定义 Roberta 输出层
self.output = FlaxRobertaOutput(self.config, dtype=self.dtype)
# 如果配置中包含跨注意力机制,则定义 Roberta 交叉注意力层
if self.config.add_cross_attention:
self.crossattention = FlaxRobertaAttention(self.config, causal=False, dtype=self.dtype)
def __call__(
self,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
deterministic: bool = True,
output_attentions: bool = False,
# Self Attention
# 使用 self.attention 方法进行自注意力计算,传入隐藏状态、注意力掩码等参数
attention_outputs = self.attention(
hidden_states,
attention_mask,
layer_head_mask=layer_head_mask,
init_cache=init_cache,
deterministic=deterministic,
output_attentions=output_attentions,
)
# 获取注意力计算的输出
attention_output = attention_outputs[0]
# Cross-Attention Block
# 如果存在编码器的隐藏状态,执行交叉注意力计算
if encoder_hidden_states is not None:
# 使用 self.crossattention 方法进行交叉注意力计算,传入自注意力输出、编码器注意力掩码、编码器隐藏状态等参数
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask=encoder_attention_mask,
layer_head_mask=layer_head_mask,
key_value_states=encoder_hidden_states,
deterministic=deterministic,
output_attentions=output_attentions,
)
# 获取交叉注意力计算的输出作为最终的注意力输出
attention_output = cross_attention_outputs[0]
# 使用 self.intermediate 方法进行隐藏状态的中间层处理
hidden_states = self.intermediate(attention_output)
# 使用 self.output 方法生成最终的输出隐藏状态
hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
# 输出为隐藏状态的元组
outputs = (hidden_states,)
# 如果需要输出注意力权重信息
if output_attentions:
# 将自注意力的注意力权重添加到输出元组中
outputs += (attention_outputs[1],)
# 如果存在编码器的隐藏状态,将交叉注意力的注意力权重也添加到输出元组中
if encoder_hidden_states is not None:
outputs += (cross_attention_outputs[1],)
# 返回最终的输出元组
return outputs
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection 复制代码,并将 Bert 替换为 Roberta
class FlaxRobertaLayerCollection(nn.Module):
config: RobertaConfig # 使用 RobertaConfig 类型的配置
dtype: jnp.dtype = jnp.float32 # 计算过程中使用的数据类型
gradient_checkpointing: bool = False # 是否使用梯度检查点技术
def setup(self):
if self.gradient_checkpointing:
# 如果启用梯度检查点技术,则使用 remat 函数重新定义 FlaxRobertaLayer 类
FlaxRobertaCheckpointLayer = remat(FlaxRobertaLayer, static_argnums=(5, 6, 7))
# 创建包含梯度检查点层的列表,每层以字符串形式命名
self.layers = [
FlaxRobertaCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
for i in range(self.config.num_hidden_layers)
]
else:
# 否则,创建普通的 FlaxRobertaLayer 层列表,每层以字符串形式命名
self.layers = [
FlaxRobertaLayer(self.config, name=str(i), dtype=self.dtype)
for i in range(self.config.num_hidden_layers)
]
def __call__(
self,
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 初始化空元组或 None,根据 output_attentions 的值确定是否返回注意力信息
all_attentions = () if output_attentions else None
# 初始化空元组或 None,根据 output_hidden_states 的值确定是否返回隐藏状态信息
all_hidden_states = () if output_hidden_states else None
# 初始化空元组或 None,根据 output_attentions 和 encoder_hidden_states 的值确定是否返回交叉注意力信息
all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
# 检查 head_mask 是否正确指定了每层的屏蔽信息
if head_mask is not None:
if head_mask.shape[0] != (len(self.layers)):
# 抛出异常,提示 head_mask 应该对应于 self.layers 的层数
raise ValueError(
f"The head_mask should be specified for {len(self.layers)} layers, but it is for "
f"{head_mask.shape[0]}."
)
# 遍历所有层,进行前向传播计算
for i, layer in enumerate(self.layers):
# 如果输出隐藏状态信息,则将当前隐藏状态加入到 all_hidden_states 中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 调用当前层的前向传播函数
layer_outputs = layer(
hidden_states,
attention_mask,
head_mask[i] if head_mask is not None else None,
encoder_hidden_states,
encoder_attention_mask,
init_cache,
deterministic,
output_attentions,
)
# 更新当前隐藏状态为当前层的输出的第一个元素(通常是隐藏状态)
hidden_states = layer_outputs[0]
# 如果输出注意力信息,则将当前层的注意力加入到 all_attentions 中
if output_attentions:
all_attentions += (layer_outputs[1],)
# 如果同时存在 encoder_hidden_states,则将当前层的交叉注意力加入到 all_cross_attentions 中
if encoder_hidden_states is not None:
all_cross_attentions += (layer_outputs[2],)
# 如果输出隐藏状态信息,则将最终的隐藏状态加入到 all_hidden_states 中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 整理最终的输出结果
outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)
# 如果 return_dict 为 False,则返回 outputs 中非 None 的部分作为元组
if not return_dict:
return tuple(v for v in outputs if v is not None)
# 如果 return_dict 为 True,则将输出整理成 FlaxBaseModelOutputWithPastAndCrossAttentions 类的对象返回
return FlaxBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_attentions,
cross_attentions=all_cross_attentions,
)
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertEncoder 复制并替换为 Roberta
class FlaxRobertaEncoder(nn.Module):
config: RobertaConfig # 使用 RobertaConfig 类型的配置信息
dtype: jnp.dtype = jnp.float32 # 计算时的数据类型为 jnp.float32
gradient_checkpointing: bool = False # 是否使用梯度检查点
def setup(self):
self.layer = FlaxRobertaLayerCollection( # 初始化 FlaxRobertaLayerCollection 实例
self.config, # 使用给定的配置信息
dtype=self.dtype, # 使用指定的数据类型
gradient_checkpointing=self.gradient_checkpointing, # 梯度检查点设置
)
def __call__( # 定义对象调用时的行为
self,
hidden_states, # 输入的隐藏状态张量
attention_mask, # 注意力掩码张量
head_mask, # 头部掩码张量
encoder_hidden_states: Optional[jnp.ndarray] = None, # 编码器隐藏状态(可选)
encoder_attention_mask: Optional[jnp.ndarray] = None, # 编码器注意力掩码(可选)
init_cache: bool = False, # 是否初始化缓存
deterministic: bool = True, # 是否确定性计算
output_attentions: bool = False, # 是否输出注意力权重
output_hidden_states: bool = False, # 是否输出隐藏状态
return_dict: bool = True, # 是否以字典形式返回结果
):
return self.layer( # 调用 FlaxRobertaLayerCollection 的前向传播
hidden_states,
attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
init_cache=init_cache,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertPooler 复制并替换为 Roberta
class FlaxRobertaPooler(nn.Module):
config: RobertaConfig # 使用 RobertaConfig 类型的配置信息
dtype: jnp.dtype = jnp.float32 # 计算时的数据类型为 jnp.float32
def setup(self):
self.dense = nn.Dense( # 初始化密集连接层
self.config.hidden_size, # 输出大小为配置中的隐藏大小
kernel_init=jax.nn.initializers.normal(self.config.initializer_range), # 使用正态分布初始化权重
dtype=self.dtype, # 使用指定的数据类型
)
def __call__(self, hidden_states):
cls_hidden_state = hidden_states[:, 0] # 取出每个样本的第一个位置的隐藏状态
cls_hidden_state = self.dense(cls_hidden_state) # 将其通过密集连接层
return nn.tanh(cls_hidden_state) # 返回经过 tanh 激活函数后的结果
class FlaxRobertaLMHead(nn.Module):
config: RobertaConfig # 使用 RobertaConfig 类型的配置信息
dtype: jnp.dtype = jnp.float32 # 计算时的数据类型为 jnp.float32
bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros # 偏置初始化函数为零初始化
def setup(self):
self.dense = nn.Dense( # 初始化第一个密集连接层
self.config.hidden_size, # 输出大小为配置中的隐藏大小
dtype=self.dtype, # 使用指定的数据类型
kernel_init=jax.nn.initializers.normal(self.config.initializer_range), # 使用正态分布初始化权重
)
self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) # 初始化 LayerNorm 层
self.decoder = nn.Dense( # 初始化解码器密集连接层
self.config.vocab_size, # 输出大小为词汇表大小
dtype=self.dtype, # 使用指定的数据类型
use_bias=False, # 不使用偏置
kernel_init=jax.nn.initializers.normal(self.config.initializer_range), # 使用正态分布初始化权重
)
self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,)) # 初始化偏置参数
# 定义类的调用方法,接受隐藏状态和可选的共享嵌入作为输入
def __call__(self, hidden_states, shared_embedding=None):
# 使用全连接层对隐藏状态进行线性变换
hidden_states = self.dense(hidden_states)
# 应用 GELU 激活函数到线性变换后的隐藏状态
hidden_states = ACT2FN["gelu"](hidden_states)
# 对激活后的隐藏状态进行层归一化
hidden_states = self.layer_norm(hidden_states)
# 如果提供了共享的嵌入向量,使用解码器对隐藏状态进行处理
if shared_embedding is not None:
# 使用解码器对隐藏状态应用共享的嵌入向量的核参数
hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
else:
# 否则,直接使用解码器处理隐藏状态
hidden_states = self.decoder(hidden_states)
# 将偏置项转换为与模型指定的数据类型相匹配的 JAX 数组
bias = jnp.asarray(self.bias, self.dtype)
# 将偏置项加到隐藏状态上
hidden_states += bias
# 返回处理后的最终隐藏状态
return hidden_states
class FlaxRobertaClassificationHead(nn.Module):
config: RobertaConfig # 定义一个属性 config,类型为 RobertaConfig,用于存储配置信息
dtype: jnp.dtype = jnp.float32 # 定义一个属性 dtype,默认为 jnp.float32
def setup(self):
self.dense = nn.Dense(
self.config.hidden_size, # 使用 config 中的 hidden_size 初始化一个全连接层
dtype=self.dtype, # 指定数据类型为 dtype
kernel_init=jax.nn.initializers.normal(self.config.initializer_range), # 使用正态分布初始化权重
)
classifier_dropout = (
self.config.classifier_dropout # 获取 config 中的 classifier_dropout 属性
if self.config.classifier_dropout is not None # 如果不为 None,则使用该值
else self.config.hidden_dropout_prob # 否则使用 config 中的 hidden_dropout_prob 属性的值
)
self.dropout = nn.Dropout(rate=classifier_dropout) # 使用 classifier_dropout 率初始化一个 Dropout 层
self.out_proj = nn.Dense(
self.config.num_labels, # 使用 config 中的 num_labels 初始化一个全连接层
dtype=self.dtype, # 指定数据类型为 dtype
kernel_init=jax.nn.initializers.normal(self.config.initializer_range), # 使用正态分布初始化权重
)
def __call__(self, hidden_states, deterministic=True):
hidden_states = hidden_states[:, 0, :] # 仅保留每个样本的第一个 token 的隐藏状态
hidden_states = self.dropout(hidden_states, deterministic=deterministic) # 应用 Dropout
hidden_states = self.dense(hidden_states) # 全连接层处理隐藏状态
hidden_states = nn.tanh(hidden_states) # 使用双曲正切作为激活函数
hidden_states = self.dropout(hidden_states, deterministic=deterministic) # 再次应用 Dropout
hidden_states = self.out_proj(hidden_states) # 使用最后一个全连接层进行最终的分类预测
return hidden_states
class FlaxRobertaPreTrainedModel(FlaxPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = RobertaConfig # 类属性,指定配置类为 RobertaConfig
base_model_prefix = "roberta" # 类属性,指定基础模型前缀为 "roberta"
module_class: nn.Module = None # 类属性,用于存储模块类,默认为 None
def __init__(
self,
config: RobertaConfig,
input_shape: Tuple = (1, 1),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
gradient_checkpointing: bool = False,
**kwargs,
):
module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing
def enable_gradient_checkpointing(self):
self._module = self.module_class(
config=self.config, # 使用当前实例的 config 属性初始化模块
dtype=self.dtype, # 使用当前实例的 dtype 属性指定数据类型
gradient_checkpointing=True, # 启用梯度检查点
)
# 初始化权重方法,用于模型参数初始化
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化输入张量
input_ids = jnp.zeros(input_shape, dtype="i4")
# token_type_ids初始化为与input_ids相同形状的全1张量
token_type_ids = jnp.ones_like(input_ids)
# 根据input_ids创建position_ids,并用config中的pad_token_id进行填充
position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
# attention_mask初始化为与input_ids相同形状的全1张量
attention_mask = jnp.ones_like(input_ids)
# head_mask初始化为形状为(config.num_hidden_layers, config.num_attention_heads)的全1张量
head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
# 划分随机数生成器rng,分为params和dropout两个部分
params_rng, dropout_rng = jax.random.split(rng)
rngs = {"params": params_rng, "dropout": dropout_rng}
# 如果config中包含cross-attention,初始化encoder_hidden_states和encoder_attention_mask
if self.config.add_cross_attention:
encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
encoder_attention_mask = attention_mask
# 使用module的init方法初始化模块,传入必要参数,不返回字典
module_init_outputs = self.module.init(
rngs,
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
return_dict=False,
)
else:
# 使用module的init方法初始化模块,传入必要参数,不返回字典
module_init_outputs = self.module.init(
rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False
)
# 从初始化输出中获取随机参数
random_params = module_init_outputs["params"]
# 如果传入了params,则将随机参数与params进行融合
if params is not None:
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
# 将随机参数中缺失的键添加到params中
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
# 返回融合后的冻结params
return freeze(unflatten_dict(params))
else:
# 否则直接返回随机参数
return random_params
# 从transformers库中复制的初始化缓存方法
def init_cache(self, batch_size, max_length):
r"""
Args:
batch_size (`int`):
fast auto-regressive decoding使用的batch_size,定义了初始化缓存的批大小。
max_length (`int`):
auto-regressive decoding的最大可能长度,定义了初始化缓存的序列长度。
"""
# 初始化用于检索缓存的输入变量
input_ids = jnp.ones((batch_size, max_length), dtype="i4")
# attention_mask初始化为与input_ids相同形状的全1张量
attention_mask = jnp.ones_like(input_ids, dtype="i4")
# position_ids根据input_ids广播而来,形状与input_ids相同
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
# 使用module的init方法初始化模块,传入必要参数,返回不包含字典的初始化变量
init_variables = self.module.init(
jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
)
# 返回解冻的初始化变量中的cache
return unfreeze(init_variables["cache"])
# 将开始字符串的文档字符串添加到模型前向传播方法
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# 定义类的 __call__ 方法,使对象可以像函数一样被调用
def __call__(
# 输入的 token IDs,用于模型的输入
self,
# 注意力掩码,指示模型在哪些位置需要注意
attention_mask=None,
# token 类型 IDs,用于区分不同句子的 token
token_type_ids=None,
# 位置 IDs,指示每个 token 在句子中的位置
position_ids=None,
# 头部掩码,控制多头注意力机制中哪些头部生效
head_mask=None,
# 编码器隐藏状态,用于模型的编码器
encoder_hidden_states=None,
# 编码器注意力掩码,指示编码器中需要注意的位置
encoder_attention_mask=None,
# 参数字典,包含其他参数的字典形式输入
params: dict = None,
# 随机数生成器的密钥,用于随机性操作
dropout_rng: jax.random.PRNGKey = None,
# 是否在训练阶段,影响是否应用 dropout 等训练相关操作
train: bool = False,
# 是否输出注意力矩阵
output_attentions: Optional[bool] = None,
# 是否输出隐藏状态
output_hidden_states: Optional[bool] = None,
# 是否返回字典形式的输出结果
return_dict: Optional[bool] = None,
# 过去的键值对,用于处理带有过去状态的模型
past_key_values: dict = None,
# 从transformers.models.bert.modeling_flax_bert.FlaxBertModule复制代码,并将Bert替换为Roberta
class FlaxRobertaModule(nn.Module):
config: RobertaConfig
dtype: jnp.dtype = jnp.float32 # 计算中的数据类型
add_pooling_layer: bool = True # 是否添加池化层
gradient_checkpointing: bool = False # 是否使用梯度检查点
def setup(self):
self.embeddings = FlaxRobertaEmbeddings(self.config, dtype=self.dtype) # 初始化Roberta模型的嵌入层
self.encoder = FlaxRobertaEncoder(
self.config,
dtype=self.dtype,
gradient_checkpointing=self.gradient_checkpointing,
) # 初始化Roberta模型的编码器
self.pooler = FlaxRobertaPooler(self.config, dtype=self.dtype) # 初始化Roberta模型的池化层
def __call__(
self,
input_ids,
attention_mask,
token_type_ids: Optional[jnp.ndarray] = None,
position_ids: Optional[jnp.ndarray] = None,
head_mask: Optional[jnp.ndarray] = None,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 当token_type_ids未传递时,确保其正确初始化为零数组
if token_type_ids is None:
token_type_ids = jnp.zeros_like(input_ids)
# 当position_ids未传递时,确保其正确初始化为广播到适当形状的数组
if position_ids is None:
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
# 使用嵌入层处理输入,生成隐藏状态
hidden_states = self.embeddings(
input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic
)
# 使用编码器处理隐藏状态,返回模型输出
outputs = self.encoder(
hidden_states,
attention_mask,
head_mask=head_mask,
deterministic=deterministic,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0] # 获取编码器输出的隐藏状态
# 如果不返回字典格式的输出
if not return_dict:
# 如果pooled为None,则不返回它
if pooled is None:
return (hidden_states,) + outputs[1:]
return (hidden_states, pooled) + outputs[1:]
# 返回带池化和交叉注意力的基础模型输出
return FlaxBaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=hidden_states,
pooler_output=pooled,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
@add_start_docstrings(
"The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
ROBERTA_START_DOCSTRING,
)
class FlaxRobertaModel(FlaxRobertaPreTrainedModel):
module_class = FlaxRobertaModule
# 调用函数向模型类添加示例文档字符串
append_call_sample_docstring(FlaxRobertaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
class FlaxRobertaForMaskedLMModule(nn.Module):
config: RobertaConfig
dtype: jnp.dtype = jnp.float32
gradient_checkpointing: bool = False
def setup(self):
# 初始化 RoBERTa 模型
self.roberta = FlaxRobertaModule(
config=self.config,
add_pooling_layer=False,
dtype=self.dtype,
gradient_checkpointing=self.gradient_checkpointing,
)
# 初始化 RoBERTa 语言模型头部
self.lm_head = FlaxRobertaLMHead(config=self.config, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用 RoBERTa 模型,获取模型输出
outputs = self.roberta(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
# 获取共享的词嵌入(如果配置允许)
if self.config.tie_word_embeddings:
shared_embedding = self.roberta.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
else:
shared_embedding = None
# 计算预测得分
logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)
# 根据 return_dict 决定返回的格式
if not return_dict:
return (logits,) + outputs[1:]
# 返回 RoBERTa 对象的输出作为 MaskedLM 的输出
return FlaxMaskedLMOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING)
class FlaxRobertaForMaskedLM(FlaxRobertaPreTrainedModel):
module_class = FlaxRobertaForMaskedLMModule
# 调用函数向模型类添加示例文档字符串
append_call_sample_docstring(
FlaxRobertaForMaskedLM,
_CHECKPOINT_FOR_DOC,
FlaxBaseModelOutputWithPooling,
_CONFIG_FOR_DOC,
mask="<mask>",
)
class FlaxRobertaForSequenceClassificationModule(nn.Module):
config: RobertaConfig
dtype: jnp.dtype = jnp.float32
gradient_checkpointing: bool = False
def setup(self):
# 初始化 RoBERTa 模型
self.roberta = FlaxRobertaModule(
config=self.config,
dtype=self.dtype,
add_pooling_layer=False,
gradient_checkpointing=self.gradient_checkpointing,
)
# 初始化 RoBERTa 序列分类头部
self.classifier = FlaxRobertaClassificationHead(config=self.config, dtype=self.dtype)
# 定义一个方法,使得对象可以像函数一样被调用,接受多个输入参数和多个关键字参数
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic: bool = True, # 控制模型行为的布尔参数,默认为True
output_attentions: bool = False, # 是否输出注意力权重的布尔参数,默认为False
output_hidden_states: bool = False, # 是否输出隐藏状态的布尔参数,默认为False
return_dict: bool = True, # 是否返回结果字典的布尔参数,默认为True
):
# 使用预训练模型进行前向传播
outputs = self.roberta(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 提取模型输出的序列输出(通常是最后一层隐藏状态)
sequence_output = outputs[0]
# 使用分类器对序列输出进行分类,得到预测的逻辑回归结果
logits = self.classifier(sequence_output, deterministic=deterministic)
# 如果不要求返回结果字典,则返回一个元组,包含逻辑回归结果和其他输出
if not return_dict:
return (logits,) + outputs[1:]
# 否则,返回一个FlaxSequenceClassifierOutput对象,包含逻辑回归结果、隐藏状态和注意力权重
return FlaxSequenceClassifierOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Roberta Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
ROBERTA_START_DOCSTRING,
)
# 使用装饰器添加文档字符串,描述这是一个在Roberta模型基础上构建的序列分类/回归模型,顶部有一个线性层作为池化输出的一部分,例如用于GLUE任务。
class FlaxRobertaForSequenceClassification(FlaxRobertaPreTrainedModel):
module_class = FlaxRobertaForSequenceClassificationModule
append_call_sample_docstring(
FlaxRobertaForSequenceClassification,
_CHECKPOINT_FOR_DOC,
FlaxSequenceClassifierOutput,
_CONFIG_FOR_DOC,
)
# 从transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule复制过来,将Bert改为Roberta,self.bert改为self.roberta
class FlaxRobertaForMultipleChoiceModule(nn.Module):
config: RobertaConfig
dtype: jnp.dtype = jnp.float32
gradient_checkpointing: bool = False
def setup(self):
# 初始化Roberta模块,包括配置、数据类型和梯度检查点设置
self.roberta = FlaxRobertaModule(
config=self.config,
dtype=self.dtype,
gradient_checkpointing=self.gradient_checkpointing,
)
# 使用配置中的隐藏层dropout率初始化dropout层
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
# 初始化分类器层,输出为1,数据类型与配置中的隐藏层一致
self.classifier = nn.Dense(1, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 获取选择项的数量
num_choices = input_ids.shape[1]
# 重新整形输入以便于模型处理
input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None
# 调用Roberta模型
outputs = self.roberta(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取池化后的输出
pooled_output = outputs[1]
# 对池化输出应用dropout,具体行为根据deterministic参数确定
pooled_output = self.dropout(pooled_output, deterministic=deterministic)
# 将dropout后的输出通过分类器层,得到logits
logits = self.classifier(pooled_output)
# 将logits重新整形为(batch_size, num_choices)
reshaped_logits = logits.reshape(-1, num_choices)
if not return_dict:
# 如果不需要返回字典,则返回元组形式的输出
return (reshaped_logits,) + outputs[2:]
# 返回多选题模型的输出,包括logits、隐藏状态和注意力
return FlaxMultipleChoiceModelOutput(
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
# 这部分代码片段未完成,应继续添加代码以完成类的定义和功能。
# 定义 ROBERTA_START_DOCSTRING 常量,通常用于标识文档字符串的起始位置
ROBERTA_START_DOCSTRING,
# 在 FlaxRobertaForMultipleChoice 类中设置模块类为 FlaxRobertaForMultipleChoiceModule
class FlaxRobertaForMultipleChoice(FlaxRobertaPreTrainedModel):
module_class = FlaxRobertaForMultipleChoiceModule
# 覆盖 FlaxRobertaForMultipleChoice 类的调用文档字符串,使用 ROBERTA_INPUTS_DOCSTRING 格式化字符串
overwrite_call_docstring(
FlaxRobertaForMultipleChoice, ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
# 向 FlaxRobertaForMultipleChoice 类的示例调用文档字符串附加示例代码和相关说明
append_call_sample_docstring(
FlaxRobertaForMultipleChoice,
_CHECKPOINT_FOR_DOC,
FlaxMultipleChoiceModelOutput,
_CONFIG_FOR_DOC,
)
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassificationModule 复制到 FlaxRobertaForTokenClassificationModule,将 Bert 替换为 Roberta,self.bert 替换为 self.roberta
class FlaxRobertaForTokenClassificationModule(nn.Module):
config: RobertaConfig
dtype: jnp.dtype = jnp.float32
gradient_checkpointing: bool = False
def setup(self):
# 使用 FlaxRobertaModule 创建 self.roberta,配置为不添加池化层,是否进行梯度检查点由 self.gradient_checkpointing 控制
self.roberta = FlaxRobertaModule(
config=self.config,
dtype=self.dtype,
add_pooling_layer=False,
gradient_checkpointing=self.gradient_checkpointing,
)
# 根据配置设置分类器的 dropout 率
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
# 创建 dropout 层
self.dropout = nn.Dropout(rate=classifier_dropout)
# 创建分类器,输出维度为 self.config.num_labels
self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用 self.roberta 进行模型前向传播
outputs = self.roberta(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取模型的隐藏状态
hidden_states = outputs[0]
# 对隐藏状态应用 dropout
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
# 对应用 dropout 后的隐藏状态应用分类器,得到 logits
logits = self.classifier(hidden_states)
# 如果不返回字典,则返回 logits 以及 outputs 中的其余部分
if not return_dict:
return (logits,) + outputs[1:]
# 返回 FlaxTokenClassifierOutput 对象,包含 logits、隐藏状态和注意力权重
return FlaxTokenClassifierOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 为 FlaxRobertaForTokenClassification 类添加起始文档字符串,描述其为在隐藏状态之上具有标记分类头的 Roberta 模型,例如用于命名实体识别 (NER) 任务
@add_start_docstrings(
"""
Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
ROBERTA_START_DOCSTRING,
)
class FlaxRobertaForTokenClassification(FlaxRobertaPreTrainedModel):
module_class = FlaxRobertaForTokenClassificationModule
# 向 FlaxRobertaForTokenClassification 类的示例调用文档字符串附加示例代码和相关说明
append_call_sample_docstring(
FlaxRobertaForTokenClassification,
_CHECKPOINT_FOR_DOC,
FlaxTokenClassifierOutput,
_CONFIG_FOR_DOC,
)
# 从transformers.models.bert.modeling_flax_bert.FlaxBertForQuestionAnsweringModule复制代码到此处,并将Bert->Roberta,self.bert->self.roberta
class FlaxRobertaForQuestionAnsweringModule(nn.Module):
config: RobertaConfig
dtype: jnp.dtype = jnp.float32
gradient_checkpointing: bool = False
def setup(self):
# 初始化Roberta模型作为self.roberta,不包含池化层,支持梯度检查点
self.roberta = FlaxRobertaModule(
config=self.config,
dtype=self.dtype,
add_pooling_layer=False,
gradient_checkpointing=self.gradient_checkpointing,
)
# 初始化用于QA任务的输出层self.qa_outputs,包含num_labels个输出单元
self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用Roberta模型self.roberta进行前向传播
outputs = self.roberta(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从输出中获取隐藏状态
hidden_states = outputs[0]
# 将隐藏状态传入QA输出层self.qa_outputs,得到起始和结束位置的logits
logits = self.qa_outputs(hidden_states)
start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
# 如果不返回字典,直接返回logits和可能存在的额外输出
if not return_dict:
return (start_logits, end_logits) + outputs[1:]
# 返回QA模型的输出,包括起始和结束logits、隐藏状态和注意力权重
return FlaxQuestionAnsweringModelOutput(
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
为抽取式问答任务(如SQuAD)设计的Roberta模型,顶部有一个用于span分类的线性层,
用于计算`span start logits`和`span end logits`的隐藏状态输出。
""",
ROBERTA_START_DOCSTRING,
)
class FlaxRobertaForQuestionAnswering(FlaxRobertaPreTrainedModel):
# 使用FlaxRobertaForQuestionAnsweringModule作为模型类
module_class = FlaxRobertaForQuestionAnsweringModule
# 添加调用示例的文档字符串
append_call_sample_docstring(
FlaxRobertaForQuestionAnswering,
_CHECKPOINT_FOR_DOC,
FlaxQuestionAnsweringModelOutput,
_CONFIG_FOR_DOC,
)
class FlaxRobertaForCausalLMModule(nn.Module):
config: RobertaConfig
dtype: jnp.dtype = jnp.float32
gradient_checkpointing: bool = False
def setup(self):
# 初始化Roberta模型作为self.roberta,不包含池化层,支持梯度检查点
self.roberta = FlaxRobertaModule(
config=self.config,
add_pooling_layer=False,
dtype=self.dtype,
gradient_checkpointing=self.gradient_checkpointing,
)
# 初始化用于Causal LM任务的LM头部self.lm_head
self.lm_head = FlaxRobertaLMHead(config=self.config, dtype=self.dtype)
# 定义一个特殊方法 __call__,用于实现对象的可调用行为
def __call__(
self,
input_ids,
attention_mask,
position_ids,
token_type_ids: Optional[jnp.ndarray] = None,
head_mask: Optional[jnp.ndarray] = None,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用模型的主体部分
outputs = self.roberta(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
init_cache=init_cache,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取隐藏状态
hidden_states = outputs[0]
# 如果配置允许词嵌入共享
if self.config.tie_word_embeddings:
# 获取共享的词嵌入层
shared_embedding = self.roberta.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
else:
shared_embedding = None
# 计算预测分数(logits)
logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)
# 如果不要求返回字典形式的结果,则返回元组形式的输出
if not return_dict:
return (logits,) + outputs[1:]
# 返回带有交叉注意力的因果语言模型输出对象
return FlaxCausalLMOutputWithCrossAttentions(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
"""
Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
autoregressive tasks.
"""
# 将 FlaxRobertaForCausalLM 类定义为一个特定的 RoBERTa 模型,用于自回归任务,包括语言建模等
@add_start_docstrings(
"""
Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
autoregressive tasks.
""",
ROBERTA_START_DOCSTRING,
)
class FlaxRobertaForCausalLM(FlaxRobertaPreTrainedModel):
# 设置模型的主体类为 FlaxRobertaForCausalLMModule
module_class = FlaxRobertaForCausalLMModule
# 准备用于生成的输入数据,包括初始化缓存和注意力遮罩
def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
# 获取输入的批量大小和序列长度
batch_size, seq_length = input_ids.shape
# 初始化缓存 past_key_values
past_key_values = self.init_cache(batch_size, max_length)
# 注意:通常情况下,需要在 attention_mask 中为超出 input_ids.shape[-1] 和小于 cache_length 的位置放置 0
# 但由于解码器使用因果遮罩,这些位置已经被遮蔽了。
# 因此,我们可以在这里创建一个静态的 attention_mask,这对编译效率更高。
extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
if attention_mask is not None:
# 计算位置 IDs
position_ids = attention_mask.cumsum(axis=-1) - 1
# 将 attention_mask 动态更新到 extended_attention_mask 中
extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
else:
# 如果 attention_mask 为 None,则创建一个广播后的位置 IDs
position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
return {
"past_key_values": past_key_values,
"attention_mask": extended_attention_mask,
"position_ids": position_ids,
}
# 更新生成过程中的输入参数,包括 past_key_values 和 position_ids
def update_inputs_for_generation(self, model_outputs, model_kwargs):
model_kwargs["past_key_values"] = model_outputs.past_key_values
model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
return model_kwargs
# 附加调用示例文档字符串,指定模型类、检查点信息、输出类型及配置信息
append_call_sample_docstring(
FlaxRobertaForCausalLM,
_CHECKPOINT_FOR_DOC,
FlaxCausalLMOutputWithCrossAttentions,
_CONFIG_FOR_DOC,
)
.\models\roberta\modeling_roberta.py
"""PyTorch RoBERTa model."""
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN, gelu
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_roberta import RobertaConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "FacebookAI/roberta-base"
_CONFIG_FOR_DOC = "RobertaConfig"
ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"FacebookAI/roberta-base",
"FacebookAI/roberta-large",
"FacebookAI/roberta-large-mnli",
"distilbert/distilroberta-base",
"openai-community/roberta-base-openai-detector",
"openai-community/roberta-large-openai-detector",
]
class RobertaEmbeddings(nn.Module):
"""
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
"""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
)
):
if position_ids is None:
if input_ids is not None:
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
def create_position_ids_from_inputs_embeds(self, inputs_embeds):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape)
class RobertaSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
class RobertaSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class RobertaAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = RobertaSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = RobertaSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class RobertaIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class RobertaOutput(nn.Module):
pass
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class RobertaLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = RobertaAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = RobertaAttention(config, position_embedding_type="absolute")
self.intermediate = RobertaIntermediate(config)
self.output = RobertaOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class RobertaEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class RobertaPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class RobertaPreTrainedModel(PreTrainedModel):
"""
一个抽象类,用于处理权重初始化以及一个简单的接口,用于下载和加载预训练模型。
"""
config_class = RobertaConfig
base_model_prefix = "roberta"
supports_gradient_checkpointing = True
_no_split_modules = ["RobertaEmbeddings", "RobertaSelfAttention"]
def _init_weights(self, module):
"""初始化权重"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
ROBERTA_START_DOCSTRING = r"""
此模型继承自[`PreTrainedModel`]。请查看其超类文档以了解库实现的所有模型的通用方法(例如下载或保存模型、调整输入嵌入、修剪头等)。
此模型也是一个PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)子类。
您可以像使用常规PyTorch模块一样使用它,并参考PyTorch文档以获取有关一般使用和行为的所有相关信息。
参数:
config ([`RobertaConfig`]): 包含模型所有参数的配置类。使用配置文件初始化模型不会加载模型的权重,只会加载配置。
请查看[`~PreTrainedModel.from_pretrained`]方法以加载模型权重。
"""
ROBERTA_INPUTS_DOCSTRING = r"""
```
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
# 输入序列标记的索引,对应于词汇表中的标记
# 可以使用 [`AutoTokenizer`] 获取这些索引。参见 [`PreTrainedTokenizer.encode`] 和
# [`PreTrainedTokenizer.__call__`] 获取更多详情。
# [什么是输入 ID?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
# 遮罩,用于在填充的标记索引上避免执行注意力计算。遮罩的值选在 `[0, 1]`:
# - 对于 **未遮罩的** 标记,值为 1,
# - 对于 **遮罩的** 标记,值为 0。
# [什么是注意力遮罩?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 段标记索引,指示输入的第一部分和第二部分。索引值选在 `[0,1]`:
# - 0 对应 *句子 A* 的标记,
# - 1 对应 *句子 B* 的标记。
# 当模型用 `type_vocab_size` 参数初始化且值 >= 2 时才能使用此参数。此张量中的所有值应始终 < type_vocab_size。
# [什么是标记类型 ID?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 输入序列标记在位置嵌入中的位置索引。选择范围在 `[0, config.max_position_embeddings - 1]`。
# [什么是位置 ID?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
# 用于屏蔽自注意力模块中选定头部的遮罩。遮罩的值选在 `[0, 1]`:
# - 1 表示头部 **未被屏蔽**,
# - 0 表示头部 **被屏蔽**。
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
# 可选地,您可以直接传递嵌入表示而不是传递 `input_ids`。这在您想要更精确地控制如何将 `input_ids` 索引转换为相关向量时很有用,而不是使用模型内部的嵌入查找矩阵。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。有关更多详细信息,请参见返回的张量中的 `attentions`。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。有关更多详细信息,请参见返回的张量中的 `hidden_states`。
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
"""
@add_start_docstrings(
"The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
ROBERTA_START_DOCSTRING,
)
"""
"""
class RobertaModel(RobertaPreTrainedModel):
"""
RoBERTa 模型可以作为编码器(只有自注意力)或解码器使用,后者在自注意力层之间增加了一个交叉注意力层,
遵循 *Attention is all you need*_ 中描述的架构,由 Ashish Vaswani、Noam Shazeer、Niki Parmar、Jakob Uszkoreit、
Llion Jones、Aidan N. Gomez、Lukasz Kaiser 和 Illia Polosukhin 提出。
若要作为解码器使用,模型需要使用配置中设置 `is_decoder` 参数为 `True` 进行初始化。
若要在 Seq2Seq 模型中使用,模型需要同时设置 `is_decoder` 参数和 `add_cross_attention` 参数为 `True`,
并期望在前向传播中输入 `encoder_hidden_states`。
.. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.embeddings = RobertaEmbeddings(config)
self.encoder = RobertaEncoder(config)
self.pooler = RobertaPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
"""
# 定义 Transformer 模型的前向传播方法,接受多个输入参数
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的 token IDs 张量,可选
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码张量,可选
token_type_ids: Optional[torch.Tensor] = None, # token 类型 IDs 张量,可选
position_ids: Optional[torch.Tensor] = None, # 位置 IDs 张量,可选
head_mask: Optional[torch.Tensor] = None, # 头部掩码张量,可选
inputs_embeds: Optional[torch.Tensor] = None, # 输入的嵌入张量,可选
encoder_hidden_states: Optional[torch.Tensor] = None, # 编码器隐藏状态张量,可选
encoder_attention_mask: Optional[torch.Tensor] = None, # 编码器注意力掩码张量,可选
past_key_values: Optional[List[torch.FloatTensor]] = None, # 过去的键值对列表,可选
use_cache: Optional[bool] = None, # 是否使用缓存,可选
output_attentions: Optional[bool] = None, # 是否输出注意力张量,可选
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选
return_dict: Optional[bool] = None, # 是否以字典形式返回结果,可选
# 定义 RoBERTa 语言模型,用于条件语言建模 fine-tuning
@add_start_docstrings(
"""RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.""", ROBERTA_START_DOCSTRING
)
class RobertaForCausalLM(RobertaPreTrainedModel):
# 共享权重的键名列表
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
# 初始化函数,接受一个配置对象 config
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 如果配置中指定不是解码器,则记录警告日志
if not config.is_decoder:
logger.warning("If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
# 初始化 RoBERTa 模型,不包含池化层
self.roberta = RobertaModel(config, add_pooling_layer=False)
# 初始化 RoBERTa 语言建模头部
self.lm_head = RobertaLMHead(config)
# 执行初始化权重并应用最终处理
self.post_init()
# 返回语言建模头部的输出嵌入层
def get_output_embeddings(self):
return self.lm_head.decoder
# 设置语言建模头部的输出嵌入层
def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings
# 前向传播函数,接受多个输入参数并返回预测结果
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 准备生成的输入数据,根据需要动态创建解码器的注意力遮罩
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
# 如果未提供注意力遮罩,则使用全为1的遮罩
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
# 如果存在过去键值,则截取解码器输入的 ID
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
# 一些生成方法可能只传递最后一个输入 ID
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# 默认行为:保留最后一个 ID
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
# 重新排序缓存中的过去键值对,以适应束搜索的索引顺序
def _reorder_cache(self, past_key_values, beam_idx):
# 初始化一个空的重排序后的过去键值对元组
reordered_past = ()
# 遍历每一层的过去键值对
for layer_past in past_key_values:
# 对于每个层的过去状态,根据束搜索的索引重新选择对应的过去状态
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
# 返回重新排序后的过去键值对元组
return reordered_past
# 使用自定义的文档字符串注释 RoBERTa 模型,包含了一个顶部的语言建模头部
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING)
class RobertaForMaskedLM(RobertaPreTrainedModel):
# 定义一个列表,包含了需要共享权重的键名
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
# 初始化方法,接受一个配置对象作为参数
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 如果配置中指定了 is_decoder 为 True,发出警告信息
if config.is_decoder:
logger.warning(
"If you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
# 创建一个 RoBERTa 模型,不包含池化层
self.roberta = RobertaModel(config, add_pooling_layer=False)
# 创建一个 RoBERTa 语言建模头部
self.lm_head = RobertaLMHead(config)
# 执行额外的初始化操作和最终处理
self.post_init()
# 返回语言建模头部的输出嵌入
def get_output_embeddings(self):
return self.lm_head.decoder
# 设置新的输出嵌入到语言建模头部
def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings
# 使用文档字符串和代码示例的注释来注释 forward 方法
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="<mask>",
expected_output="' Paris'",
expected_loss=0.1,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
# Determine if the output should be returned as a dictionary or not
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Forward pass through the Roberta model
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Extract the sequence output from the model outputs
sequence_output = outputs[0]
# Generate prediction scores using the language model head
prediction_scores = self.lm_head(sequence_output)
masked_lm_loss = None
if labels is not None:
# Move labels tensor to the device used for prediction_scores
labels = labels.to(prediction_scores.device)
# Define the loss function for masked language modeling
loss_fct = CrossEntropyLoss()
# Compute masked language modeling loss
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
# Prepare output tuple without returning a dictionary
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
# Return MaskedLMOutput named tuple if return_dict is True
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 定义一个用于 RoBERTa 的语言模型头部的类
class RobertaLMHead(nn.Module):
"""Roberta Head for masked language modeling."""
def __init__(self, config):
super().__init__()
# 创建一个线性层,将输入大小映射到隐藏大小
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 创建一个层归一化层,用于标准化隐藏状态
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 创建一个线性层,将隐藏大小映射到词汇表大小
self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
# 创建一个偏置参数,用于解码器线性层
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
# 将解码器的偏置设置为自定义的偏置参数
self.decoder.bias = self.bias
def forward(self, features, **kwargs):
# 输入特征经过线性层变换
x = self.dense(features)
# 使用 GELU 激活函数进行非线性变换
x = gelu(x)
# 输入经过层归一化处理
x = self.layer_norm(x)
# 将隐藏状态映射回词汇表大小,并加上偏置
x = self.decoder(x)
return x
def _tie_weights(self):
# 如果解码器的偏置参数设备类型是 "meta",则将其与自定义偏置参数绑定
# 这是为了加速兼容性和保持向后兼容性
if self.decoder.bias.device.type == "meta":
self.decoder.bias = self.bias
else:
# 否则,将自定义偏置参数与解码器的偏置参数绑定
self.bias = self.decoder.bias
@add_start_docstrings(
"""
RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
ROBERTA_START_DOCSTRING,
)
class RobertaForSequenceClassification(RobertaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化 RoBERTa 模型的分类/回归头部
self.num_labels = config.num_labels
self.config = config
# 创建 RoBERTa 模型,不包含池化层
self.roberta = RobertaModel(config, add_pooling_layer=False)
# 创建 RoBERTa 分类头部
self.classifier = RobertaClassificationHead(config)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="cardiffnlp/twitter-roberta-base-emotion",
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="'optimism'",
expected_loss=0.08,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 如果 return_dict 不为 None,则使用其值;否则使用 self.config.use_return_dict 的值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 将输入传递给 RoBERTa 模型进行处理,并获取输出结果
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从 RoBERTa 模型的输出中获取序列输出
sequence_output = outputs[0]
# 将序列输出传递给分类器模型获取 logits
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
# 将 labels 移动到正确的设备以启用模型并行处理
labels = labels.to(logits.device)
# 根据问题类型自动推断配置中的 problem_type
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据 problem_type 计算损失函数
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果 return_dict 为 False,则返回 logits 和可能的额外输出
if not return_dict:
output = (logits,) + outputs[2:] # 保留 logits 和额外的 hidden states
return ((loss,) + output) if loss is not None else output
# 返回一个 SequenceClassifierOutput 对象,包含 loss、logits、hidden states 和 attentions
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 使用多项选择分类头部的 RoBERTa 模型(在汇总输出之上有一个线性层和 softmax),例如用于 RocStories/SWAG 任务
@add_start_docstrings(
"""
Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
ROBERTA_START_DOCSTRING,
)
class RobertaForMultipleChoice(RobertaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化 RoBERTa 模型
self.roberta = RobertaModel(config)
# Dropout 层
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 分类器线性层,输出维度为1(用于多项选择任务)
self.classifier = nn.Linear(config.hidden_size, 1)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
# 根据返回值字典是否为空来确定是否使用预设的返回值设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 计算输入张量的第二维大小,即选择数量
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
# 如果存在input_ids,则将其展平为二维张量,否则为None
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
# 如果存在position_ids,则将其展平为二维张量,否则为None
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
# 如果存在token_type_ids,则将其展平为二维张量,否则为None
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
# 如果存在attention_mask,则将其展平为二维张量,否则为None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
# 如果存在inputs_embeds,则将其展平为三维张量,否则为None
flat_inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
# 调用RoBERTa模型进行前向传播
outputs = self.roberta(
flat_input_ids,
position_ids=flat_position_ids,
token_type_ids=flat_token_type_ids,
attention_mask=flat_attention_mask,
head_mask=head_mask,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 提取汇总输出
pooled_output = outputs[1]
# 对汇总输出应用dropout
pooled_output = self.dropout(pooled_output)
# 使用分类器得到logits
logits = self.classifier(pooled_output)
# 将logits重新调整形状为(batch_size, num_choices)
reshaped_logits = logits.view(-1, num_choices)
loss = None
# 如果存在标签,则计算交叉熵损失
if labels is not None:
# 将标签移动到正确的设备上以实现模型并行计算
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
# 如果不需要返回字典,则返回输出元组
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 如果需要返回字典,则返回多选模型输出
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 定义一个带有标记分类头部的 RoBERTa 模型类,用于例如命名实体识别(NER)任务
@add_start_docstrings(
"""
Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
ROBERTA_START_DOCSTRING,
)
class RobertaForTokenClassification(RobertaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
# 初始化 RoBERTa 模型,不包括汇聚层
self.roberta = RobertaModel(config, add_pooling_layer=False)
# 根据配置设定分类器的 dropout 率,若未设置则使用隐藏层的 dropout 率
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
# 创建一个线性层,将 RoBERTa 隐藏层的输出转换为分类标签
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="Jean-Baptiste/roberta-large-ner-english",
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
expected_loss=0.01,
)
# 前向传播函数,接受 RoBERTa 的输入并输出分类结果
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# RoBERTa 输入的详细说明文档
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
# 如果没有指定 return_dict,则根据配置决定是否使用返回字典
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 RoBERTa 模型处理输入数据,并获取输出结果
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从 RoBERTa 模型输出中获取序列输出
sequence_output = outputs[0]
# 对序列输出进行 dropout 处理
sequence_output = self.dropout(sequence_output)
# 将 dropout 后的序列输出送入分类器得到 logits
logits = self.classifier(sequence_output)
# 初始化损失值
loss = None
# 如果给定了标签,则计算交叉熵损失
if labels is not None:
# 将标签移到正确的设备上以支持模型并行计算
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
# 计算损失值
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
# 如果不要求返回字典,则组织输出结果
if not return_dict:
output = (logits,) + outputs[2:] # 这里的 outputs[2:] 包含额外的隐藏状态
return ((loss,) + output) if loss is not None else output
# 构造 TokenClassifierOutput 对象用于返回结果
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class RobertaClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
# 定义一个全连接层,输入和输出维度均为 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 确定分类器的 dropout 率,如果未提供,则使用隐藏层 dropout 率
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
# 定义一个 dropout 层,应用上述确定的 dropout 率
self.dropout = nn.Dropout(classifier_dropout)
# 定义一个全连接层,将隐藏状态映射到类别数 config.num_labels
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
# 从 features 中选择第一个 token 的隐藏状态作为输出,类似于取 [CLS] token
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
x = self.dropout(x) # 应用 dropout
x = self.dense(x) # 应用全连接层
x = torch.tanh(x) # 应用 tanh 激活函数
x = self.dropout(x) # 再次应用 dropout
x = self.out_proj(x) # 应用最终的全连接层映射到输出类别数
return x
@add_start_docstrings(
"""
Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
ROBERTA_START_DOCSTRING,
)
class RobertaForQuestionAnswering(RobertaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
# 初始化 RoBERTa 模型,不包含池化层
self.roberta = RobertaModel(config, add_pooling_layer=False)
# 定义一个全连接层,将 RoBERTa 的隐藏状态映射到类别数 config.num_labels
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="deepset/roberta-base-squad2",
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="' puppet'",
expected_loss=0.86,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
# Determine whether to use return_dict or default based on configuration
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Pass input data to the Roberta model for processing
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Extract the sequence output from the Roberta model's outputs
sequence_output = outputs[0]
# Compute logits for question answering start and end positions
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
# Handle multi-GPU scenarios by adjusting tensor dimensions
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# Clamp positions to ensure they are within valid range
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
# Define loss function and compute start/end losses
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
# Prepare output tuple without using return_dict
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
# Return structured output using QuestionAnsweringModelOutput class
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.
Args:
input_ids: torch.Tensor, input tensor containing token ids
padding_idx: int, index of padding token in the vocabulary
past_key_values_length: int, length of past key values to consider for incremental indexing
Returns:
torch.Tensor, tensor of position ids corresponding to input_ids
"""
# 创建一个掩码,标记非填充符号的位置为1,填充符号的位置为0
mask = input_ids.ne(padding_idx).int()
# 根据掩码累积计算位置索引,加上过去关键值长度,然后乘以掩码以忽略填充符号
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
# 返回最终的位置 ids,加上填充索引以确保填充符号仍然为填充索引
return incremental_indices.long() + padding_idx
.\models\roberta\modeling_tf_roberta.py
""" TF 2.0 RoBERTa 模型。"""
from __future__ import annotations
import math
import warnings
from typing import Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutputWithPastAndCrossAttentions,
TFBaseModelOutputWithPoolingAndCrossAttentions,
TFCausalLMOutputWithCrossAttentions,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFMaskedLanguageModelingLoss,
TFModelInputType,
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_roberta import RobertaConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "FacebookAI/roberta-base"
_CONFIG_FOR_DOC = "RobertaConfig"
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"FacebookAI/roberta-base",
"FacebookAI/roberta-large",
"FacebookAI/roberta-large-mnli",
"distilbert/distilroberta-base",
]
class TFRobertaEmbeddings(keras.layers.Layer):
"""
BertEmbeddings 的变种,用于处理位置编码索引的微小调整。
"""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.padding_idx = 1
self.config = config
self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
good answer should meticulously annotate each line of the provided Python code block, adhering to the specified format. This includes adding comments that explain the purpose and functionality of each statement, ensuring clarity and completeness without altering the original code structure or indentation.
Here is a potential answer:
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
with tf.name_scope("position_embeddings"):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.
Args:
input_ids: tf.Tensor
Returns: tf.Tensor
"""
mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask
return incremental_indices + self.padding_idx
def call(
self,
input_ids=None,
position_ids=None,
token_type_ids=None,
inputs_embeds=None,
past_key_values_length=0,
training=False,
This annotation ensures that each method and operation within the provided class is clearly explained, promoting understanding and maintenance of the code.
):
"""
Applies embedding based on inputs tensor.
Returns:
final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if token_type_ids is None:
token_type_ids = tf.fill(dims=input_shape, value=0)
if position_ids is None:
if input_ids is not None:
position_ids = self.create_position_ids_from_input_ids(
input_ids=input_ids, past_key_values_length=past_key_values_length
)
else:
position_ids = tf.expand_dims(
tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
)
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
final_embeddings = inputs_embeds + position_embeds + token_type_embeds
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
class TFRobertaPooler(keras.layers.Layer):
def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(inputs=first_token_tensor)
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFRobertaSelfAttention(keras.layers.Layer):
def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
f"of attention heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.config = config
tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(tensor, perm=[0, 2, 1, 3])
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
class TFRobertaSelfOutput(keras.layers.Layer):
def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFRobertaAttention(keras.layers.Layer):
def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs)
self.self_attention = TFRobertaSelfAttention(config, name="self")
self.dense_output = TFRobertaSelfOutput(config, name="output")
def prune_heads(self, heads):
raise NotImplementedError
def call(
self,
input_tensor: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
encoder_hidden_states: tf.Tensor,
encoder_attention_mask: tf.Tensor,
past_key_value: Tuple[tf.Tensor],
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
self_outputs = self.self_attention(
hidden_states=input_tensor,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_value=past_key_value,
output_attentions=output_attentions,
training=training,
)
attention_output = self.dense_output(
hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
)
outputs = (attention_output,) + self_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFRobertaIntermediate(keras.layers.Layer):
def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFRobertaOutput(keras.layers.Layer):
def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFRobertaLayer(keras.layers.Layer):
def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs)
self.attention = TFRobertaAttention(config, name="attention")
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = TFRobertaAttention(config, name="crossattention")
self.intermediate = TFRobertaIntermediate(config, name="intermediate")
self.bert_output = TFRobertaOutput(config, name="output")
) -> Tuple[tf.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
input_tensor=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=self_attn_past_key_value,
output_attentions=output_attentions,
training=training,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
input_tensor=attention_output,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_value=cross_attn_past_key_value,
output_attentions=output_attentions,
training=training,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
intermediate_output = self.intermediate(hidden_states=attention_output)
layer_output = self.bert_output(
hidden_states=intermediate_output,
input_tensor=attention_output,
training=training
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
class TFRobertaEncoder(keras.layers.Layer):
def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.layer = [TFRobertaLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
encoder_hidden_states: tf.Tensor | None,
encoder_attention_mask: tf.Tensor | None,
past_key_values: Tuple[Tuple[tf.Tensor]] | None,
use_cache: Optional[bool],
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
past_key_value = past_key_values[i] if past_key_values is not None else None
layer_outputs = layer_module(
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask[i],
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_value=past_key_value,
output_attentions=output_attentions,
training=training,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if self.config.add_cross_attention and encoder_hidden_states is not None:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
)
return TFBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_attentions,
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFRobertaMainLayer(keras.layers.Layer):
config_class = RobertaConfig
def __init__(self, config, add_pooling_layer=True, **kwargs):
super().__init__(**kwargs)
self.config = config
self.is_decoder = config.is_decoder
self.num_hidden_layers = config.num_hidden_layers
self.initializer_range = config.initializer_range
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.encoder = TFRobertaEncoder(config, name="encoder")
self.pooler = TFRobertaPooler(config, name="pooler") if add_pooling_layer else None
self.embeddings = TFRobertaEmbeddings(config, name="embeddings")
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
def set_input_embeddings(self, value: tf.Variable):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
):
pass
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
class TFRobertaPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = RobertaConfig
base_model_prefix = "roberta"
ROBERTA_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Parameters:
config ([`RobertaConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
ROBERTA_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
ROBERTA_START_DOCSTRING,
)
class TFRobertaModel(TFRobertaPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.roberta = TFRobertaMainLayer(config, name="roberta")
@unpack_inputs
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFBaseModelOutputWithPoolingAndCrossAttentions]:
r"""
encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*, defaults to `True`):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`). Set to `False` during training, `True` during generation
"""
outputs = self.roberta(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
class TFRobertaLMHead(keras.layers.Layer):
"""Roberta Head for masked language modeling."""
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
self.config = config
self.hidden_size = config.hidden_size
self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.act = get_tf_activation("gelu")
self.decoder = input_embeddings
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
def get_output_embeddings(self):
return self.decoder
def set_output_embeddings(self, value):
self.decoder.weight = value
self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self):
return {"bias": self.bias}
def set_bias(self, value):
self.bias = value["bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.layer_norm(hidden_states)
seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
def get_lm_head(self):
return self.lm_head
def get_prefix_bias_name(self):
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
return self.name + "/" + self.lm_head.name
@unpack_inputs
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="<mask>",
expected_output="' Paris'",
expected_loss=0.1,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLoss):
_keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
def __init__(self, config: RobertaConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
if not config.is_decoder:
logger.warning("If you want to use `TFRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
self.lm_head = TFRobertaLMHead(config, input_embeddings=self.roberta.embeddings, name="lm_head")
def get_lm_head(self):
return self.lm_head
def get_prefix_bias_name(self):
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
return self.name + "/" + self.lm_head.name
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
if attention_mask is None:
attention_mask = tf.ones(input_shape)
if past_key_values is not None:
input_ids = input_ids[:, -1:]
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
@unpack_inputs
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFCausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
class TFRobertaClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="dense",
)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = keras.layers.Dropout(classifier_dropout)
self.out_proj = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
self.config = config
def call(self, features, training=False):
x = features[:, 0, :]
x = self.dropout(x, training=training)
x = self.dense(x)
x = self.dropout(x, training=training)
x = self.out_proj(x)
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
ROBERTA_START_DOCSTRING,
)
class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceClassificationLoss):
_keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
self.classifier = TFRobertaClassificationHead(config, name="classifier")
@unpack_inputs
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="cardiffnlp/twitter-roberta-base-emotion",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="'optimism'",
expected_loss=0.08,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output, training=training)
loss = None if labels is None else self.hf_compute_loss(labels, logits)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
@add_start_docstrings(
"""
Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
ROBERTA_START_DOCSTRING,
)
class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss):
_keys_to_ignore_on_load_unexpected = [r"lm_head"]
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.roberta = TFRobertaMainLayer(config, name="roberta")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
outputs = self.roberta(
flat_input_ids,
flat_attention_mask,
flat_token_type_ids,
flat_position_ids,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices))
loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
"""
RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassificationLoss):
_keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = keras.layers.Dropout(classifier_dropout)
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="ydshieh/roberta-large-ner-english",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
expected_loss=0.01,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
**kwargs,
):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
**kwargs,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output)
if not return_dict:
output = (logits,) + outputs[2:]
return ((output,) + outputs[2:]) if output else output
return TFTokenClassifierOutput(logits=logits, hidden_states=outputs.hidden_states)
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels, logits)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
"""
RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnsweringLoss):
_keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="ydshieh/roberta-base-squad2",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="' puppet'",
expected_loss=0.86,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
loss = None
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.hf_compute_loss(labels, (start_logits, end_logits))
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
.\models\roberta\tokenization_roberta.py
"""RoBERTa 的分词类。"""
import json
import os
from functools import lru_cache
from typing import List, Optional, Tuple
import regex as re
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/vocab.json",
"FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/vocab.json",
"FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/vocab.json",
"distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/vocab.json",
"openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/vocab.json",
"openai-community/roberta-large-openai-detector": (
"https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/vocab.json"
),
},
"merges_file": {
"FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/merges.txt",
"FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/merges.txt",
"FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/merges.txt",
"distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/merges.txt",
"openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/merges.txt",
"openai-community/roberta-large-openai-detector": (
"https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/merges.txt"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"FacebookAI/roberta-base": 512,
"FacebookAI/roberta-large": 512,
"FacebookAI/roberta-large-mnli": 512,
"distilbert/distilroberta-base": 512,
"openai-community/roberta-base-openai-detector": 512,
"openai-community/roberta-large-openai-detector": 512,
}
@lru_cache()
def bytes_to_unicode():
"""
返回一个 UTF-8 字节列表,并提供到 Unicode 字符串的映射。
避免将空白字符和控制字符映射到 BPE(字节对编码)代码无法处理的字符。
可逆的 BPE(字节对编码)在 Unicode 字符串上工作。这意味着如果要避免 UNK(未知)字符,
则需要在词汇表中包含大量的 Unicode 字符。例如,处理 10B 个标记的数据集时,大约需要 5K 个字符
来获得良好的覆盖率。这相当于正常情况下 32K 个 BPE 词汇表的显著比例。为了避免这种情况,
我们需要 UTF-8 字节和 Unicode 字符串之间的查找表。
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
返回单词中的符号对集合。
单词表示为符号元组(符号是长度可变的字符串)。
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class RobertaTokenizer(PreTrainedTokenizer):
"""
构建 RoBERTa 分词器,基于 GPT-2 分词器,使用字节级别的字节对编码。
该分词器已经训练成将空格视为标记的一部分(有点类似 sentencepiece),因此一个单词的编码方式取决于
它是否在句子开头(没有空格)。
您可以通过在实例化分词器或在对文本调用时传递 `add_prefix_space=True` 来绕过这种行为,但由于模型
不是以这种方式进行预训练的,这可能会降低性能。
<Tip>
当使用 `is_split_into_words=True` 时,此分词器将在每个单词之前添加一个空格(即使是第一个单词)。
</Tip>
此分词器继承自 [`PreTrainedTokenizer`],该类包含大多数主要方法。用户应参考该超类以获取更多有关这些方法的信息。
"""
Args:
vocab_file (`str`):
Path to the vocabulary file. 词汇表文件的路径。
merges_file (`str`):
Path to the merges file. 合并文件的路径。
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html
解码字节流为 UTF-8 编码时使用的错误处理方式。参见 bytes.decode 获取更多信息。
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
训练预处理阶段使用的序列开始标记。可用作序列分类器标记。
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
序列结束标记。
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
分隔符标记,在构建多个序列的序列时使用,例如用于序列分类或文本问答中的问题与回答。同时也作为使用特殊标记构建序列的最后一个标记。
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
用于序列分类任务时的分类器标记(对整个序列进行分类而不是对每个标记进行分类)。在使用特殊标记构建序列时,它是序列的第一个标记。
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
未知标记。词汇表中不存在的标记无法被转换为标识符,将被设置为此标记。
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
用于填充的标记,例如在批处理不同长度的序列时使用。
mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
用于掩码值的标记。在进行掩码语言建模训练时使用的标记,模型会试图预测这些标记。
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
是否将初始空格添加到输入中。这允许将前导词视为其他任何词。RoBERTa 分词器通过前导空格检测词的开头。
```
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
**kwargs,
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
if isinstance(mask_token, str)
else mask_token
)
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.add_prefix_space = add_prefix_space
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
super().__init__(
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
vocab = dict(self.encoder).copy()
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A RoBERTa sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
"""
Retrieve a mask of special tokens to avoid performing unnecessary calculations on them.
Args:
token_ids_0 (`List[int]`):
List of IDs corresponding to the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional list of IDs corresponding to the second sequence.
already_has_special_tokens (`bool`):
Whether the input token IDs already include special tokens.
Returns:
`List[int]`: List indicating the positions of special tokens (1 for special token, 0 otherwise).
"""
special_tokens_mask = [0] * len(token_ids_0)
if token_ids_1 is None:
return special_tokens_mask
first_sep_token_idx = token_ids_0.index(self.sep_token_id) if self.sep_token_id in token_ids_0 else -1
for i in range(len(token_ids_0)):
if token_ids_0[i] in [self.sep_token_id, self.cls_token_id]:
special_tokens_mask[i] = 1
second_sep_token_idx = token_ids_1.index(self.sep_token_id) if self.sep_token_id in token_ids_1 else -1
for i in range(len(token_ids_1)):
if token_ids_1[i] in [self.sep_token_id, self.cls_token_id]:
special_tokens_mask.append(1)
else:
special_tokens_mask.append(0)
if already_has_special_tokens:
special_tokens_mask.extend([1] * (len(token_ids_0) + len(token_ids_1)))
return special_tokens_mask
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
"""
Prepare text for tokenization, potentially adding a prefix space if required.
Args:
text (str): The input text to be tokenized.
is_split_into_words (bool, optional): Whether the text is already split into words.
**kwargs: Additional keyword arguments.
Returns:
tuple: A tuple containing the modified text and remaining keyword arguments.
"""
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
text = " " + text
return (text, kwargs)