Transformers 源码解析(一百二十六)
.\models\xlm\modeling_xlm.py
"""
PyTorch XLM model.
"""
import itertools
import math
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union
import numpy as np
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import gelu
from ...modeling_outputs import (
BaseModelOutput,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel, SequenceSummary, SQuADHead
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_xlm import XLMConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-mlm-en-2048"
_CONFIG_FOR_DOC = "XLMConfig"
XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"FacebookAI/xlm-mlm-en-2048",
"FacebookAI/xlm-mlm-ende-1024",
"FacebookAI/xlm-mlm-enfr-1024",
"FacebookAI/xlm-mlm-enro-1024",
"FacebookAI/xlm-mlm-tlm-xnli15-1024",
"FacebookAI/xlm-mlm-xnli15-1024",
"FacebookAI/xlm-clm-enfr-1024",
"FacebookAI/xlm-clm-ende-1024",
"FacebookAI/xlm-mlm-17-1280",
"FacebookAI/xlm-mlm-100-1280",
]
def create_sinusoidal_embeddings(n_pos, dim, out):
"""
Create sinusoidal positional embeddings.
Args:
- n_pos (int): Number of positions.
- dim (int): Dimension of embeddings.
- out (Tensor): Output tensor to store the embeddings.
This function computes sinusoidal embeddings based on position and dimension,
storing them in the provided output tensor.
"""
position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
out.detach_()
out.requires_grad = False
def get_masks(slen, lengths, causal, padding_mask=None):
"""
Generate masks for hidden states and optionally an attention mask.
Args:
- slen (int): Sequence length.
- lengths (Tensor): Lengths of each sequence in a batch.
- causal (bool): If True, generate a causal (triangular) attention mask.
- padding_mask (Tensor, optional): Mask indicating padded elements.
Returns:
- Tensor: Mask for hidden states.
This function generates a mask to hide elements beyond the actual length
of each sequence, and optionally a causal attention mask if specified.
"""
alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
if padding_mask is not None:
mask = padding_mask
else:
assert lengths.max().item() <= slen
mask = alen < lengths[:, None]
bs = lengths.size(0)
if causal:
attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
else:
attn_mask = mask
assert mask.size() == (bs, slen)
assert causal is False or attn_mask.size() == (bs, slen, slen)
return mask, attn_mask
class MultiHeadAttention(nn.Module):
NEW_ID = itertools.count()
def __init__(self, n_heads, dim, config):
super().__init__()
self.layer_id = next(MultiHeadAttention.NEW_ID)
self.dim = dim
self.n_heads = n_heads
self.dropout = config.attention_dropout
assert self.dim % self.n_heads == 0
self.q_lin = nn.Linear(dim, dim)
self.k_lin = nn.Linear(dim, dim)
self.v_lin = nn.Linear(dim, dim)
self.out_lin = nn.Linear(dim, dim)
self.pruned_heads = set()
def prune_heads(self, heads):
attention_head_size = self.dim // self.n_heads
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads)
self.q_lin = prune_linear_layer(self.q_lin, index)
self.k_lin = prune_linear_layer(self.k_lin, index)
self.v_lin = prune_linear_layer(self.v_lin, index)
self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
self.n_heads = self.n_heads - len(heads)
self.dim = attention_head_size * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, input, mask, kv=None, cache=None, head_mask=None, output_attentions=False):
"""
Self-attention (if kv is None) or attention over source sentence (provided by kv).
"""
bs, qlen, dim = input.size()
if kv is None:
klen = qlen if cache is None else cache["slen"] + qlen
else:
klen = kv.size(1)
n_heads = self.n_heads
dim_per_head = self.dim // n_heads
mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
def shape(x):
"""projection"""
return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
def unshape(x):
"""compute context"""
return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
q = shape(self.q_lin(input))
if kv is None:
k = shape(self.k_lin(input))
v = shape(self.v_lin(input))
elif cache is None or self.layer_id not in cache:
k = v = kv
k = shape(self.k_lin(k))
v = shape(self.v_lin(v))
if cache is not None:
if self.layer_id in cache:
if kv is None:
k_, v_ = cache[self.layer_id]
k = torch.cat([k_, k], dim=2)
v = torch.cat([v_, v], dim=2)
else:
k, v = cache[self.layer_id]
cache[self.layer_id] = (k, v)
q = q / math.sqrt(dim_per_head)
scores = torch.matmul(q, k.transpose(2, 3))
mask = (mask == 0).view(mask_reshape).expand_as(scores)
scores.masked_fill_(mask, torch.finfo(scores.dtype).min)
weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
weights = nn.functional.dropout(weights, p=self.dropout, training=self.training)
if head_mask is not None:
weights = weights * head_mask
context = torch.matmul(weights, v)
context = unshape(context)
outputs = (self.out_lin(context),)
if output_attentions:
outputs = outputs + (weights,)
return outputs
class TransformerFFN(nn.Module):
def __init__(self, in_dim, dim_hidden, out_dim, config):
super().__init__()
self.dropout = config.dropout
self.lin1 = nn.Linear(in_dim, dim_hidden)
self.lin2 = nn.Linear(dim_hidden, out_dim)
self.act = gelu if config.gelu_activation else nn.functional.relu
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
def forward(self, input):
return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)
def ff_chunk(self, input):
x = self.lin1(input)
x = self.act(x)
x = self.lin2(x)
x = nn.functional.dropout(x, p=self.dropout, training=self.training)
return x
class XLMPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = XLMConfig
load_tf_weights = None
base_model_prefix = "transformer"
def __init__(self, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
@property
def dummy_inputs(self):
inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
if self.config.use_lang_emb and self.config.n_langs > 1:
langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
else:
langs_list = None
return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, nn.Embedding):
if self.config is not None and self.config.embed_init_std is not None:
nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
if isinstance(module, nn.Linear):
if self.config is not None and self.config.init_std is not None:
nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
if module.bias is not None:
nn.init.constant_(module.bias, 0.0)
if isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
@dataclass
class XLMForQuestionAnsweringOutput(ModelOutput):
"""
Base class for outputs of question answering models using a `SquadHead`.
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
分类损失,作为起始标记和结束标记分类损失的总和(如果提供了 `start_positions` 和 `end_positions`)。
start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
开始标记可能性的对数概率,对应于前 `config.start_n_top` 个可能性(使用 Beam Search)。
start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
开始标记可能性的索引,对应于前 `config.start_n_top` 个可能性(使用 Beam Search)。
end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
结束标记可能性的对数概率,对应于前 `config.start_n_top * config.end_n_top` 个可能性(使用 Beam Search)。
end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
结束标记可能性的索引,对应于前 `config.start_n_top * config.end_n_top` 个可能性(使用 Beam Search)。
cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
答案是否不可能的标签的对数概率。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型每层的隐藏状态,包括初始嵌入输出,形状为 `(batch_size, sequence_length, hidden_size)`。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
自注意力机制注意力权重,用于计算自注意力头中的加权平均值,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`XLMConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
XLM_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
XLM_START_DOCSTRING,
)
class XLMModel(XLMPreTrainedModel):
"""
XLM Model class inheriting from XLMPreTrainedModel.
"""
def get_input_embeddings(self):
"""
Returns the input embeddings of the model.
"""
return self.embeddings
def set_input_embeddings(self, new_embeddings):
"""
Set the input embeddings of the model to new_embeddings.
"""
self.embeddings = new_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model.
Args:
heads_to_prune (dict): Dictionary of {layer_num: list of heads to prune in this layer}.
See base class PreTrainedModel.
"""
for layer, heads in heads_to_prune.items():
self.attentions[layer].prune_heads(heads)
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass of the XLM model.
Args:
input_ids (torch.Tensor, optional): Indices of input sequence tokens in the vocabulary.
attention_mask (torch.Tensor, optional): Mask to avoid performing attention on padding token indices.
langs (torch.Tensor, optional): Language IDs for multilingual models (not used here).
token_type_ids (torch.Tensor, optional): Segment token indices to indicate first and second portions of the inputs.
position_ids (torch.Tensor, optional): Indices of positions of each input sequence tokens in the position embeddings.
lengths (torch.Tensor, optional): Lengths of each sequence to avoid masking beyond the sequence length.
cache (Dict[str, torch.Tensor], optional): Dictionary with precomputed hidden-states.
head_mask (torch.Tensor, optional): Mask to nullify selected heads of the self-attention modules.
inputs_embeds (torch.Tensor, optional): External embeddings for the input tokens.
output_attentions (bool, optional): Whether to output the attentions weights.
output_hidden_states (bool, optional): Whether to output the hidden states.
return_dict (bool, optional): Whether to return a dictionary instead of a tuple of outputs.
Returns:
BaseModelOutput: Model output that contains various elements depending on the configuration.
"""
class XLMPredLayer(nn.Module):
"""
Prediction layer (cross_entropy or adaptive_softmax).
"""
def __init__(self, config):
super().__init__()
self.asm = config.asm
self.n_words = config.n_words
self.pad_index = config.pad_index
dim = config.emb_dim
if config.asm is False:
self.proj = nn.Linear(dim, config.n_words, bias=True)
else:
self.proj = nn.AdaptiveLogSoftmaxWithLoss(
in_features=dim,
n_classes=config.n_words,
cutoffs=config.asm_cutoffs,
div_value=config.asm_div_value,
head_bias=True,
)
def forward(self, x, y=None):
"""计算损失,并可选地计算分数。"""
outputs = ()
if self.asm is False:
scores = self.proj(x)
outputs = (scores,) + outputs
if y is not None:
loss = nn.functional.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="mean")
outputs = (loss,) + outputs
else:
scores = self.proj.log_prob(x)
outputs = (scores,) + outputs
if y is not None:
_, loss = self.proj(x, y)
outputs = (loss,) + outputs
return outputs
"""
The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
"""
class XLMWithLMHeadModel(XLMPreTrainedModel):
_tied_weights_keys = ["pred_layer.proj.weight"]
def __init__(self, config):
super().__init__(config)
self.transformer = XLMModel(config)
self.pred_layer = XLMPredLayer(config)
self.post_init()
def get_output_embeddings(self):
return self.pred_layer.proj
def set_output_embeddings(self, new_embeddings):
self.pred_layer.proj = new_embeddings
def prepare_inputs_for_generation(self, input_ids, **kwargs):
mask_token_id = self.config.mask_token_id
lang_id = self.config.lang_id
effective_batch_size = input_ids.shape[0]
mask_token = torch.full((effective_batch_size, 1), mask_token_id, dtype=torch.long, device=input_ids.device)
input_ids = torch.cat([input_ids, mask_token], dim=1)
if lang_id is not None:
langs = torch.full_like(input_ids, lang_id)
else:
langs = None
return {"input_ids": input_ids, "langs": langs}
"""
Forward 方法的函数签名注释,描述了输入参数和输出的相关文档字符串。
Parameters:
input_ids (Optional[torch.Tensor]): 输入的 token IDs 张量,默认为 None。
attention_mask (Optional[torch.Tensor]): 注意力掩码张量,默认为 None。
langs (Optional[torch.Tensor]): 语言 ID 张量,默认为 None。
token_type_ids (Optional[torch.Tensor]): token 类型 ID 张量,默认为 None。
position_ids (Optional[torch.Tensor]): 位置 ID 张量,默认为 None。
lengths (Optional[torch.Tensor]): 长度张量,默认为 None。
cache (Optional[Dict[str, torch.Tensor]]): 缓存字典,默认为 None。
head_mask (Optional[torch.Tensor]): 头部掩码张量,默认为 None。
inputs_embeds (Optional[torch.Tensor]): 输入嵌入张量,默认为 None。
labels (Optional[torch.Tensor]): 标签张量,默认为 None。
output_attentions (Optional[bool]): 是否输出注意力,默认为 None。
output_hidden_states (Optional[bool]): 是否输出隐藏状态,默认为 None。
return_dict (Optional[bool]): 是否返回字典,默认为 None。
"""
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="<special1>",
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
...
):
pass
) -> Union[Tuple, MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
output = transformer_outputs[0]
outputs = self.pred_layer(output, labels)
if not return_dict:
return outputs + transformer_outputs[1:]
return MaskedLMOutput(
loss=outputs[0] if labels is not None else None,
logits=outputs[0] if labels is None else outputs[1],
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
class XLMForSequenceClassification(XLMPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.transformer = XLMModel(config)
self.sequence_summary = SequenceSummary(config)
self.post_init()
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings(
"""
XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLM_START_DOCSTRING,
)
这段代码定义了一个XLM模型,该模型在其顶部具有一个用于抽取式问答任务(如SQuAD)的跨度分类头部,这个注释说明了模型的整体功能和用途。
class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
定义了一个名为`XLMForQuestionAnsweringSimple`的类,它继承自`XLMPreTrainedModel`类,用于执行简单的问答任务。
def __init__(self, config):
super().__init__(config)
self.transformer = XLMModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
初始化方法定义了模型的构造函数。它首先调用父类的构造函数来初始化模型配置。然后创建了一个`XLMModel`实例作为`transformer`,并创建了一个线性层`qa_outputs`,用于预测答案的开始和结束位置。最后调用`post_init()`方法来初始化权重并应用最终处理。
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
这些装饰器为`forward`方法添加了文档字符串,描述了模型前向传播的输入和输出格式,以及提供了示例代码和模型配置信息的链接。
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
`forward`方法定义了模型的前向传播逻辑,接受多个输入参数,包括`input_ids`、`attention_mask`等,用于执行模型的计算和推理过程。
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
is_impossible: Optional[torch.Tensor] = None,
cls_index: Optional[torch.Tensor] = None,
p_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
XLM_START_DOCSTRING,
)
class XLMForTokenClassification(XLMPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = XLMModel(config)
self.dropout = nn.Dropout(config.dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
XLM Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
XLM_START_DOCSTRING,
)
class XLMForMultipleChoice(XLMPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = XLMModel(config)
self.sequence_summary = SequenceSummary(config)
self.logits_proj = nn.Linear(config.num_labels, 1)
self.post_init()
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
langs = langs.view(-1, langs.size(-1)) if langs is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
if lengths is not None:
logger.warning(
"The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
"attention mask instead."
)
lengths = None
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
logits = self.logits_proj(logits)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
.\models\xlm\tokenization_xlm.py
"""Tokenization classes for XLM."""
import json
import os
import re
import sys
import unicodedata
from typing import List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"FacebookAI/xlm-mlm-en-2048": "https://huggingface.co/FacebookAI/xlm-mlm-en-2048/resolve/main/vocab.json",
"FacebookAI/xlm-mlm-ende-1024": "https://huggingface.co/FacebookAI/xlm-mlm-ende-1024/resolve/main/vocab.json",
"FacebookAI/xlm-mlm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enfr-1024/resolve/main/vocab.json",
"FacebookAI/xlm-mlm-enro-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enro-1024/resolve/main/vocab.json",
"FacebookAI/xlm-mlm-tlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-tlm-xnli15-1024/resolve/main/vocab.json",
"FacebookAI/xlm-mlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-xnli15-1024/resolve/main/vocab.json",
"FacebookAI/xlm-clm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-clm-enfr-1024/resolve/main/vocab.json",
"FacebookAI/xlm-clm-ende-1024": "https://huggingface.co/FacebookAI/xlm-clm-ende-1024/resolve/main/vocab.json",
"FacebookAI/xlm-mlm-17-1280": "https://huggingface.co/FacebookAI/xlm-mlm-17-1280/resolve/main/vocab.json",
"FacebookAI/xlm-mlm-100-1280": "https://huggingface.co/FacebookAI/xlm-mlm-100-1280/resolve/main/vocab.json",
},
"merges_file": {
"FacebookAI/xlm-mlm-en-2048": "https://huggingface.co/FacebookAI/xlm-mlm-en-2048/resolve/main/merges.txt",
"FacebookAI/xlm-mlm-ende-1024": "https://huggingface.co/FacebookAI/xlm-mlm-ende-1024/resolve/main/merges.txt",
"FacebookAI/xlm-mlm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enfr-1024/resolve/main/merges.txt",
"FacebookAI/xlm-mlm-enro-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enro-1024/resolve/main/merges.txt",
"FacebookAI/xlm-mlm-tlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-tlm-xnli15-1024/resolve/main/merges.txt",
"FacebookAI/xlm-mlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-xnli15-1024/resolve/main/merges.txt",
"FacebookAI/xlm-clm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-clm-enfr-1024/resolve/main/merges.txt",
"FacebookAI/xlm-clm-ende-1024": "https://huggingface.co/FacebookAI/xlm-clm-ende-1024/resolve/main/merges.txt",
"FacebookAI/xlm-mlm-17-1280": "https://huggingface.co/FacebookAI/xlm-mlm-17-1280/resolve/main/merges.txt",
"FacebookAI/xlm-mlm-100-1280": "https://huggingface.co/FacebookAI/xlm-mlm-100-1280/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"FacebookAI/xlm-mlm-en-2048": 512,
"FacebookAI/xlm-mlm-ende-1024": 512,
"FacebookAI/xlm-mlm-enfr-1024": 512,
"FacebookAI/xlm-mlm-enro-1024": 512,
"FacebookAI/xlm-mlm-tlm-xnli15-1024": 512,
"FacebookAI/xlm-mlm-xnli15-1024": 512,
"FacebookAI/xlm-clm-enfr-1024": 512,
"FacebookAI/xlm-clm-ende-1024": 512,
"FacebookAI/xlm-mlm-17-1280": 512,
"FacebookAI/xlm-mlm-100-1280": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"FacebookAI/xlm-mlm-en-2048": {"do_lowercase_and_remove_accent": True},
"FacebookAI/xlm-mlm-ende-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {0: "de", 1: "en"},
"lang2id": {"de": 0, "en": 1},
},
"FacebookAI/xlm-mlm-enfr-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {0: "en", 1: "fr"},
"lang2id": {"en": 0, "fr": 1},
},
"FacebookAI/xlm-mlm-enro-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {0: "en", 1: "ro"},
"lang2id": {"en": 0, "ro": 1},
},
"FacebookAI/xlm-mlm-tlm-xnli15-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {
0: "ar",
1: "bg",
2: "de",
3: "el",
4: "en",
5: "es",
6: "fr",
7: "hi",
8: "ru",
9: "sw",
10: "th",
11: "tr",
12: "ur",
13: "vi",
14: "zh",
},
"lang2id": {
"ar": 0,
"bg": 1,
"de": 2,
"el": 3,
"en": 4,
"es": 5,
"fr": 6,
"hi": 7,
"ru": 8,
"sw": 9,
"th": 10,
"tr": 11,
"ur": 12,
"vi": 13,
"zh": 14,
},
},
"FacebookAI/xlm-mlm-xnli15-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {
0: "ar",
1: "bg",
2: "de",
3: "el",
4: "en",
5: "es",
6: "fr",
7: "hi",
8: "ru",
9: "sw",
10: "th",
11: "tr",
12: "ur",
13: "vi",
14: "zh",
},
"lang2id": {
"ar": 0,
"bg": 1,
"de": 2,
"el": 3,
"en": 4,
"es": 5,
"fr": 6,
"hi": 7,
"ru": 8,
"sw": 9,
"th": 10,
"tr": 11,
"ur": 12,
"vi": 13,
"zh": 14,
},
},
"FacebookAI/xlm-clm-enfr-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {0: "en", 1: "fr"},
"lang2id": {"en": 0, "fr": 1},
},
"FacebookAI/xlm-clm-ende-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {0: "de", 1: "en"},
"lang2id": {"de": 0, "en": 1},
},
"FacebookAI/xlm-mlm-17-1280": {
"do_lowercase_and_remove_accent": False,
"id2lang": {
0: "ar",
1: "de",
2: "en",
3: "es",
4: "fr",
5: "hi",
6: "it",
7: "ja",
8: "ko",
9: "nl",
10: "pl",
11: "pt",
12: "ru",
13: "sv",
14: "tr",
15: "vi",
16: "zh",
},
"lang2id": {
"ar": 0,
"de": 1,
"en": 2,
"es": 3,
"fr": 4,
"hi": 5,
"it": 6,
"ja": 7,
"ko": 8,
"nl": 9,
"pl": 10,
"pt": 11,
"ru": 12,
"sv": 13,
"tr": 14,
"vi": 15,
"zh": 16,
},
},
}
def get_pairs(word):
"""
Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
strings)
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
def lowercase_and_remove_accent(text):
"""
Lowercase and strips accents from a piece of text based on
https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
"""
text = " ".join(text)
text = text.lower()
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output).lower().split(" ")
def replace_unicode_punct(text):
"""
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
"""
text = text.replace(",", ",")
text = re.sub(r"。\s*", ". ", text)
text = text.replace("、", ",")
text = text.replace("”", '"')
text = text.replace("“", '"')
text = text.replace("∶", ":")
text = text.replace(":", ":")
text = text.replace("?", "?")
text = text.replace("《", '"')
text = text.replace("》", '"')
text = text.replace(")", ")")
text = text.replace("!", "!")
text = text.replace("(", "(")
text = text.replace(";", ";")
text = text.replace("1", "1")
text = text.replace("」", '"')
text = text.replace("「", '"')
text = text.replace("0", "0")
text = text.replace("3", "3")
text = text.replace("2", "2")
text = text.replace("5", "5")
text = text.replace("6", "6")
text = text.replace("9", "9")
text = text.replace("7", "7")
text = text.replace("8", "8")
text = text.replace("4", "4")
text = re.sub(r".\s*", ". ", text)
text = text.replace("~", "~")
text = text.replace("’", "'")
text = text.replace("…", "...")
text = text.replace("━", "-")
text = text.replace("〈", "<")
text = text.replace("〉", ">")
text = text.replace("【", "[")
text = text.replace("】", "]")
text = text.replace("%", "%")
return text
def remove_non_printing_char(text):
"""
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
"""
output = []
for char in text:
cat = unicodedata.category(char)
if cat.startswith("C"):
continue
output.append(char)
return "".join(output)
def romanian_preprocessing(text):
"""Sennrich's WMT16 scripts for Romanian preprocessing, used by model `FacebookAI/xlm-mlm-enro-1024`"""
text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
text = text.replace("\u0218", "S").replace("\u0219", "s")
text = text.replace("\u021a", "T").replace("\u021b", "t")
text = text.replace("\u0102", "A").replace("\u0103", "a")
text = text.replace("\u00C2", "A").replace("\u00E2", "a")
text = text.replace("\u00CE", "I").replace("\u00EE", "i")
return text
class XLMTokenizer(PreTrainedTokenizer):
"""
Construct an XLM tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:
- Moses preprocessing and tokenization for most supported languages.
- Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP).
- Optionally lowercases and normalizes all inputs text.
- The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
"__classify__") to a vocabulary.
- The `lang2id` attribute maps the languages supported by the model with their IDs if provided (automatically set
for pretrained vocabularies).
- The `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
Args:
vocab_file (`str`):
Vocabulary file.
merges_file (`str`):
Merges file.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"</s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"<special1>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (`List[str]`, *optional*, defaults to `['<special0>', '<special1>', '<special2>', '<special3>', '<special4>', '<special5>', '<special6>', '<special7>', '<special8>', '<special9>']`):
List of additional special tokens.
lang2id (`Dict[str, int]`, *optional*):
Dictionary mapping languages string identifiers to their IDs.
id2lang (`Dict[int, str]`, *optional*):
Dictionary mapping language IDs to their string identifiers.
do_lowercase_and_remove_accent (`bool`, *optional*, defaults to `True`):
Whether to lowercase and remove accents when tokenizing.
```
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
merges_file,
unk_token="<unk>",
bos_token="<s>",
sep_token="</s>",
pad_token="<pad>",
cls_token="</s>",
mask_token="<special1>",
additional_special_tokens=[
"<special0>",
"<special1>",
"<special2>",
"<special3>",
"<special4>",
"<special5>",
"<special6>",
"<special7>",
"<special8>",
"<special9>",
],
lang2id=None,
id2lang=None,
do_lowercase_and_remove_accent=True,
**kwargs,
):
try:
import sacremoses
except ImportError:
raise ImportError(
"You need to install sacremoses to use XLMTokenizer. "
"See https://pypi.org/project/sacremoses/ for installation."
)
self.sm = sacremoses
self.cache_moses_punct_normalizer = {}
self.cache_moses_tokenizer = {}
self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
self.lang2id = lang2id
self.id2lang = id2lang
if lang2id is not None and id2lang is not None:
assert len(lang2id) == len(id2lang)
self.ja_word_tokenizer = None
self.zh_word_tokenizer = None
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[:-1]
merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
lang2id=lang2id,
id2lang=id2lang,
do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
**kwargs,
)
@property
def do_lower_case(self):
return self.do_lowercase_and_remove_accent
def moses_punct_norm(self, text, lang):
if lang not in self.cache_moses_punct_normalizer:
punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
self.cache_moses_punct_normalizer[lang] = punct_normalizer
else:
punct_normalizer = self.cache_moses_punct_normalizer[lang]
return punct_normalizer.normalize(text)
def moses_tokenize(self, text, lang):
if lang not in self.cache_moses_tokenizer:
moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
self.cache_moses_tokenizer[lang] = moses_tokenizer
else:
moses_tokenizer = self.cache_moses_tokenizer[lang]
return moses_tokenizer.tokenize(text, return_str=False, escape=False)
def moses_pipeline(self, text, lang):
text = replace_unicode_punct(text)
text = self.moses_punct_norm(text, lang)
text = remove_non_printing_char(text)
return text
def ja_tokenize(self, text):
if self.ja_word_tokenizer is None:
try:
import Mykytea
self.ja_word_tokenizer = Mykytea.Mykytea(
f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
)
except (AttributeError, ImportError):
logger.error(
"Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper"
" (https://github.com/chezou/Mykytea-python) with the following steps"
)
logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
logger.error("2. autoreconf -i")
logger.error("3. ./configure --prefix=$HOME/local")
logger.error("4. make && make install")
logger.error("5. pip install kytea")
raise
return list(self.ja_word_tokenizer.getWS(text))
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
word = tuple(token[:-1]) + (token[-1] + "</w>",)
if token in self.cache:
return self.cache[token]
pairs = get_pairs(word)
if not pairs:
return token + "</w>"
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
if word == "\n </w>":
word = "\n</w>"
self.cache[token] = word
return word
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = "".join(tokens).replace("</w>", " ").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An XLM sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
bos = [self.bos_token_id]
sep = [self.sep_token_id]
if token_ids_1 is None:
return bos + token_ids_0 + sep
return bos + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
else:
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
else:
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
def __getstate__(self):
state = self.__dict__.copy()
state["sm"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
try:
import sacremoses
except ImportError:
raise ImportError(
"You need to install sacremoses to use XLMTokenizer. "
"See https://pypi.org/project/sacremoses/ for installation."
)
self.sm = sacremoses
.\models\xlm\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
_import_structure = {
"configuration_xlm": ["XLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMConfig", "XLMOnnxConfig"],
"tokenization_xlm": ["XLMTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_xlm"] = [
"XLM_PRETRAINED_MODEL_ARCHIVE_LIST",
"XLMForMultipleChoice",
"XLMForQuestionAnswering",
"XLMForQuestionAnsweringSimple",
"XLMForSequenceClassification",
"XLMForTokenClassification",
"XLMModel",
"XLMPreTrainedModel",
"XLMWithLMHeadModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_xlm"] = [
"TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFXLMForMultipleChoice",
"TFXLMForQuestionAnsweringSimple",
"TFXLMForSequenceClassification",
"TFXLMForTokenClassification",
"TFXLMMainLayer",
"TFXLMModel",
"TFXLMPreTrainedModel",
"TFXLMWithLMHeadModel",
]
if TYPE_CHECKING:
from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig, XLMOnnxConfig
from .tokenization_xlm import XLMTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_xlm import (
XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
XLMForMultipleChoice,
XLMForQuestionAnswering,
XLMForQuestionAnsweringSimple,
XLMForSequenceClassification,
XLMForTokenClassification,
XLMModel,
XLMPreTrainedModel,
XLMWithLMHeadModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_xlm import (
TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
TFXLMForMultipleChoice,
TFXLMForQuestionAnsweringSimple,
TFXLMForSequenceClassification,
TFXLMForTokenClassification,
TFXLMMainLayer,
TFXLMModel,
TFXLMPreTrainedModel,
TFXLMWithLMHeadModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\xlm_prophetnet\configuration_xlm_prophetnet.py
from typing import Callable, Optional, Union
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/xprophetnet-large-wiki100-cased": (
"https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/config.json"
),
}
class XLMProphetNetConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`XLMProphetNetModel`]. It is used to instantiate a
XLMProphetNet model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the XLMProphetNet
[microsoft/xprophetnet-large-wiki100-cased](https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased)
architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "xlm-prophetnet"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_attention_heads": "num_encoder_attention_heads",
}
def __init__(
self,
activation_dropout: Optional[float] = 0.1,
activation_function: Optional[Union[str, Callable]] = "gelu",
vocab_size: Optional[int] = 30522,
hidden_size: Optional[int] = 1024,
encoder_ffn_dim: Optional[int] = 4096,
num_encoder_layers: Optional[int] = 12,
num_encoder_attention_heads: Optional[int] = 16,
decoder_ffn_dim: Optional[int] = 4096,
num_decoder_layers: Optional[int] = 12,
num_decoder_attention_heads: Optional[int] = 16,
attention_dropout: Optional[float] = 0.1,
dropout: Optional[float] = 0.1,
max_position_embeddings: Optional[int] = 512,
init_std: Optional[float] = 0.02,
is_encoder_decoder: Optional[bool] = True,
add_cross_attention: Optional[bool] = True,
decoder_start_token_id: Optional[int] = 0,
ngram: Optional[int] = 2,
num_buckets: Optional[int] = 32,
relative_max_distance: Optional[int] = 128,
disable_ngram_loss: Optional[bool] = False,
eps: Optional[float] = 0.0,
use_cache: Optional[bool] = True,
pad_token_id: Optional[int] = 0,
bos_token_id: Optional[int] = 1,
eos_token_id: Optional[int] = 2,
**kwargs,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.encoder_ffn_dim = encoder_ffn_dim
self.num_encoder_layers = num_encoder_layers
self.num_encoder_attention_heads = num_encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.num_decoder_layers = num_decoder_layers
self.num_decoder_attention_heads = num_decoder_attention_heads
self.max_position_embeddings = max_position_embeddings
self.init_std = init_std
self.activation_function = activation_function
self.ngram = ngram
self.num_buckets = num_buckets
self.relative_max_distance = relative_max_distance
self.disable_ngram_loss = disable_ngram_loss
self.eps = eps
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.dropout = dropout
self.use_cache = use_cache
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
add_cross_attention=add_cross_attention,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
@property
def num_hidden_layers(self) -> int:
return self.num_encoder_layers + self.num_decoder_layers
@num_hidden_layers.setter
def num_hidden_layers(self, value):
raise NotImplementedError(
"This model does not support the setting of `num_hidden_layers`. Please set `num_encoder_layers` and"
" `num_decoder_layers`."
)
.\models\xlm_prophetnet\modeling_xlm_prophetnet.py
""" PyTorch XLM-ProphetNet model."""
import copy
import math
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from torch.nn import LayerNorm
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_xlm_prophetnet import XLMProphetNetConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "XLMProphetNetConfig"
XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/xprophetnet-large-wiki100-cased",
]
XLM_PROPHETNET_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
Original ProphetNet code can be found [here](https://github.com/microsoft/ProphetNet). Checkpoints were converted
from original Fairseq checkpoints. For more information on the checkpoint conversion, please take a look at the
file `convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py`.
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matters related to general usage and
behavior.
Parameters:
config ([`XLMProphetNetConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
XLM_PROPHETNET_INPUTS_DOCSTRING = r"""
"""
Copied from src.transformers.models.prophetnet.modeling_prophetnet.PROPHETNET_STANDALONE_INPUTS_DOCSTRING with ProphetNet->XLMProphetNet
"""
XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# Copied from transformers.models.prophetnet.modeling_prophetnet.softmax
def softmax(hidden_state, dim, onnx_trace=False):
"""
Applies softmax function along a specific dimension of the input tensor.
Args:
hidden_state (torch.Tensor): Input tensor to apply softmax.
dim (int): Dimension along which softmax will be computed.
onnx_trace (bool, optional): Whether to trace the operation for ONNX compatibility.
Returns:
torch.Tensor: Tensor after applying softmax along the specified dimension.
"""
if onnx_trace:
return nn.functional.softmax(hidden_state.float(), dim=dim)
else:
return nn.functional.softmax(hidden_state, dim=dim, dtype=torch.float32)
# Copied from transformers.models.prophetnet.modeling_prophetnet.ngram_attention_bias
def ngram_attention_bias(sequence_length, ngram, device, dtype):
"""
Compute n-gram attention bias tensor for ProphetNet.
Args:
sequence_length (int): Length of the input sequence.
ngram (int): Size of the n-gram.
device (torch.device): Device on which to allocate the tensors.
dtype (torch.dtype): Data type of the tensors.
Returns:
torch.Tensor: N-gram attention bias tensor of shape (ngram, sequence_length, 2 * sequence_length).
"""
left_block = (
torch.ones((ngram, sequence_length, sequence_length), device=device, dtype=dtype) * torch.finfo(dtype).min
)
right_block = left_block.detach().clone()
# create bias
for stream_idx in range(ngram):
right_block[stream_idx].fill_diagonal_(0, wrap=False)
left_block[stream_idx].triu_(-stream_idx + 1)
left_block[:, :, 0] = 0
return torch.cat([left_block, right_block], dim=2)
# 计算相对位置桶的函数,用于指定数量的桶、最大距离和相对位置列表
def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_bidirectional=False):
"""
This function computes individual parts of the relative position buckets. For more detail, see paper.
"""
# 反转相对位置,用负数表示
inv_relative_positions = -relative_positions
# 初始化相对位置桶
rel_positions_bucket = 0
# 如果是双向的相对位置计算
if is_bidirectional:
# 将桶的数量减半
num_buckets = num_buckets // 2
# 根据负相对位置是否小于零,确定其所属桶的索引
rel_positions_bucket = (
rel_positions_bucket
+ torch.lt(inv_relative_positions, torch.zeros_like(inv_relative_positions)).int() * num_buckets
)
# 取相对位置的绝对值
inv_relative_positions = torch.abs(inv_relative_positions)
else:
# 将负相对位置限制在非负数范围内
inv_relative_positions = torch.max(inv_relative_positions, torch.zeros_like(inv_relative_positions))
# 计算精确的最大值
max_exact = num_buckets // 2
# 判断是否是小距离的情况
is_small = torch.lt(inv_relative_positions, max_exact)
# 如果是大距离,使用对数函数计算其桶索引
val_if_large = max_exact + torch.log(inv_relative_positions.float() / max_exact) / math.log(
max_distance / max_exact
) * (num_buckets - max_exact)
# 限制桶索引在合理范围内
val_if_large = torch.min(val_if_large, torch.ones_like(val_if_large) * (num_buckets - 1)).int()
# 根据距离大小判断采用小距离还是大距离计算的结果
rel_positions_bucket = rel_positions_bucket + torch.where(is_small, inv_relative_positions.int(), val_if_large)
# 返回相对位置桶
return rel_positions_bucket
# 从transformers.models.prophetnet.modeling_prophetnet.compute_all_stream_relative_buckets复制而来
# 计算所有流的相对位置桶
def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids):
"""
This function computes both main and predict relative position buckets. For more detail, see paper.
"""
# 主流相对位置
main_stream_relative_positions = position_ids.unsqueeze(1).repeat(1, position_ids.size(-1), 1)
main_stream_relative_positions = main_stream_relative_positions - position_ids.unsqueeze(-1)
# 预测流相对位置
predicting_stream_relative_positions = torch.cat((position_ids - 1, position_ids), dim=-1).unsqueeze(1)
predicting_stream_relative_positions = predicting_stream_relative_positions.repeat(1, position_ids.size(-1), 1)
predicting_stream_relative_positions = predicting_stream_relative_positions - position_ids.unsqueeze(-1)
# 获取主要和预测位置桶
main_relative_position_buckets = compute_relative_buckets(
num_buckets, max_distance, main_stream_relative_positions, is_bidirectional=False
)
predict_relative_position_buckets = compute_relative_buckets(
num_buckets, max_distance, predicting_stream_relative_positions, is_bidirectional=False
)
# 返回主流和预测流的相对位置桶
return main_relative_position_buckets, predict_relative_position_buckets
# 从transformers.models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqLMOutput中复制而来,
# 用于XLMProphetNet的序列到序列语言模型输出
@dataclass
class XLMProphetNetSeq2SeqLMOutput(ModelOutput):
"""
Base class for sequence-to-sequence language models outputs.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
logits_ngram: Optional[torch.FloatTensor] = None
# 定义了多个可选类型的 Torch 张量元组变量,用于存储模型解码器的各种状态和注意力机制的输出
past_key_values: Optional[Tuple[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_ngram_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
decoder_ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 定义了一个属性方法,用于获取解码器的交叉注意力机制,同时发出未来移除警告
@property
def decoder_cross_attentions(self):
warnings.warn(
"`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions`"
" instead.",
FutureWarning,
)
return self.cross_attentions
@dataclass
# 定义 XLMProphetNetSeq2SeqModelOutput 类,继承自 ModelOutput,用于存储编码器模型的输出结果,包含预先计算的隐藏状态以加速顺序解码。
class XLMProphetNetSeq2SeqModelOutput(ModelOutput):
"""
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
decoding.
"""
# 最后一个隐藏状态,类型为 torch.FloatTensor
last_hidden_state: torch.FloatTensor
# 可选项,最后一个 n-gram 隐藏状态,类型为 torch.FloatTensor
last_hidden_state_ngram: Optional[torch.FloatTensor] = None
# 可选项,过去的键/值对,用于加速顺序解码,类型为 Tuple[torch.FloatTensor]
past_key_values: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,解码器的隐藏状态序列,类型为 Tuple[torch.FloatTensor]
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,解码器的 n-gram 隐藏状态序列,类型为 Tuple[torch.FloatTensor]
decoder_ngram_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,解码器的注意力权重序列,类型为 Tuple[torch.FloatTensor]
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,解码器的 n-gram 注意力权重序列,类型为 Tuple[torch.FloatTensor]]
decoder_ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,交叉注意力权重序列,类型为 Tuple[torch.FloatTensor]
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,编码器的最后一个隐藏状态,类型为 torch.FloatTensor
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
# 可选项,编码器的隐藏状态序列,类型为 Tuple[torch.FloatTensor]]
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,编码器的注意力权重序列,类型为 Tuple[torch.FloatTensor]]
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
@property
def decoder_cross_attentions(self):
# 发出警告,提示 `decoder_cross_attentions` 将被移除,请使用 `cross_attentions` 替代
warnings.warn(
"`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions`"
" instead.",
FutureWarning,
)
# 返回交叉注意力权重序列 cross_attentions
return self.cross_attentions
@dataclass
# 定义 XLMProphetNetDecoderModelOutput 类,继承自 ModelOutput,用于存储解码器模型的输出结果。
class XLMProphetNetDecoderModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
"""
# 最后一个隐藏状态,类型为 torch.FloatTensor
last_hidden_state: torch.FloatTensor
# 可选项,最后一个 n-gram 隐藏状态,类型为 torch.FloatTensor
last_hidden_state_ngram: Optional[torch.FloatTensor] = None
# 可选项,过去的键/值对,用于加速顺序解码,类型为 Tuple[torch.FloatTensor]
past_key_values: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,隐藏状态序列,类型为 Tuple[torch.FloatTensor]
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,n-gram 隐藏状态序列,类型为 Tuple[torch.FloatTensor]
hidden_states_ngram: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,注意力权重序列,类型为 Tuple[torch.FloatTensor]
attentions: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,n-gram 注意力权重序列,类型为 Tuple[torch.FloatTensor]
ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,交叉注意力权重序列,类型为 Tuple[torch.FloatTensor]
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
# 定义 XLMProphetNetDecoderLMOutput 类,继承自 ModelOutput,用于存储解码器语言模型的输出结果。
class XLMProphetNetDecoderLMOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
"""
# 可选项,损失值,类型为 torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
# 预测的 logits,类型为 torch.FloatTensor
logits: torch.FloatTensor = None
# 可选项,预测的 n-gram logits,类型为 torch.FloatTensor
logits_ngram: Optional[torch.FloatTensor] = None
# 可选项,过去的键/值对,用于加速顺序解码,类型为 Tuple[torch.FloatTensor]
past_key_values: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,隐藏状态序列,类型为 Tuple[torch.FloatTensor]
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,n-gram 隐藏状态序列,类型为 Tuple[torch.FloatTensor]
hidden_states_ngram: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,注意力权重序列,类型为 Tuple[torch.FloatTensor]
attentions: Optional[Tuple[torch.FloatTensor]] = None
# 可选项,n-gram 注意力权重序列,类型为 Tuple[torch.FloatTensor]
ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 从transformers.models.prophetnet.modeling_prophetnet.ProphetNetPreTrainedModel复制而来,将ProphetNet替换为XLMProphetNet
class XLMProphetNetPreTrainedModel(PreTrainedModel):
# 配置类为XLMProphetNetConfig
config_class = XLMProphetNetConfig
# 基础模型前缀为"prophetnet"
base_model_prefix = "prophetnet"
# 支持梯度检查点
supports_gradient_checkpointing = True
# 初始化模型权重的函数,根据不同类型的module设置不同的初始化方式
def _init_weights(self, module):
if isinstance(module, nn.Linear):
# 对线性层的权重进行正态分布初始化
module.weight.data.normal_(mean=0.0, std=self.config.init_std)
# 如果存在偏置项,则将其初始化为零
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
# 对嵌入层的权重进行正态分布初始化
module.weight.data.normal_(mean=0.0, std=self.config.init_std)
# 如果存在padding_idx,则将其对应位置的权重初始化为零
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
# 将输入向右移动的函数,用于decoder端的输入准备
def _shift_right(self, input_ids):
decoder_start_token_id = self.config.decoder_start_token_id
pad_token_id = self.config.pad_token_id
# 断言确保decoder_start_token_id已定义,通常设置为pad_token_id
assert decoder_start_token_id is not None, (
"self.model.config.decoder_start_token_id has to be defined. In XLMProphetNet it is usually set to the"
" pad_token_id. See XLMProphetNet docs for more information"
)
# 将输入向右移动一位
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
shifted_input_ids[..., 0] = decoder_start_token_id
# 断言确保pad_token_id已定义,用于替换labels中可能存在的-100值
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
# 断言确保shifted_input_ids中所有值都为非负数
assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values"
return shifted_input_ids
# 从transformers.models.prophetnet.modeling_prophetnet.ProphetNetPositionalEmbeddings复制而来,将ProphetNet替换为XLMProphetNet
class XLMProphetNetPositionalEmbeddings(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
the forward function.
"""
def __init__(self, config: XLMProphetNetConfig) -> None:
# 最大长度为config中的max_position_embeddings
self.max_length = config.max_position_embeddings
super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)
# 定义一个方法 forward,用于模型的前向传播
def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
# 断言语句,确保 position_ids 为 None 或者 self.padding_idx 未设置
assert (position_ids is None) or (
self.padding_idx is None
), "If position_ids is pre-computed then padding_idx should not be set."
# 如果 position_ids 为 None
if position_ids is None:
# 如果 past_key_values 不为 None,则在解码单步时 position_ids 对每个 token 都相同
if past_key_values is not None:
# 获取过去键值中的输入 token 数量
prev_num_input_ids = past_key_values[0][0].shape[2]
# 计算新的输入 token 数量
num_input_ids = inputs_shape[1] + prev_num_input_ids
# 计算新的 position_ids,并将其设为 padding_idx 加上 num_input_ids
position_ids = torch.ones((1, 1), dtype=torch.long, device=device) * (
int(self.padding_idx + num_input_ids)
)
else:
# 如果 attention_mask 为 None,则初始化 attention_mask 为全 1 的张量
if attention_mask is None:
attention_mask = torch.ones(inputs_shape, dtype=torch.long, device=device)
# 从 input_ids / attention_mask 中获取 position_ids
position_ids = (
torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask
).long() + self.padding_idx
# 确保 position_ids 不超过 max_length - 1
position_ids = position_ids.clamp(0, self.max_length - 1)
# 调用父类的 forward 方法,并返回其结果以及计算得到的 position_ids
return super().forward(position_ids), position_ids
# 定义一个私有方法 _forward,用于调用父类的 forward 方法
def _forward(self, position_ids):
return super().forward(position_ids)
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetAttention with ProphetNet->XLMProphetNet
class XLMProphetNetAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
config: XLMProphetNetConfig,
num_attn_heads: int,
):
super().__init__()
hidden_size = config.hidden_size
self.attention_dropout = config.attention_dropout # 设置注意力(dropout)的概率
self.dropout = config.dropout # 设置全连接层(dropout)的概率
self.num_attn_heads = num_attn_heads # 设置注意力头的数量
self.head_dim = hidden_size // num_attn_heads # 计算每个注意力头的维度
assert self.head_dim * num_attn_heads == hidden_size, (
"`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and"
" `config.num_decoder_attention_heads`"
)
self.key_proj = nn.Linear(hidden_size, hidden_size) # 创建线性层,用于计算键的投影
self.value_proj = nn.Linear(hidden_size, hidden_size) # 创建线性层,用于计算值的投影
self.query_proj = nn.Linear(hidden_size, hidden_size) # 创建线性层,用于计算查询的投影
self.out_proj = nn.Linear(hidden_size, hidden_size) # 创建线性层,用于输出投影
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_attn_heads, self.head_dim).transpose(1, 2).contiguous() # 重新形状张量,以便进行多头注意力计算
def forward(
self,
hidden_states,
key_value_states: Optional[Tensor] = None,
attention_mask: Optional[Tensor] = None,
layer_head_mask: Optional[Tensor] = None,
past_key_value: Optional[Tuple[Tensor]] = None,
output_attentions: bool = False,
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetFeedForward with ProphetNet->XLMProphetNet
class XLMProphetNetFeedForward(nn.Module):
"""
This is the residual two feed-forward layer block based on the original Transformer implementation.
"""
def __init__(self, config: XLMProphetNetConfig, ffn_dim: int):
super().__init__()
self.activation_fn = ACT2FN[config.activation_function] # 设置激活函数
self.intermediate = nn.Linear(config.hidden_size, ffn_dim) # 创建线性层,用于中间变换
self.output = nn.Linear(ffn_dim, config.hidden_size) # 创建线性层,用于输出变换
self.activation_dropout = config.activation_dropout # 设置激活(dropout)的概率
self.dropout = config.dropout # 设置全连接层(dropout)的概率
def forward(self, hidden_states):
hidden_states = self.intermediate(hidden_states) # 中间变换
hidden_states = self.activation_fn(hidden_states) # 激活函数处理
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) # 激活(dropout)
hidden_states = self.output(hidden_states) # 输出变换
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # 全连接层(dropout)
return hidden_states
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetNgramSelfAttention with ProphetNet->XLMProphetNet
class XLMProphetNetNgramSelfAttention(nn.Module):
# 初始化方法,接受一个配置对象 config:XLMProphetNetConfig
def __init__(self, config: XLMProphetNetConfig):
# 调用父类的初始化方法
super().__init__()
# 设置隐藏层大小为 config 中的 hidden_size
self.hidden_size = config.hidden_size
# 设置桶的数量为 config 中的 num_buckets
self.num_buckets = config.num_buckets
# 设置相对最大距离为 config 中的 relative_max_distance
self.relative_max_distance = config.relative_max_distance
# 设置注意力头的数量为 config 中的 num_decoder_attention_heads
self.num_attn_heads = config.num_decoder_attention_heads
# 设置全连接层的 dropout 率为 config 中的 dropout
self.dropout = config.dropout
# 设置注意力机制的 dropout 率为 config 中的 attention_dropout
self.attention_dropout = config.attention_dropout
# 设置每个注意力头的维度为 hidden_size / num_attn_heads
self.head_dim = config.hidden_size // self.num_attn_heads
# 设置 ngram 参数为 config 中的 ngram
# 断言条件:确保 hidden_size 能够被 num_attn_heads 整除
assert (
self.head_dim * self.num_attn_heads == config.hidden_size
), "config.hidden_size must be divisible by num_attn_heads"
# key, value, query 的投影层
self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
self.query_proj = nn.Linear(config.hidden_size, config.hidden_size)
# 输出投影层
self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
# 相对位置编码嵌入层
self.relative_pos_embeddings = nn.Linear(config.hidden_size, self.num_buckets * self.num_attn_heads)
# 用于 ONNX 运行时的标志,默认为 False
self.onnx_trace = False
# 将张量 tensor 重新整形为 (batch_size, seq_len, num_attn_heads, head_dim),并进行转置和连续性处理
def _shape(self, tensor, seq_len, batch_size):
return tensor.view(batch_size, seq_len, self.num_attn_heads, self.head_dim).transpose(1, 2).contiguous()
# 准备模型用于 ONNX 导出时设置 onnx_trace 标志为 True
def prepare_for_onnx_export_(self):
self.onnx_trace = True
# 前向传播方法
def forward(
self,
hidden_states,
past_key_value: Optional[Tuple[Tensor]] = None,
attention_mask=None,
layer_head_mask=None,
extended_predict_attention_mask=None,
main_relative_position_buckets=None,
predict_relative_position_buckets=None,
position_ids=None,
):
# 获取主要相对位置编码嵌入
def get_main_relative_pos_embeddings(
self, hidden_states, attn_weights, position_ids, main_relative_position_buckets
# input hidden_states [batch_size, sequence_length, hidden_size]
# input attn_weights [batch_size, num_heads, sequence_length, sequence_length]
# input position_ids [batch_size, sequence_length] or [1,1]
batch_size, num_attn_heads, tgt_len, src_len = attn_weights.shape
# 将注意力权重张量重新调整形状为 [batch_size, num_heads, tgt_len, src_len]
attn_weights = attn_weights.view(batch_size, num_attn_heads, tgt_len, src_len)
# 如果未提供主要相对位置桶,则计算它们
if main_relative_position_buckets is None:
batch_size, sequence_length = hidden_states.shape[:2]
# 生成相对位置张量,维度为 [batch_size, sequence_length, sequence_length+1]
relative_positions = (
torch.arange(1, attn_weights.shape[-1] + 1)
.unsqueeze(0)
.unsqueeze(0)
.repeat(batch_size, sequence_length, 1)
.to(position_ids.device)
)
# 计算相对位置差,并减去位置 ID,形成相对位置差矩阵
relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
# 计算主要相对位置桶,用于后续的注意力计算
main_relative_position_buckets = compute_relative_buckets(
self.num_buckets, self.relative_max_distance, relative_positions, False
)
# 计算相对位置编码张量,形状为 [batch_size, sequence_length, num_buckets * num_heads]
rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
# 调整相对位置编码张量的形状为 [batch_size, sequence_length, num_buckets, num_heads]
rel_pos_embeddings = rel_pos_embeddings.view(
rel_pos_embeddings.shape[:2] + (self.num_buckets, self.num_attn_heads)
)
# 将维度重新排列为 [batch_size, num_heads, sequence_length, num_buckets]
rel_pos_embeddings = rel_pos_embeddings.permute(0, 3, 1, 2)
# 调整形状为 [batch_size, num_heads, sequence_length, num_buckets * 1]
rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:3] + (-1,))
# 将主要相对位置桶扩展到所有头部,形状为 [batch_size * num_heads * sequence_length, sequence_length]
main_relative_position_buckets = main_relative_position_buckets.repeat(1, self.num_attn_heads, 1)
# 调整形状为 [batch_size * num_heads * sequence_length, sequence_length],并转换为长整型
main_relative_position_buckets = main_relative_position_buckets.view(
-1, main_relative_position_buckets.shape[-1]
).long()
# 调整相对位置编码张量的形状,以匹配相应的主要相对位置桶
rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))
# 使用索引从相对位置编码张量中聚合主要相对位置桶对应的编码
main_relative_pos_embeddings = torch.gather(rel_pos_embeddings, dim=1, index=main_relative_position_buckets)
# 调整形状为 [batch_size, num_heads, tgt_len, num_buckets]
main_relative_pos_embeddings = main_relative_pos_embeddings.view(batch_size, num_attn_heads, tgt_len, -1)
# 返回主要相对位置编码张量
return main_relative_pos_embeddings
# 定义函数 predict_relative_position_embeddings,接受多个输入参数
def predict_relative_position_embeddings(
hidden_states, attn_weights, position_ids, predict_relative_position_buckets=None
):
# 获取 hidden_states 的 batch_size 和 sequence_length 维度大小
# hidden_states 的形状为 [batch_size, sequence_length, ngram, hidden_size]
batch_size, sequence_length = hidden_states.shape[0:2]
# 如果 predict_relative_position_buckets 为 None,则计算相对位置
if predict_relative_position_buckets is None:
# 获取 attn_weights 的 key_sequence_length 维度大小
key_sequence_length = attn_weights.shape[-1]
# 断言检查 position_ids 是否正确,应为 1 2 3 4 5 ... (key_sequence_length - 1)
assert (
position_ids[0][0] == key_sequence_length - 1
), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
# 创建相对位置张量 relative_positions
relative_positions = (
torch.arange(0, key_sequence_length)
.unsqueeze(0)
.unsqueeze(0)
.repeat(batch_size, sequence_length, 1)
.to(position_ids.device)
)
# 计算相对位置偏移量
relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
# 使用 compute_relative_buckets 计算预测相对位置的桶
predict_relative_position_buckets = compute_relative_buckets(
self.num_buckets, self.relative_max_distance, relative_positions, False
)
# 将 hidden_states 的 ngram 维度与 sequence_length 维度互换位置
hidden_states = hidden_states.transpose(1, 2)
# 计算相对位置嵌入 rel_pos_embeddings
rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
# 调整 rel_pos_embeddings 的形状为 [batch_size, ngram, sequence_length, num_buckets, num_heads]
rel_pos_embeddings = rel_pos_embeddings.view(
hidden_states.shape[:-1] + (self.num_buckets, self.num_attn_heads)
)
# 将 rel_pos_embeddings 的维度顺序重新排列为 [batch_size, ngram, sequence_length, num_heads, num_buckets]
rel_pos_embeddings = rel_pos_embeddings.permute(0, 2, 1, 4, 3)
# 将 rel_pos_embeddings 展开为二维张量 [batch_size * ngram * sequence_length * num_heads, num_buckets]
rel_pos_embeddings = rel_pos_embeddings.reshape(-1, self.num_buckets)
# 将 predict_relative_position_buckets 在第 0 维度上增加一个维度
predict_relative_position_buckets = predict_relative_position_buckets.unsqueeze(0)
# 在第 0 维度上重复 predict_relative_position_buckets self.ngram 次,
# 在第 1 维度上重复 batch_size 次,第 2 维度上重复 num_attn_heads 次,最后一维度不变
predict_relative_position_buckets = predict_relative_position_buckets.repeat(
self.ngram, 1, self.num_attn_heads, 1
)
# 将 predict_relative_position_buckets 重塑为二维张量 [ngram * batch_size * num_heads * sequence_length, -1]
predict_relative_position_buckets = predict_relative_position_buckets.view(
-1, predict_relative_position_buckets.size(-1)
).long()
# 使用 torch.gather 根据 predict_relative_position_buckets 从 rel_pos_embeddings 中获取预测的相对位置嵌入
predict_relative_pos_embeddings = torch.gather(
rel_pos_embeddings, dim=1, index=predict_relative_position_buckets
)
# 将预测的相对位置嵌入 predict_relative_pos_embeddings 重新调整为形状 [batch_size, gram, num_heads, sequence_length, -1]
predict_relative_pos_embeddings = predict_relative_pos_embeddings.view(
batch_size, self.ngram, self.num_attn_heads, sequence_length, -1
)
# 返回预测的相对位置嵌入 predict_relative_pos_embeddings
return predict_relative_pos_embeddings
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetEncoderLayer with ProphetNet->XLMProphetNet, Prophetnet->XLMProphetnet
class XLMProphetNetEncoderLayer(nn.Module):
"""
Encoder block for XLMProphetnet
"""
def __init__(self, config: XLMProphetNetConfig):
super().__init__()
# 1st residual block
# 定义自注意力机制层,使用XLMProphetNetAttention模块,配置头数为config.num_encoder_attention_heads
self.self_attn = XLMProphetNetAttention(config, config.num_encoder_attention_heads)
# 定义Layer Normalization层,用于自注意力输出的归一化
self.self_attn_layer_norm = LayerNorm(config.hidden_size)
# 2nd residual block
# 定义前馈神经网络层,使用XLMProphetNetFeedForward模块,配置隐藏层大小为config.encoder_ffn_dim
self.feed_forward = XLMProphetNetFeedForward(config, config.encoder_ffn_dim)
# 定义Layer Normalization层,用于前馈神经网络输出的归一化
self.feed_forward_layer_norm = LayerNorm(config.hidden_size)
def forward(
self,
hidden_states,
attention_mask,
layer_head_mask,
output_attentions: bool = False,
):
# 1st residual block
# 执行自注意力机制,获取注意力输出、注意力权重和无用信息,更新隐藏状态
attention_output, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
# 对注意力输出和原始隐藏状态进行残差连接后,再进行Layer Normalization
hidden_states = self.self_attn_layer_norm(attention_output + hidden_states)
# 2nd residual block
# 执行前馈神经网络,得到前馈网络的输出
feed_forward_output = self.feed_forward(hidden_states)
# 对前馈网络的输出和原始隐藏状态进行残差连接后,再进行Layer Normalization
hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
# 组装输出结果
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,) # 如果需要输出注意力权重,则添加到输出结果中
return outputs
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderLayer with Prophetnet->XLMProphetnet, ProphetNet->XLMProphetNet
class XLMProphetNetDecoderLayer(nn.Module):
"""
Decoder block for XLMProphetnet
"""
def __init__(self, config: XLMProphetNetConfig):
super().__init__()
# 1st residual block
# 定义N-gram自注意力机制层,使用XLMProphetNetNgramSelfAttention模块
self.self_attn = XLMProphetNetNgramSelfAttention(config)
# 定义Layer Normalization层,用于自注意力输出的归一化
self.self_attn_layer_norm = LayerNorm(config.hidden_size)
# 2nd residual block
# 如果配置了交叉注意力,定义交叉注意力机制层,使用XLMProphetNetAttention模块,配置头数为config.num_decoder_attention_heads
if config.add_cross_attention:
self.cross_attn = XLMProphetNetAttention(config, config.num_decoder_attention_heads)
# 定义Layer Normalization层,用于交叉注意力输出的归一化
self.cross_attn_layer_norm = LayerNorm(config.hidden_size)
# 3rd residual block
# 定义前馈神经网络层,使用XLMProphetNetFeedForward模块,配置隐藏层大小为config.decoder_ffn_dim
self.feed_forward = XLMProphetNetFeedForward(config, config.decoder_ffn_dim)
# 定义Layer Normalization层,用于前馈神经网络输出的归一化
self.feed_forward_layer_norm = LayerNorm(config.hidden_size)
def forward(
self,
hidden_states,
attention_mask=None,
encoder_hidden_states=None,
encoder_attn_mask=None,
layer_head_mask=None,
cross_attn_layer_head_mask=None,
extended_predict_attention_mask=None,
main_relative_position_buckets=None,
predict_relative_position_buckets=None,
position_ids=None,
past_key_value=None,
use_cache: bool = True,
output_attentions: bool = False,
):
# 1st residual block
# 执行N-gram自注意力机制,更新隐藏状态
attention_output = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
extended_predict_attention_mask=extended_predict_attention_mask,
main_relative_position_buckets=main_relative_position_buckets,
predict_relative_position_buckets=predict_relative_position_buckets,
position_ids=position_ids,
past_key_value=past_key_value,
use_cache=use_cache,
output_attentions=output_attentions,
)
# 对注意力输出和原始隐藏状态进行残差连接后,再进行Layer Normalization
hidden_states = self.self_attn_layer_norm(attention_output + hidden_states)
# 2nd residual block
if config.add_cross_attention:
# 执行交叉注意力机制,获取注意力输出,更新隐藏状态
cross_attention_output = self.cross_attn(
hidden_states=hidden_states,
attention_mask=encoder_attn_mask,
layer_head_mask=cross_attn_layer_head_mask,
encoder_hidden_states=encoder_hidden_states,
output_attentions=output_attentions,
)
# 对交叉注意力输出和原始隐藏状态进行残差连接后,再进行Layer Normalization
hidden_states = self.cross_attn_layer_norm(cross_attention_output + hidden_states)
# 3rd residual block
# 执行前馈神经网络,得到前馈网络的输出
feed_forward_output = self.feed_forward(hidden_states)
# 对前馈网络的输出和原始隐藏状态进行残差连接后,再进行Layer Normalization
hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
# 组装输出结果
outputs = (hidden_states,)
if output_attentions:
outputs += (attention_output[-1],) # 如果需要输出注意力权重,则添加到输出结果中
return outputs
):
# 1st residual block
# 如果过去的键/值对存在,则从中获取自注意力缓存的键/值对的前两个位置,否则设为 None
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
# 使用自注意力模型处理隐藏状态,生成 ngram_attention_output 是自注意力输出,self_attn_weights 是自注意力权重,self_attn_weights_ngram 是 ngram 注意力权重,present_key_value 是当前的键/值对
ngram_attention_output, self_attn_weights, self_attn_weights_ngram, present_key_value = self.self_attn(
hidden_states=hidden_states,
past_key_value=self_attn_past_key_value,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
extended_predict_attention_mask=extended_predict_attention_mask,
main_relative_position_buckets=main_relative_position_buckets,
predict_relative_position_buckets=predict_relative_position_buckets,
position_ids=position_ids,
)
# 将自注意力输出与原始隐藏状态相加,并进行 Layer Normalization
hidden_states = self.self_attn_layer_norm(hidden_states + ngram_attention_output)
# 如果过去的键/值对存在,则从中获取交叉注意力缓存的键/值对的后两个位置,否则设为 None
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attn_weights = None
if encoder_hidden_states is not None:
# 2nd residual block
# 如果编码器的隐藏状态存在,则使用交叉注意力模型处理隐藏状态与编码器的键/值状态,生成 attention_output 是交叉注意力输出,cross_attn_weights 是交叉注意力权重,cross_attn_present_key_value 是当前的键/值对
attention_output, cross_attn_weights, cross_attn_present_key_value = self.cross_attn(
hidden_states=hidden_states,
key_value_states=encoder_hidden_states,
attention_mask=encoder_attn_mask,
layer_head_mask=cross_attn_layer_head_mask,
past_key_value=cross_attn_past_key_value,
output_attentions=output_attentions,
)
# 将交叉注意力输出与原始隐藏状态相加,并进行 Layer Normalization
hidden_states = self.cross_attn_layer_norm(attention_output + hidden_states)
# 将交叉注意力的键/值对添加到 present_key_value 中的后两个位置
present_key_value = present_key_value + cross_attn_present_key_value
# 3rd residual block
# 使用前馈神经网络处理隐藏状态,生成 feed_forward_output 是前馈神经网络输出
feed_forward_output = self.feed_forward(hidden_states)
# 将前馈神经网络的输出与原始隐藏状态相加,并进行 Layer Normalization
hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
# 将最终的隐藏状态作为输出
outputs = (hidden_states,)
# 如果需要输出注意力权重,则将自注意力和交叉注意力权重添加到输出中
if output_attentions:
outputs += (self_attn_weights, self_attn_weights_ngram, cross_attn_weights)
# 如果需要使用缓存,则将当前的键/值对添加到输出中
if use_cache:
outputs += (present_key_value,)
# 返回最终的输出元组
return outputs
# 添加起始文档字符串,描述 XLMProphetNetModel 的独立编码器部分
@add_start_docstrings(
"The standalone encoder part of the XLMProphetNetModel.",
XLM_PROPHETNET_START_DOCSTRING,
)
# 从 transformers.models.prophetnet.modeling_prophetnet.ProphetNetEncoder 复制而来,做了如下更改:microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel):
r"""
word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
The word embedding parameters. This can be used to initialize [`XLMProphetNetEncoder`] with pre-defined word
embeddings instead of randomly initialized word embeddings.
"""
def __init__(self, config: XLMProphetNetConfig, word_embeddings: nn.Embedding = None):
super().__init__(config)
# 初始化词嵌入,如果未提供则随机初始化,并设置填充索引
self.word_embeddings = (
word_embeddings
if word_embeddings is not None
else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
)
# 初始化位置嵌入
self.position_embeddings = XLMProphetNetPositionalEmbeddings(config)
# 初始化嵌入层的 LayerNorm
self.embeddings_layer_norm = LayerNorm(config.hidden_size)
# 创建编码器层列表,每层都是 XLMProphetNetEncoderLayer 类的实例
self.layers = nn.ModuleList([XLMProphetNetEncoderLayer(config) for _ in range(config.num_encoder_layers)])
# 是否使用梯度检查点
self.gradient_checkpointing = False
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self):
# 返回输入的词嵌入
return self.word_embeddings
def set_input_embeddings(self, value):
# 设置输入的词嵌入
self.word_embeddings = value
# 添加起始文档字符串到模型的 forward 方法,提供 XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING 描述
@add_start_docstrings_to_model_forward(XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 使用给定的配置和可选的词嵌入初始化模型
def __init__(self, config: XLMProphetNetConfig, word_embeddings: Optional[nn.Embedding] = None):
# 调用父类的初始化方法
super().__init__(config)
# 从配置中获取参数并设置为对象属性
self.ngram = config.ngram # ngram 参数
self.num_buckets = config.num_buckets # 桶的数量
self.relative_max_distance = config.relative_max_distance # 相对最大距离
self.dropout = config.dropout # dropout 比率
self.max_target_positions = config.max_position_embeddings # 最大目标位置数
# 如果提供了词嵌入,则使用提供的;否则创建一个新的词嵌入对象
self.word_embeddings = (
word_embeddings
if word_embeddings is not None
else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
)
# 创建位置嵌入对象
self.position_embeddings = XLMProphetNetPositionalEmbeddings(config)
# 创建 ngram 嵌入对象
self.ngram_embeddings = nn.Embedding(self.ngram, config.hidden_size, None)
# 创建多个解码层,并组成一个模块列表
self.layers = nn.ModuleList([XLMProphetNetDecoderLayer(config) for _ in range(config.num_decoder_layers)])
# 创建用于层归一化的对象
self.embeddings_layer_norm = LayerNorm(config.hidden_size)
# 初始化梯度检查点标志为 False
self.gradient_checkpointing = False
# 执行初始化权重和最终处理步骤
self.post_init()
# 返回模型的输入词嵌入
def get_input_embeddings(self):
return self.word_embeddings
# 设置模型的输入词嵌入
def set_input_embeddings(self, value):
self.word_embeddings = value
# 前向传播函数,具有详细的文档字符串和输出文档的替换
@add_start_docstrings_to_model_forward(XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=XLMProphetNetDecoderModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 计算带缓冲的相对桶
def compute_buffered_relative_buckets(self, position_ids):
# 获取批次大小和序列长度
batch_size, sequence_length = position_ids.shape
# 创建从1到self.max_target_positions的整数序列,并移到设备上
position_ids = torch.arange(1, self.max_target_positions).to(position_ids.device).repeat(batch_size, 1)
# 计算主相对桶和预测相对桶
main_relative_buckets, predict_relative_buckets = compute_all_stream_relative_buckets(
self.num_buckets, self.relative_max_distance, position_ids
)
# 缓冲主相对桶
main_relative_buckets = main_relative_buckets[:, :sequence_length, :sequence_length].repeat(batch_size, 1, 1)
# 缓冲预测相对桶,包括当前目标位置和扩展的序列长度部分
predict_relative_buckets = torch.cat(
[
predict_relative_buckets[:, :sequence_length, :sequence_length],
predict_relative_buckets[
:, :sequence_length, self.max_target_positions : self.max_target_positions + sequence_length
],
],
2,
).repeat(batch_size, 1, 1)
# 返回主相对桶和预测相对桶
return main_relative_buckets, predict_relative_buckets
# 准备注意力掩码
def prepare_attention_mask(self, hidden_states, attention_mask):
# 获取批次大小和序列长度
batch_size, seq_length = hidden_states.shape[:2]
# 获取因果掩码,用最小值填充
causal_mask = torch.full(
(seq_length, seq_length),
torch.finfo(hidden_states.dtype).min,
dtype=hidden_states.dtype,
device=hidden_states.device,
)
causal_mask = torch.triu(causal_mask, 1) # 取上三角部分作为因果掩码
# 扩展因果掩码以适应批次和注意力头数
extended_causal_mask = causal_mask[:seq_length, :seq_length][None, None, :, :].expand(
(batch_size, self.config.num_decoder_attention_heads) + causal_mask.shape
)
# 添加常规注意力掩码
if attention_mask is not None:
extended_attention_mask = (1.0 - attention_mask[:, None, None, :]) * torch.finfo(hidden_states.dtype).min
extended_attention_mask = extended_causal_mask + extended_attention_mask
else:
extended_attention_mask = extended_causal_mask
# 将注意力掩码转换为hidden_states的dtype并返回
return extended_attention_mask.to(hidden_states.dtype)
# 定义一个方法,准备预测用的注意力掩码
def prepare_predict_attention_mask(self, hidden_states, attention_mask):
# 获取批次大小和序列长度
batch_size, seq_length = hidden_states.shape[:2]
# 获取预测用因果掩码
predict_causal_mask = ngram_attention_bias(
self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype
)
# 将因果掩码按照特定规则连接起来,以适应预测流的需要
predict_causal_mask = torch.cat(
[
predict_causal_mask[:, :seq_length, :seq_length],
predict_causal_mask[
:, :seq_length, self.max_target_positions : self.max_target_positions + seq_length
],
],
dim=-1,
)
# 扩展因果掩码以适应批次和注意力头数目
extended_predict_causal_mask = predict_causal_mask[None, None, :, :, :].expand(
(batch_size, self.config.num_decoder_attention_heads) + predict_causal_mask.shape
)
# 添加常规注意力掩码(如果有)
if attention_mask is not None:
# 根据注意力掩码生成扩展的注意力掩码,负无穷处保持不变
extended_attention_mask = (1.0 - attention_mask[:, None, None, None, :]) * torch.finfo(self.dtype).min
extended_attention_mask = extended_attention_mask.expand(
(batch_size, self.config.num_decoder_attention_heads, self.ngram, seq_length, seq_length)
)
# 预测流的注意力掩码应始终为0,将其连接到扩展的注意力掩码中
extended_attention_mask = torch.cat(
[extended_attention_mask, torch.zeros_like(extended_attention_mask)], dim=-1
)
extended_predict_attention_mask = extended_predict_causal_mask + extended_attention_mask
else:
extended_predict_attention_mask = extended_predict_causal_mask
# 返回最终的扩展预测注意力掩码,转换为隐藏状态的数据类型
return extended_predict_attention_mask.to(hidden_states.dtype)
# 为 XLMProphetNetModel 类添加文档字符串,描述其作为 XLMProphetNetPreTrainedModel 的子类,以及模型输出原始隐藏状态的特性
@add_start_docstrings(
"The bare XLMProphetNet Model outputting raw hidden-states without any specific head on top.",
XLM_PROPHETNET_START_DOCSTRING,
)
# 从 transformers.models.prophetnet.modeling_prophetnet.ProphetNetModel 复制并修改的 XLMProphetNetModel 类
# 原始模型地址由 microsoft/prophetnet-large-uncased 更改为 patrickvonplaten/xprophetnet-large-uncased-standalone,
# 类名由 ProphetNetModel 更改为 XLMProphetNetModel,相关常量和字符串也做相应的修改
class XLMProphetNetModel(XLMProphetNetPreTrainedModel):
# 指定了 encoder 和 decoder 共享权重的键名列表
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
def __init__(self, config: XLMProphetNetConfig):
super().__init__(config)
# 初始化词嵌入层,使用配置中的词汇大小、隐藏层大小和填充标识符
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
# 复制配置以初始化编码器和解码器,确保配置的一致性和独立性
encoder_config = copy.deepcopy(config)
encoder_config.is_encoder_decoder = False
encoder_config.use_cache = False
# 初始化编码器,传入编码器配置和词嵌入层
self.encoder = XLMProphetNetEncoder(encoder_config, self.word_embeddings)
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
decoder_config.is_encoder_decoder = False
# 初始化解码器,传入解码器配置和词嵌入层
self.decoder = XLMProphetNetDecoder(decoder_config, self.word_embeddings)
# 初始化权重并应用最终处理
self.post_init()
# 获取输入词嵌入层
def get_input_embeddings(self):
return self.word_embeddings
# 设置输入词嵌入层
def set_input_embeddings(self, value):
self.word_embeddings = value
self.encoder.word_embeddings = self.word_embeddings
self.decoder.word_embeddings = self.word_embeddings
# 绑定编码器和解码器的词嵌入权重
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.encoder.word_embeddings, self.word_embeddings)
self._tie_or_clone_weights(self.decoder.word_embeddings, self.word_embeddings)
# 获取编码器对象
def get_encoder(self):
return self.encoder
# 获取解码器对象
def get_decoder(self):
return self.decoder
# 前向传播函数,接受多个输入参数并返回模型输出
@add_start_docstrings_to_model_forward(XLM_PROPHETNET_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=XLMProphetNetSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.Tensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"The XLMProphetNet Model with a language modeling head. Can be used for sequence generation tasks.",
# 定义了一个字符串,描述了带有语言建模头部的 XLMProphetNet 模型,适用于序列生成任务。
XLM_PROPHETNET_START_DOCSTRING,
# 引用了常量 XLM_PROPHETNET_START_DOCSTRING,可能是用于生成模型文档字符串的起始标记。
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForConditionalGeneration with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel):
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
def __init__(self, config: XLMProphetNetConfig):
super().__init__(config)
self.prophetnet = XLMProphetNetModel(config) # 初始化ProphetNet模型
self.padding_idx = config.pad_token_id # 设置填充索引
self.disable_ngram_loss = config.disable_ngram_loss # 禁用N-gram损失
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # 初始化线性层用于语言建模头
# Initialize weights and apply final processing
self.post_init() # 调用后续初始化方法
def get_output_embeddings(self):
return self.lm_head # 返回语言建模头的权重
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings # 设置新的语言建模头权重
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.prophetnet.word_embeddings, self.lm_head) # 如果需要,则绑定或克隆词嵌入的权重到语言建模头
def get_input_embeddings(self):
return self.prophetnet.word_embeddings # 返回ProphetNet模型的词嵌入层
@add_start_docstrings_to_model_forward(XLM_PROPHETNET_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=XLMProphetNetSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.Tensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
此方法实现了XLMProphetNetForConditionalGeneration的前向传播逻辑,接受多个输入参数,并返回模型输出。
"""
# 实现详细的前向传播逻辑...
# 计算损失函数,用于模型训练过程中的损失计算
def _compute_loss(self, logits, labels, ignore_index=-100):
# 创建与labels相同维度的零张量,用于存储扩展后的目标标签
expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index)
# 根据配置参数ngram扩展目标标签,用于计算ngram损失
for i in range(self.config.ngram):
if i > 0 and self.disable_ngram_loss:
break
expend_targets[i, :, :] = labels
# 调整logits的维度顺序以便计算损失
logits = logits.transpose(0, 1).contiguous()
# 对logits进行log_softmax操作,用于计算负对数似然损失
lprobs = nn.functional.log_softmax(
logits.view(-1, logits.size(-1)),
dim=-1,
dtype=torch.float32,
)
# 计算负对数似然损失
loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")
# 如果配置中的平滑因子eps大于0,则进行标签平滑处理
if self.config.eps > 0.0:
smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
non_masked_tokens = expend_targets.ne(ignore_index).view(-1)
smooth_loss = smooth_loss[non_masked_tokens]
smooth_loss = smooth_loss.mean()
eps_i = self.config.eps / lprobs.size(-1)
loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss
# 返回计算得到的损失
return loss
# 生成过程中准备输入,返回用于生成的输入参数字典
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
# 断言encoder_outputs不为None,确保生成过程中有编码器输出
assert encoder_outputs is not None, "`encoder_outputs` have to be passed for generation."
# 如果有过去的键值,将decoder_input_ids限制为最后一个token
if past_key_values:
decoder_input_ids = decoder_input_ids[:, -1:]
# 返回生成过程所需的参数字典
return {
"input_ids": None, # encoder_outputs已定义,不需要input_ids
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
# 根据标签准备解码器输入ids,用于解码器生成过程
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return self._shift_right(labels)
# 重新排序缓存数据,用于生成过程中的beam搜索
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
# 对每一层的缓存数据进行重新排序,以便与beam搜索结果匹配
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
# 获取编码器模型
def get_encoder(self):
return self.prophetnet.encoder
# 获取解码器模型
def get_decoder(self):
return self.prophetnet.decoder
@add_start_docstrings(
"The standalone decoder part of the XLMProphetNetModel with a lm head on top. The model can be used for causal"
" language modeling.",
XLM_PROPHETNET_START_DOCSTRING,
)
# 定义 XLMProphetNetForCausalLM 类,继承自 XLMProphetNetPreTrainedModel
# 这个类是 XLMProphetNet 模型的独立解码器部分,顶部带有语言建模头
# 可用于因果语言建模。
class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel):
# 静态成员变量,用于指定需要共享权重的层
_tied_weights_keys = [
"prophetnet.word_embeddings.weight",
"prophetnet.decoder.word_embeddings.weight",
"lm_head.weight",
]
def __init__(self, config: XLMProphetNetConfig):
# 设置用于条件语言建模的配置
config = copy.deepcopy(config)
config.is_decoder = True # 设置为解码器
config.is_encoder_decoder = False # 不是编码解码模型
super().__init__(config) # 调用父类构造函数,初始化配置
self.prophetnet = XLMProphetNetDecoderWrapper(config) # 初始化 XLMProphetNetDecoderWrapper
self.padding_idx = config.pad_token_id # 设置填充符索引
self.disable_ngram_loss = config.disable_ngram_loss # 是否禁用 ngram 损失
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# 初始化语言建模头,线性层映射到词汇表大小,无偏置
# 初始化权重并进行最终处理
self.post_init()
def get_input_embeddings(self):
# 返回输入嵌入层,即 prophetnet 解码器的词嵌入层
return self.prophetnet.decoder.word_embeddings
def set_input_embeddings(self, value):
# 设置输入嵌入层
self.prophetnet.decoder.word_embeddings = value
def get_output_embeddings(self):
# 返回输出嵌入层,即语言建模头
return self.lm_head
def set_output_embeddings(self, new_embeddings):
# 设置输出嵌入层
self.lm_head = new_embeddings
def _tie_weights(self):
# 如果配置要求共享词嵌入权重,则共享解码器词嵌入层和语言建模头的权重
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.prophetnet.decoder.word_embeddings, self.lm_head)
def set_decoder(self, decoder):
# 设置解码器
self.prophetnet.decoder = decoder
def get_decoder(self):
# 获取解码器
return self.prophetnet.decoder
@add_start_docstrings_to_model_forward(XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=XLMProphetNetDecoderLMOutput, config_class=_CONFIG_FOR_DOC)
# 重写 forward 方法,添加模型输入的文档字符串和输出的类型说明
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 模型前向传播方法,包含多个输入和控制参数
# 输出是否返回字典格式结果
return_dict: Optional[bool] = None,
注释:
# 定义一个方法用于计算损失函数,接收模型预测的logits、真实标签、以及一个忽略索引值(默认为-100)
def _compute_loss(self, logits, labels, ignore_index=-100):
# 创建一个与labels相同数据类型和形状的全零张量,填充值为ignore_index,形状为(self.config.ngram, labels.size(0), labels.size(1))
expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index)
# 循环创建ngram个维度的标签张量,用于损失计算
for i in range(self.config.ngram):
# 如果i大于0并且self.disable_ngram_loss为True,则退出循环
if i > 0 and self.disable_ngram_loss:
break
# 将labels复制到第i维的标签张量中
expend_targets[i, :, :] = labels
# 转置logits张量,使其形状变为(序列长度, 批次大小, 类别数),并保证内存连续性
logits = logits.transpose(0, 1).contiguous()
# 对logits进行log_softmax操作,计算对数概率,dim=-1表示沿着最后一个维度进行softmax操作,dtype=torch.float32指定数据类型
lprobs = nn.functional.log_softmax(
logits.view(-1, logits.size(-1)), # 将logits视图展平为二维张量
dim=-1,
dtype=torch.float32,
)
# 计算负对数似然损失,将lprobs视图展平为一维张量,expend_targets也展平为一维张量,reduction="mean"表示计算均值
loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")
# 如果配置参数self.config.eps大于0.0
if self.config.eps > 0.0:
# 计算平滑损失,对lprobs在最后一个维度求和并保持维度不变
smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
# 获取非遮蔽标记的令牌,即expend_targets不等于ignore_index的元素视图
non_masked_tokens = expend_targets.ne(ignore_index).view(-1)
# 根据非遮蔽标记的令牌,重新计算smooth_loss的均值
smooth_loss = smooth_loss[non_masked_tokens].mean()
# 计算eps_i,即self.config.eps除以lprobs的最后一个维度的长度
eps_i = self.config.eps / lprobs.size(-1)
# 计算最终损失,结合平滑损失和eps_i的影响
loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss
# 返回计算得到的损失值
return loss
# 定义一个方法,准备生成过程中的输入参数,接收input_ids等参数及其它关键字参数
def prepare_inputs_for_generation(
self,
input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
use_cache=None,
**kwargs,
):
# 如果注意力掩码为None,则创建一个与input_ids形状相同的全1张量作为注意力掩码
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
# 如果past_key_values不为None,则只保留input_ids的最后一个时间步作为输入
if past_key_values:
input_ids = input_ids[:, -1:]
# 返回准备好的输入参数字典,包括input_ids、attention_mask、head_mask、past_key_values和use_cache
# input_ids不需要在这里定义,因为encoder_outputs已经定义了
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
# 从transformers.models.bart.modeling_bart.BartForCausalLM._reorder_cache中复制而来的方法
def _reorder_cache(past_key_values, beam_idx):
# 重新排序过的过去键值对的元组
reordered_past = ()
# 遍历过去的每一层键值对
for layer_past in past_key_values:
# 对每个过去状态,根据beam_idx重新排序,并放置到reordered_past中
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
# 返回重新排序后的过去键值对
return reordered_past
# 从transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderWrapper复制而来,将ProphetNet->XLMProphetNet,prophetnet->XLMProphetNet
class XLMProphetNetDecoderWrapper(XLMProphetNetPreTrainedModel):
"""
这是一个包装类,使得[`XLMProphetNetForCausalLM`]能够从预训练的XLMProphetNet类正确加载。
"""
def __init__(self, config: XLMProphetNetConfig):
super().__init__(config)
# 初始化词嵌入层,使用给定的词汇表大小、隐藏大小和填充标记ID
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
# 初始化解码器,传入配置和词嵌入层
self.decoder = XLMProphetNetDecoder(config, word_embeddings=self.word_embeddings)
# 初始化权重并应用最终处理
self.post_init()
def _tie_weights(self):
# 将词嵌入层的权重与解码器的输入嵌入层权重绑定
self._tie_or_clone_weights(self.word_embeddings, self.decoder.get_input_embeddings())
def forward(self, *args, **kwargs):
# 前向传播,调用解码器的前向方法
return self.decoder(*args, **kwargs)
.\models\xlm_prophetnet\tokenization_xlm_prophetnet.py
import collections
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "prophetnet.tokenizer"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/xprophetnet-large-wiki100-cased": (
"https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/prophetnet.tokenizer"
),
}
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/xprophetnet-large-wiki100-cased": {"do_lower_case": False},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/xprophetnet-large-wiki100-cased": 512,
}
def load_vocab(vocab_file):
"""
将词汇文件加载到字典中
:param vocab_file: 词汇文件路径和名称
:type vocab_file: str
"""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
Args:
vocab_file (`str`):
bos_token (`str`, *optional*, defaults to `"[SEP]"`):
eos_token (`str`, *optional*, defaults to `"[SEP]"`):
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
sp_model_kwargs (`dict`, *optional*):
Attributes:
sp_model (`SentencePieceProcessor`):
The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
# 定义类变量,指定了模型使用的词汇文件名列表
vocab_files_names = VOCAB_FILES_NAMES
# 定义类变量,指定了预训练模型使用的词汇文件映射
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 定义类变量,指定了预训练模型的最大输入尺寸
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 定义类变量,指定了模型的输入名称列表
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
bos_token="[SEP]",
eos_token="[SEP]",
sep_token="[SEP]",
unk_token="[UNK]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
# 如果没有提供 sp_model_kwargs 则设为空字典
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
try:
# 尝试导入 sentencepiece 库
import sentencepiece as spm
except ImportError:
# 如果导入失败,给出警告并提示用户安装 SentencePiece 库的链接和安装指令
logger.warning(
"You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
" pip install sentencepiece"
)
raise
# 初始化 sp_model 属性,使用给定的 sp_model_kwargs 创建 SentencePieceProcessor 对象
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
# 加载词汇文件到 sp_model
self.sp_model.Load(str(vocab_file))
# 保存词汇文件路径到 vocab_file 属性
self.vocab_file = vocab_file
# 原始 fairseq 的词汇和 spm 的词汇必须是“对齐”的:
# Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
# -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
# fairseq | '<s>' | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's' | '▁de' | '-'
# spm | '<unk>' | '<s>' | '</s>' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a'
# 将特殊的 tokens 和 [unused] tokens 放入词汇表中
self.fairseq_tokens_to_ids = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[UNK]": 3, "[MASK]": 4}
for i in range(10):
tok = f"[unused{i}]"
self.fairseq_tokens_to_ids[tok] = 5 + i
# 第一个“真实”的 token “,” 在嵌入词汇中的位置为 15,在 spm 词汇中的位置为 3
self.fairseq_offset = 12
# 创建 fairseq_ids_to_tokens 字典,用于根据 id 查找对应的 token
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
# TODO ArthurZ fairseq_ids_to_tokens should be removed
# 调用父类的初始化方法,传入各种特殊 token 和 sp_model_kwargs 等参数
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
unk_token=unk_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
@property
def can_save_slow_tokenizer(self) -> bool:
# 检查词汇文件是否存在,从而确定是否可以保存慢速的分词器
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def __getstate__(self):
# 返回对象的状态字典,将 sp_model 设为 None,以便对象可以被序列化
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d # 将对象的属性字典设置为给定的字典 `d`
try:
import sentencepiece as spm # 尝试导入 sentencepiece 库
except ImportError:
logger.warning(
"You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
" pip install sentencepiece"
)
raise # 报错提醒用户需要安装 SentencePiece 库
# 用于向后兼容性
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {} # 如果对象没有 `sp_model_kwargs` 属性,则设置为空字典
# 根据 `self.sp_model_kwargs` 参数创建 SentencePieceProcessor 对象,并加载词汇文件
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
) # 如果已经包含特殊标记,调用父类的方法获取特殊标记掩码
if token_ids_1 is None:
return ([0] * len(token_ids_0)) + [1] # 返回仅有第一个序列的特殊标记掩码
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] # 返回包含两个序列的特殊标记掩码
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLMProphetNet
does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id] # 获取分隔符的 token id
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0] # 返回仅有第一个序列的 token type ids
return len(token_ids_0 + sep + sep + token_ids_1 + sep) * [0] # 返回包含两个序列的 token type ids
@property
def vocab_size(self):
return len(self.sp_model) + self.fairseq_offset # 返回词汇表大小
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} # 构建词汇表字典
vocab.update(self.added_tokens_encoder) # 添加额外的编码器信息到词汇表
return vocab # 返回词汇表字典
def _tokenize(self, text: str) -> str:
"""Tokenizes a given text using the SentencePiece model and returns it as a string."""
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
"""Converts a token (str) into its corresponding ID using the vocabulary."""
# Check if the token exists in the predefined Fairseq tokens to IDs mapping
if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token]
# Obtain the token's ID from the SentencePiece model
spm_id = self.sp_model.PieceToId(token)
# Return the ID with an offset specific to Fairseq or the unknown token ID if not found
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
def _convert_id_to_token(self, index):
"""Converts an index (integer) into its corresponding token (str) using the vocabulary."""
# Check if the index exists in the predefined Fairseq IDs to tokens mapping
if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index]
# Convert the index to a token using the SentencePiece model adjusted by Fairseq offset
return self.sp_model.IdToPiece(index - self.fairseq_offset)
def convert_tokens_to_string(self, tokens):
"""
Converts a sequence of tokens (strings for sub-words) into a single concatenated string,
replacing special sub-word marker with spaces and stripping leading/trailing spaces.
"""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Saves the current vocabulary to the specified directory.
Args:
save_directory (str): Directory path where the vocabulary file should be saved.
filename_prefix (Optional[str]): Optional prefix for the vocabulary file name.
Returns:
Tuple[str]: Tuple containing the path of the saved vocabulary file.
"""
# Ensure the provided directory path exists; otherwise, log an error
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# Define the output vocabulary file path based on the provided directory and filename prefix
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# Copy the current vocabulary file if it differs from the destination path and exists
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# If the current vocabulary file doesn't exist, write the serialized SentencePiece model to the output file
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Builds model inputs by concatenating a sequence or pair of sequences with special tokens.
Args:
token_ids_0 (List[int]): List of token IDs for the first sequence.
token_ids_1 (Optional[List[int]]): Optional list of token IDs for the second sequence in a pair.
Returns:
List[int]: List of input IDs with added special tokens for model input.
"""
# If only one sequence is provided, concatenate it with the separator token
if token_ids_1 is None:
return token_ids_0 + [self.sep_token_id]
# Concatenate both sequences with separator tokens in between
sep = [self.sep_token_id]
return token_ids_0 + sep + token_ids_1 + sep
.\models\xlm_prophetnet\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_torch_available
_import_structure = {
"configuration_xlm_prophetnet": ["XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMProphetNetConfig"],
}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_xlm_prophetnet"] = ["XLMProphetNetTokenizer"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_xlm_prophetnet"] = [
"XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"XLMProphetNetDecoder",
"XLMProphetNetEncoder",
"XLMProphetNetForCausalLM",
"XLMProphetNetForConditionalGeneration",
"XLMProphetNetModel",
"XLMProphetNetPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_xlm_prophetnet import (
XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
XLMProphetNetDecoder,
XLMProphetNetEncoder,
XLMProphetNetForCausalLM,
XLMProphetNetForConditionalGeneration,
XLMProphetNetModel,
XLMProphetNetPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\xlm_roberta\configuration_xlm_roberta.py
""" XLM-RoBERTa configuration"""
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"FacebookAI/xlm-roberta-base": "https://huggingface.co/FacebookAI/xlm-roberta-base/resolve/main/config.json",
"FacebookAI/xlm-roberta-large": "https://huggingface.co/FacebookAI/xlm-roberta-large/resolve/main/config.json",
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch": (
"https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/config.json"
),
"FacebookAI/xlm-roberta-large-finetuned-conll02-spanish": (
"https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/config.json"
),
"FacebookAI/xlm-roberta-large-finetuned-conll03-english": (
"https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-english/resolve/main/config.json"
),
"FacebookAI/xlm-roberta-large-finetuned-conll03-german": (
"https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-german/resolve/main/config.json"
),
}
class XLMRobertaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`XLMRobertaModel`] or a [`TFXLMRobertaModel`]. It
is used to instantiate a XLM-RoBERTa model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the XLMRoBERTa
[FacebookAI/xlm-roberta-base](https://huggingface.co/FacebookAI/xlm-roberta-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import XLMRobertaConfig, XLMRobertaModel
>>> # Initializing a XLM-RoBERTa FacebookAI/xlm-roberta-base style configuration
>>> configuration = XLMRobertaConfig()
>>> # Initializing a model (with random weights) from the FacebookAI/xlm-roberta-base style configuration
"""
>>> model = XLMRobertaModel(configuration)
>>>
>>> configuration = model.config
```
model_type = "xlm-roberta"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
position_embedding_type="absolute",
use_cache=True,
classifier_dropout=None,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
class XLMRobertaOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
]
)
.\models\xlm_roberta\modeling_flax_xlm_roberta.py
"""
Flax XLM-RoBERTa model.
"""
from typing import Callable, Optional, Tuple
import flax.linen as nn
import jax
import jax.numpy as jnp
import numpy as np
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen import partitioning as nn_partitioning
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from ...modeling_flax_outputs import (
FlaxBaseModelOutputWithPastAndCrossAttentions,
FlaxBaseModelOutputWithPooling,
FlaxBaseModelOutputWithPoolingAndCrossAttentions,
FlaxCausalLMOutputWithCrossAttentions,
FlaxMaskedLMOutput,
FlaxMultipleChoiceModelOutput,
FlaxQuestionAnsweringModelOutput,
FlaxSequenceClassifierOutput,
FlaxTokenClassifierOutput,
)
from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_xlm_roberta import XLMRobertaConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"
_CONFIG_FOR_DOC = "XLMRobertaConfig"
remat = nn_partitioning.remat
FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"FacebookAI/xlm-roberta-base",
"FacebookAI/xlm-roberta-large",
]
def create_position_ids_from_input_ids(input_ids, padding_idx):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.
Args:
input_ids: jnp.ndarray # 输入的 ID 数组
padding_idx: int # 填充符号的索引
Returns: jnp.ndarray # 返回一个新的位置 ID 数组
"""
mask = (input_ids != padding_idx).astype("i4")
if mask.ndim > 2:
mask = mask.reshape((-1, mask.shape[-1]))
incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask
incremental_indices = incremental_indices.reshape(input_ids.shape)
else:
incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask
return incremental_indices.astype("i4") + padding_idx
XLM_ROBERTA_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
This model is also a
[flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
config ([`XLMRobertaConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
"""
XLM_ROBERTA_INPUTS_DOCSTRING = r"""
Args:
input_ids (`numpy.ndarray` of shape `({0})`):
# 输入序列的标记索引在词汇表中的位置。
# 可以使用 [`AutoTokenizer`] 获取这些索引。参见 [`PreTrainedTokenizer.encode`] 和
# [`PreTrainedTokenizer.__call__`] 获取详细信息。
# [什么是输入 ID?](../glossary#input-ids)
attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
# 避免在填充标记索引上执行注意力操作的掩码。掩码值为 `[0, 1]`:
# - 1 表示**未被掩码**的标记,
# - 0 表示**被掩码**的标记。
# [什么是注意力掩码?](../glossary#attention-mask)
token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
# 段标记索引,用于指示输入的第一部分和第二部分。索引值为 `[0, 1]`:
# - 0 对应于*句子 A* 的标记,
# - 1 对应于*句子 B* 的标记。
# [什么是标记类型 ID?](../glossary#token-type-ids)
position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
# 每个输入序列标记在位置嵌入中的位置索引。选择范围为 `[0, config.max_position_embeddings - 1]`。
head_mask (`numpy.ndarray` of shape `({0})`, `optional):
# 用于将注意力模块中选择的头部置零的掩码。掩码值为 `[0, 1]`:
# - 1 表示**未被掩码**的头部,
# - 0 表示**被掩码**的头部。
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings with Bert->XLMRoberta
class FlaxXLMRobertaEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
config: XLMRobertaConfig # 类型提示:XLMRoberta 模型配置对象
dtype: jnp.dtype = jnp.float32 # 计算使用的数据类型,默认为单精度浮点型
def setup(self):
# 初始化词嵌入层,用于将输入的词 ID 映射成对应的词向量
self.word_embeddings = nn.Embed(
self.config.vocab_size,
self.config.hidden_size,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
dtype=self.dtype,
)
# 初始化位置嵌入层,用于表示词的位置信息
self.position_embeddings = nn.Embed(
self.config.max_position_embeddings,
self.config.hidden_size,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
dtype=self.dtype,
)
# 初始化类型嵌入层,用于区分不同类型的输入(如句子 A 和句子 B)
self.token_type_embeddings = nn.Embed(
self.config.type_vocab_size,
self.config.hidden_size,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
dtype=self.dtype,
)
# 初始化 Layer Normalization 层,用于归一化隐藏状态
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# 初始化 Dropout 层,用于在训练过程中随机丢弃部分隐藏状态,防止过拟合
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True):
# Embed
# 将输入的词 ID 转换为词嵌入向量
inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
# 将位置 ID 转换为位置嵌入向量
position_embeds = self.position_embeddings(position_ids.astype("i4"))
# 将类型 ID 转换为类型嵌入向量
token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
# Sum all embeddings
# 将词嵌入向量、位置嵌入向量和类型嵌入向量相加得到最终的隐藏状态
hidden_states = inputs_embeds + token_type_embeddings + position_embeds
# Layer Norm
# 对隐藏状态进行 Layer Normalization 处理
hidden_states = self.LayerNorm(hidden_states)
# 对归一化后的隐藏状态进行 Dropout 操作,以防止过拟合
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
return hidden_states
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->XLMRoberta
class FlaxXLMRobertaSelfAttention(nn.Module):
config: XLMRobertaConfig # 类型提示:XLMRoberta 模型配置对象
causal: bool = False # 是否是因果注意力(自回归/自回归式),默认为否
dtype: jnp.dtype = jnp.float32 # 计算使用的数据类型,默认为单精度浮点型
# 在模型设置过程中调用,计算每个注意力头的维度
def setup(self):
# 将隐藏层大小除以注意力头的数量,以确定每个头的维度
self.head_dim = self.config.hidden_size // self.config.num_attention_heads
# 如果隐藏层大小不能被注意力头的数量整除,抛出数值错误异常
if self.config.hidden_size % self.config.num_attention_heads != 0:
raise ValueError(
"`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
" : {self.config.num_attention_heads}"
)
# 初始化查询、键、值网络层,用于注意力机制
self.query = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
self.key = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
self.value = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
# 如果启用因果注意力机制,则创建一个因果掩码
if self.causal:
self.causal_mask = make_causal_mask(
jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
)
# 将隐藏状态张量分割为多个注意力头
def _split_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim))
# 将多个注意力头的张量合并回隐藏状态张量
def _merge_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,))
# 使用 nn.compact 修饰器,定义一个函数,此处功能与特定的函数一致
@nn.compact
# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache
def _concatenate_to_cache(self, key, value, query, attention_mask):
"""
This function takes projected key, value states from a single input token and concatenates the states to cached
states from previous steps. This function is slightly adapted from the official Flax repository:
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py
"""
# 检测是否通过缺少现有缓存数据来初始化。
is_initialized = self.has_variable("cache", "cached_key")
# 获取或初始化缓存的键和值,使用零张量填充,维度和类型与输入的key和value相同。
cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
# 获取或初始化缓存索引,初始化为整数0。
cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
if is_initialized:
# 提取批处理维度、最大长度、头数和每头深度
*batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
# 使用新的一维空间片段更新键和值缓存
cur_index = cache_index.value
indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
key = lax.dynamic_update_slice(cached_key.value, key, indices)
value = lax.dynamic_update_slice(cached_value.value, value, indices)
# 更新缓存中的键和值
cached_key.value = key
cached_value.value = value
# 更新缓存索引,增加更新的缓存向量数量
num_updated_cache_vectors = query.shape[1]
cache_index.value = cache_index.value + num_updated_cache_vectors
# 生成用于缓存解码器自注意力的因果掩码:我们的单个查询位置只应关注已生成和缓存的键位置,而不是剩余的零元素。
pad_mask = jnp.broadcast_to(
jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
)
# 将因果掩码与输入的注意力掩码结合起来
attention_mask = combine_masks(pad_mask, attention_mask)
# 返回更新后的键、值和注意力掩码
return key, value, attention_mask
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->XLMRoberta
# 定义了一个用于 XLMRoberta 模型的自注意力输出层
class FlaxXLMRobertaSelfOutput(nn.Module):
config: XLMRobertaConfig # 类型注解,指定配置类 XLMRobertaConfig
dtype: jnp.dtype = jnp.float32 # 计算中使用的数据类型
def setup(self):
# 初始化全连接层,输出维度为配置中指定的隐藏大小,使用正态分布初始化权重
self.dense = nn.Dense(
self.config.hidden_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
)
# 初始化 LayerNorm 层,epsilon 参数由配置类 XLMRobertaConfig 提供
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# 初始化 Dropout 层,dropout 率由配置类 XLMRobertaConfig 提供
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
# 前向传播函数,接收隐藏状态、输入张量和一个布尔值作为参数
# 通过全连接层处理隐藏状态
hidden_states = self.dense(hidden_states)
# 使用 Dropout 层对处理后的隐藏状态进行随机失活
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
# 将处理后的隐藏状态与输入张量相加,并通过 LayerNorm 层处理
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertAttention with Bert->XLMRoberta
# 定义了一个用于 XLMRoberta 模型的注意力机制层
class FlaxXLMRobertaAttention(nn.Module):
config: XLMRobertaConfig # 类型注解,指定配置类 XLMRobertaConfig
causal: bool = False # 是否启用因果关系的布尔值,默认为 False
dtype: jnp.dtype = jnp.float32 # 计算中使用的数据类型
def setup(self):
# 初始化自注意力层,使用 XLMRobertaSelfAttention 类处理
self.self = FlaxXLMRobertaSelfAttention(self.config, causal=self.causal, dtype=self.dtype)
# 初始化自注意力输出层,使用 FlaxXLMRobertaSelfOutput 类处理
self.output = FlaxXLMRobertaSelfOutput(self.config, dtype=self.dtype)
def __call__(
self,
hidden_states,
attention_mask,
layer_head_mask,
key_value_states=None,
init_cache=False,
deterministic=True,
output_attentions: bool = False,
):
# 前向传播函数,接收多个参数用于处理注意力机制
# 使用 self.self 处理自注意力计算,得到注意力输出
attn_outputs = self.self(
hidden_states,
attention_mask,
layer_head_mask=layer_head_mask,
key_value_states=key_value_states,
init_cache=init_cache,
deterministic=deterministic,
output_attentions=output_attentions,
)
attn_output = attn_outputs[0]
# 使用 self.output 处理注意力输出,得到最终的隐藏状态
hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_outputs[1],)
return outputs
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->XLMRoberta
# 定义了一个用于 XLMRoberta 模型的中间层
class FlaxXLMRobertaIntermediate(nn.Module):
config: XLMRobertaConfig # 类型注解,指定配置类 XLMRobertaConfig
dtype: jnp.dtype = jnp.float32 # 计算中使用的数据类型
def setup(self):
# 初始化全连接层,输出维度为配置中指定的中间大小,使用正态分布初始化权重
self.dense = nn.Dense(
self.config.intermediate_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
)
# 初始化激活函数,激活函数类型由配置类 XLMRobertaConfig 提供
self.activation = ACT2FN[self.config.hidden_act]
# 定义一个类中的特殊方法 __call__(),用于将对象实例像函数一样调用
def __call__(self, hidden_states):
# 将输入的隐藏状态数据通过全连接层进行线性变换
hidden_states = self.dense(hidden_states)
# 对变换后的隐藏状态数据应用激活函数
hidden_states = self.activation(hidden_states)
# 返回经过线性变换和激活函数处理后的隐藏状态数据
return hidden_states
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertOutput 复制而来,将 Bert 替换为 XLMRoberta
class FlaxXLMRobertaOutput(nn.Module):
config: XLMRobertaConfig # XLMRoberta 模型的配置信息
dtype: jnp.dtype = jnp.float32 # 计算时使用的数据类型
def setup(self):
# 初始化一个全连接层,输出大小为 config.hidden_size
self.dense = nn.Dense(
self.config.hidden_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range), # 使用正态分布初始化权重
dtype=self.dtype,
)
# 初始化一个 Dropout 层,丢弃率为 config.hidden_dropout_prob
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
# 初始化一个 LayerNorm 层,epsilon 为 config.layer_norm_eps
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
def __call__(self, hidden_states, attention_output, deterministic: bool = True):
# 通过全连接层处理 hidden_states
hidden_states = self.dense(hidden_states)
# 应用 Dropout 处理 hidden_states
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
# 应用 LayerNorm 处理 hidden_states 和 attention_output 的和
hidden_states = self.LayerNorm(hidden_states + attention_output)
return hidden_states
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertLayer 复制而来,将 Bert 替换为 XLMRoberta
class FlaxXLMRobertaLayer(nn.Module):
config: XLMRobertaConfig # XLMRoberta 模型的配置信息
dtype: jnp.dtype = jnp.float32 # 计算时使用的数据类型
def setup(self):
# 初始化 self.attention 为 FlaxXLMRobertaAttention 实例
self.attention = FlaxXLMRobertaAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype)
# 初始化 self.intermediate 为 FlaxXLMRobertaIntermediate 实例
self.intermediate = FlaxXLMRobertaIntermediate(self.config, dtype=self.dtype)
# 初始化 self.output 为 FlaxXLMRobertaOutput 实例
self.output = FlaxXLMRobertaOutput(self.config, dtype=self.dtype)
# 如果配置中包含交叉注意力,初始化 self.crossattention 为 FlaxXLMRobertaAttention 实例
if self.config.add_cross_attention:
self.crossattention = FlaxXLMRobertaAttention(self.config, causal=False, dtype=self.dtype)
def __call__(
self,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
deterministic: bool = True,
output_attentions: bool = False,
):
# 实现 FlaxXLMRobertaLayer 的调用功能,接收多个参数进行处理
# (具体处理逻辑在实现该方法的类的调用实现中)
pass # 这里是函数体的结尾,没有实际的代码逻辑,因此不需要添加额外的注释
# Self Attention
# 使用 self.attention 方法进行自注意力计算,处理隐藏状态和注意力掩码
attention_outputs = self.attention(
hidden_states,
attention_mask,
layer_head_mask=layer_head_mask,
init_cache=init_cache,
deterministic=deterministic,
output_attentions=output_attentions,
)
attention_output = attention_outputs[0]
# Cross-Attention Block
# 如果存在编码器的隐藏状态,则进行交叉注意力计算
if encoder_hidden_states is not None:
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask=encoder_attention_mask,
layer_head_mask=layer_head_mask,
key_value_states=encoder_hidden_states,
deterministic=deterministic,
output_attentions=output_attentions,
)
attention_output = cross_attention_outputs[0]
# 经过 self.intermediate 层的处理
hidden_states = self.intermediate(attention_output)
# 经过 self.output 层的处理,得到最终输出的隐藏状态
hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
# 将隐藏状态打包成输出元组
outputs = (hidden_states,)
# 如果需要输出注意力信息
if output_attentions:
# 添加自注意力信息到输出元组
outputs += (attention_outputs[1],)
# 如果存在编码器的隐藏状态,则添加交叉注意力信息到输出元组
if encoder_hidden_states is not None:
outputs += (cross_attention_outputs[1],)
# 返回最终的输出元组
return outputs
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection 复制并修改为 FlaxXLMRobertaLayerCollection
class FlaxXLMRobertaLayerCollection(nn.Module):
config: XLMRobertaConfig # 类型提示,指定配置对象为 XLMRobertaConfig 类型
dtype: jnp.dtype = jnp.float32 # 计算中使用的数据类型,默认为 jnp.float32
gradient_checkpointing: bool = False # 是否使用梯度检查点,默认为 False
def setup(self):
if self.gradient_checkpointing:
# 如果开启梯度检查点,使用 remat 函数对 FlaxXLMRobertaLayer 进行重建
FlaxXLMRobertaCheckpointLayer = remat(FlaxXLMRobertaLayer, static_argnums=(5, 6, 7))
# 创建一个包含检查点层的列表,每层的名称为索引号字符串
self.layers = [
FlaxXLMRobertaCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
for i in range(self.config.num_hidden_layers)
]
else:
# 如果未开启梯度检查点,创建一个 FlaxXLMRobertaLayer 的列表,每层的名称为索引号字符串
self.layers = [
FlaxXLMRobertaLayer(self.config, name=str(i), dtype=self.dtype)
for i in range(self.config.num_hidden_layers)
]
def __call__(
self,
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
# 神经网络层的调用方法,接受多个输入参数和一些可选的布尔值参数
all_attentions = () if output_attentions else None
all_hidden_states = () if output_hidden_states else None
all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
# 检查是否需要创建头部遮罩(head_mask),确保头部遮罩的层数与模型层数一致
if head_mask is not None:
if head_mask.shape[0] != (len(self.layers)):
raise ValueError(
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.shape[0]}."
)
# 遍历模型的每一层并进行前向传播
for i, layer in enumerate(self.layers):
# 如果需要输出隐藏状态,则将当前层的隐藏状态加入到列表中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 调用当前层的前向传播函数,获取当前层的输出
layer_outputs = layer(
hidden_states,
attention_mask,
head_mask[i] if head_mask is not None else None,
encoder_hidden_states,
encoder_attention_mask,
init_cache,
deterministic,
output_attentions,
)
# 更新当前层的隐藏状态
hidden_states = layer_outputs[0]
# 如果需要输出注意力权重,将当前层的注意力权重加入到列表中
if output_attentions:
all_attentions += (layer_outputs[1],)
# 如果存在编码器的隐藏状态,将当前层的交叉注意力权重加入到列表中
if encoder_hidden_states is not None:
all_cross_attentions += (layer_outputs[2],)
# 如果需要输出隐藏状态,将最后一层的隐藏状态加入到列表中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 构建模型的输出
outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)
# 如果不需要以字典形式返回结果,则返回元组形式的输出
if not return_dict:
return tuple(v for v in outputs if v is not None)
# 以带过去和交叉注意力的 Flax 模型输出格式返回结果
return FlaxBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_attentions,
cross_attentions=all_cross_attentions,
)
# 从transformers.models.bert.modeling_flax_bert.FlaxBertEncoder复制代码,并将Bert->XLMRoberta
class FlaxXLMRobertaEncoder(nn.Module):
config: XLMRobertaConfig
dtype: jnp.dtype = jnp.float32 # 计算的数据类型
gradient_checkpointing: bool = False # 是否使用梯度检查点
def setup(self):
self.layer = FlaxXLMRobertaLayerCollection(
self.config,
dtype=self.dtype,
gradient_checkpointing=self.gradient_checkpointing,
)
def __call__(
self,
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
return self.layer(
hidden_states,
attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
init_cache=init_cache,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从transformers.models.bert.modeling_flax_bert.FlaxBertPooler复制代码,并将Bert->XLMRoberta
class FlaxXLMRobertaPooler(nn.Module):
config: XLMRobertaConfig
dtype: jnp.dtype = jnp.float32 # 计算的数据类型
def setup(self):
self.dense = nn.Dense(
self.config.hidden_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
)
def __call__(self, hidden_states):
cls_hidden_state = hidden_states[:, 0] # 取第一个位置的CLS隐藏状态
cls_hidden_state = self.dense(cls_hidden_state) # 通过全连接层进行处理
return nn.tanh(cls_hidden_state) # 返回经过tanh激活的CLS隐藏状态
# 从transformers.models.roberta.modeling_flax_roberta.FlaxRobertaLMHead复制代码,并将Roberta->XLMRoberta
class FlaxXLMRobertaLMHead(nn.Module):
config: XLMRobertaConfig
dtype: jnp.dtype = jnp.float32 # 计算的数据类型
bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros # 偏置初始化函数
def setup(self):
self.dense = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) # 层归一化
self.decoder = nn.Dense(
self.config.vocab_size,
dtype=self.dtype,
use_bias=False,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,)) # 偏置参数
# 定义一个对象的调用方法,接受隐藏状态和共享嵌入作为参数
def __call__(self, hidden_states, shared_embedding=None):
# 将隐藏状态通过全连接层进行线性变换
hidden_states = self.dense(hidden_states)
# 使用 GELU 激活函数处理隐藏状态
hidden_states = ACT2FN["gelu"](hidden_states)
# 对处理后的隐藏状态进行 Layer Normalization
hidden_states = self.layer_norm(hidden_states)
# 如果提供了共享的嵌入向量,则将其作为参数应用到解码器中
if shared_embedding is not None:
hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
else:
# 否则,直接使用解码器处理隐藏状态
hidden_states = self.decoder(hidden_states)
# 将偏置转换为 JAX 数组,并加到隐藏状态上
bias = jnp.asarray(self.bias, self.dtype)
hidden_states += bias
# 返回处理后的隐藏状态
return hidden_states
# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaClassificationHead 复制而来,将 Roberta 替换为 XLMRoberta
class FlaxXLMRobertaClassificationHead(nn.Module):
config: XLMRobertaConfig # 类的配置信息,使用 XLMRobertaConfig
dtype: jnp.dtype = jnp.float32 # 数据类型设置为 jnp.float32
def setup(self):
# 初始化一个全连接层,输出大小为 config.hidden_size,使用正态分布初始化权重
self.dense = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
# 设置分类器的 dropout 率为 config.classifier_dropout,如果为 None,则使用 config.hidden_dropout_prob
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(rate=classifier_dropout) # 设置 dropout 层
# 初始化一个全连接层,输出大小为 config.num_labels,使用正态分布初始化权重
self.out_proj = nn.Dense(
self.config.num_labels,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
def __call__(self, hidden_states, deterministic=True):
hidden_states = hidden_states[:, 0, :] # 取 <s> 标记对应的隐藏状态 (等同于 [CLS])
hidden_states = self.dropout(hidden_states, deterministic=deterministic) # 应用 dropout
hidden_states = self.dense(hidden_states) # 应用全连接层
hidden_states = nn.tanh(hidden_states) # 应用 tanh 激活函数
hidden_states = self.dropout(hidden_states, deterministic=deterministic) # 再次应用 dropout
hidden_states = self.out_proj(hidden_states) # 应用输出投影层
return hidden_states
# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaPreTrainedModel 复制而来,将 Roberta 替换为 XLMRoberta,roberta 替换为 xlm-roberta,ROBERTA 替换为 XLM_ROBERTA
class FlaxXLMRobertaPreTrainedModel(FlaxPreTrainedModel):
"""
处理权重初始化和简单接口以下载和加载预训练模型的抽象类。
"""
config_class = XLMRobertaConfig # 配置类为 XLMRobertaConfig
base_model_prefix = "xlm-roberta" # 基础模型前缀为 "xlm-roberta"
module_class: nn.Module = None # 模块类设置为 None
def __init__(
self,
config: XLMRobertaConfig,
input_shape: Tuple = (1, 1),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
gradient_checkpointing: bool = False,
**kwargs,
):
# 初始化一个模块类对象,使用给定的配置、数据类型和参数
module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing 复制而来
def enable_gradient_checkpointing(self):
self._module = self.module_class(
config=self.config,
dtype=self.dtype,
gradient_checkpointing=True,
)
# 初始化模型的权重
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化输入张量
input_ids = jnp.zeros(input_shape, dtype="i4") # 创建全零的输入张量
token_type_ids = jnp.ones_like(input_ids) # 创建与输入张量形状相同的全一张量
position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id) # 根据输入张量创建位置编码
attention_mask = jnp.ones_like(input_ids) # 创建与输入张量形状相同的全一注意力掩码
head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) # 创建全一的头部掩码
# 分割随机数生成器
params_rng, dropout_rng = jax.random.split(rng)
rngs = {"params": params_rng, "dropout": dropout_rng}
if self.config.add_cross_attention:
# 如果配置要求添加交叉注意力,初始化编码器隐藏状态和注意力掩码
encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
encoder_attention_mask = attention_mask
# 使用模型初始化,并返回模型初始化的输出
module_init_outputs = self.module.init(
rngs,
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
return_dict=False,
)
else:
# 否则,使用模型初始化,仅传入基本参数
module_init_outputs = self.module.init(
rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False
)
# 从模型初始化的输出中获取随机参数
random_params = module_init_outputs["params"]
if params is not None:
# 如果提供了预定义参数,则将随机参数展开并填充缺失的键
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
return freeze(unflatten_dict(params)) # 冻结填充后的参数并返回
else:
return random_params # 否则,返回随机初始化的参数
# 从 transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache 复制过来的方法
def init_cache(self, batch_size, max_length):
r"""
Args:
batch_size (`int`):
用于快速自回归解码的批大小。定义初始化缓存时的批大小。
max_length (`int`):
自回归解码的最大可能长度。定义初始化缓存的序列长度。
"""
# 初始化用于检索缓存的输入变量
input_ids = jnp.ones((batch_size, max_length), dtype="i4") # 创建全一的输入张量
attention_mask = jnp.ones_like(input_ids, dtype="i4") # 创建与输入张量形状相同的全一注意力掩码
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) # 广播位置编码
# 使用模型初始化,并返回初始化变量的缓存部分
init_variables = self.module.init(
jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
)
return unfreeze(init_variables["cache"])
@add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def __call__(
self,
input_ids,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
params: dict = None,
dropout_rng: jax.random.PRNGKey = None,
train: bool = False,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
past_key_values: dict = None,
# 定义一个调用方法,接收多个输入参数,以下为详细参数解释
# 必须的输入参数,表示模型的输入 token IDs
input_ids,
# 可选的输入参数,表示注意力遮罩,用于指示哪些标记是有效的
attention_mask=None,
# 可选的输入参数,表示标记类型的 IDs,通常在多段文本输入时使用
token_type_ids=None,
# 可选的输入参数,表示标记在序列中的位置 IDs
position_ids=None,
# 可选的输入参数,表示头部遮罩,用于指示哪些注意力头部是有效的
head_mask=None,
# 可选的输入参数,表示编码器的隐藏状态
encoder_hidden_states=None,
# 可选的输入参数,表示编码器注意力遮罩,用于指示哪些编码器隐藏状态是有效的
encoder_attention_mask=None,
# 可选的输入参数,表示额外的参数字典,用于模型配置
params: dict = None,
# 可选的输入参数,表示随机数生成器密钥,用于 dropout 操作
dropout_rng: jax.random.PRNGKey = None,
# 可选的输入参数,表示是否处于训练模式
train: bool = False,
# 可选的输入参数,表示是否输出注意力权重
output_attentions: Optional[bool] = None,
# 可选的输入参数,表示是否输出隐藏状态
output_hidden_states: Optional[bool] = None,
# 可选的输入参数,表示是否返回一个字典对象
return_dict: Optional[bool] = None,
# 可选的输入参数,表示过去的键值状态字典
past_key_values: dict = None,
# 从transformers.models.bert.modeling_flax_bert.FlaxBertModule复制代码,并将Bert->XLMRoberta
class FlaxXLMRobertaModule(nn.Module):
# 使用XLMRobertaConfig配置
config: XLMRobertaConfig
# 计算时的数据类型
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
# 是否添加池化层,默认为True
add_pooling_layer: bool = True
# 是否使用梯度检查点
gradient_checkpointing: bool = False
def setup(self):
# 初始化嵌入层
self.embeddings = FlaxXLMRobertaEmbeddings(self.config, dtype=self.dtype)
# 初始化编码器
self.encoder = FlaxXLMRobertaEncoder(
self.config,
dtype=self.dtype,
gradient_checkpointing=self.gradient_checkpointing,
)
# 初始化池化层
self.pooler = FlaxXLMRobertaPooler(self.config, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids: Optional[jnp.ndarray] = None,
position_ids: Optional[jnp.ndarray] = None,
head_mask: Optional[jnp.ndarray] = None,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 确保当token_type_ids未传入时被正确初始化为全零数组
if token_type_ids is None:
token_type_ids = jnp.zeros_like(input_ids)
# 确保当position_ids未传入时被正确初始化
if position_ids is None:
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
# 通过嵌入层计算隐藏状态
hidden_states = self.embeddings(
input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic
)
# 使用编码器计算输出
outputs = self.encoder(
hidden_states,
attention_mask,
head_mask=head_mask,
deterministic=deterministic,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取编码器的隐藏状态
hidden_states = outputs[0]
# 如果需要添加池化层,则计算池化结果
pooled = self.pooler(hidden_states) if self.add_pooling_layer else None
# 如果不需要返回字典形式的结果
if not return_dict:
# 如果池化结果为None,则不返回它
if pooled is None:
return (hidden_states,) + outputs[1:]
return (hidden_states, pooled) + outputs[1:]
# 返回包含池化结果和交叉注意力的FlaxBaseModelOutputWithPoolingAndCrossAttentions对象
return FlaxBaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=hidden_states,
pooler_output=pooled,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
XLM_ROBERTA_START_DOCSTRING,
# 引用预定义的常量 XLM_ROBERTA_START_DOCSTRING
)
class FlaxXLMRobertaModel(FlaxXLMRobertaPreTrainedModel):
module_class = FlaxXLMRobertaModule
append_call_sample_docstring(FlaxXLMRobertaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForMaskedLMModule 复制并修改为 XLMRoberta
class FlaxXLMRobertaForMaskedLMModule(nn.Module):
config: XLMRobertaConfig
dtype: jnp.dtype = jnp.float32
gradient_checkpointing: bool = False
def setup(self):
# 初始化 XLM-Roberta 模型,配置为不添加池化层,使用指定数据类型和梯度检查点
self.roberta = FlaxXLMRobertaModule(
config=self.config,
add_pooling_layer=False,
dtype=self.dtype,
gradient_checkpointing=self.gradient_checkpointing,
)
# 初始化 XLM-Roberta 语言模型头部
self.lm_head = FlaxXLMRobertaLMHead(config=self.config, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用 XLM-Roberta 模型
outputs = self.roberta(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
# 如果配置指定共享词嵌入,则获取共享的词嵌入
if self.config.tie_word_embeddings:
shared_embedding = self.roberta.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
else:
shared_embedding = None
# 计算预测分数
logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)
if not return_dict:
return (logits,) + outputs[1:]
# 返回 XLM-Roberta 遮蔽语言建模的输出
return FlaxMaskedLMOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings("""XLM RoBERTa Model with a `language modeling` head on top.""", XLM_ROBERTA_START_DOCSTRING)
class FlaxXLMRobertaForMaskedLM(FlaxXLMRobertaPreTrainedModel):
module_class = FlaxXLMRobertaForMaskedLMModule
append_call_sample_docstring(
FlaxXLMRobertaForMaskedLM,
_CHECKPOINT_FOR_DOC,
FlaxBaseModelOutputWithPooling,
_CONFIG_FOR_DOC,
mask="<mask>",
)
# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForSequenceClassificationModule 复制并修改为 XLMRoberta
class FlaxXLMRobertaForSequenceClassificationModule(nn.Module):
config: XLMRobertaConfig
dtype: jnp.dtype = jnp.float32
gradient_checkpointing: bool = False
# 在对象初始化时设置模型结构
def setup(self):
self.roberta = FlaxXLMRobertaModule(
config=self.config, # 使用给定配置初始化模型
dtype=self.dtype, # 设定数据类型
add_pooling_layer=False, # 禁用池化层
gradient_checkpointing=self.gradient_checkpointing, # 设置梯度检查点
)
self.classifier = FlaxXLMRobertaClassificationHead(config=self.config, dtype=self.dtype) # 初始化分类头部模块
# 对象调用时执行的函数,用于模型推断
def __call__(
self,
input_ids, # 输入的token id序列
attention_mask, # 注意力掩码
token_type_ids, # token类型id
position_ids, # 位置id
head_mask, # 头部掩码
deterministic: bool = True, # 是否使用确定性计算
output_attentions: bool = False, # 是否输出注意力权重
output_hidden_states: bool = False, # 是否输出隐藏状态
return_dict: bool = True, # 是否返回字典形式的结果
):
# 执行RoBERTa模型的前向传播
outputs = self.roberta(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0] # 提取序列输出
logits = self.classifier(sequence_output, deterministic=deterministic) # 使用分类头部预测logits
if not return_dict:
return (logits,) + outputs[1:] # 返回tuple形式的输出
return FlaxSequenceClassifierOutput(
logits=logits, # 返回分类的logits
hidden_states=outputs.hidden_states, # 返回隐藏状态
attentions=outputs.attentions, # 返回注意力权重
)
@add_start_docstrings(
"""
XLM Roberta Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
XLM_ROBERTA_START_DOCSTRING,
)
"""
XLM Roberta模型转换器,顶部带有序列分类/回归头部(即池化输出的顶部线性层),例如用于GLUE任务。
"""
append_call_sample_docstring(
FlaxXLMRobertaForSequenceClassification,
_CHECKPOINT_FOR_DOC,
FlaxSequenceClassifierOutput,
_CONFIG_FOR_DOC,
)
"""
将示例调用的文档字符串附加到FlaxXLMRobertaForSequenceClassification类的文档中,
包括_CHECKPOINT_FOR_DOC、FlaxSequenceClassifierOutput和_CONFIG_FOR_DOC。
"""
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule with Bert->XLMRoberta, with self.bert->self.roberta
"""
从transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule复制,
将Bert替换为XLMRoberta,将self.bert替换为self.roberta。
"""
class FlaxXLMRobertaForMultipleChoiceModule(nn.Module):
config: XLMRobertaConfig
dtype: jnp.dtype = jnp.float32
gradient_checkpointing: bool = False
def setup(self):
self.roberta = FlaxXLMRobertaModule(
config=self.config,
dtype=self.dtype,
gradient_checkpointing=self.gradient_checkpointing,
)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
self.classifier = nn.Dense(1, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
num_choices = input_ids.shape[1]
input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None
# Model
outputs = self.roberta(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, deterministic=deterministic)
logits = self.classifier(pooled_output)
reshaped_logits = logits.reshape(-1, num_choices)
if not return_dict:
return (reshaped_logits,) + outputs[2:]
return FlaxMultipleChoiceModelOutput(
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
XLM Roberta模型,带有多选分类头部(即池化输出的顶部线性层和)。
"""
@add_start_docstrings(
"""
XLM Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
""",
XLM_ROBERTA_START_DOCSTRING,
)
"""
XLM Roberta模型,带有多选分类头部(即池化输出的顶部线性层和)
"""
a softmax) e.g. for RocStories/SWAG tasks.
"""
XLM-RoBERTa 模型的起始文档字符串,用于生成模型文档说明。
"""
)
class FlaxXLMRobertaForMultipleChoice(FlaxXLMRobertaPreTrainedModel):
module_class = FlaxXLMRobertaForMultipleChoiceModule
overwrite_call_docstring(
FlaxXLMRobertaForMultipleChoice, XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
append_call_sample_docstring(
FlaxXLMRobertaForMultipleChoice,
_CHECKPOINT_FOR_DOC,
FlaxMultipleChoiceModelOutput,
_CONFIG_FOR_DOC,
)
# 从FlaxBertForTokenClassificationModule复制并修改为FlaxXLMRobertaForTokenClassificationModule,将self.bert->self.roberta
class FlaxXLMRobertaForTokenClassificationModule(nn.Module):
config: XLMRobertaConfig
dtype: jnp.dtype = jnp.float32
gradient_checkpointing: bool = False
def setup(self):
# 初始化时创建FlaxXLMRobertaModule实例,并传入配置、数据类型、是否梯度检查点、不添加池化层
self.roberta = FlaxXLMRobertaModule(
config=self.config,
dtype=self.dtype,
add_pooling_layer=False,
gradient_checkpointing=self.gradient_checkpointing,
)
# 根据配置设置分类器的dropout率,若未指定则使用隐藏层的dropout率
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
# 创建一个dropout层,用于隐藏状态
self.dropout = nn.Dropout(rate=classifier_dropout)
# 创建一个全连接层,输出维度为配置文件中指定的标签数
self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用self.roberta进行模型推断
outputs = self.roberta(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从输出中获取隐藏状态,并在推断时使用dropout层
hidden_states = outputs[0]
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
# 使用分类器预测标签
logits = self.classifier(hidden_states)
# 若return_dict为False,则返回元组形式的输出
if not return_dict:
return (logits,) + outputs[1:]
# 否则返回FlaxTokenClassifierOutput对象,包含logits、隐藏状态和注意力机制
return FlaxTokenClassifierOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
XLM Roberta模型,顶部带有一个标记分类头(即隐藏状态输出的线性层),例如用于命名实体识别(NER)任务。
""",
XLM_ROBERTA_START_DOCSTRING,
)
class FlaxXLMRobertaForTokenClassification(FlaxXLMRobertaPreTrainedModel):
module_class = FlaxXLMRobertaForTokenClassificationModule
append_call_sample_docstring(
FlaxXLMRobertaForTokenClassification,
_CHECKPOINT_FOR_DOC,
FlaxTokenClassifierOutput,
_CONFIG_FOR_DOC,
)
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertForQuestionAnsweringModule 复制代码到这里,并将 Bert->XLMRoberta,self.bert->self.roberta
class FlaxXLMRobertaForQuestionAnsweringModule(nn.Module):
# 使用 XLMRobertaConfig 配置类
config: XLMRobertaConfig
# 数据类型默认为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 是否使用梯度检查点,默认为 False
gradient_checkpointing: bool = False
def setup(self):
# 初始化 self.roberta 为 FlaxXLMRobertaModule
self.roberta = FlaxXLMRobertaModule(
config=self.config,
dtype=self.dtype,
add_pooling_layer=False,
gradient_checkpointing=self.gradient_checkpointing,
)
# 初始化 self.qa_outputs 为 nn.Dense,输出大小为 self.config.num_labels
self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用 self.roberta 进行模型计算
outputs = self.roberta(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取隐藏状态
hidden_states = outputs[0]
# 计算起始和结束位置的 logits
logits = self.qa_outputs(hidden_states)
start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
# 如果 return_dict 为 False,则返回 tuple 类型
if not return_dict:
return (start_logits, end_logits) + outputs[1:]
# 如果 return_dict 为 True,则返回 FlaxQuestionAnsweringModelOutput 类型
return FlaxQuestionAnsweringModelOutput(
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
XLM Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLM_ROBERTA_START_DOCSTRING,
)
# 继承自 FlaxXLMRobertaPreTrainedModel 的 XLMRoberta 问题回答模型类
class FlaxXLMRobertaForQuestionAnswering(FlaxXLMRobertaPreTrainedModel):
# 指定模块类为 FlaxXLMRobertaForQuestionAnsweringModule
module_class = FlaxXLMRobertaForQuestionAnsweringModule
# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForCausalLMModule 复制代码到这里,并将 Roberta->XLMRoberta
class FlaxXLMRobertaForCausalLMModule(nn.Module):
# 使用 XLMRobertaConfig 配置类
config: XLMRobertaConfig
# 数据类型默认为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 是否使用梯度检查点,默认为 False
gradient_checkpointing: bool = False
# 在模型设置方法中初始化 RoBERTa 模型和语言模型头部
def setup(self):
self.roberta = FlaxXLMRobertaModule(
config=self.config,
add_pooling_layer=False,
dtype=self.dtype,
gradient_checkpointing=self.gradient_checkpointing,
)
self.lm_head = FlaxXLMRobertaLMHead(config=self.config, dtype=self.dtype)
# 在调用方法中执行模型的前向传播
def __call__(
self,
input_ids,
attention_mask,
position_ids,
token_type_ids: Optional[jnp.ndarray] = None,
head_mask: Optional[jnp.ndarray] = None,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用 RoBERTa 模型的前向传播,并传入所有必要的参数
outputs = self.roberta(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
init_cache=init_cache,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取模型的隐藏状态作为输入特征
hidden_states = outputs[0]
# 根据配置决定是否共享词嵌入层
if self.config.tie_word_embeddings:
shared_embedding = self.roberta.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
else:
shared_embedding = None
# 计算语言模型头部的预测分数
logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)
# 如果不要求返回字典形式的输出,则返回元组
if not return_dict:
return (logits,) + outputs[1:]
# 返回带有交叉注意力的因果语言建模输出
return FlaxCausalLMOutputWithCrossAttentions(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
# 使用装饰器为类添加文档字符串,描述该类是在 XLM Roberta 模型基础上添加了语言建模头部的变体,用于自回归任务
@add_start_docstrings(
"""
XLM Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
autoregressive tasks.
""",
XLM_ROBERTA_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForCausalLM 复制过来,并将 Roberta 改为 XLMRoberta
class FlaxXLMRobertaForCausalLM(FlaxXLMRobertaPreTrainedModel):
# 使用 FlaxXLMRobertaForCausalLMModule 作为模块类
module_class = FlaxXLMRobertaForCausalLMModule
# 为生成准备输入的方法,接受输入的 token IDs,生成最大长度的序列
def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
# 初始化缓存
batch_size, seq_length = input_ids.shape
# 使用 self.init_cache 方法初始化过去的键值对
past_key_values = self.init_cache(batch_size, max_length)
# 注意:通常需要在 attention_mask 中为 x > input_ids.shape[-1] 和 x < cache_length 的位置放置 0
# 但由于解码器使用因果蒙版,这些位置已经被蒙版了。因此,我们可以在这里创建一个静态的 attention_mask,这对编译更有效。
extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
# 如果提供了 attention_mask,则根据其累积和更新 extended_attention_mask
if attention_mask is not None:
position_ids = attention_mask.cumsum(axis=-1) - 1
extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
else:
# 否则,广播生成一个 position_ids,形状为 (batch_size, seq_length)
position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
return {
"past_key_values": past_key_values,
"attention_mask": extended_attention_mask,
"position_ids": position_ids,
}
# 更新生成时的输入,将模型输出的 past_key_values 和 position_ids 更新到 model_kwargs 中
def update_inputs_for_generation(self, model_outputs, model_kwargs):
model_kwargs["past_key_values"] = model_outputs.past_key_values
model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
return model_kwargs
# 将样例调用文档字符串附加到 FlaxXLMRobertaForCausalLM 类上,描述如何调用该类以生成样本
append_call_sample_docstring(
FlaxXLMRobertaForCausalLM,
_CHECKPOINT_FOR_DOC,
FlaxCausalLMOutputWithCrossAttentions,
_CONFIG_FOR_DOC,
)