Transformers 源码解析(一百二十八)
.\models\xlm_roberta_xl\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_xlm_roberta_xl": [
"XLM_ROBERTA_XL_PRETRAINED_CONFIG_ARCHIVE_MAP",
"XLMRobertaXLConfig",
"XLMRobertaXLOnnxConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_xlm_roberta_xl"] = [
"XLM_ROBERTA_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
"XLMRobertaXLForCausalLM",
"XLMRobertaXLForMaskedLM",
"XLMRobertaXLForMultipleChoice",
"XLMRobertaXLForQuestionAnswering",
"XLMRobertaXLForSequenceClassification",
"XLMRobertaXLForTokenClassification",
"XLMRobertaXLModel",
"XLMRobertaXLPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_xlm_roberta_xl import (
XLM_ROBERTA_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
XLMRobertaXLConfig,
XLMRobertaXLOnnxConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_xlm_roberta_xl import (
XLM_ROBERTA_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
XLMRobertaXLForCausalLM,
XLMRobertaXLForMaskedLM,
XLMRobertaXLForMultipleChoice,
XLMRobertaXLForQuestionAnswering,
XLMRobertaXLForSequenceClassification,
XLMRobertaXLForTokenClassification,
XLMRobertaXLModel,
XLMRobertaXLPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\xlnet\configuration_xlnet.py
import warnings
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"xlnet/xlnet-base-cased": "https://huggingface.co/xlnet/xlnet-base-cased/resolve/main/config.json",
"xlnet/xlnet-large-cased": "https://huggingface.co/xlnet/xlnet-large-cased/resolve/main/config.json",
}
class XLNetConfig(PretrainedConfig):
"""
这是一个配置类,用于存储 [`XLNetModel`] 或 [`TFXLNetModel`] 的配置信息。根据指定的参数实例化一个 XLNet 模型,
定义模型的架构。使用默认参数实例化配置对象将得到与 [xlnet/xlnet-large-cased](https://huggingface.co/xlnet/xlnet-large-cased)
架构类似的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。
示例:
```
>>> from transformers import XLNetConfig, XLNetModel
>>> # 初始化一个 XLNet 配置对象
>>> configuration = XLNetConfig()
>>> # 使用配置对象初始化一个模型(随机权重)
>>> model = XLNetModel(configuration)
>>> # 访问模型的配置信息
>>> configuration = model.config
```
"""
model_type = "xlnet"
keys_to_ignore_at_inference = ["mems"]
attribute_map = {
"n_token": "vocab_size",
"hidden_size": "d_model",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
vocab_size=32000,
d_model=1024,
n_layer=24,
n_head=16,
d_inner=4096,
ff_activation="gelu",
untie_r=True,
attn_type="bi",
initializer_range=0.02,
layer_norm_eps=1e-12,
dropout=0.1,
mem_len=512,
reuse_len=None,
use_mems_eval=True,
use_mems_train=False,
bi_data=False,
clamp_len=-1,
same_length=False,
summary_type="last",
summary_use_proj=True,
summary_activation="tanh",
summary_last_dropout=0.1,
start_n_top=5,
end_n_top=5,
pad_token_id=5,
bos_token_id=1,
eos_token_id=2,
**kwargs,
):
"""Constructs XLNetConfig."""
self.vocab_size = vocab_size
self.d_model = d_model
self.n_layer = n_layer
self.n_head = n_head
if d_model % n_head != 0:
raise ValueError(f"'d_model % n_head' ({d_model % n_head}) should be equal to 0")
if "d_head" in kwargs:
if kwargs["d_head"] != d_model // n_head:
raise ValueError(
f"`d_head` ({kwargs['d_head']}) should be equal to `d_model // n_head` ({d_model // n_head})"
)
self.d_head = d_model // n_head
self.ff_activation = ff_activation
self.d_inner = d_inner
self.untie_r = untie_r
self.attn_type = attn_type
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.dropout = dropout
self.mem_len = mem_len
self.reuse_len = reuse_len
self.bi_data = bi_data
self.clamp_len = clamp_len
self.same_length = same_length
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_last_dropout = summary_last_dropout
self.start_n_top = start_n_top
self.end_n_top = end_n_top
self.bos_token_id = bos_token_id
self.pad_token_id = pad_token_id
self.eos_token_id = eos_token_id
if "use_cache" in kwargs:
warnings.warn(
"The `use_cache` argument is deprecated and will be removed in a future version, use `use_mems_eval`"
" instead.",
FutureWarning,
)
use_mems_eval = kwargs["use_cache"]
self.use_mems_eval = use_mems_eval
self.use_mems_train = use_mems_train
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@property
def max_position_embeddings(self):
logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
return -1
@max_position_embeddings.setter
def max_position_embeddings(self, value):
raise NotImplementedError(
f"The model {self.model_type} is one of the few models that has no sequence length limit."
)
.\models\xlnet\convert_xlnet_original_tf_checkpoint_to_pytorch.py
"""Convert BERT checkpoint."""
import argparse
import os
import torch
from transformers import (
XLNetConfig,
XLNetForQuestionAnswering,
XLNetForSequenceClassification,
XLNetLMHeadModel,
load_tf_weights_in_xlnet,
)
from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
GLUE_TASKS_NUM_LABELS = {
"cola": 2,
"mnli": 3,
"mrpc": 2,
"sst-2": 2,
"sts-b": 1,
"qqp": 2,
"qnli": 2,
"rte": 2,
"wnli": 2,
}
logging.set_verbosity_info()
def convert_xlnet_checkpoint_to_pytorch(
tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None
):
config = XLNetConfig.from_json_file(bert_config_file)
finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
if finetuning_task in GLUE_TASKS_NUM_LABELS:
print(f"Building PyTorch XLNetForSequenceClassification model from configuration: {config}")
config.finetuning_task = finetuning_task
config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
model = XLNetForSequenceClassification(config)
elif "squad" in finetuning_task:
config.finetuning_task = finetuning_task
model = XLNetForQuestionAnswering(config)
else:
model = XLNetLMHeadModel(config)
load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}")
torch.save(model.state_dict(), pytorch_weights_dump_path)
print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}")
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
f.write(config.to_json_string())
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--xlnet_config_file",
default=None,
type=str,
required=True,
help=(
"The config json file corresponding to the pre-trained XLNet model. \n"
"This specifies the model architecture."
),
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
required=True,
help="Path to the folder to store the PyTorch model or dataset/vocab.",
)
parser.add_argument(
"--finetuning_task",
default=None,
type=str,
help="Name of a task on which the XLNet TensorFlow model was fine-tuned",
)
args = parser.parse_args()
print(args)
convert_xlnet_checkpoint_to_pytorch(
args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task
)
.\models\xlnet\modeling_tf_xlnet.py
"""
TF 2.0 XLNet model.
"""
from __future__ import annotations
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFModelInputType,
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFSequenceSummary,
TFSharedEmbeddings,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_xlnet import XLNetConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "xlnet/xlnet-base-cased"
_CONFIG_FOR_DOC = "XLNetConfig"
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlnet/xlnet-base-cased",
"xlnet/xlnet-large-cased",
]
class TFXLNetRelativeAttention(keras.layers.Layer):
"""
相对注意力层的 TensorFlow 2.0 实现。
Args:
config (XLNetConfig): XLNet 模型的配置对象。
Raises:
ValueError: 如果配置中的隐藏大小不是注意力头数的倍数。
Attributes:
n_head (int): 注意力头的数量。
d_head (int): 每个注意力头的隐藏大小。
d_model (int): 模型的隐藏大小。
scale (float): 缩放因子,用于注意力计算。
initializer_range (float): 初始化范围。
output_attentions (bool): 是否输出注意力权重。
layer_norm (keras.layers.LayerNormalization): 应用在每个子层输出上的层归一化层。
dropout (keras.layers.Dropout): 用于应用 dropout 的层。
config (XLNetConfig): XLNet 模型的配置对象。
"""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
if config.d_model % config.n_head != 0:
raise ValueError(
f"The hidden size ({config.d_model}) is not a multiple of the number of attention "
f"heads ({config.n_head}"
)
self.n_head = config.n_head
self.d_head = config.d_head
self.d_model = config.d_model
self.scale = 1 / (config.d_head**0.5)
self.initializer_range = config.initializer_range
self.output_attentions = config.output_attentions
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.dropout = keras.layers.Dropout(config.dropout)
self.config = config
def build(self, input_shape=None):
initializer = get_initializer(self.initializer_range)
self.q = self.add_weight(
shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q"
)
self.k = self.add_weight(
shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="k"
)
self.v = self.add_weight(
shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="v"
)
self.o = self.add_weight(
shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="o"
)
self.r = self.add_weight(
shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="r"
)
self.r_r_bias = self.add_weight(
shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
)
self.r_s_bias = self.add_weight(
shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_s_bias"
)
self.r_w_bias = self.add_weight(
shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
)
self.seg_embed = self.add_weight(
shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed"
)
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
def prune_heads(self, heads):
raise NotImplementedError
def rel_shift(self, x, klen=-1):
"""perform relative shift to form the relative attention score."""
x_size = shape_list(x)
x = tf.reshape(x, (x_size[1], x_size[0], x_size[2], x_size[3]))
x = x[1:, ...]
x = tf.reshape(x, (x_size[0], x_size[1] - 1, x_size[2], x_size[3]))
x = x[:, 0:klen, :, :]
return x
def rel_attn_core(
self, q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask, output_attentions, training=False
):
):
"""Core relative positional attention operations."""
ac = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_w_bias, k_head_h)
bd = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_r_bias, k_head_r)
bd = self.rel_shift(bd, klen=shape_list(ac)[1])
if seg_mat is None:
ef = 0
else:
ef = tf.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
ef = tf.einsum("ijbs,ibns->ijbn", seg_mat, ef)
attn_score = (ac + bd + ef) * self.scale
if attn_mask is not None:
if attn_mask.dtype == tf.float16 or attn_mask.dtype == tf.bfloat16:
attn_score = attn_score - 65500 * attn_mask
else:
attn_score = attn_score - 1e30 * attn_mask
attn_prob = stable_softmax(attn_score, axis=1)
attn_prob = self.dropout(attn_prob, training=training)
if head_mask is not None:
attn_prob = attn_prob * head_mask
attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, v_head_h)
if output_attentions:
return attn_vec, attn_prob
return attn_vec
def post_attention(self, h, attn_vec, residual=True, training=False):
"""Post-attention processing."""
attn_out = tf.einsum("ibnd,hnd->ibh", attn_vec, self.o)
attn_out = self.dropout(attn_out, training=training)
if residual:
attn_out = attn_out + h
output = self.layer_norm(attn_out)
return output
def call(
self,
h,
g,
attn_mask_h,
attn_mask_g,
r,
seg_mat,
mems: np.ndarray | tf.Tensor | None = None,
target_mapping: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = False,
training: bool = False,
class TFXLNetFeedForward(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.layer_1 = keras.layers.Dense(
config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1"
)
self.layer_2 = keras.layers.Dense(
config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2"
)
self.dropout = keras.layers.Dropout(config.dropout)
if isinstance(config.ff_activation, str):
self.activation_function = get_tf_activation(config.ff_activation)
else:
self.activation_function = config.ff_activation
self.config = config
def call(self, inp, training=False):
output = inp
output = self.layer_1(output)
output = self.activation_function(output)
output = self.dropout(output, training=training)
output = self.layer_2(output)
output = self.dropout(output, training=training)
output = self.layer_norm(output + inp)
return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layer_1", None) is not None:
with tf.name_scope(self.layer_1.name):
self.layer_1.build([None, None, self.config.d_model])
if getattr(self, "layer_2", None) is not None:
with tf.name_scope(self.layer_2.name):
self.layer_2.build([None, None, self.config.d_inner])
class TFXLNetLayer(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn")
self.ff = TFXLNetFeedForward(config, name="ff")
self.dropout = keras.layers.Dropout(config.dropout)
def call(
self,
output_h,
output_g,
non_tgt_mask,
attn_mask,
pos_emb,
seg_mat,
mems: np.ndarray | tf.Tensor | None = None,
target_mapping: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = False,
training: bool = False,
):
outputs = self.rel_attn(
output_h,
output_g,
non_tgt_mask,
attn_mask,
pos_emb,
seg_mat,
mems,
target_mapping,
head_mask,
output_attentions,
training=training,
)
output_h, output_g = outputs[:2]
if output_g is not None:
output_g = self.ff(output_g, training=training)
output_h = self.ff(output_h, training=training)
outputs = (output_h, output_g) + outputs[2:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rel_attn", None) is not None:
with tf.name_scope(self.rel_attn.name):
self.rel_attn.build(None)
if getattr(self, "ff", None) is not None:
with tf.name_scope(self.ff.name):
self.ff.build(None)
class TFXLNetLMHead(keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
self.config = config
self.input_embeddings = input_embeddings
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
def get_output_embeddings(self):
return self.input_embeddings
def set_output_embeddings(self, value):
self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self):
return {"bias": self.bias}
def set_bias(self, value):
self.bias = value["bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear")
hidden_states = hidden_states + self.bias
return hidden_states
@keras_serializable
class TFXLNetMainLayer(keras.layers.Layer):
config_class = XLNetConfig
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions
self.return_dict = config.return_dict
self.mem_len = config.mem_len
self.reuse_len = config.reuse_len
self.d_model = config.d_model
self.same_length = config.same_length
self.attn_type = config.attn_type
self.bi_data = config.bi_data
self.clamp_len = config.clamp_len
self.n_layer = config.n_layer
self.use_bfloat16 = config.use_bfloat16
self.initializer_range = config.initializer_range
self.word_embedding = TFSharedEmbeddings(
config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding"
)
self.layer = [TFXLNetLayer(config, name=f"layer_._{i}") for i in range(config.n_layer)]
self.dropout = keras.layers.Dropout(config.dropout)
self.use_mems_eval = config.use_mems_eval
self.use_mems_train = config.use_mems_train
def get_input_embeddings(self):
return self.word_embedding
def set_input_embeddings(self, value):
self.word_embedding.weight = value
self.word_embedding.vocab_size = shape_list(value)[0]
def build(self, input_shape=None):
initializer = get_initializer(self.initializer_range)
self.mask_emb = self.add_weight(
shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb"
)
if self.built:
return
self.built = True
if getattr(self, "word_embedding", None) is not None:
with tf.name_scope(self.word_embedding.name):
self.word_embedding.build(None)
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
def create_mask(self, qlen, mlen):
"""
Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
Args:
qlen: Query 的长度
mlen: Memory 的长度
"""
attn_mask = tf.ones([qlen, qlen])
mask_u = tf.linalg.band_part(attn_mask, 0, -1)
mask_dia = tf.linalg.band_part(attn_mask, 0, 0)
attn_mask_pad = tf.zeros([qlen, mlen])
ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
if self.same_length:
mask_l = tf.linalg.band_part(attn_mask, -1, 0)
ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
return ret
def cache_mem(self, curr_out, prev_mem):
if self.reuse_len is not None and self.reuse_len > 0:
curr_out = curr_out[: self.reuse_len]
if self.mem_len is None or self.mem_len == 0:
cutoff = 0
else:
cutoff = -self.mem_len
if prev_mem is None:
new_mem = curr_out[cutoff:]
else:
new_mem = tf.concat([prev_mem, curr_out], 0)[cutoff:]
return tf.stop_gradient(new_mem)
def positional_embedding(pos_seq, inv_freq, bsz=None):
sinusoid_inp = tf.einsum("i,d->id", pos_seq, inv_freq)
pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1)
pos_emb = pos_emb[:, None, :]
if bsz is not None:
pos_emb = tf.tile(pos_emb, [1, bsz, 1])
return pos_emb
def relative_positional_encoding(self, qlen, klen, bsz=None):
"""create relative positional encoding."""
freq_seq = tf.range(0, self.d_model, 2.0)
inv_freq = 1 / (10000 ** (freq_seq / self.d_model))
if self.attn_type == "bi":
beg, end = klen, -qlen
elif self.attn_type == "uni":
beg, end = klen, -1
else:
raise ValueError(f"Unknown `attn_type` {self.attn_type}.")
if self.bi_data:
fwd_pos_seq = tf.range(beg, end, -1.0)
bwd_pos_seq = tf.range(-beg, -end, 1.0)
if self.clamp_len > 0:
fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
bwd_pos_seq = tf.clip_by_value(bwd_pos_seq, -self.clamp_len, self.clamp_len)
if bsz is not None:
if bsz % 2 != 0:
raise ValueError(f"With bi_data, the batch size {bsz} should be divisible by 2")
fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
else:
fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
pos_emb = tf.concat([fwd_pos_emb, bwd_pos_emb], axis=1)
else:
fwd_pos_seq = tf.range(beg, end, -1.0)
if self.clamp_len > 0:
fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
return pos_emb
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
mems: np.ndarray | tf.Tensor | None = None,
perm_mask: np.ndarray | tf.Tensor | None = None,
target_mapping: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
input_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_mems: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
class TFXLNetPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = XLNetConfig
base_model_prefix = "transformer"
@dataclass
class TFXLNetModelOutput(ModelOutput):
"""
Output type of [`TFXLNetModel`].
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, num_predict, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
`num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict`
corresponds to `sequence_length`.
mems (`List[tf.Tensor]` of length `config.n_layers`):
Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
token ids which have their past given to this model should not be passed as `input_ids` as they have
already been computed.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
mems: List[tf.Tensor] | None = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
@dataclass
class TFXLNetLMHeadModelOutput(ModelOutput):
"""
Output type of [`TFXLNetLMHeadModel`].
"""
pass
Args:
loss (`tf.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
如果提供了 `labels`,则返回的语言建模损失(用于下一个标记预测)。
logits (`tf.Tensor` of shape `(batch_size, num_predict, config.vocab_size)`):
语言建模头部的预测分数(在应用 SoftMax 之前的每个词汇标记的分数)。
`num_predict` 对应于 `target_mapping.shape[1]`。如果 `target_mapping` 是 `None`,则 `num_predict`
对应于 `sequence_length`。
mems (`List[tf.Tensor]` of length `config.n_layers`):
包含预计算的隐藏状态。可以用于加速顺序解码。已经计算过其过去的令牌 id 不应该作为 `input_ids` 传递,
因为它们已经被计算过。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
一个元组,包含 `tf.Tensor`(一个用于嵌入输出 + 每个层的输出)的形状为 `(batch_size, sequence_length, hidden_size)`。
模型每个层的输出以及初始嵌入输出的隐藏状态。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
一个元组,包含 `tf.Tensor`(每个层的一个)的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
@dataclass
class TFXLNetForSequenceClassificationOutput(ModelOutput):
"""
[`TFXLNetForSequenceClassification`] 的输出类型。
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, 当 `label` 被提供时返回):
分类(如果 `config.num_labels==1` 则为回归)损失。
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
分类(或回归,如果 `config.num_labels==1`)得分(SoftMax 之前)。
mems (`List[tf.Tensor]` of length `config.n_layers`):
包含预计算隐藏状态。可以用于加速序列解码。将已经计算过其过去的令牌 id 传递给该模型不应作为 `input_ids` 传递。
hidden_states (`tuple(tf.Tensor)`, *optional*, 当 `output_hidden_states=True` 传递或者 `config.output_hidden_states=True` 时返回):
形状为 `(batch_size, sequence_length, hidden_size)` 的 `tf.Tensor` 元组(一个用于嵌入输出,一个用于每一层的输出)。
模型每一层输出的隐藏状态加上初始嵌入输出。
attentions (`tuple(tf.Tensor)`, *optional*, 当 `output_attentions=True` 传递或者 `config.output_attentions=True` 时返回):
形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的 `tf.Tensor` 元组(每层一个)。
在注意力 softmax 后的注意力权重,用于在自注意力头中计算加权平均值。
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
mems: List[tf.Tensor] | None = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
@dataclass
class TFXLNetForTokenClassificationOutput(ModelOutput):
"""
[`TFXLNetForTokenClassificationOutput`] 的输出类型。
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
mems: List[tf.Tensor] | None = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
@dataclass
class TFXLNetForMultipleChoiceOutput(ModelOutput):
"""
Output type of [`TFXLNetForMultipleChoice`].
Args:
loss (`tf.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
分类损失。
logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
`num_choices` 是输入张量的第二维度。参见上文中的 `input_ids`。
分类得分(SoftMax 之前)。
mems (`List[tf.Tensor]` of length `config.n_layers`):
包含预先计算的隐藏状态。可用于加速顺序解码。这个模型接收到的 token id 不应作为 `input_ids` 传递,因为它们已经被计算过。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
元组,包含 `tf.Tensor`(一个用于嵌入输出,每一层一个用于层的输出)的形状为 `(batch_size, sequence_length, hidden_size)`。
每层模型的隐藏状态,以及初始嵌入的输出。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
元组,包含每一层的 `tf.Tensor` 的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
mems: List[tf.Tensor] | None = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
@dataclass
class TFXLNetForQuestionAnsweringSimpleOutput(ModelOutput):
"""
Output type of [`TFXLNetForQuestionAnsweringSimple`].
"""
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
总的跨度抽取损失,由开始和结束位置的交叉熵之和组成。
start_logits (`tf.Tensor` of shape `(batch_size, sequence_length,)`):
跨度开始位置的分数(SoftMax 之前)。
end_logits (`tf.Tensor` of shape `(batch_size, sequence_length,)`):
跨度结束位置的分数(SoftMax 之前)。
mems (`List[tf.Tensor]` of length `config.n_layers`):
包含预先计算的隐藏状态的列表。可以用于加速序列解码。
给定到该模型的过去标记 id 不应作为 `input_ids` 传递,因为它们已经计算过。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
包含模型每一层输出的隐藏状态的元组。
第一个张量是嵌入层的输出,后续的张量是每一层输出的隐藏状态。
形状为 `(batch_size, sequence_length, hidden_size)`。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
包含自注意力机制 softmax 后的注意力权重的元组。
用于计算自注意力头部的加权平均值。
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
"""
# 初始化变量,表示损失、开始位置分数、结束位置分数、预先计算的隐藏状态、每层的隐藏状态、每层的注意力权重
loss: tf.Tensor | None = None
start_logits: tf.Tensor = None
end_logits: tf.Tensor = None
mems: List[tf.Tensor] | None = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Parameters:
config ([`XLNetConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# 用于存储 XLNet 模型输入说明文档的常量字符串
XLNET_INPUTS_DOCSTRING = r"""
"""
# 使用装饰器 `add_start_docstrings` 给 `TFXLNetModel` 类添加描述文档
@add_start_docstrings(
"The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
XLNET_START_DOCSTRING, # 引用之前定义的模型文档字符串
)
# 定义 TFXLNetModel 类,继承自 TFXLNetPreTrainedModel
class TFXLNetModel(TFXLNetPreTrainedModel):
# 初始化方法,接收一个 config 对象和任意数量的位置参数和关键字参数
def __init__(self, config, *inputs, **kwargs):
# 调用父类的初始化方法
super().__init__(config, *inputs, **kwargs)
# 创建一个名为 transformer 的 TFXLNetMainLayer 实例
self.transformer = TFXLNetMainLayer(config, name="transformer")
# 使用装饰器 `unpack_inputs` 给该方法添加描述文档
@unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFXLNetModelOutput,
config_class=_CONFIG_FOR_DOC,
)
# 添加代码示例的文档字符串,指定检查点、输出类型和配置类
def call(
self,
input_ids: TFModelInputType | None = None, # 输入的 token IDs,可以为空
attention_mask: np.ndarray | tf.Tensor | None = None, # 注意力掩码,可以为空,可以是 NumPy 数组或 TensorFlow 张量
mems: np.ndarray | tf.Tensor | None = None, # 循环记忆,可以为空,可以是 NumPy 数组或 TensorFlow 张量
perm_mask: np.ndarray | tf.Tensor | None = None, # 排列掩码,可以为空,可以是 NumPy 数组或 TensorFlow 张量
target_mapping: np.ndarray | tf.Tensor | None = None, # 目标映射,可以为空,可以是 NumPy 数组或 TensorFlow 张量
token_type_ids: np.ndarray | tf.Tensor | None = None, # token 类型 IDs,可以为空,可以是 NumPy 数组或 TensorFlow 张量
input_mask: np.ndarray | tf.Tensor | None = None, # 输入掩码,可以为空,可以是 NumPy 数组或 TensorFlow 张量
head_mask: np.ndarray | tf.Tensor | None = None, # 头部掩码,可以为空,可以是 NumPy 数组或 TensorFlow 张量
inputs_embeds: np.ndarray | tf.Tensor | None = None, # 嵌入输入,可以为空,可以是 NumPy 数组或 TensorFlow 张量
use_mems: Optional[bool] = None, # 是否使用循环记忆,可选布尔类型
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选布尔类型
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选布尔类型
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,可选布尔类型
training: bool = False, # 是否在训练模式下,布尔类型,默认为 False
) -> Union[TFXLNetModelOutput, Tuple[tf.Tensor]]:
# 调用 Transformer 模型的前向传播
outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_mems=use_mems,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 标记为已经构建
self.built = True
# 如果存在 Transformer 模型,则在命名空间下构建
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
@add_start_docstrings(
"""
XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
""",
XLNET_START_DOCSTRING,
)
class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 初始化 XLNet 主体层,使用给定的配置参数
self.transformer = TFXLNetMainLayer(config, name="transformer")
# 初始化语言建模头部,与词嵌入权重相连
self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss")
# 不支持 XLA 生成
self.supports_xla_generation = False
def get_lm_head(self):
# 返回语言建模头部对象
return self.lm_loss
def get_prefix_bias_name(self):
# 警告,方法已弃用,请使用 `get_bias` 替代
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
# 返回头部名称和语言建模头部名称的组合
return self.name + "/" + self.lm_loss.name
def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_mems=None, **kwargs):
# 在输入序列末尾添加虚拟标记(不被注意力机制使用)
effective_batch_size = inputs.shape[0]
dummy_token = tf.zeros((effective_batch_size, 1), dtype=inputs.dtype)
# 计算新标记和最后两个生成标记的注意力值,其余从 `past` 缓存重新加载。
# 完全自回归模型应该有 offset = 1;offset = 2 似乎计算稍好。
offset = 2
if past_key_values:
# 如果过去键值存在,则在末尾添加虚拟标记
input_ids = tf.concat([inputs[:, -offset:], dummy_token], axis=1)
else:
# 否则,在末尾添加虚拟标记
input_ids = tf.concat([inputs, dummy_token], axis=1)
# 构建排列掩码,使之前的标记不看到最后一个标记
sequence_length = input_ids.shape[1]
perm_mask = tf.zeros((effective_batch_size, sequence_length, sequence_length - 1))
perm_mask_seq_end = tf.ones((effective_batch_size, sequence_length, 1))
perm_mask = tf.concat([perm_mask, perm_mask_seq_end], axis=-1)
# 仅预测最后一个标记
target_mapping = tf.zeros((effective_batch_size, 1, sequence_length - 1))
target_mapping_seq_end = tf.ones((effective_batch_size, 1, 1))
target_mapping = tf.concat([target_mapping, target_mapping_seq_end], axis=-1)
inputs = {
"input_ids": input_ids,
"perm_mask": perm_mask,
"target_mapping": target_mapping,
"use_mems": use_mems,
}
# 如果模型参数中定义了过去键值,则用于更快速的解码
if past_key_values:
inputs["mems"] = tuple(layer_past[:-offset, :, :] for layer_past in past_key_values)
return inputs
@unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFXLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC)
# 定义一个方法用于调用模型,接受多个输入参数,每个参数都有指定的类型或者可以为空
def call(
self,
input_ids: TFModelInputType | None = None, # 输入的模型的输入 IDs,可以是指定的类型或者空
attention_mask: np.ndarray | tf.Tensor | None = None, # 注意力掩码,可以是 NumPy 数组、张量或者空
mems: np.ndarray | tf.Tensor | None = None, # 记忆项,可以是 NumPy 数组、张量或者空
perm_mask: np.ndarray | tf.Tensor | None = None, # 排列掩码,可以是 NumPy 数组、张量或者空
target_mapping: np.ndarray | tf.Tensor | None = None, # 目标映射,可以是 NumPy 数组、张量或者空
token_type_ids: np.ndarray | tf.Tensor | None = None, # 标记类型 IDs,可以是 NumPy 数组、张量或者空
input_mask: np.ndarray | tf.Tensor | None = None, # 输入掩码,可以是 NumPy 数组、张量或者空
head_mask: np.ndarray | tf.Tensor | None = None, # 头部掩码,可以是 NumPy 数组、张量或者空
inputs_embeds: np.ndarray | tf.Tensor | None = None, # 输入的嵌入表示,可以是 NumPy 数组、张量或者空
use_mems: Optional[bool] = None, # 是否使用记忆项,可选布尔值,默认为 None
output_attentions: Optional[bool] = None, # 是否输出注意力,可选布尔值,默认为 None
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选布尔值,默认为 None
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,可选布尔值,默认为 None
labels: np.ndarray | tf.Tensor | None = None, # 标签,可以是 NumPy 数组、张量或者空
training: bool = False, # 是否处于训练模式,默认为 False
):
# 定义模型调用方法,实现模型的前向传播等功能
pass # 此处实际上只是定义了方法的结构,具体的实现需要在此基础上完成
# 构建方法,用于构建模型的结构
def build(self, input_shape=None):
# 如果模型已经构建完毕,则直接返回
if self.built:
return
# 设置模型为已构建状态
self.built = True
# 如果存在 transformer 属性,则在其命名空间内构建 transformer
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
# 如果存在 lm_loss 属性,则在其命名空间内构建 lm_loss
if getattr(self, "lm_loss", None) is not None:
with tf.name_scope(self.lm_loss.name):
self.lm_loss.build(None)
# 使用装饰器添加模型文档字符串,描述 XLNet 模型用于序列分类/回归任务的顶层结构
@add_start_docstrings(
"""
XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
for GLUE tasks.
""",
XLNET_START_DOCSTRING,
)
# 定义 TFXLNetForSequenceClassification 类,继承自 TFXLNetPreTrainedModel 和 TFSequenceClassificationLoss
class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassificationLoss):
# 初始化方法
def __init__(self, config, *inputs, **kwargs):
# 调用父类的初始化方法
super().__init__(config, *inputs, **kwargs)
# 设置类别数目
self.num_labels = config.num_labels
# 创建 XLNet 主层,命名为 'transformer'
self.transformer = TFXLNetMainLayer(config, name="transformer")
# 创建序列摘要层,用于生成序列摘要,初始化范围为 config.initializer_range,命名为 'sequence_summary'
self.sequence_summary = TFSequenceSummary(
config, initializer_range=config.initializer_range, name="sequence_summary"
)
# 创建输出层,用于生成 logits,层的输出尺寸为 config.num_labels,权重初始化方法为 config.initializer_range 中的 initializer,命名为 'logits_proj'
self.logits_proj = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
)
# 保存模型配置信息
self.config = config
# 使用装饰器添加模型前向传播的文档字符串,描述输入参数的含义和形状要求
@unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFXLNetForSequenceClassificationOutput,
config_class=_CONFIG_FOR_DOC,
)
# 定义模型的前向传播方法
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
mems: np.ndarray | tf.Tensor | None = None,
perm_mask: np.ndarray | tf.Tensor | None = None,
target_mapping: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
input_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_mems: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
# 省略部分输入参数说明
) -> Union[TFXLNetForSequenceClassificationOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 调用transformer模型进行前向传播,返回输出结果
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_mems=use_mems,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 从transformer模型的输出中取第一个元素作为模型输出
output = transformer_outputs[0]
# 对模型输出进行序列摘要(summary)
output = self.sequence_summary(output)
# 将摘要后的结果投影到logits空间
logits = self.logits_proj(output)
# 如果labels不为空,计算损失值;否则损失为None
loss = None if labels is None else self.hf_compute_loss(labels, logits)
# 如果return_dict为False,则返回元组形式的输出
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果return_dict为True,则返回TFXLNetForSequenceClassificationOutput对象
return TFXLNetForSequenceClassificationOutput(
loss=loss,
logits=logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
# 如果模型已经构建,直接返回
if self.built:
return
# 标记模型为已构建状态
self.built = True
# 如果self.transformer存在,则在其命名作用域下构建transformer模型
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
# 如果self.sequence_summary存在,则在其命名作用域下构建序列摘要模型
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
# 如果self.logits_proj存在,则在其命名作用域下构建logits投影模型
if getattr(self, "logits_proj", None) is not None:
with tf.name_scope(self.logits_proj.name):
self.logits_proj.build([None, None, self.config.d_model])
# 使用装饰器添加文档字符串,描述了这个类的作用是在XLNET模型基础上添加一个多选分类的头部,例如用于RocStories/SWAG任务。
@add_start_docstrings(
"""
XLNET Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
XLNET_START_DOCSTRING, # 引用XLNET模型的起始文档字符串
)
class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 初始化XLNET主层,命名为'transformer'
self.transformer = TFXLNetMainLayer(config, name="transformer")
# 初始化序列摘要层,用于生成序列汇总
self.sequence_summary = TFSequenceSummary(
config, initializer_range=config.initializer_range, name="sequence_summary"
)
# 初始化逻辑回归投影层,用于多选分类,输出维度为1
self.logits_proj = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
)
# 保存模型配置
self.config = config
# 使用装饰器添加文档字符串,描述这个方法的作用是处理XLNET模型的前向传播,支持多种输入
@unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC, # 引用文档中的检查点示例
output_type=TFXLNetForMultipleChoiceOutput, # 引用XLNET多选输出的类型
config_class=_CONFIG_FOR_DOC, # 引用文档中的配置类示例
)
def call(
self,
input_ids: TFModelInputType | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
input_mask: np.ndarray | tf.Tensor | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
mems: np.ndarray | tf.Tensor | None = None,
perm_mask: np.ndarray | tf.Tensor | None = None,
target_mapping: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_mems: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
# 如果传入了 input_ids,则获取其第二维度的大小作为 num_choices
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
# 获取 input_ids 的第三维度大小作为 seq_length
seq_length = shape_list(input_ids)[2]
else:
# 否则,从 inputs_embeds 中获取第二维度大小作为 num_choices
num_choices = shape_list(inputs_embeds)[1]
# 获取 inputs_embeds 的第三维度大小作为 seq_length
seq_length = shape_list(inputs_embeds)[2]
# 根据是否为 None,将各输入张量展平成二维张量
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_input_mask = tf.reshape(input_mask, (-1, seq_length)) if input_mask is not None else None
flat_inputs_embeds = (
tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
if inputs_embeds is not None
else None
)
# 调用 Transformer 模型,传入展平后的输入张量和其他参数
transformer_outputs = self.transformer(
flat_input_ids,
flat_attention_mask,
mems,
perm_mask,
target_mapping,
flat_token_type_ids,
flat_input_mask,
head_mask,
flat_inputs_embeds,
use_mems,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
)
# 从 Transformer 输出中取出第一个元素作为输出
output = transformer_outputs[0]
# 对输出进行序列摘要
logits = self.sequence_summary(output)
# 对序列摘要后的结果进行投影,得到最终的 logits
logits = self.logits_proj(logits)
# 将 logits 重新 reshape 成二维张量
reshaped_logits = tf.reshape(logits, (-1, num_choices))
# 如果提供了 labels,则计算损失,否则损失设为 None
loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
# 如果 return_dict 为 False,则返回扁平化后的 logits 和可能的其他输出
if not return_dict:
output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则返回一个 TFXLNetForMultipleChoiceOutput 对象
return TFXLNetForMultipleChoiceOutput(
loss=loss,
logits=reshaped_logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
# 构建模型的方法,如果已经构建过则直接返回
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果存在变换器(transformer),则在其命名空间内构建变换器
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
# 如果存在序列摘要(sequence_summary),则在其命名空间内构建序列摘要
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
# 如果存在 logits 项目(logits_proj),则在其命名空间内构建 logits 项目
if getattr(self, "logits_proj", None) is not None:
with tf.name_scope(self.logits_proj.name):
# 构建 logits 项目,输入形状为 [None, None, self.config.d_model]
self.logits_proj.build([None, None, self.config.d_model])
"""
XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
# 继承自TFXLNetPreTrainedModel和TFTokenClassificationLoss的XLNet模型,用于标记分类任务(如命名实体识别NER)。
class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 设置分类的标签数目
self.num_labels = config.num_labels
# 初始化XLNet的主要层,命名为'transformer'
self.transformer = TFXLNetMainLayer(config, name="transformer")
# 设置分类器,为一个全连接层,输出大小为config.num_labels,使用config中定义的初始化范围初始化权重
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
# 保存配置参数
self.config = config
# 将多个输入解包并传递给模型的前向传播函数,同时添加了额外的文档字符串说明
@unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFXLNetForTokenClassificationOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
mems: np.ndarray | tf.Tensor | None = None,
perm_mask: np.ndarray | tf.Tensor | None = None,
target_mapping: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
input_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_mems: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
"""
进行模型的前向传播,接受多种输入参数,包括input_ids、attention_mask等,以及用于训练的labels和是否处于训练模式的training标志位。
"""
) -> Union`
) -> Union[TFXLNetForTokenClassificationOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
# 使用 transformer 处理输入数据,返回 transformer 的输出
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_mems=use_mems,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取 transformer 的输出中的第一个元素,即 logits
output = transformer_outputs[0]
# 使用 classifier 处理 logits,得到最终的分类结果
logits = self.classifier(output)
# 如果提供了 labels,则计算损失;否则损失为 None
loss = None if labels is None else self.hf_compute_loss(labels, logits)
# 如果 return_dict 为 False,则返回一个包含 logits 和其他输出的元组
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则构建 TFXLNetForTokenClassificationOutput 对象并返回
return TFXLNetForTokenClassificationOutput(
loss=loss,
logits=logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
# 设置标记为已构建
self.built = True
# 如果存在 transformer 属性,则构建 transformer
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
# 如果存在 classifier 属性,则构建 classifier
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
# 使用自定义的文档字符串描述这个类,说明它是在XLNet模型基础上构建的用于抽取式问答任务的模型,
# 通过在隐藏状态输出的基础上添加线性层来计算'起始位置logits'和'结束位置logits'。
@add_start_docstrings(
"""
XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLNET_START_DOCSTRING,
)
class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnsweringLoss):
# 初始化方法,接收配置和其他输入参数
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 创建XLNet的主要层,并命名为"transformer"
self.transformer = TFXLNetMainLayer(config, name="transformer")
# 创建一个全连接层用于问答输出,其输出维度为config.num_labels,初始化方式为指定范围内的初始化器
self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
# 将配置保存为对象的属性
self.config = config
# 调用方法,实现模型的前向传播
@unpack_inputs
# 添加模型前向传播的文档字符串,描述输入参数的形状和含义
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# 添加示例代码的文档字符串,描述模型的检查点、输出类型和配置类
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFXLNetForQuestionAnsweringSimpleOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
mems: np.ndarray | tf.Tensor | None = None,
perm_mask: np.ndarray | tf.Tensor | None = None,
target_mapping: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
input_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_mems: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: bool = False,
) -> Union[TFXLNetForQuestionAnsweringSimpleOutput, Tuple[tf.Tensor]]:
r"""
start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
# 调用 Transformer 模型进行前向传播,获取输出
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_mems=use_mems,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 从 Transformer 输出中提取序列输出
sequence_output = transformer_outputs[0]
# 将序列输出传递给 QA 输出层,得到起始位置和结束位置的预测 logits
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
# 计算损失,如果提供了起始位置和结束位置的标签
loss = None
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.hf_compute_loss(labels, (start_logits, end_logits))
# 如果不需要返回字典,则构建输出元组
if not return_dict:
output = (start_logits, end_logits) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果需要返回字典,则构建 TFXLNetForQuestionAnsweringSimpleOutput 对象
return TFXLNetForQuestionAnsweringSimpleOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果已经定义了 Transformer 模型,则构建它
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
# 如果已经定义了 QA 输出层,则构建它
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
.\models\xlnet\modeling_xlnet.py
def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
tf_to_pt_map = {}
if hasattr(model, "transformer"):
if hasattr(model, "lm_loss"):
tf_to_pt_map["model/lm_loss/bias"] = model.lm_loss.bias
if hasattr(model, "sequence_summary") and "model/sequnece_summary/summary/kernel" in tf_weights:
tf_to_pt_map["model/sequnece_summary/summary/kernel"] = model.sequence_summary.summary.weight
tf_to_pt_map["model/sequnece_summary/summary/bias"] = model.sequence_summary.summary.bias
if (
hasattr(model, "logits_proj")
and config.finetuning_task is not None
and f"model/regression_{config.finetuning_task}/logit/kernel" in tf_weights
):
tf_to_pt_map[f"model/regression_{config.finetuning_task}/logit/kernel"] = model.logits_proj.weight
tf_to_pt_map[f"model/regression_{config.finetuning_task}/logit/bias"] = model.logits_proj.bias
model = model.transformer
tf_to_pt_map.update(
{
"model/transformer/word_embedding/lookup_table": model.word_embedding.weight,
"model/transformer/mask_emb/mask_emb": model.mask_emb,
}
)
for i, b in enumerate(model.layer):
layer_str = f"model/transformer/layer_{i}/"
tf_to_pt_map.update(
{
layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight,
layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias,
layer_str + "rel_attn/o/kernel": b.rel_attn.o,
layer_str + "rel_attn/q/kernel": b.rel_attn.q,
layer_str + "rel_attn/k/kernel": b.rel_attn.k,
layer_str + "rel_attn/r/kernel": b.rel_attn.r,
layer_str + "rel_attn/v/kernel": b.rel_attn.v,
layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight,
layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias,
layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight,
layer_str + "ff/layer_1/bias": b.ff.layer_1.bias,
layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight,
layer_str + "ff/layer_2/bias": b.ff.layer_2.bias,
}
)
if config.untie_r:
r_r_list = []
r_w_list = []
r_s_list = []
seg_embed_list = []
for b in model.layer:
r_r_list.append(b.rel_attn.r_r_bias)
r_w_list.append(b.rel_attn.r_w_bias)
r_s_list.append(b.rel_attn.r_s_bias)
seg_embed_list.append(b.rel_attn.seg_embed)
else:
r_r_list = [model.r_r_bias]
r_w_list = [model.r_w_bias]
r_s_list = [model.r_s_bias]
seg_embed_list = [model.seg_embed]
tf_to_pt_map.update(
{
"model/transformer/r_r_bias": r_r_list,
"model/transformer/r_w_bias": r_w_list,
"model/transformer/r_s_bias": r_s_list,
"model/transformer/seg_embed": seg_embed_list,
}
)
return tf_to_pt_map
def load_tf_weights_in_xlnet(model, config, tf_path):
"""Load tf checkpoints in a pytorch model"""
try:
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
init_vars = tf.train.list_variables(tf_path)
tf_weights = {}
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
tf_weights[name] = array
tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
for name, pointer in tf_to_pt_map.items():
logger.info(f"Importing {name}")
if name not in tf_weights:
logger.info(f"{name} not in tf pre-trained weights, skipping")
continue
array = tf_weights[name]
if "kernel" in name and ("ff" in name or "summary" in name or "logit" in name):
logger.info("Transposing")
array = np.transpose(array)
if isinstance(pointer, list):
assert (
len(pointer) == array.shape[0]
), f"Pointer length {len(pointer)} and array length {array.shape[0]} mismatched"
for i, p_i in enumerate(pointer):
arr_i = array[i, ...]
try:
assert (
p_i.shape == arr_i.shape
), f"Pointer shape {p_i.shape} and array shape {arr_i.shape} mismatched"
except AssertionError as e:
e.args += (p_i.shape, arr_i.shape)
raise
logger.info(f"Initialize PyTorch weight {name} for layer {i}")
p_i.data = torch.from_numpy(arr_i)
else:
try:
assert (
pointer.shape == array.shape
), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
tf_weights.pop(name, None)
tf_weights.pop(name + "/Adam", None)
tf_weights.pop(name + "/Adam_1", None)
logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")
return model
def __init__(self, config):
super().__init__()
if config.d_model % config.n_head != 0:
raise ValueError(
f"The hidden size ({config.d_model}) is not a multiple of the number of attention "
f"heads ({config.n_head}"
)
self.n_head = config.n_head
self.d_head = config.d_head
self.d_model = config.d_model
self.scale = 1 / (config.d_head**0.5)
self.q = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
self.k = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
self.v = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
self.o = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
self.r = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
self.r_s_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
self.seg_embed = nn.Parameter(torch.FloatTensor(2, self.n_head, self.d_head))
self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.dropout)
def prune_heads(self, heads):
raise NotImplementedError
@staticmethod
def rel_shift(x, klen=-1):
"""perform relative shift to form the relative attention score."""
x_size = x.shape
x = x.reshape(x_size[1], x_size[0], x_size[2], x_size[3])
x = x[1:, ...]
x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3])
x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
return x
@staticmethod
def rel_shift_bnij(x, klen=-1):
x_size = x.shape
x = x.reshape(x_size[0], x_size[1], x_size[3], x_size[2])
x = x[:, :, 1:, :]
x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3] - 1)
x = torch.index_select(x, 3, torch.arange(klen, device=x.device, dtype=torch.long))
return x
def rel_attn_core(
self,
q_head,
k_head_h,
v_head_h,
k_head_r,
seg_mat=None,
attn_mask=None,
head_mask=None,
output_attentions=False,
):
"""Core relative positional attention operations."""
ac = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_w_bias, k_head_h)
bd = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_r_bias, k_head_r)
bd = self.rel_shift_bnij(bd, klen=ac.shape[3])
if seg_mat is None:
ef = 0
else:
ef = torch.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
ef = torch.einsum("ijbs,ibns->bnij", seg_mat, ef)
attn_score = (ac + bd + ef) * self.scale
if attn_mask is not None:
if attn_mask.dtype == torch.float16:
attn_score = attn_score - 65500 * torch.einsum("ijbn->bnij", attn_mask)
else:
attn_score = attn_score - 1e30 * torch.einsum("ijbn->bnij", attn_mask)
attn_prob = nn.functional.softmax(attn_score, dim=3)
attn_prob = self.dropout(attn_prob)
if head_mask is not None:
attn_prob = attn_prob * torch.einsum("ijbn->bnij", head_mask)
attn_vec = torch.einsum("bnij,jbnd->ibnd", attn_prob, v_head_h)
if output_attentions:
return attn_vec, torch.einsum("bnij->ijbn", attn_prob)
return attn_vec
def post_attention(self, h, attn_vec, residual=True):
"""Post-attention processing."""
attn_out = torch.einsum("ibnd,hnd->ibh", attn_vec, self.o)
attn_out = self.dropout(attn_out)
if residual:
attn_out = attn_out + h
output = self.layer_norm(attn_out)
return output
def forward(
self,
h,
g,
attn_mask_h,
attn_mask_g,
r,
seg_mat,
mems=None,
target_mapping=None,
head_mask=None,
output_attentions=False,
class XLNetFeedForward(nn.Module):
def __init__(self, config):
super().__init__()
self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
self.layer_1 = nn.Linear(config.d_model, config.d_inner)
self.layer_2 = nn.Linear(config.d_inner, config.d_model)
self.dropout = nn.Dropout(config.dropout)
if isinstance(config.ff_activation, str):
self.activation_function = ACT2FN[config.ff_activation]
else:
self.activation_function = config.ff_activation
def forward(self, inp):
output = inp
output = self.layer_1(output)
output = self.activation_function(output)
output = self.dropout(output)
output = self.layer_2(output)
output = self.dropout(output)
output = self.layer_norm(output + inp)
return output
class XLNetLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.rel_attn = XLNetRelativeAttention(config)
self.ff = XLNetFeedForward(config)
self.dropout = nn.Dropout(config.dropout)
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
def forward(
self,
output_h,
output_g,
attn_mask_h,
attn_mask_g,
r,
seg_mat,
mems=None,
target_mapping=None,
head_mask=None,
output_attentions=False,
):
outputs = self.rel_attn(
output_h,
output_g,
attn_mask_h,
attn_mask_g,
r,
seg_mat,
mems=mems,
target_mapping=target_mapping,
head_mask=head_mask,
output_attentions=output_attentions,
)
output_h, output_g = outputs[:2]
if output_g is not None:
output_g = apply_chunking_to_forward(
self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, output_g
)
output_h = apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, output_h)
outputs = (output_h, output_g) + outputs[2:]
return outputs
def ff_chunk(self, output_x):
output_x = self.ff(output_x)
return output_x
class XLNetPreTrainedModel(PreTrainedModel):
"""
处理权重初始化和下载预训练模型的抽象类。
"""
config_class = XLNetConfig
load_tf_weights = load_tf_weights_in_xlnet
base_model_prefix = "transformer"
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, XLNetRelativeAttention):
for param in [
module.q,
module.k,
module.v,
module.o,
module.r,
module.r_r_bias,
module.r_s_bias,
module.r_w_bias,
module.seg_embed,
]:
param.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, XLNetModel):
module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
@dataclass
class XLNetModelOutput(ModelOutput):
"""
Output type of [`XLNetModel`].
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_predict, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
`num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict`
corresponds to `sequence_length`.
mems (`List[torch.FloatTensor]` of length `config.n_layers`):
Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
token ids which have their past given to this model should not be passed as `input_ids` as they have
already been computed.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: torch.FloatTensor
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class XLNetLMHeadModelOutput(ModelOutput):
"""
Output type of [`XLNetLMHeadModel`].
This class serves as the output specification for the XLNet language model head.
"""
Args:
loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided)
Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, num_predict, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
`num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict`
corresponds to `sequence_length`.
mems (`List[torch.FloatTensor]` of length `config.n_layers`):
Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
token ids which have their past given to this model should not be passed as `input_ids` as they have
already been computed.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
# Optional attributes representing outputs from the language model
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义一个数据类,表示 XLNet 模型用于序列分类任务的输出
@dataclass
class XLNetForSequenceClassificationOutput(ModelOutput):
"""
Output type of [`XLNetForSequenceClassification`].
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
分类(或者当 `config.num_labels==1` 时的回归)损失值。
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
分类(或者当 `config.num_labels==1` 时的回归)得分(softmax 之前的分数)。
mems (`List[torch.FloatTensor]` of length `config.n_layers`):
包含预先计算的隐藏状态列表,可用于加速序列解码。
传给模型的 token id 不应该作为 `input_ids` 再次传入,因为它们已经被计算过。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
包含模型每一层的隐藏状态的元组,包括初始的嵌入输出。
形状为 `(batch_size, sequence_length, hidden_size)`。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
包含每一层的注意力权重的元组。
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
"""
# 表示损失值,当提供 `label` 时返回
loss: Optional[torch.FloatTensor] = None
# 表示分类(或回归)得分,形状为 `(batch_size, config.num_labels)`
logits: torch.FloatTensor = None
# 包含预先计算的隐藏状态列表,长度为 `config.n_layers`
mems: Optional[List[torch.FloatTensor]] = None
# 包含每一层的隐藏状态的元组,包括初始的嵌入输出
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 包含每一层的注意力权重的元组
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义一个数据类,表示 XLNet 模型用于标记分类任务的输出
@dataclass
class XLNetForTokenClassificationOutput(ModelOutput):
"""
Output type of [`XLNetForTokenClassificationOutput`].
"""
# 定义函数的参数和返回类型注解,这些参数是用于处理分类任务的神经网络模型的输出和中间状态
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
分类损失值。可选的,当提供了 `labels` 参数时返回。
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
分类得分(SoftMax 之前的分数)。
mems (`List[torch.FloatTensor]` of length `config.n_layers`):
包含预先计算的隐藏状态的列表。可用于加速顺序解码。已经计算过其过去的标记 ID 不应作为 `input_ids` 传递给模型。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型每一层输出的隐藏状态的元组,包括初始嵌入输出。
模型每一层输出的隐藏状态和初始嵌入输出的 torch.FloatTensor,形状为 `(batch_size, sequence_length, hidden_size)`。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
注意力权重的元组,用于计算自注意力头中的加权平均值。
注意力 softmax 后的注意力权重,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class XLNetForMultipleChoiceOutput(ModelOutput):
"""
Output type of [`XLNetForMultipleChoice`].
Args:
loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
Classification loss. Represents the loss value associated with the classification task.
logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
Classification scores (before SoftMax). These scores represent the model's raw output for each choice in a multiple-choice scenario.
mems (`List[torch.FloatTensor]` of length `config.n_layers`):
Contains pre-computed hidden-states. Can be used to speed up sequential decoding by providing already computed hidden states from previous decoding steps.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple containing hidden-states of the model at the output of each layer plus the initial embedding outputs. This helps in accessing intermediate hidden states if needed.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple containing attention weights after the attention softmax. These weights are used to compute the weighted average in the self-attention heads.
Each element in the tuple corresponds to attention weights from different layers of the model.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
"""
Output type of [`XLNetForQuestionAnsweringSimple`].
This class represents the output structure specifically tailored for the task of question answering using XLNet.
It inherits from `ModelOutput`, providing a standardized way to represent model outputs in this context.
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
总的跨度提取损失,是开始位置和结束位置的交叉熵之和。
start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,)`):
跨度开始位置的分数(SoftMax 之前)。
end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,)`):
跨度结束位置的分数(SoftMax 之前)。
mems (`List[torch.FloatTensor]` of length `config.n_layers`):
包含预计算隐藏状态的列表。可以用于加速顺序解码。
这些模型已经计算过的 token id 不应作为 `input_ids` 传递,因为它们已经计算过。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型在每一层输出的隐藏状态的元组。包括初始嵌入输出。
形状为 `(batch_size, sequence_length, hidden_size)`。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
每一层注意力权重的元组。
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
"""
# 定义函数内部的变量类型和默认值
loss: Optional[torch.FloatTensor] = None
start_logits: torch.FloatTensor = None
end_logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义一个数据类,用于存储 XLNet 问答模型的输出结果,继承自 ModelOutput 类
@dataclass
class XLNetForQuestionAnsweringOutput(ModelOutput):
"""
[`XLNetForQuestionAnswering`] 的输出类型。
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, 如果 `start_positions` 和 `end_positions` 都提供则返回):
分类损失,作为起始标记、结束标记分类损失的总和(如果提供了 `is_impossible` 则也包括在内)。
start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, 如果未提供 `start_positions` 或 `end_positions` 则返回):
针对顶部 config.start_n_top 起始标记可能性(波束搜索)的对数概率。
start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, 如果未提供 `start_positions` 或 `end_positions` 则返回):
针对顶部 config.start_n_top 起始标记可能性(波束搜索)的索引。
end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, 如果未提供 `start_positions` 或 `end_positions` 则返回):
针对顶部 `config.start_n_top * config.end_n_top` 结束标记可能性(波束搜索)的对数概率。
end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, 如果未提供 `start_positions` 或 `end_positions` 则返回):
针对顶部 `config.start_n_top * config.end_n_top` 结束标记可能性(波束搜索)的索引。
cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, 如果未提供 `start_positions` 或 `end_positions` 则返回):
关于答案的 `is_impossible` 标签的对数概率。
mems (`List[torch.FloatTensor]` of length `config.n_layers`):
包含预先计算的隐藏状态。可用于加速顺序解码。已经计算过其过去的令牌 id 不应作为 `input_ids` 传递,因为它们已经计算过。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 时返回或当 `config.output_hidden_states=True` 时返回):
模型每层输出的隐藏状态的元组(一个用于嵌入的输出 + 每一层的输出),形状为 `(batch_size, sequence_length, hidden_size)`。
每层输出的隐藏状态加上初始嵌入的输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, 当 `output_attentions=True` 时返回或当 `config.output_attentions=True` 时返回):
注意力权重的元组(每一层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
# 定义可选的损失张量
loss: Optional[torch.FloatTensor] = None
# 定义可选的起始位置的对数概率张量
start_top_log_probs: Optional[torch.FloatTensor] = None
# 定义可选的起始位置的索引张量
start_top_index: Optional[torch.LongTensor] = None
# 定义可选的结束位置的对数概率张量
end_top_log_probs: Optional[torch.FloatTensor] = None
# 定义可选的结束位置的索引张量
end_top_index: Optional[torch.LongTensor] = None
# 定义可选的分类器输出张量
cls_logits: Optional[torch.FloatTensor] = None
# 定义可选的记忆列表,每个元素为张量
mems: Optional[List[torch.FloatTensor]] = None
# 定义可选的隐藏状态元组,包含多个张量
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义可选的注意力张量元组,包含多个张量
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# XLNet 模型的文档字符串,描述了模型的继承关系和常见用法
XLNET_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`XLNetConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# XLNet 模型的输入文档字符串,当前为空
XLNET_INPUTS_DOCSTRING = r"""
"""
# 用于添加文档字符串的装饰器函数,说明了 XLNetModel 类的基本信息和继承关系
@add_start_docstrings(
"The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
XLNET_START_DOCSTRING,
)
# XLNetModel 类的定义,继承自 XLNetPreTrainedModel 类
class XLNetModel(XLNetPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化模型参数
self.mem_len = config.mem_len # 记忆长度
self.reuse_len = config.reuse_len # 复用长度
self.d_model = config.d_model # 模型维度
self.same_length = config.same_length # 是否相同长度
self.attn_type = config.attn_type # 注意力类型
self.bi_data = config.bi_data # 是否双向数据
self.clamp_len = config.clamp_len # 限制长度
self.n_layer = config.n_layer # 层数
# 初始化词嵌入层、掩码嵌入层、层列表和 dropout 层
self.word_embedding = nn.Embedding(config.vocab_size, config.d_model) # 词嵌入层
self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model)) # 掩码嵌入层
self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)]) # XLNetLayer 层列表
self.dropout = nn.Dropout(config.dropout) # dropout 层
# 初始化权重并应用最终处理
self.post_init()
# 获取输入词嵌入层
def get_input_embeddings(self):
return self.word_embedding
# 设置输入词嵌入层
def set_input_embeddings(self, new_embeddings):
self.word_embedding = new_embeddings
# 剪枝头部的方法,当前未实现
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
def create_mask(self, qlen, mlen):
"""
Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
Args:
qlen: Sequence length
mlen: Mask length
::
same_length=False: same_length=True: <mlen > < qlen > <mlen > < qlen >
^ [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 1 1 1 1]
[0 0 0 0 0 0 1 1 1] [1 0 0 0 0 0 1 1 1]
qlen [0 0 0 0 0 0 0 1 1] [1 1 0 0 0 0 0 1 1]
[0 0 0 0 0 0 0 0 1] [1 1 1 0 0 0 0 0 1]
v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0]
"""
# 创建一个形状为 (qlen, qlen + mlen) 的全为 1 的张量,表示初始的掩码
mask = torch.ones((qlen, qlen + mlen), device=self.device)
# 如果 same_length 为 True,则需要特殊处理掩码
if self.same_length:
# 提取出下三角矩阵,并且将掩码的上三角部分置为 0
mask_lo = mask[:, :qlen].tril(-1)
mask.triu_(mlen + 1)
mask[:, :qlen] += mask_lo
else:
# 如果 same_length 不为 True,则只将掩码的上三角部分置为 0
mask.triu_(mlen + 1)
return mask
def cache_mem(self, curr_out, prev_mem):
# 将当前输出缓存为新的内存状态。
# 如果 reuse_len 被定义且大于 0,则截取当前输出的前部分
if self.reuse_len is not None and self.reuse_len > 0:
curr_out = curr_out[: self.reuse_len]
# 如果 mem_len 没有被定义或者为 0,则根据情况设置截断点
if self.mem_len is None or self.mem_len == 0:
# 如果 use_mems 激活但未定义 mem_len,则模型在推断时的行为类似于 GPT-2,
# 返回所有过去和当前的隐藏状态。
cutoff = 0
else:
# 如果 use_mems 激活且定义了 mem_len,则模型返回最后的 mem_len 个隐藏状态,
# 这是训练和生成长文本时的首选设置。
cutoff = -self.mem_len
# 如果 prev_mem 为 None,则直接使用当前输出的部分作为新的内存
if prev_mem is None:
new_mem = curr_out[cutoff:]
else:
# 否则将当前输出与之前的内存连接起来,并根据截断点进行截断
new_mem = torch.cat([prev_mem, curr_out], dim=0)[cutoff:]
return new_mem.detach()
@staticmethod
def positional_embedding(pos_seq, inv_freq, bsz=None):
# 根据位置序列和频率逆序列生成位置编码矩阵
# 计算正弦和余弦函数的输入
sinusoid_inp = torch.einsum("i,d->id", pos_seq, inv_freq)
# 将正弦和余弦函数的结果连接起来形成位置编码
pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
pos_emb = pos_emb[:, None, :]
# 如果给定了 batch size,则将位置编码矩阵进行扩展
if bsz is not None:
pos_emb = pos_emb.expand(-1, bsz, -1)
return pos_emb
# 创建相对位置编码。
freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.int64).float()
# 计算频率因子,用于相对位置编码
inv_freq = 1 / torch.pow(10000, (freq_seq / self.d_model))
if self.attn_type == "bi":
# 如果是双向注意力机制,设置起始和结束位置
beg, end = klen, -qlen
elif self.attn_type == "uni":
# 如果是单向注意力机制,设置起始和结束位置
beg, end = klen, -1
else:
# 抛出异常,未知的注意力类型
raise ValueError(f"Unknown `attn_type` {self.attn_type}.")
if self.bi_data:
# 创建前向和后向位置序列
fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.int64).float()
bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=torch.int64).float()
if self.clamp_len > 0:
# 对位置序列进行截断
fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
if bsz is not None:
# 创建前向和后向位置的位置嵌入
fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
else:
# 创建前向和后向位置的位置嵌入(无批次大小限制)
fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
# 将前向和后向的位置嵌入拼接在一起
pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1)
else:
# 创建前向位置序列
fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.int64).float()
if self.clamp_len > 0:
# 对位置序列进行截断
fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
# 创建前向位置的位置嵌入
pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
# 返回位置嵌入张量
return pos_emb
@add_start_docstrings(
"""
XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
""",
XLNET_START_DOCSTRING,
)
class XLNetLMHeadModel(XLNetPreTrainedModel):
_tied_weights_keys = ["lm_loss.weight"]
def __init__(self, config):
super().__init__(config)
self.attn_type = config.attn_type # 从配置中获取注意力类型
self.same_length = config.same_length # 从配置中获取是否使用相同长度
self.transformer = XLNetModel(config) # 初始化XLNet模型
self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True) # 初始化语言建模头部线性层
# 初始化权重并应用最终处理
self.post_init()
def get_output_embeddings(self):
return self.lm_loss # 返回语言建模头部的线性层
def set_output_embeddings(self, new_embeddings):
self.lm_loss = new_embeddings # 设置新的输出嵌入层
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_mems=None, **kwargs):
# 在输入的末尾添加一个虚拟标记(该标记不会被注意力机制关注)
effective_batch_size = input_ids.shape[0] # 计算有效的批量大小
dummy_token = torch.zeros((effective_batch_size, 1), dtype=torch.long, device=input_ids.device) # 创建一个全零的虚拟标记张量
# 在每次传递中,计算新标记以及最后两个生成标记的注意力值,其余从过去的缓存中重新加载。
# 纯自回归模型应该有 offset = 1; 使用 offset = 2 似乎计算稍微更好。
offset = 2
if past_key_values:
input_ids = torch.cat([input_ids[:, -offset:], dummy_token], dim=1) # 如果过去的键值存在,则在末尾添加虚拟标记
else:
input_ids = torch.cat([input_ids, dummy_token], dim=1) # 否则在末尾添加虚拟标记
# 构建排列掩码,使得之前的标记不会看到最后一个标记
sequence_length = input_ids.shape[1] # 获取序列长度
perm_mask = torch.zeros(
(effective_batch_size, sequence_length, sequence_length), dtype=torch.float, device=input_ids.device
) # 创建全零的排列掩码张量
perm_mask[:, :, -1] = 1.0 # 最后一个位置设为1.0,表示不允许之前的标记看到最后一个标记
# 我们只预测最后一个标记
target_mapping = torch.zeros(
(effective_batch_size, 1, sequence_length), dtype=torch.float, device=input_ids.device
) # 创建全零的目标映射张量
target_mapping[:, 0, -1] = 1.0 # 最后一个位置设为1.0,表示只预测最后一个标记
inputs = {
"input_ids": input_ids,
"perm_mask": perm_mask,
"target_mapping": target_mapping,
"use_mems": use_mems,
} # 构建输入字典
# 如果模型kwargs中定义了过去的键值,则用它进行更快的解码
if past_key_values:
inputs["mems"] = tuple(layer_past[:-offset, :, :] for layer_past in past_key_values) # 为更快的解码使用过去的记忆
return inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=XLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC)
# 定义一个实例方法 `forward`,用于模型的前向传播
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的token IDs,可选的张量类型
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码,可选的张量类型
mems: Optional[torch.Tensor] = None, # 记忆缓存,可选的张量类型
perm_mask: Optional[torch.Tensor] = None, # 排列掩码,可选的张量类型
target_mapping: Optional[torch.Tensor] = None, # 目标映射,可选的张量类型
token_type_ids: Optional[torch.Tensor] = None, # token 类型 IDs,可选的张量类型
input_mask: Optional[torch.Tensor] = None, # 输入掩码,可选的张量类型
head_mask: Optional[torch.Tensor] = None, # 头部掩码,可选的张量类型
inputs_embeds: Optional[torch.Tensor] = None, # 输入的嵌入表示,可选的张量类型
labels: Optional[torch.Tensor] = None, # 标签,可选的张量类型
use_mems: Optional[bool] = None, # 是否使用记忆缓存,可选的布尔类型
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选的布尔类型
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选的布尔类型
return_dict: Optional[bool] = None, # 是否返回字典格式的输出,可选的布尔类型
**kwargs, # 删除 `use_cache` 在 `XLNetModel` 移除时使用
):
pass # 此处只是定义方法的占位符,实际内容未提供,暂无具体实现
@staticmethod
def _reorder_cache(mems: List[torch.Tensor], beam_idx: torch.Tensor) -> List[torch.Tensor]:
"""
重新排列 `mems` 缓存,如果调用了 `~PreTrainedModel.beam_search` 或 `~PreTrainedModel.beam_sample`,
这是为了确保在每个生成步骤中,`mems` 与正确的 `beam_idx` 匹配。
"""
# 使用 `beam_idx` 将每个层级的过去状态重新排序到对应的设备上
return [layer_past.index_select(1, beam_idx.to(layer_past.device)) for layer_past in mems]
# 定义 XLNet 序列分类/回归模型,其顶部包含一个用于 GLUE 任务的线性层(放在汇总输出之上)
@add_start_docstrings(
"""
XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
for GLUE tasks.
""",
XLNET_START_DOCSTRING,
)
class XLNetForSequenceClassification(XLNetPreTrainedModel):
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 记录类别数量
self.num_labels = config.num_labels
# 记录配置信息
self.config = config
# 初始化 XLNet 模型
self.transformer = XLNetModel(config)
# 序列汇总层
self.sequence_summary = SequenceSummary(config)
# 输出层,用于分类任务,将模型输出映射到标签数量
self.logits_proj = nn.Linear(config.d_model, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=XLNetForSequenceClassificationOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
mems: Optional[torch.Tensor] = None,
perm_mask: Optional[torch.Tensor] = None,
target_mapping: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
input_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_mems: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs, # 在 XLNetModel 移除 `use_cache` 时应删除此参数
):
) -> Union[Tuple, XLNetForSequenceClassificationOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 根据传入参数决定是否返回字典形式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用transformer模型处理输入数据,获取transformer的输出结果
transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_mems=use_mems,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
**kwargs,
)
# 获取transformer的输出结果中的第一个tensor作为模型输出
output = transformer_outputs[0]
# 对模型输出进行序列摘要处理
output = self.sequence_summary(output)
# 将摘要后的结果投影到logits空间
logits = self.logits_proj(output)
# 初始化损失值为None
loss = None
# 如果有传入标签数据
if labels is not None:
# 如果配置中未定义问题类型,则根据标签类型和数量设置问题类型
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型选择相应的损失函数
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
# 对于单标签问题,计算均方误差损失
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
# 对于多标签问题,同样计算均方误差损失
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
# 对于单标签分类问题,使用交叉熵损失函数
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
# 对于多标签分类问题,使用带logits的二元交叉熵损失函数
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果不要求返回字典形式的输出,则将损失值和其它输出合并返回
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
# 返回经过XLNet模型处理后的输出结果,包括损失、logits、mems、隐藏状态和注意力
return XLNetForSequenceClassificationOutput(
loss=loss,
logits=logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
# 使用 XLNet 模型,在其上面添加一个用于标记分类(例如命名实体识别)的头部,这个头部是隐藏状态输出之上的一个线性层。
@add_start_docstrings(
"""
XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
XLNET_START_DOCSTRING,
)
# 定义 XLNetForTokenClassification 类,继承自 XLNetPreTrainedModel
class XLNetForTokenClassification(XLNetPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 设置标签数量
self.num_labels = config.num_labels
# 初始化 XLNet 模型
self.transformer = XLNetModel(config)
# 定义分类器,使用线性层将隐藏状态输出转换成指定数量的标签
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
# 前向传播函数,处理输入并返回结果
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=XLNetForTokenClassificationOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
mems: Optional[torch.Tensor] = None,
perm_mask: Optional[torch.Tensor] = None,
target_mapping: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
input_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_mems: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs, # 在 XLNetModel 中删除 `use_cache` 时使用
) -> Union[Tuple, XLNetForTokenClassificationOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above)
"""
# 确定是否使用返回字典,如果未指定则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 将输入参数传递给Transformer模型进行处理
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_mems=use_mems,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从Transformer模型的输出中获取序列输出
sequence_output = outputs[0]
# 将序列输出传递给分类器,生成logits
logits = self.classifier(sequence_output)
# 初始化损失为None
loss = None
# 如果提供了标签,则计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
# 如果不需要返回字典,则返回包含logits和其他输出的元组
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果需要返回字典,则返回XLNetForTokenClassificationOutput对象
return XLNetForTokenClassificationOutput(
loss=loss,
logits=logits,
mems=outputs.mems,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 带有多选分类头部的 XLNet 模型类定义,用于例如 RACE/SWAG 任务
@add_start_docstrings(
"""
XLNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RACE/SWAG tasks.
""",
XLNET_START_DOCSTRING,
)
class XLNetForMultipleChoice(XLNetPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化 XLNet 模型部分
self.transformer = XLNetModel(config)
# 序列摘要,用于汇总序列输出
self.sequence_summary = SequenceSummary(config)
# 线性层,将模型输出映射到一个标量,用于多选分类
self.logits_proj = nn.Linear(config.d_model, 1)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=XLNetForMultipleChoiceOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
input_mask: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
mems: Optional[torch.Tensor] = None,
perm_mask: Optional[torch.Tensor] = None,
target_mapping: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_mems: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs, # 当 XLNetModel 中的 `use_cache` 参数移除时删除这部分
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
# 根据 `return_dict` 参数确定是否返回一个字典对象,如果未指定则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 计算输入的选项数,即第二维的大小,如果输入中不为空则使用 `input_ids.shape[1]`
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
# 将输入张量展平,以便在批处理维度上进行处理,如果输入不为空,则进行相应的操作
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
flat_input_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None
flat_inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
# 调用 Transformer 模型进行前向传播,传入相应的参数
transformer_outputs = self.transformer(
flat_input_ids,
token_type_ids=flat_token_type_ids,
input_mask=flat_input_mask,
attention_mask=flat_attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
head_mask=head_mask,
inputs_embeds=flat_inputs_embeds,
use_mems=use_mems,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
**kwargs,
)
# 获取 Transformer 输出中的第一个张量,通常是模型输出的主要部分
output = transformer_outputs[0]
# 对模型输出进行序列摘要,通常是对序列长度进行汇总或降维
output = self.sequence_summary(output)
# 将摘要后的序列输出投影到对应的分类空间,以获得 logits
logits = self.logits_proj(output)
# 将 logits 重新整形为二维张量,以便进行多选分类的计算
reshaped_logits = logits.view(-1, num_choices)
# 初始化损失为 None
loss = None
# 如果提供了 labels,则计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels.view(-1))
# 如果不要求返回字典,则重新组织输出的元组
if not return_dict:
output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
# 构造并返回 XLNetForMultipleChoiceOutput 对象,包含损失、logits 和其他 Transformer 输出
return XLNetForMultipleChoiceOutput(
loss=loss,
logits=reshaped_logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings(
"""
XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLNET_START_DOCSTRING,
)
class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
# 初始化 XLNet 模型
self.transformer = XLNetModel(config)
# 线性层,用于输出 span 起始和结束的 logits
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=XLNetForQuestionAnsweringSimpleOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
mems: Optional[torch.Tensor] = None,
perm_mask: Optional[torch.Tensor] = None,
target_mapping: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
input_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
use_mems: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs, # delete when `use_cache` is removed in XLNetModel
):
"""
Forward pass for XLNetForQuestionAnsweringSimple.
"""
# 略,这里是具体的前向传播逻辑,根据输入计算输出
# 定义一个方法 `forward`,用于模型的前向传播
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的 token IDs,可选的 PyTorch 张量
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码,可选的 PyTorch 张量
mems: Optional[torch.Tensor] = None, # 记忆(历史隐藏状态),可选的 PyTorch 张量
perm_mask: Optional[torch.Tensor] = None, # 排列掩码,可选的 PyTorch 张量
target_mapping: Optional[torch.Tensor] = None, # 目标映射,可选的 PyTorch 张量
token_type_ids: Optional[torch.Tensor] = None, # token 类型 IDs,可选的 PyTorch 张量
input_mask: Optional[torch.Tensor] = None, # 输入掩码,可选的 PyTorch 张量
head_mask: Optional[torch.Tensor] = None, # 头部掩码,可选的 PyTorch 张量
inputs_embeds: Optional[torch.Tensor] = None, # 输入的嵌入表示,可选的 PyTorch 张量
start_positions: Optional[torch.Tensor] = None, # 起始位置,可选的 PyTorch 张量
end_positions: Optional[torch.Tensor] = None, # 结束位置,可选的 PyTorch 张量
is_impossible: Optional[torch.Tensor] = None, # 是否不可能的标记,可选的 PyTorch 张量
cls_index: Optional[torch.Tensor] = None, # 分类索引,可选的 PyTorch 张量
p_mask: Optional[torch.Tensor] = None, # 部分掩码,可选的 PyTorch 张量
use_mems: Optional[bool] = None, # 是否使用记忆的标志,可选的布尔值
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选的布尔值
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选的布尔值
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,可选的布尔值
**kwargs, # 当 `use_cache` 在 XLNetModel 中被移除时删除这部分内容
.\models\xlnet\tokenization_xlnet.py
import os
import unicodedata
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import SPIECE_UNDERLINE, logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"xlnet/xlnet-base-cased": "https://huggingface.co/xlnet/xlnet-base-cased/resolve/main/spiece.model",
"xlnet/xlnet-large-cased": "https://huggingface.co/xlnet/xlnet-large-cased/resolve/main/spiece.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"xlnet/xlnet-base-cased": None,
"xlnet/xlnet-large-cased": None,
}
SEG_ID_A = 0
SEG_ID_B = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4
class XLNetTokenizer(PreTrainedTokenizer):
"""
构建一个 XLNet 分词器,基于 SentencePiece。
该分词器继承自 `PreTrainedTokenizer`,其中包含大多数主要方法。用户应参考该超类以获取关于这些方法的更多信息。
Attributes:
sp_model (`SentencePieceProcessor`):
用于所有转换(字符串、token 和 ID)的 SentencePiece 处理器。
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
padding_side = "left"
def __init__(
self,
vocab_file,
do_lower_case=False,
remove_space=True,
keep_accents=False,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
sep_token="<sep>",
pad_token="<pad>",
cls_token="<cls>",
mask_token="<mask>",
additional_special_tokens=["<eop>", "<eod>"],
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
super().__init__(
do_lower_case=do_lower_case,
remove_space=remove_space,
keep_accents=keep_accents,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
self._pad_token_type_id = 3
@property
def vocab_size(self):
return len(self.sp_model)
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
def preprocess_text(self, inputs):
if self.remove_space:
outputs = " ".join(inputs.strip().split())
else:
outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"')
if not self.keep_accents:
outputs = unicodedata.normalize("NFKD", outputs)
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
if self.do_lower_case:
outputs = outputs.lower()
return outputs
def _tokenize(self, text: str) -> List[str]:
"""Tokenize a string."""
text = self.preprocess_text(text)
pieces = self.sp_model.encode(text, out_type=str)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else:
new_pieces.append(piece)
return new_pieces
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def _decode(
self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
spaces_between_special_tokens: bool = True,
**kwargs,
):
"""Decode a list of token IDs back into a string."""
) -> str:
self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
sub_texts = []
current_sub_text = []
for token in filtered_tokens:
if skip_special_tokens and token in self.all_special_ids:
continue
if token in self.added_tokens_encoder:
if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
current_sub_text = []
sub_texts.append(token)
else:
current_sub_text.append(token)
if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
text = "".join(sub_texts)
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
return clean_text
else:
return text
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
通过连接和添加特殊 token,构建用于序列分类任务的模型输入。对于 XLNet,序列的格式如下:
- 单个序列:`X <sep> <cls>`
- 序列对:`A <sep> B <sep> <cls>`
Args:
token_ids_0 (`List[int]`):
将添加特殊 token 的 ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个 ID 列表,用于序列对。
Returns:
`List[int]`: 包含适当特殊 token 的输入 ID 列表。
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return token_ids_0 + sep + cls
return token_ids_0 + sep + token_ids_1 + sep + cls
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
"""
生成特殊 token 的掩码,用于标识输入中的特殊 token。
Args:
token_ids_0 (`List[int]`):
第一个序列的 token ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个序列的 token ID 列表。
already_has_special_tokens (`bool`):
指示输入 token 是否已经包含特殊 token。
Returns:
`List[int]`: 包含特殊 token 掩码的列表。
"""
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
return ([0] * len(token_ids_0)) + [1, 1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls_segment_id = [2]
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0] + cls_segment_id
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
.\models\xlnet\tokenization_xlnet_fast.py
import os
from shutil import copyfile
from typing import List, Optional, Tuple
from ...tokenization_utils import AddedToken
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging
if is_sentencepiece_available():
from .tokenization_xlnet import XLNetTokenizer
else:
XLNetTokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"xlnet/xlnet-base-cased": "https://huggingface.co/xlnet/xlnet-base-cased/resolve/main/spiece.model",
"xlnet/xlnet-large-cased": "https://huggingface.co/xlnet/xlnet-large-cased/resolve/main/spiece.model",
},
"tokenizer_file": {
"xlnet/xlnet-base-cased": "https://huggingface.co/xlnet/xlnet-base-cased/resolve/main/tokenizer.json",
"xlnet/xlnet-large-cased": "https://huggingface.co/xlnet/xlnet-large-cased/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"xlnet/xlnet-base-cased": None,
"xlnet/xlnet-large-cased": None,
}
SPIECE_UNDERLINE = "▁"
SEG_ID_A = 0
SEG_ID_B = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4
class XLNetTokenizerFast(PreTrainedTokenizerFast):
"""
快速构建XLNet分词器,基于HuggingFace的tokenizers库。基于Unigram模型。
该分词器继承自PreTrainedTokenizerFast类,包含大部分主要方法。详细方法信息请参考其超类。
"""
pass
Args:
vocab_file (`str`):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether to lowercase the input when tokenizing.
remove_space (`bool`, *optional*, defaults to `True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (`bool`, *optional*, defaults to `False`):
Whether to keep accents when tokenizing.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"<sep>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"<cls>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (`SentencePieceProcessor`):
The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
# VOCAB_FILES_NAMES is typically a constant or a configuration that defines
# the names or paths of various vocabulary-related files used by the tokenizer.
vocab_files_names = VOCAB_FILES_NAMES
# 使用预先定义的词汇文件映射 PRETRAINED_VOCAB_FILES_MAP
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 使用预先定义的最大模型输入大小映射 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 指定填充位置为左侧
padding_side = "left"
# 指定使用的慢速分词器类为 XLNetTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=False,
remove_space=True,
keep_accents=False,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
sep_token="<sep>",
pad_token="<pad>",
cls_token="<cls>",
mask_token="<mask>",
additional_special_tokens=["<eop>", "<eod>"],
**kwargs,
):
# 如果 mask_token 是字符串,则创建一个 AddedToken 对象,用于处理去除左侧空格
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
# 调用父类的构造方法,初始化分词器对象
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
remove_space=remove_space,
keep_accents=keep_accents,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
# 设置特殊的 pad_token_type_id 为 3
self._pad_token_type_id = 3
# 初始化其他属性
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
@property
def can_save_slow_tokenizer(self) -> bool:
# 检查是否可以保存慢速分词器,基于是否存在 vocab_file 文件
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
从一个序列或一对序列构建模型输入,用于序列分类任务,通过连接和添加特殊标记。对于 XLNet 模型,输入的格式如下:
- 单个序列: `X <sep> <cls>`
- 一对序列: `A <sep> B <sep> <cls>`
Args:
token_ids_0 (`List[int]`):
要添加特殊标记的 ID 列表。
token_ids_1 (`List[int]`, *optional*):
可选的第二个序列 ID 列表,用于序列对。
Returns:
`List[int]`: 包含适当特殊标记的输入 ID 列表。
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return token_ids_0 + sep + cls
return token_ids_0 + sep + token_ids_1 + sep + cls
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
# 这个方法的作用没有具体注释,应该补充一个注释来解释它的功能
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary
"""
# Separator token [SEP] used to separate sequences
sep = [self.sep_token_id]
# Classification segment ID indicating the segment for classification
cls_segment_id = [2]
# If only one sequence (`token_ids_1` is None), return a mask for the first sequence only
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0] + cls_segment_id
# Otherwise, return a mask for both sequences
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# Check if the fast tokenizer can save vocabulary; raise an error if not
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
# Check if save_directory is a valid directory; log an error and return if not
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# Define the output vocabulary file path
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# If the current vocabulary file path is different from the output path, copy the vocabulary file
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# Return the path to the saved vocabulary file
return (out_vocab_file,)
.\models\xlnet\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {"configuration_xlnet": ["XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLNetConfig"]}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_xlnet"] = ["XLNetTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_xlnet_fast"] = ["XLNetTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_xlnet"] = [
"XLNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"XLNetForMultipleChoice",
"XLNetForQuestionAnswering",
"XLNetForQuestionAnsweringSimple",
"XLNetForSequenceClassification",
"XLNetForTokenClassification",
"XLNetLMHeadModel",
"XLNetModel",
"XLNetPreTrainedModel",
"load_tf_weights_in_xlnet",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_xlnet"] = [
"TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFXLNetForMultipleChoice",
"TFXLNetForQuestionAnsweringSimple",
"TFXLNetForSequenceClassification",
"TFXLNetForTokenClassification",
"TFXLNetLMHeadModel",
"TFXLNetMainLayer",
"TFXLNetModel",
"TFXLNetPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_xlnet import XLNetTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_xlnet_fast import XLNetTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_xlnet import (
XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
XLNetForMultipleChoice,
XLNetForQuestionAnswering,
XLNetForQuestionAnsweringSimple,
XLNetForSequenceClassification,
XLNetForTokenClassification,
XLNetLMHeadModel,
XLNetModel,
XLNetPreTrainedModel,
load_tf_weights_in_xlnet,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_xlnet import (
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
TFXLNetForMultipleChoice,
TFXLNetForQuestionAnsweringSimple,
TFXLNetForSequenceClassification,
TFXLNetForTokenClassification,
TFXLNetLMHeadModel,
TFXLNetMainLayer,
TFXLNetModel,
TFXLNetPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\xmod\configuration_xmod.py
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/xmod-base": "https://huggingface.co/facebook/xmod-base/resolve/main/config.json",
"facebook/xmod-large-prenorm": "https://huggingface.co/facebook/xmod-large-prenorm/resolve/main/config.json",
"facebook/xmod-base-13-125k": "https://huggingface.co/facebook/xmod-base-13-125k/resolve/main/config.json",
"facebook/xmod-base-30-125k": "https://huggingface.co/facebook/xmod-base-30-125k/resolve/main/config.json",
"facebook/xmod-base-30-195k": "https://huggingface.co/facebook/xmod-base-30-195k/resolve/main/config.json",
"facebook/xmod-base-60-125k": "https://huggingface.co/facebook/xmod-base-60-125k/resolve/main/config.json",
"facebook/xmod-base-60-265k": "https://huggingface.co/facebook/xmod-base-60-265k/resolve/main/config.json",
"facebook/xmod-base-75-125k": "https://huggingface.co/facebook/xmod-base-75-125k/resolve/main/config.json",
"facebook/xmod-base-75-269k": "https://huggingface.co/facebook/xmod-base-75-269k/resolve/main/config.json",
}
class XmodConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`XmodModel`]. It is used to instantiate an X-MOD
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the
[facebook/xmod-base](https://huggingface.co/facebook/xmod-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import XmodConfig, XmodModel
>>> # Initializing an X-MOD facebook/xmod-base style configuration
>>> configuration = XmodConfig()
>>> # Initializing a model (with random weights) from the facebook/xmod-base style configuration
>>> model = XmodModel(configuration)
>>> # Accessing the model configuration
```
"""
pass
configuration = model.config
class XmodOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
]
)
.\models\xmod\convert_xmod_original_pytorch_checkpoint_to_pytorch.py
if version.parse(fairseq.__version__) < version.parse("0.12.2"):
raise Exception("requires fairseq >= 0.12.2")
if version.parse(fairseq.__version__) > version.parse("2"):
raise Exception("requires fairseq < v2")
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
SAMPLE_TEXT = "Hello, World!"
SAMPLE_LANGUAGE = "en_XX"
def convert_xmod_checkpoint_to_pytorch(
xmod_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
):
data_dir = Path("data_bin")
xmod = FairseqXmodModel.from_pretrained(
model_name_or_path=str(Path(xmod_checkpoint_path).parent),
checkpoint_file=Path(xmod_checkpoint_path).name,
_name="xmod_base",
arch="xmod_base",
task="multilingual_masked_lm",
data_name_or_path=str(data_dir),
bpe="sentencepiece",
sentencepiece_model=str(Path(xmod_checkpoint_path).parent / "sentencepiece.bpe.model"),
src_dict=str(data_dir / "dict.txt"),
)
xmod.eval()
print(xmod)
xmod_sent_encoder = xmod.model.encoder.sentence_encoder
config = XmodConfig(
vocab_size=xmod_sent_encoder.embed_tokens.num_embeddings,
hidden_size=xmod.cfg.model.encoder_embed_dim,
num_hidden_layers=xmod.cfg.model.encoder_layers,
num_attention_heads=xmod.cfg.model.encoder_attention_heads,
intermediate_size=xmod.cfg.model.encoder_ffn_embed_dim,
max_position_embeddings=514,
type_vocab_size=1,
layer_norm_eps=1e-5,
pre_norm=xmod.cfg.model.encoder_normalize_before,
adapter_reduction_factor=getattr(xmod.cfg.model, "bottleneck", 2),
adapter_layer_norm=xmod.cfg.model.adapter_layer_norm,
adapter_reuse_layer_norm=xmod.cfg.model.adapter_reuse_layer_norm,
ln_before_adapter=xmod.cfg.model.ln_before_adapter,
languages=xmod.cfg.model.languages,
)
if classification_head:
config.num_labels = xmod.model.classification_heads["mnli"].out_proj.weight.shape[0]
print("Our X-MOD config:", config)
model = XmodForSequenceClassification(config) if classification_head else XmodForMaskedLM(config)
model.eval()
model.roberta.embeddings.word_embeddings.weight = xmod_sent_encoder.embed_tokens.weight
model.roberta.embeddings.position_embeddings.weight = xmod_sent_encoder.embed_positions.weight
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
model.roberta.embeddings.token_type_embeddings.weight
)
model.roberta.embeddings.LayerNorm.weight = xmod_sent_encoder.layernorm_embedding.weight
model.roberta.embeddings.LayerNorm.bias = xmod_sent_encoder.layernorm_embedding.bias
if xmod_sent_encoder.layer_norm is not None:
model.roberta.encoder.LayerNorm.weight = xmod_sent_encoder.layer_norm.weight
model.roberta.encoder.LayerNorm.bias = xmod_sent_encoder.layer_norm.bias
if classification_head:
model.classifier.dense.weight = xmod.model.classification_heads["mnli"].dense.weight
model.classifier.dense.bias = xmod.model.classification_heads["mnli"].dense.bias
model.classifier.out_proj.weight = xmod.model.classification_heads["mnli"].out_proj.weight
model.classifier.out_proj.bias = xmod.model.classification_heads["mnli"].out_proj.bias
else:
model.lm_head.dense.weight = xmod.model.encoder.lm_head.dense.weight
model.lm_head.dense.bias = xmod.model.encoder.lm_head.dense.bias
model.lm_head.layer_norm.weight = xmod.model.encoder.lm_head.layer_norm.weight
model.lm_head.layer_norm.bias = xmod.model.encoder.lm_head.layer_norm.bias
model.lm_head.decoder.weight = xmod.model.encoder.lm_head.weight
model.lm_head.decoder.bias = xmod.model.encoder.lm_head.bias
input_ids = xmod.encode(SAMPLE_TEXT).unsqueeze(0)
model.roberta.set_default_language(SAMPLE_LANGUAGE)
our_output = model(input_ids)[0]
if classification_head:
their_output = xmod.model.classification_heads["mnli"](xmod.extract_features(input_ids))
else:
their_output = xmod.model(input_ids, lang_id=[SAMPLE_LANGUAGE])[0]
print(our_output.shape, their_output.shape)
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
print(f"max_absolute_diff = {max_absolute_diff}")
success = torch.allclose(our_output, their_output, atol=1e-3)
print("Do both models output the same tensors?", "🔥" if success else "💩")
if not success:
raise Exception("Something went wRoNg")
Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--xmod_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument(
"--classification_head", action="store_true", help="Whether to convert a final classification head."
)
args = parser.parse_args()
convert_xmod_checkpoint_to_pytorch(
args.xmod_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
)
.\models\xmod\modeling_xmod.py
"""PyTorch X-MOD 模型。"""
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN, gelu
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_xmod import XmodConfig
logger = logging.get_logger(__name__)
XMOD_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/xmod-base",
"facebook/xmod-large-prenorm",
"facebook/xmod-base-13-125k",
"facebook/xmod-base-30-125k",
"facebook/xmod-base-30-195k",
"facebook/xmod-base-60-125k",
"facebook/xmod-base-60-265k",
"facebook/xmod-base-75-125k",
"facebook/xmod-base-75-269k",
]
class XmodEmbeddings(nn.Module):
"""
与 BertEmbeddings 相同,但对于位置嵌入索引进行了微小调整。
"""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
)
):
if position_ids is None:
if input_ids is not None:
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
def create_position_ids_from_inputs_embeds(self, inputs_embeds):
"""
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape)
class XmodSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
class XmodSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states + input_tensor
return hidden_states
class XmodAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = XmodSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = XmodSelfOutput(config)
self.pruned_heads = set()
self.pre_norm = config.pre_norm
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
residual = hidden_states
if self.pre_norm:
hidden_states = self.output.LayerNorm(hidden_states)
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], residual)
if not self.pre_norm:
attention_output = self.output.LayerNorm(attention_output)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class XmodIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class XmodAdapter(nn.Module):
def __init__(self, config):
super().__init__()
self.bottleneck_size = config.hidden_size // config.adapter_reduction_factor
self.dense1 = nn.Linear(config.hidden_size, self.bottleneck_size)
self.dense2 = nn.Linear(self.bottleneck_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.adapter_act_fn = ACT2FN[config.hidden_act]
else:
self.adapter_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense1(hidden_states)
hidden_states = self.adapter_act_fn(hidden_states)
hidden_states = self.dense2(hidden_states)
return hidden_states
class XmodOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.ln_before_adapter = config.ln_before_adapter
self.dropout = nn.Dropout(config.hidden_dropout_prob)
if config.adapter_layer_norm:
self.adapter_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
else:
self.adapter_layer_norm = None
self.adapter_reuse_layer_norm = config.adapter_reuse_layer_norm
self.adapter_modules = nn.ModuleDict({})
for language in config.languages:
self.adapter_modules[str(language)] = XmodAdapter(config)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor, lang_ids: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states + input_tensor
hidden_states = self.lang_adapter(lang_ids, hidden_states)
return hidden_states
lang_ids, lang_lengths = torch.unique_consecutive(lang_ids, return_counts=True)
if not self.ln_before_adapter:
residual = hidden_states
if self.adapter_layer_norm is not None:
hidden_states = self.adapter_layer_norm(hidden_states)
elif self.adapter_reuse_layer_norm:
hidden_states = self.LayerNorm(hidden_states)
if self.ln_before_adapter:
residual = hidden_states
split_hidden_states = torch.split(hidden_states, lang_lengths.tolist(), 0)
lang_wise_outputs = []
for i, (lang_id, split_hidden_state) in enumerate(zip(lang_ids, split_hidden_states)):
lang = list(self.adapter_modules.keys())[int(lang_id.item())]
lang_wise_outputs.append(self.adapter_modules[lang](split_hidden_state))
hidden_states = torch.cat(lang_wise_outputs, 0)
hidden_states = self.dropout(hidden_states)
hidden_states += residual
return hidden_states
class XmodLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = XmodAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = XmodAttention(config, position_embedding_type="absolute")
self.intermediate = XmodIntermediate(config)
self.output = XmodOutput(config)
self.pre_norm = config.pre_norm
def forward(
self,
hidden_states: torch.Tensor,
lang_ids: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
residual = attention_output
if self.pre_norm:
attention_output = self.output.LayerNorm(attention_output)
intermediate_output = apply_chunking_to_forward(
self.feed_forward_chunk,
self.chunk_size_feed_forward,
self.seq_len_dim,
attention_output,
)
layer_output = self.output(intermediate_output, residual, lang_ids)
if not self.pre_norm:
layer_output = self.output.LayerNorm(layer_output)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
return self.intermediate(attention_output)
class XmodEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([XmodLayer(config) for _ in range(config.num_hidden_layers)])
self.is_pre_norm = config.pre_norm
if self.is_pre_norm:
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
lang_ids: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
class XmodPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class XmodPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = XmodConfig
base_model_prefix = "roberta"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def set_default_language(self, language: str):
if language not in self.config.languages:
raise ValueError(
f"{self} does not have an adapter for {language}. Supported languages: {list(self.config.languages)}"
)
self.config.default_language = language
def freeze_embeddings_and_language_adapters(self):
logger.info("Freezing embeddings")
for parameter in self.roberta.embeddings.parameters():
parameter.requires_grad = False
logger.info("Freezing adapters")
for layer in self.roberta.encoder.layer:
if layer.output.adapter_layer_norm is not None:
for parameter in layer.output.adapter_layer_norm.parameters():
parameter.requires_grad = False
for parameter in layer.output.adapter_modules.parameters():
parameter.requires_grad = False
XMOD_INPUTS_DOCSTRING = r"""
# 接受输入参数并进行处理的函数
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
输入序列标记在词汇表中的索引。
可以使用 [`AutoTokenizer`] 获取这些索引。详情请参阅 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`]。
[什么是输入 ID?](../glossary#input-ids)
lang_ids (`torch.LongTensor` of shape `({0})`, *optional*):
每个样本应该激活的语言适配器的索引。默认情况下为 `self.config.default_language` 对应的索引。
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
遮盖掩盖填充标记索引的注意力掩码。掩码值选取在 `[0, 1]` 之间:
- 1 表示**未掩盖**的标记,
- 0 表示**已掩盖**的标记。
[什么是注意力掩码?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
指示输入的第一部分和第二部分的段标记索引。索引选取在 `[0, 1]` 之间:
- 0 对应*句子 A* 的标记,
- 1 对应*句子 B* 的标记。
[什么是标记类型 ID?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
每个输入序列标记在位置嵌入中的位置索引。索引选取范围为 `[0, config.max_position_embeddings - 1]`。
[什么是位置 ID?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
自注意力模块中选择性屏蔽的头部掩码。掩码值选取在 `[0, 1]` 之间:
- 1 表示**未掩盖**的头部,
- 0 表示**已掩盖**的头部。
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
可选参数,而非传递 `input_ids`,你可以直接传递嵌入表示。如果你想要更多控制如何将 `input_ids` 索引转换为相关向量,而不是使用模型的内部嵌入查找矩阵时,这将非常有用。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。请参阅返回张量下的 `attentions` 以获取更多详情。
output_hidden_states (`bool`, *optional*):
是否返回所有层的隐藏状态。请参阅返回张量下的 `hidden_states` 以获取更多详情。
return_dict (`bool`, *optional*):
是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
"""
@add_start_docstrings(
"The bare X-MOD Model transformer outputting raw hidden-states without any specific head on top.",
XMOD_START_DOCSTRING,
)
class XmodModel(XmodPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in *Attention is
all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
Kaiser and Illia Polosukhin.
To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
.. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.embeddings = XmodEmbeddings(config)
self.encoder = XmodEncoder(config)
self.pooler = XmodPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
lang_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"X-MOD Model with a `language modeling` head on top for CLM fine-tuning.",
XMOD_START_DOCSTRING,
)
class XmodForCausalLM(XmodPreTrainedModel):
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
super().__init__(config)
if not config.is_decoder:
logger.warning("If you want to use `XmodLMHeadModel` as a standalone, add `is_decoder=True.`")
self.roberta = XmodModel(config, add_pooling_layer=False)
self.lm_head = XmodLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.lm_head.decoder
def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings
@add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
lang_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
pass
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
def _reorder_cache(self, past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
@add_start_docstrings(
"""X-MOD Model with a `language modeling` head on top.""",
XMOD_START_DOCSTRING,
)
class XmodForMaskedLM(XmodPreTrainedModel):
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
super().__init__(config)
if config.is_decoder:
logger.warning(
"If you want to use `XmodForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.roberta = XmodModel(config, add_pooling_layer=False)
self.lm_head = XmodLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.lm_head.decoder
def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings
@add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
lang_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta(
input_ids,
lang_ids=lang_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class XmodLMHead(nn.Module):
"""Roberta Head for masked language modeling."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, features, **kwargs):
x = self.dense(features)
x = gelu(x)
x = self.layer_norm(x)
x = self.decoder(x)
return x
def _tie_weights(self):
if self.decoder.bias.device.type == "meta":
self.decoder.bias = self.bias
else:
self.bias = self.decoder.bias
@add_start_docstrings(
"""
X-MOD Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
XMOD_START_DOCSTRING,
)
class XmodForSequenceClassification(XmodPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.roberta = XmodModel(config, add_pooling_layer=False)
self.classifier = XmodClassificationHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
lang_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta(
input_ids,
lang_ids=lang_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
X-MOD Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""
@add_start_docstrings(
"""
X-MOD Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
XMOD_START_DOCSTRING,
)
class XmodForMultipleChoice(XmodPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.roberta = XmodModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.post_init()
@add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
lang_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
flat_lang_ids = lang_ids.repeat(input_ids.size(0) * input_ids.size(1)) if lang_ids is not None else None
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
flat_inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.roberta(
flat_input_ids,
lang_ids=flat_lang_ids,
position_ids=flat_position_ids,
token_type_ids=flat_token_type_ids,
attention_mask=flat_attention_mask,
head_mask=head_mask,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
X-MOD Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
@add_start_docstrings(
"""
XMOD_START_DOCSTRING,
"""
)
class XmodForTokenClassification(XmodPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.roberta = XmodModel(config, add_pooling_layer=False)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
lang_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta(
input_ids,
lang_ids=lang_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class XmodClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features[:, 0, :]
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
@add_start_docstrings(
"""
X-MOD Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XMOD_START_DOCSTRING,
)
class XmodForQuestionAnswering(XmodPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.roberta = XmodModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
lang_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def forward(
self,
input_ids: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
return_dict: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.
Args:
input_ids: torch.Tensor 输入的张量,包含需要进行位置编码的元素
padding_idx: int 填充符号的索引,非填充符号将被替换为其位置编号
past_key_values_length: int 过去键值对的长度,用于计算增量索引
Returns:
torch.Tensor 返回与输入张量相同形状的张量,其中非填充符号被替换为它们的位置编号
"""
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx