Transformers 源码解析(九十七)
.\models\roberta_prelayernorm\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_torch_available,
)
_import_structure = {
"configuration_roberta_prelayernorm": [
"ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP",
"RobertaPreLayerNormConfig",
"RobertaPreLayerNormOnnxConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_roberta_prelayernorm"] = [
"ROBERTA_PRELAYERNORM_PRETRAINED_MODEL_ARCHIVE_LIST",
"RobertaPreLayerNormForCausalLM",
"RobertaPreLayerNormForMaskedLM",
"RobertaPreLayerNormForMultipleChoice",
"RobertaPreLayerNormForQuestionAnswering",
"RobertaPreLayerNormForSequenceClassification",
"RobertaPreLayerNormForTokenClassification",
"RobertaPreLayerNormModel",
"RobertaPreLayerNormPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_roberta_prelayernorm"] = [
"TF_ROBERTA_PRELAYERNORM_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFRobertaPreLayerNormForCausalLM",
"TFRobertaPreLayerNormForMaskedLM",
"TFRobertaPreLayerNormForMultipleChoice",
"TFRobertaPreLayerNormForQuestionAnswering",
"TFRobertaPreLayerNormForSequenceClassification",
"TFRobertaPreLayerNormForTokenClassification",
"TFRobertaPreLayerNormMainLayer",
"TFRobertaPreLayerNormModel",
"TFRobertaPreLayerNormPreTrainedModel",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_roberta_prelayernorm"] = [
"FlaxRobertaPreLayerNormForCausalLM",
"FlaxRobertaPreLayerNormForMaskedLM",
"FlaxRobertaPreLayerNormForMultipleChoice",
"FlaxRobertaPreLayerNormForQuestionAnswering",
"FlaxRobertaPreLayerNormForSequenceClassification",
"FlaxRobertaPreLayerNormForTokenClassification",
"FlaxRobertaPreLayerNormModel",
"FlaxRobertaPreLayerNormPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_roberta_prelayernorm import (
ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP,
RobertaPreLayerNormConfig,
RobertaPreLayerNormOnnxConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_roberta_prelayernorm import (
ROBERTA_PRELAYERNORM_PRETRAINED_MODEL_ARCHIVE_LIST,
RobertaPreLayerNormForCausalLM,
RobertaPreLayerNormForMaskedLM,
RobertaPreLayerNormForMultipleChoice,
RobertaPreLayerNormForQuestionAnswering,
RobertaPreLayerNormForSequenceClassification,
RobertaPreLayerNormForTokenClassification,
RobertaPreLayerNormModel,
RobertaPreLayerNormPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_roberta_prelayernorm import (
TF_ROBERTA_PRELAYERNORM_PRETRAINED_MODEL_ARCHIVE_LIST,
TFRobertaPreLayerNormForCausalLM,
TFRobertaPreLayerNormForMaskedLM,
TFRobertaPreLayerNormForMultipleChoice,
TFRobertaPreLayerNormForQuestionAnswering,
TFRobertaPreLayerNormForSequenceClassification,
TFRobertaPreLayerNormForTokenClassification,
TFRobertaPreLayerNormMainLayer,
TFRobertaPreLayerNormModel,
TFRobertaPreLayerNormPreTrainedModel,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_roberta_prelayernorm import (
FlaxRobertaPreLayerNormForCausalLM,
FlaxRobertaPreLayerNormForMaskedLM,
FlaxRobertaPreLayerNormForMultipleChoice,
FlaxRobertaPreLayerNormForQuestionAnswering,
FlaxRobertaPreLayerNormForSequenceClassification,
FlaxRobertaPreLayerNormForTokenClassification,
FlaxRobertaPreLayerNormModel,
FlaxRobertaPreLayerNormPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\roc_bert\configuration_roc_bert.py
""" RoCBert model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"weiweishi/roc-bert-base-zh": "https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/config.json",
}
class RoCBertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`RoCBertModel`]. It is used to instantiate a
RoCBert model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the RoCBert
[weiweishi/roc-bert-base-zh](https://huggingface.co/weiweishi/roc-bert-base-zh) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
```
>>> from transformers import RoCBertModel, RoCBertConfig
>>> # Initializing a RoCBert weiweishi/roc-bert-base-zh style configuration
>>> configuration = RoCBertConfig()
>>> # Initializing a model from the weiweishi/roc-bert-base-zh style configuration
>>> model = RoCBertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "roc_bert"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
use_cache=True,
pad_token_id=0,
position_embedding_type="absolute",
classifier_dropout=None,
enable_pronunciation=True,
enable_shape=True,
pronunciation_embed_dim=768,
pronunciation_vocab_size=910,
shape_embed_dim=512,
shape_vocab_size=24858,
concat_input=True,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.type_vocab_size = type_vocab_size
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.enable_pronunciation = enable_pronunciation
self.enable_shape = enable_shape
self.pronunciation_embed_dim = pronunciation_embed_dim
self.pronunciation_vocab_size = pronunciation_vocab_size
self.shape_embed_dim = shape_embed_dim
self.shape_vocab_size = shape_vocab_size
self.concat_input = concat_input
self.position_embedding_type = position_embedding_type
self.classifier_dropout = classifier_dropout
super().__init__(pad_token_id=pad_token_id, **kwargs)
.\models\roc_bert\modeling_roc_bert.py
""" PyTorch RoCBert 模型."""
import math
import os
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import (
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_roc_bert import RoCBertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "weiweishi/roc-bert-base-zh"
_CONFIG_FOR_DOC = "RoCBertConfig"
_EXPECTED_OUTPUT_SHAPE = [1, 8, 768]
_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "ArthurZ/dummy-rocbert-ner"
_TOKEN_CLASS_EXPECTED_OUTPUT = [
"S-EVENT", "S-FAC", "I-ORDINAL", "I-ORDINAL", "E-ORG", "E-LANGUAGE", "E-ORG", "E-ORG", "E-ORG", "E-ORG",
"I-EVENT", "S-TIME", "S-TIME", "E-LANGUAGE", "S-TIME", "E-DATE", "I-ORDINAL", "E-QUANTITY", "E-LANGUAGE",
"S-TIME", "B-ORDINAL", "S-PRODUCT", "E-LANGUAGE", "E-LANGUAGE", "E-ORG", "E-LOC", "S-TIME", "I-ORDINAL",
"S-FAC", "O", "S-GPE", "I-EVENT", "S-GPE", "E-LANGUAGE", "E-ORG", "S-EVENT", "S-FAC", "S-FAC", "S-FAC",
"E-ORG", "S-FAC", "E-ORG", "S-GPE"
]
_TOKEN_CLASS_EXPECTED_LOSS = 3.62
_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "ArthurZ/dummy-rocbert-seq"
_SEQ_CLASS_EXPECTED_OUTPUT = "'financial news'"
_SEQ_CLASS_EXPECTED_LOSS = 2.31
_CHECKPOINT_FOR_QA = "ArthurZ/dummy-rocbert-qa"
_QA_EXPECTED_OUTPUT = "''"
_QA_EXPECTED_LOSS = 3.75
_QA_TARGET_START_INDEX = 14
_QA_TARGET_END_INDEX = 15
ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"weiweishi/roc-bert-base-zh",
]
def load_tf_weights_in_roc_bert(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
name = name.split("/")
if any(
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name
):
logger.info(f"Skipping {'/'.join(name)}")
continue
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info(f"Skipping {'/'.join(name)}")
continue
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
except ValueError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
return model
class RoCBertEmbeddings(nn.Module):
"""Construct the embeddings from word, position, shape, pronunciation and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.pronunciation_embed = nn.Embedding(
config.pronunciation_vocab_size, config.pronunciation_embed_dim, padding_idx=config.pad_token_id
)
self.shape_embed = nn.Embedding(
config.shape_vocab_size, config.shape_embed_dim, padding_idx=config.pad_token_id
)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.enable_pronunciation = config.enable_pronunciation
self.enable_shape = config.enable_shape
if config.concat_input:
input_dim = config.hidden_size
if self.enable_pronunciation:
pronunciation_dim = config.pronunciation_embed_dim
input_dim += pronunciation_dim
if self.enable_shape:
shape_dim = config.shape_embed_dim
input_dim += shape_dim
self.map_inputs_layer = torch.nn.Linear(input_dim, config.hidden_size)
else:
self.map_inputs_layer = None
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"token_type_ids",
torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
persistent=False,
)
def forward(
self,
input_ids=None,
input_shape_ids=None,
input_pronunciation_ids=None,
token_type_ids=None,
position_ids=None,
inputs_embeds=None,
past_key_values_length=0,
):
pass
class RoCBertSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
class RoCBertSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class RoCBertAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = RoCBertSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = RoCBertSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class RoCBertIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class RoCBertOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class RoCBertLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = RoCBertAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = RoCBertAttention(config, position_embedding_type="absolute")
self.intermediate = RoCBertIntermediate(config)
self.output = RoCBertOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class RoCBertEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([RoCBertLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class RoCBertPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class RoCBertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class RoCBertLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = RoCBertPredictionHeadTransform(config)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class RoCBertOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = RoCBertLMPredictionHead(config)
def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class RoCBertPreTrainedModel(PreTrainedModel):
"""
RoCBert 模型的基类,继承自 PreTrainedModel
"""
config_class = RoCBertConfig
load_tf_weights = load_tf_weights_in_roc_bert
base_model_prefix = "roc_bert"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
ROC_BERT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`RoCBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
ROC_BERT_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare RoCBert Model transformer outputting raw hidden-states without any specific head on top.",
ROC_BERT_START_DOCSTRING,
)
class RoCBertModel(RoCBertPreTrainedModel):
"""
RoCBertModel 可以作为一个编码器(仅自注意力)或解码器使用。在解码器模式下,将在自注意力层之间添加交叉注意力层,
遵循 [Attention is all you need](https://arxiv.org/abs/1706.03762) 中描述的架构。
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.embeddings = RoCBertEmbeddings(config)
self.encoder = RoCBertEncoder(config)
self.pooler = RoCBertPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def get_pronunciation_embeddings(self):
return self.embeddings.pronunciation_embed
def set_pronunciation_embeddings(self, value):
self.embeddings.pronunciation_embed = value
def get_shape_embeddings(self):
return self.embeddings.shape_embed
def set_shape_embeddings(self, value):
self.embeddings.shape_embed = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
input_shape_ids: Optional[torch.Tensor] = None,
input_pronunciation_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
RoCBert Model with contrastive loss and masked_lm_loss during the pretraining.
""",
ROC_BERT_START_DOCSTRING,
)
class RoCBertForPreTraining(RoCBertPreTrainedModel):
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)
self.roc_bert = RoCBertModel(config)
self.cls = RoCBertOnlyMLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
input_shape_ids: Optional[torch.Tensor] = None,
input_pronunciation_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
attack_input_ids: Optional[torch.Tensor] = None,
attack_input_shape_ids: Optional[torch.Tensor] = None,
attack_input_pronunciation_ids: Optional[torch.Tensor] = None,
attack_attention_mask: Optional[torch.Tensor] = None,
attack_token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels_input_ids: Optional[torch.Tensor] = None,
labels_input_shape_ids: Optional[torch.Tensor] = None,
labels_input_pronunciation_ids: Optional[torch.Tensor] = None,
labels_attention_mask: Optional[torch.Tensor] = None,
labels_token_type_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
"""
RoCBert Model with a `language modeling` head on top.
"""
def __init__(self, config):
super().__init__(config)
if config.is_decoder:
logger.warning(
"If you want to use `RoCBertForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.roc_bert = RoCBertModel(config, add_pooling_layer=False)
self.cls = RoCBertOnlyMLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
input_shape_ids: Optional[torch.Tensor] = None,
input_pronunciation_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
```
>>> from transformers import AutoTokenizer, RoCBertForMaskedLM
>>> import torch
>>> tokenizer = AutoTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
>>> model = RoCBertForMaskedLM.from_pretrained("weiweishi/roc-bert-base-zh")
>>> inputs = tokenizer("法国是首都[MASK].", return_tensors="pt")
>>> with torch.no_grad():
... logits = model(**inputs).logits
>>> # retrieve index of {mask}
>>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
>>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
>>> tokenizer.decode(predicted_token_id)
'.'
```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roc_bert(
input_ids,
input_shape_ids=input_shape_ids,
input_pronunciation_ids=input_pronunciation_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
):
input_shape = input_ids.shape
effective_batch_size = input_shape[0]
if self.config.pad_token_id is None:
raise ValueError("The PAD token should be defined for generation")
attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
dummy_token = torch.full(
(effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
)
input_ids = torch.cat([input_ids, dummy_token], dim=1)
if input_shape_ids is not None:
input_shape_ids = torch.cat([input_shape_ids, dummy_token], dim=1)
if input_pronunciation_ids is not None:
input_pronunciation_ids = torch.cat([input_pronunciation_ids, dummy_token], dim=1)
return {
"input_ids": input_ids,
"input_shape_ids": input_shape_ids,
"input_pronunciation_ids": input_pronunciation_ids,
"attention_mask": attention_mask,
}
class RoCBertForCausalLM(RoCBertPreTrainedModel):
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)
if not config.is_decoder:
logger.warning("If you want to use `RoCRoCBertForCausalLM` as a standalone, add `is_decoder=True.`")
self.roc_bert = RoCBertModel(config, add_pooling_layer=False)
self.cls = RoCBertOnlyMLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
input_shape_ids: Optional[torch.Tensor] = None,
input_pronunciation_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.Tensor]] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
pass
def prepare_inputs_for_generation(
self,
input_ids,
input_shape_ids=None,
input_pronunciation_ids=None,
past_key_values=None,
attention_mask=None,
**model_kwargs,
):
pass
):
input_shape = input_ids.shape
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
if input_shape_ids is not None:
input_shape_ids = input_shape_ids[:, -1:]
if input_pronunciation_ids is not None:
input_pronunciation_ids = input_pronunciation_ids[:, -1:]
return {
"input_ids": input_ids,
"input_shape_ids": input_shape_ids,
"input_pronunciation_ids": input_pronunciation_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
}
def _reorder_cache(self, past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
@add_start_docstrings(
"""RoCBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks.""",
ROC_BERT_START_DOCSTRING,
)
class RoCBertForSequenceClassification(RoCBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.roc_bert = RoCBertModel(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
input_shape_ids: Optional[torch.Tensor] = None,
input_pronunciation_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roc_bert(
input_ids,
input_shape_ids=input_shape_ids,
input_pronunciation_ids=input_pronunciation_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""RoCBert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
ROC_BERT_START_DOCSTRING,
)
class RoCBertForMultipleChoice(RoCBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.roc_bert = RoCBertModel(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, 1)
self.post_init()
@add_start_docstrings_to_model_forward(
ROC_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
input_shape_ids: Optional[torch.Tensor] = None,
input_pronunciation_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
input_shape_ids = input_shape_ids.view(-1, input_shape_ids.size(-1)) if input_shape_ids is not None else None
input_pronunciation_ids = (
input_pronunciation_ids.view(-1, input_pronunciation_ids.size(-1))
if input_pronunciation_ids is not None
else None
)
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.roc_bert(
input_ids,
input_shape_ids=input_shape_ids,
input_pronunciation_ids=input_pronunciation_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""RoCBert Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
ROC_BERT_START_DOCSTRING,
)
class RoCBertForTokenClassification(RoCBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.roc_bert = RoCBertModel(config, add_pooling_layer=False)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
input_shape_ids: Optional[torch.Tensor] = None,
input_pronunciation_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
self,
input_ids: Optional[torch.Tensor] = None,
input_shape_ids: Optional[torch.Tensor] = None,
input_pronunciation_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
`
def forward(
input_ids: torch.LongTensor,
input_shape_ids: Optional[torch.LongTensor] = None,
input_pronunciation_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roc_bert(
input_ids,
input_shape_ids=input_shape_ids,
input_pronunciation_ids=input_pronunciation_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
@add_start_docstrings(
"""RoCBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
ROC_BERT_START_DOCSTRING,
)
class RoCBertForQuestionAnswering(RoCBertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.roc_bert = RoCBertModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_QA,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
qa_target_start_index=_QA_TARGET_START_INDEX,
qa_target_end_index=_QA_TARGET_END_INDEX,
expected_output=_QA_EXPECTED_OUTPUT,
expected_loss=_QA_EXPECTED_LOSS,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
input_shape_ids: Optional[torch.Tensor] = None,
input_pronunciation_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\roc_bert\tokenization_roc_bert.py
"""Tokenization classes for RoCBert."""
import collections
import itertools
import json
import os
import unicodedata
from typing import Dict, List, Optional, Tuple, Union
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...tokenization_utils_base import (
ENCODE_KWARGS_DOCSTRING,
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
BatchEncoding,
EncodedInput,
EncodedInputPair,
PaddingStrategy,
PreTokenizedInput,
PreTokenizedInputPair,
TensorType,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ...utils import add_end_docstrings, logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.txt",
"word_shape_file": "word_shape.json",
"word_pronunciation_file": "word_pronunciation.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"weiweishi/roc-bert-base-zh": "https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/vocab.txt"
},
"word_shape_file": {
"weiweishi/roc-bert-base-zh": "https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/word_shape.json"
},
"word_pronunciation_file": {
"weiweishi/roc-bert-base-zh": (
"https://huggingface.co/weiweishi/roc-bert-base-zh/resolve/main/word_pronunciation.json"
)
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"weiweishi/roc-bert-base-zh": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"weiweishi/roc-bert-base-zh": {"do_lower_case": True},
}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class RoCBertTokenizer(PreTrainedTokenizer):
r"""
Args:
# 构建一个 RoCBert 分词器,基于 WordPiece。该分词器继承自 `PreTrainedTokenizer`,其中包含大多数主要方法。
# 用户应参考该超类以获取有关这些方法的更多信息。
def __init__(
self,
vocab_file: str,
word_shape_file: str,
word_pronunciation_file: str,
do_lower_case: bool = True,
do_basic_tokenize: bool = True,
never_split: Iterable[str] = None,
unk_token: str = "[UNK]",
sep_token: str = "[SEP]",
pad_token: str = "[PAD]",
cls_token: str = "[CLS]",
mask_token: str = "[MASK]",
tokenize_chinese_chars: bool = True,
strip_accents: bool = None
):
# 词汇表文件名
self.vocab_files_names = VOCAB_FILES_NAMES
# 预训练模型的词汇文件映射
self.pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 预训练模型的初始化配置
self.pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
# 预训练位置嵌入的最大输入尺寸
self.max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
word_shape_file,
word_pronunciation_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
# 检查并确保提供的文件路径有效,如果不存在或者不是文件,则抛出异常
for cur_file in [vocab_file, word_shape_file, word_pronunciation_file]:
if cur_file is None or not os.path.isfile(cur_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google "
"pretrained model use `tokenizer = RoCBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
# 加载词汇表文件并存储到 self.vocab 中
self.vocab = load_vocab(vocab_file)
# 使用 UTF-8 编码打开词形文件,并加载其内容到 self.word_shape 中
with open(word_shape_file, "r", encoding="utf8") as in_file:
self.word_shape = json.load(in_file)
# 使用 UTF-8 编码打开发音文件,并加载其内容到 self.word_pronunciation 中
with open(word_pronunciation_file, "r", encoding="utf8") as in_file:
self.word_pronunciation = json.load(in_file)
# 创建一个从 ID 到 token 的有序字典,用于反向查找
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
# 设置是否执行基本的分词操作
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
# 如果需要基本分词,则初始化 RoCBertBasicTokenizer 对象
self.basic_tokenizer = RoCBertBasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
# 使用给定的未知 token 初始化 RoCBertWordpieceTokenizer 对象
self.wordpiece_tokenizer = RoCBertWordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
# 调用父类的初始化方法,设置通用的 tokenizer 参数
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property
def do_lower_case(self):
# 返回当前 tokenizer 是否执行小写化处理的状态
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
# 返回当前词汇表的大小(词汇表中 token 的数量)
return len(self.vocab)
# 从 transformers 库中复制的方法,返回当前 tokenizer 的完整词汇表(包括添加的特殊 token)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
# 从 transformers 库中复制的方法,用于实现 tokenization_bert.BertTokenizer._tokenize 的功能
# 使用self对象的basic_tokenizer对文本进行基本的tokenization处理
def _tokenize(self, text, split_special_tokens=False):
# 初始化空列表,用于存储分割后的token
split_tokens = []
# 如果设置了do_basic_tokenize标志为True
if self.do_basic_tokenize:
# 调用basic_tokenizer的tokenize方法对文本进行处理
for token in self.basic_tokenizer.tokenize(
text, never_split=self.all_special_tokens if not split_special_tokens else None
):
# 如果token在never_split集合中,则直接加入split_tokens列表
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
# 否则使用wordpiece_tokenizer对token进行进一步的分割处理,并将结果加入split_tokens列表
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
# 如果do_basic_tokenize标志为False,则直接使用wordpiece_tokenizer对文本进行处理
split_tokens = self.wordpiece_tokenizer.tokenize(text)
# 返回处理后的token列表
return split_tokens
# 使用@add_end_docstrings注解来添加文档字符串,描述函数的各个参数
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def prepare_for_model(
self,
ids: List[int],
shape_ids: List[int],
pronunciation_ids: List[int],
pair_ids: Optional[List[int]] = None,
pair_shape_ids: Optional[List[int]] = None,
pair_pronunciation_ids: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
prepend_batch_axis: bool = False,
**kwargs,
):
# 函数用于将输入的ids、shape_ids和pronunciation_ids编码成模型输入
# 如果提供了pair_ids、pair_shape_ids和pair_pronunciation_ids,也会进行相应处理
# 设置是否添加特殊token,默认为True
# 设置padding策略,默认不进行padding
# 设置截断策略,默认不进行截断
# 设置最大长度限制,默认为None
# 设置stride,默认为0
# 设置是否将输入拆分成单词,默认为False
# 设置pad_to_multiple_of,用于设置多少的倍数进行padding,默认为None
# 设置返回的张量类型,默认为None
# 设置是否返回token类型ID,默认为None
# 设置是否返回注意力掩码,默认为None
# 设置是否返回溢出的token,默认为False
# 设置是否返回特殊token掩码,默认为False
# 设置是否返回偏移映射,默认为False
# 设置是否返回长度,默认为False
# 设置是否输出详细信息,默认为True
# 设置是否在前面添加批次轴,默认为False
# kwargs用于接收其它可能的关键字参数
pass # 函数体未提供,暂时占位
# 在类中定义一个方法 `_pad`,用于填充输入数据以达到指定的最大长度
def _pad(
# 接受一个字典或批量编码作为输入,其中键是字符串,值是编码后的输入
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
# 可选参数,指定填充后的最大长度
max_length: Optional[int] = None,
# 可选参数,填充策略,默认不进行填充
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
# 可选参数,指定填充后的长度是某数的倍数
pad_to_multiple_of: Optional[int] = None,
# 可选参数,是否返回注意力掩码
return_attention_mask: Optional[bool] = None,
) -> dict:
# 如果 return_attention_mask 为 None,则根据模型输入名称判断是否需要返回 attention_mask
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
# 获取必需的输入,通常为第一个模型输入的编码结果
required_input = encoded_inputs[self.model_input_names[0]]
# 根据 padding_strategy 是否为 LONGEST,确定 max_length
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)
# 如果 max_length 和 pad_to_multiple_of 都有值,并且 max_length 不是 pad_to_multiple_of 的倍数,调整 max_length
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
# 判断是否需要进行填充
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
# 如果需要返回 attention_mask 并且 encoded_inputs 中没有 "attention_mask",则初始化 attention_mask
if return_attention_mask and "attention_mask" not in encoded_inputs:
encoded_inputs["attention_mask"] = [1] * len(required_input)
# 如果需要填充
if needs_to_be_padded:
difference = max_length - len(required_input)
# 如果填充在右侧
if self.padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
encoded_inputs["token_type_ids"] = (
encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
)
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
for key in ["input_shape_ids", "input_pronunciation_ids"]:
if key in encoded_inputs:
encoded_inputs[key] = encoded_inputs[key] + [self.pad_token_id] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
# 如果填充在左侧
elif self.padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
"token_type_ids"
]
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
for key in ["input_shape_ids", "input_pronunciation_ids"]:
if key in encoded_inputs:
encoded_inputs[key] = [self.pad_token_id] * difference + encoded_inputs[key]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
# 如果填充策略不是 "left" 或 "right",抛出异常
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
# 返回填充后的 encoded_inputs 字典
return encoded_inputs
# 定义一个方法用于批量编码文本或文本对,支持多种输入类型
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
List[PreTokenizedInputPair],
List[EncodedInput],
List[EncodedInputPair],
],
add_special_tokens: bool = True, # 是否添加特殊标记
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, # 填充策略,默认不填充
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, # 截断策略,默认不截断
max_length: Optional[int] = None, # 最大长度限制
stride: int = 0, # 步长,默认为0
is_split_into_words: bool = False, # 输入是否已经分成单词
pad_to_multiple_of: Optional[int] = None, # 填充到指定的倍数
return_tensors: Optional[Union[str, TensorType]] = None, # 返回的张量类型
return_token_type_ids: Optional[bool] = None, # 是否返回token类型IDs
return_attention_mask: Optional[bool] = None, # 是否返回注意力掩码
return_overflowing_tokens: bool = False, # 是否返回溢出的tokens
return_special_tokens_mask: bool = False, # 是否返回特殊tokens的掩码
return_offsets_mapping: bool = False, # 是否返回偏移映射
return_length: bool = False, # 是否返回长度
verbose: bool = True, # 是否详细输出
**kwargs, # 其他关键字参数
):
# 方法功能的详细描述和补充参数文档参见ENCODE_KWARGS_DOCSTRING和ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
pass
# 定义一个方法用于为模型准备批量数据,支持多种输入类型
def _batch_prepare_for_model(
self,
batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
batch_shape_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
batch_pronunciation_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
add_special_tokens: bool = True, # 是否添加特殊标记
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, # 填充策略,默认不填充
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, # 截断策略,默认不截断
max_length: Optional[int] = None, # 最大长度限制
stride: int = 0, # 步长,默认为0
pad_to_multiple_of: Optional[int] = None, # 填充到指定的倍数
return_tensors: Optional[str] = None, # 返回的张量类型
return_token_type_ids: Optional[bool] = None, # 是否返回token类型IDs
return_attention_mask: Optional[bool] = None, # 是否返回注意力掩码
return_overflowing_tokens: bool = False, # 是否返回溢出的tokens
return_special_tokens_mask: bool = False, # 是否返回特殊tokens的掩码
return_length: bool = False, # 是否返回长度
verbose: bool = True, # 是否详细输出
**kwargs, # 其他关键字参数
):
# 方法用于准备模型输入数据的详细描述和补充参数文档参见相应文档
pass
) -> BatchEncoding:
"""
准备一个输入 id 序列,或者一对输入 id 序列,以便可以被模型使用。它添加特殊标记,根据特殊标记截断序列,同时考虑特殊标记,并管理一个移动窗口(带有用户定义的步长)来处理溢出的标记。
Args:
batch_ids_pairs: tokenized input ids 或者 input ids pairs 的列表
batch_shape_ids_pairs: tokenized input shape ids 或者 input shape ids pairs 的列表
batch_pronunciation_ids_pairs: tokenized input pronunciation ids 或者 input pronunciation ids pairs 的列表
"""
batch_outputs = {}
for i, (first_ids, second_ids) in enumerate(batch_ids_pairs):
first_shape_ids, second_shape_ids = batch_shape_ids_pairs[i]
first_pronunciation_ids, second_pronunciation_ids = batch_pronunciation_ids_pairs[i]
outputs = self.prepare_for_model(
first_ids,
first_shape_ids,
first_pronunciation_ids,
pair_ids=second_ids,
pair_shape_ids=second_shape_ids,
pair_pronunciation_ids=second_pronunciation_ids,
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value, # 在后续批处理中进行填充
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # 在后续批处理中进行填充
return_attention_mask=False, # 在后续批处理中进行填充
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None, # 最终将整个批次转换为张量
prepend_batch_axis=False,
verbose=verbose,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
return batch_outputs
# 从 transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id 复制过来的
def _convert_token_to_id(self, token):
"""使用词汇表将一个标记(字符串)转换为对应的 id。"""
return self.vocab.get(token, self.vocab.get(self.unk_token))
# 使用 shape vocab 将给定的 token 转换成对应的 shape_id
def _convert_token_to_shape_id(self, token):
"""Converts a token (str) in an shape_id using the shape vocab."""
return self.word_shape.get(token, self.word_shape.get(self.unk_token))
# 将一组 token 转换成对应的 shape_ids 列表或单个 shape_id
def convert_tokens_to_shape_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
if tokens is None:
return None
ids = []
for token in tokens:
ids.append(self._convert_token_to_shape_id(token))
return ids
# 使用 pronunciation vocab 将给定的 token 转换成对应的 pronunciation_id
def _convert_token_to_pronunciation_id(self, token):
"""Converts a token (str) in an shape_id using the shape vocab."""
return self.word_pronunciation.get(token, self.word_pronunciation.get(self.unk_token))
# 将一组 token 转换成对应的 pronunciation_ids 列表或单个 pronunciation_id
def convert_tokens_to_pronunciation_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
if tokens is None:
return None
ids = []
for token in tokens:
ids.append(self._convert_token_to_pronunciation_id(token))
return ids
# 从词汇表中将给定的 index 转换成对应的 token
# 这里的词汇表是 ids_to_tokens 字典
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
# 将一组 token 序列转换成单个字符串,去除特殊标记 "##"
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
# 构建包含特殊 token 的模型输入序列,用于序列分类任务
# 可能是单个序列 `[CLS] X [SEP]` 或者序列对 `[CLS] A [SEP] B [SEP]`
def build_inputs_with_special_tokens(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
cls_token_id: int = None,
sep_token_id: int = None,
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary
"""
cls = [self.cls_token_id] if cls_token_id is None else [cls_token_id]
sep = [self.sep_token_id] if sep_token_id is None else [sep_token_id]
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep
# 获取特殊 token 的掩码
# 由于该函数被省略了,文档中提到的函数为 transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
# 检查是否已经包含特殊标记,如果是,则调用父类方法返回特殊标记掩码
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
# 如果有第二个序列token_ids_1,则返回带有特殊标记的掩码列表
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
# 否则,只返回第一个序列token_ids_0带有特殊标记的掩码列表
return [1] + ([0] * len(token_ids_0)) + [1]
# 从给定的序列中创建token type IDs,用于序列对分类任务
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary
"""
sep = [self.sep_token_id] # 分隔符token的ID列表
cls = [self.cls_token_id] # 类别标记token的ID列表
if token_ids_1 is None:
# 如果没有第二个序列,返回仅包含第一个序列的token type IDs列表(都是0)
return len(cls + token_ids_0 + sep) * [0]
# 否则,返回包含两个序列的token type IDs列表,第一个序列为0,第二个序列为1
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# 将词汇表和相关文件保存到指定的目录中,并返回文件路径的元组
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str, str, str]:
index = 0 # 初始化索引变量
if os.path.isdir(save_directory): # 检查保存目录是否存在
# 构建词汇表文件的路径,如果有前缀,则添加前缀,否则直接使用文件名
vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"],
)
# 构建词形文件的路径,同上
word_shape_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["word_shape_file"],
)
# 构建发音文件的路径,同上
word_pronunciation_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["word_pronunciation_file"],
)
else:
# 如果目录不存在,则抛出值错误
raise ValueError(
f"Can't find a directory at path '{save_directory}'. To load the vocabulary from a Google "
"pretrained model use `tokenizer = RoCBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
# 打开并写入词汇表文件
with open(vocab_file, "w", encoding="utf-8") as writer:
# 遍历词汇表中的每个单词及其索引
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
# 检查索引是否连续,若不连续则记录警告
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
# 写入单词到文件中,并换行
writer.write(token + "\n")
index += 1
# 打开并写入词形文件,以 JSON 格式存储
with open(word_shape_file, "w", encoding="utf8") as writer:
json.dump(self.word_shape, writer, ensure_ascii=False, indent=4, separators=(", ", ": "))
# 打开并写入发音文件,以 JSON 格式存储
with open(word_pronunciation_file, "w", encoding="utf8") as writer:
json.dump(self.word_pronunciation, writer, ensure_ascii=False, indent=4, separators=(", ", ": "))
# 返回保存的文件路径的元组
return (
vocab_file,
word_shape_file,
word_pronunciation_file,
)
# 从 transformers.models.bert.tokenization_bert.BasicTokenizer 复制的 RoCBertBasicTokenizer 类定义,
# 将 BasicTokenizer 更名为 RoCBertBasicTokenizer
class RoCBertBasicTokenizer(object):
"""
构建一个 RoCBertBasicTokenizer 类,用于执行基本的分词(如标点符号分割、转换为小写等)。
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
是否在分词时将输入转换为小写。
never_split (`Iterable`, *optional*):
在分词时不会被拆分的标记集合。仅在 `do_basic_tokenize=True` 时有效。
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
是否对中文字符进行分词。
对于日文,这个选项应该关闭(参见此 [issue](https://github.com/huggingface/transformers/issues/328))。
strip_accents (`bool`, *optional*):
是否去除所有的重音符号。如果未指定此选项,则将由 `lowercase` 的值确定(与原始 BERT 一样)。
do_split_on_punc (`bool`, *optional*, defaults to `True`):
在某些情况下,我们希望跳过基本的标点符号分割,以便后续的分词可以捕获单词的完整上下文,例如缩略词。
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case # 是否将输入转换为小写
self.never_split = set(never_split) # 在分词时不拆分的标记集合
self.tokenize_chinese_chars = tokenize_chinese_chars # 是否对中文字符进行分词
self.strip_accents = strip_accents # 是否去除所有重音符号
self.do_split_on_punc = do_split_on_punc # 是否在基本标点符号上进行分割
# Tokenize a piece of text using basic tokenization rules.
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
# Create a set of tokens that should never be split during tokenization.
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
# Clean the input text.
text = self._clean_text(text)
# Handle Chinese character tokenization if enabled.
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
# Normalize the text to Unicode NFC form to ensure uniform representation of characters.
unicode_normalized_text = unicodedata.normalize("NFC", text)
# Tokenize the text on whitespace.
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
# Iterate over each token and process according to tokenizer settings.
for token in orig_tokens:
if token not in never_split:
# Lowercase the token if required by the tokenizer settings.
if self.do_lower_case:
token = token.lower()
# Strip accents from the token if specified.
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
# Split the token on punctuation marks while respecting tokens in never_split.
split_tokens.extend(self._run_split_on_punc(token, never_split))
# Tokenize the final output tokens on whitespace and return them.
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
# Remove accents (diacritical marks) from a piece of text.
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
# Normalize the text to Unicode NFD form to decompose characters and accents.
text = unicodedata.normalize("NFD", text)
output = []
# Iterate over each character in the normalized text.
for char in text:
# Check the Unicode category of the character; "Mn" denotes a nonspacing mark (accents).
cat = unicodedata.category(char)
if cat == "Mn":
continue # Skip combining characters (accents).
output.append(char)
# Join the processed characters back into a string without accents.
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
# 如果不需要在标点符号处分割,或者文本在 never_split 中,直接返回文本列表
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
# 将文本转换为字符列表
chars = list(text)
i = 0
start_new_word = True
output = []
# 遍历字符列表
while i < len(chars):
char = chars[i]
# 如果是标点符号,则将其作为新的单独列表项添加到 output 中
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
# 如果不是标点符号,根据 start_new_word 的状态判断是否开始一个新的单词
if start_new_word:
output.append([])
start_new_word = False
# 将字符添加到当前的最后一个列表项中
output[-1].append(char)
i += 1
# 将列表中的列表项连接成字符串并返回
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
# 初始化输出列表
output = []
# 遍历文本中的每个字符
for char in text:
cp = ord(char)
# 如果字符是中日韩字符,则在其前后添加空格并加入输出列表
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
# 否则直接加入输出列表
output.append(char)
# 将输出列表连接成字符串并返回
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# 检查给定的码点是否在中日韩字符的 Unicode 块范围内
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
# 初始化输出列表
output = []
# 遍历文本中的每个字符
for char in text:
cp = ord(char)
# 如果字符是无效字符或控制字符,跳过
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
# 如果字符是空白字符,则替换为单个空格;否则直接加入输出列表
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
# 将输出列表连接成字符串并返回
return "".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer with WordpieceTokenizer->RoCBertWordpieceTokenizer
class RoCBertWordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
# 初始化方法,接受词汇表、未知token以及每个单词的最大字符数作为参数
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
# 将文本分词为 wordpiece tokens 的方法
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
# 如果单词长度超过设定的最大字符数,将其标记为未知token
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
# 如果无法找到匹配的子串,则将整个单词标记为未知token
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
.\models\roc_bert\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoCBertConfig"],
"tokenization_roc_bert": ["RoCBertTokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
pass
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_roc_bert"] = [
"ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"RoCBertForCausalLM",
"RoCBertForMaskedLM",
"RoCBertForMultipleChoice",
"RoCBertForPreTraining",
"RoCBertForQuestionAnswering",
"RoCBertForSequenceClassification",
"RoCBertForTokenClassification",
"RoCBertLayer",
"RoCBertModel",
"RoCBertPreTrainedModel",
"load_tf_weights_in_roc_bert",
]
if TYPE_CHECKING:
from .configuration_roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RoCBertConfig
from .tokenization_roc_bert import RoCBertTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
raise OptionalDependencyNotAvailable()
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_roc_bert import (
ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
RoCBertForCausalLM,
RoCBertForMaskedLM,
RoCBertForMultipleChoice,
RoCBertForPreTraining,
RoCBertForQuestionAnswering,
RoCBertForSequenceClassification,
RoCBertForTokenClassification,
RoCBertLayer,
RoCBertModel,
RoCBertPreTrainedModel,
load_tf_weights_in_roc_bert,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\roformer\configuration_roformer.py
class RoFormerConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`RoFormerModel`]. It is used to instantiate an
RoFormer model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the RoFormer
[junnyu/roformer_chinese_base](https://huggingface.co/junnyu/roformer_chinese_base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
# 定义 RoFormer 模型的配置类,用于配置模型的各种参数
Args:
vocab_size (`int`, *optional*, defaults to 50000):
RoFormer 模型的词汇表大小,定义了可以由输入 `inputs_ids` 表示的不同 token 数量。
embedding_size (`int`, *optional*, defaults to None):
编码器层和池化层的维度。如果未提供,则默认为 `hidden_size`。
hidden_size (`int`, *optional*, defaults to 768):
编码器层和池化层的维度。
num_hidden_layers (`int`, *optional*, defaults to 12):
Transformer 编码器中隐藏层的数量。
num_attention_heads (`int`, *optional*, defaults to 12):
Transformer 编码器中每个注意力层的注意力头数。
intermediate_size (`int`, *optional*, defaults to 3072):
Transformer 编码器中 "intermediate"(即前馈)层的维度。
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
编码器和池化器中的非线性激活函数(函数或字符串)。支持 `"gelu"`, `"relu"`, `"selu"` 和 `"gelu_new"`。
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
注意力概率的 dropout 比例。
max_position_embeddings (`int`, *optional*, defaults to 1536):
模型可能使用的最大序列长度。通常设置为一个较大的值(例如 512、1024 或 1536)。
type_vocab_size (`int`, *optional*, defaults to 2):
调用 [`RoFormerModel`] 或 [`TFRoFormerModel`] 时传递的 `token_type_ids` 的词汇表大小。
initializer_range (`float`, *optional*, defaults to 0.02):
初始化所有权重矩阵的截断正态初始化器的标准差。
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
层归一化层使用的 epsilon。
is_decoder (`bool`, *optional*, defaults to `False`):
模型是否用作解码器。如果为 `False`,则模型用作编码器。
use_cache (`bool`, *optional*, defaults to `True`):
模型是否应返回最后的键/值注意力(不是所有模型都使用)。仅在 `config.is_decoder=True` 时相关。
rotary_value (`bool`, *optional*, defaults to `False`):
是否在值层应用旋转位置嵌入。
# 初始化一个 RoFormer 风格的配置对象
configuration = RoFormerConfig()
# 使用 RoFormer 配置对象初始化一个模型,模型的权重是随机初始化的
model = RoFormerModel(configuration)
# 获取模型的配置信息
configuration = model.config
# 定义 RoFormer 模型在 ONNX 格式中的配置类,继承自 OnnxConfig 类
class RoFormerOnnxConfig(OnnxConfig):
# 定义 inputs 属性,返回一个映射,将字符串映射到一个映射(字典),其键为整数,值为字符串
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
# 如果任务类型为多选题
if self.task == "multiple-choice":
# 动态轴定义,将 0 对应到 "batch",1 对应到 "choice",2 对应到 "sequence"
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
# 否则,动态轴定义,将 0 对应到 "batch",1 对应到 "sequence"
dynamic_axis = {0: "batch", 1: "sequence"}
# 重新赋值动态轴定义,将 0 对应到 "batch",1 对应到 "sequence"
dynamic_axis = {0: "batch", 1: "sequence"}
# 返回有序字典,包含输入名称到动态轴定义的映射
return OrderedDict(
[
("input_ids", dynamic_axis), # 输入名称 "input_ids" 对应动态轴定义
("attention_mask", dynamic_axis), # 输入名称 "attention_mask" 对应动态轴定义
("token_type_ids", dynamic_axis), # 输入名称 "token_type_ids" 对应动态轴定义
]
)
.\models\roformer\convert_roformer_original_tf_checkpoint_to_pytorch.py
"""Convert RoFormer checkpoint."""
import argparse
import torch
from transformers import RoFormerConfig, RoFormerForMaskedLM, load_tf_weights_in_roformer
from transformers.utils import logging
logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
config = RoFormerConfig.from_json_file(bert_config_file)
print(f"Building PyTorch model from configuration: {config}")
model = RoFormerForMaskedLM(config)
load_tf_weights_in_roformer(model, config, tf_checkpoint_path)
print(f"Save PyTorch model to {pytorch_dump_path}")
torch.save(model.state_dict(), pytorch_dump_path, _use_new_zipfile_serialization=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--bert_config_file",
default=None,
type=str,
required=True,
help=(
"The config json file corresponding to the pre-trained BERT model. \n"
"This specifies the model architecture."
),
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
.\models\roformer\modeling_flax_roformer.py
from typing import Callable, Optional, Tuple
import flax.linen as nn
import jax
import jax.numpy as jnp
import numpy as np
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from ...modeling_flax_outputs import (
FlaxBaseModelOutput,
FlaxMaskedLMOutput,
FlaxMultipleChoiceModelOutput,
FlaxQuestionAnsweringModelOutput,
FlaxSequenceClassifierOutput,
FlaxTokenClassifierOutput,
)
from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_roformer import RoFormerConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "junnyu/roformer_chinese_base"
_CONFIG_FOR_DOC = "RoFormerConfig"
FLAX_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"junnyu/roformer_chinese_small",
"junnyu/roformer_chinese_base",
"junnyu/roformer_chinese_char_small",
"junnyu/roformer_chinese_char_base",
"junnyu/roformer_small_discriminator",
"junnyu/roformer_small_generator",
]
ROFORMER_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
This model is also a
[flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
# Parameters:
# config ([`RoFormerConfig`]): 模型配置类,包含模型的所有参数。
# 初始化时使用配置文件不会加载与模型关联的权重,只加载配置信息。
# 可查看 [`~FlaxPreTrainedModel.from_pretrained`] 方法来加载模型权重。
# dtype (`jax.numpy.dtype`, *optional*, 默认为 `jax.numpy.float32`):
# 计算的数据类型。可以是 `jax.numpy.float32`、`jax.numpy.float16`(在GPU上)、`jax.numpy.bfloat16`(在TPU上)之一。
#
# 这可以用于在GPU或TPU上启用混合精度训练或半精度推断。如果指定了dtype,则所有计算将使用指定的数据类型。
#
# **注意,这只指定了计算的数据类型,并不影响模型参数的数据类型。**
#
# 如果想要更改模型参数的数据类型,请参见 [`~FlaxPreTrainedModel.to_fp16`] 和 [`~FlaxPreTrainedModel.to_bf16`]。
"""
ROFORMER_INPUTS_DOCSTRING = r"""
Args:
input_ids (`numpy.ndarray` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
head_mask (`numpy.ndarray` of shape `({0})`, `optional):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
def create_sinusoidal_positions(n_pos, dim):
position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
sentinel = dim // 2 + dim % 2
out = np.zeros_like(position_enc)
out[:, 0:sentinel] = np.sin(position_enc[:, 0::2])
out[:, sentinel:] = np.cos(position_enc[:, 1::2])
return jnp.array(out)
class FlaxRoFormerEmbeddings(nn.Module):
"""Construct the embeddings from word and token_type embeddings."""
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.word_embeddings = nn.Embed(
self.config.vocab_size,
self.config.hidden_size,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
self.token_type_embeddings = nn.Embed(
self.config.type_vocab_size,
self.config.hidden_size,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
def __call__(self, input_ids, token_type_ids, attention_mask, deterministic: bool = True):
inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
hidden_states = inputs_embeds + token_type_embeddings
hidden_states = self.LayerNorm(hidden_states)
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
return hidden_states
class FlaxRoFormerSelfAttention(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self) -> None:
if self.config.hidden_size % self.config.num_attention_heads != 0:
raise ValueError(
"`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
" : {self.config.num_attention_heads}"
)
self.query = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
self.key = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
self.value = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
self.rotary_value = self.config.rotary_value
def __call__(
self,
hidden_states,
attention_mask,
sinusoidal_pos,
layer_head_mask,
deterministic=True,
output_attentions: bool = False,
):
head_dim = self.config.hidden_size // self.config.num_attention_heads
query_states = self.query(hidden_states).reshape(
hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
)
value_states = self.value(hidden_states).reshape(
hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
)
key_states = self.key(hidden_states).reshape(
hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
)
if sinusoidal_pos is not None:
if self.rotary_value:
query_states, key_states, value_states = self.apply_rotary_position_embeddings(
sinusoidal_pos, query_states, key_states, value_states
)
else:
query_states, key_states = self.apply_rotary_position_embeddings(
sinusoidal_pos, query_states, key_states
)
if attention_mask is not None:
attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
attention_bias = lax.select(
attention_mask > 0,
jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
)
else:
attention_bias = None
dropout_rng = None
if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
dropout_rng = self.make_rng("dropout")
attn_weights = dot_product_attention_weights(
query_states,
key_states,
bias=attention_bias,
dropout_rng=dropout_rng,
dropout_rate=self.config.attention_probs_dropout_prob,
broadcast_dropout=True,
deterministic=deterministic,
dtype=self.dtype,
precision=None,
)
if layer_head_mask is not None:
attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask)
attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
return outputs
@staticmethod
sin, cos = sinusoidal_pos.split(2, axis=-1)
sin_pos = jnp.stack([sin, sin], axis=-1).reshape(sinusoidal_pos.shape)
cos_pos = jnp.stack([cos, cos], axis=-1).reshape(sinusoidal_pos.shape)
def rotate_layer(layer, sin_pos, cos_pos):
rotate_half_layer = jnp.stack([-layer[..., 1::2], layer[..., ::2]], axis=-1).reshape(layer.shape)
rotary_matrix_cos = jnp.einsum("bslh,...sh->bslh", layer, cos_pos)
rotary_matrix_sin = jnp.einsum("bslh,...sh->bslh", rotate_half_layer, sin_pos)
return rotary_matrix_cos + rotary_matrix_sin
query_layer = rotate_layer(query_layer, sin_pos, cos_pos)
key_layer = rotate_layer(key_layer, sin_pos, cos_pos)
if value_layer is not None:
value_layer = rotate_layer(value_layer, sin_pos, cos_pos)
return query_layer, key_layer, value_layer
return query_layer, key_layer
class FlaxRoFormerSelfOutput(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.dense = nn.Dense(
self.config.hidden_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
)
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class FlaxRoFormerAttention(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.self = FlaxRoFormerSelfAttention(self.config, dtype=self.dtype)
self.output = FlaxRoFormerSelfOutput(self.config, dtype=self.dtype)
def __call__(
self,
hidden_states,
attention_mask,
sinusoidal_pos,
layer_head_mask,
deterministic=True,
output_attentions: bool = False,
):
attn_outputs = self.self(
hidden_states,
attention_mask,
sinusoidal_pos,
layer_head_mask=layer_head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
)
attn_output = attn_outputs[0]
hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_outputs[1],)
return outputs
class FlaxRoFormerIntermediate(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.dense = nn.Dense(
self.config.intermediate_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
)
self.activation = ACT2FN[self.config.hidden_act]
def __call__(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.activation(hidden_states)
return hidden_states
class FlaxRoFormerOutput(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.dense = nn.Dense(
self.config.hidden_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
def __call__(self, hidden_states, attention_output, deterministic: bool = True):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
hidden_states = self.LayerNorm(hidden_states + attention_output)
return hidden_states
class FlaxRoFormerLayer(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.attention = FlaxRoFormerAttention(self.config, dtype=self.dtype)
self.intermediate = FlaxRoFormerIntermediate(self.config, dtype=self.dtype)
self.output = FlaxRoFormerOutput(self.config, dtype=self.dtype)
def __call__(
self,
hidden_states,
attention_mask,
sinusiodal_pos,
layer_head_mask,
deterministic: bool = True,
output_attentions: bool = False,
):
attention_outputs = self.attention(
hidden_states,
attention_mask,
sinusiodal_pos,
layer_head_mask=layer_head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
)
attention_output = attention_outputs[0]
hidden_states = self.intermediate(attention_output)
hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
outputs = (hidden_states,)
if output_attentions:
outputs += (attention_outputs[1],)
return outputs
class FlaxRoFormerLayerCollection(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.layers = [
FlaxRoFormerLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
]
def __call__(
self,
hidden_states,
attention_mask,
sinusoidal_pos,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
all_attentions = () if output_attentions else None
all_hidden_states = () if output_hidden_states else None
if head_mask is not None:
if head_mask.shape[0] != (len(self.layers)):
raise ValueError(
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.shape[0]}."
)
for i, layer in enumerate(self.layers):
if output_hidden_states:
all_hidden_states += (hidden_states,)
layer_outputs = layer(
hidden_states,
attention_mask,
sinusoidal_pos,
layer_head_mask=head_mask[i] if head_mask is not None else None,
deterministic=deterministic,
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions += (layer_outputs[1],)
if output_hidden_states:
all_hidden_states += (hidden_states,)
outputs = (hidden_states,)
if not return_dict:
return tuple(v for v in outputs if v is not None)
return FlaxBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
class FlaxRoFormerEncoder(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.embed_positions = create_sinusoidal_positions(
self.config.max_position_embeddings, self.config.hidden_size // self.config.num_attention_heads
)
self.layer = FlaxRoFormerLayerCollection(self.config, dtype=self.dtype)
def __call__(
self,
hidden_states,
attention_mask,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
sinusoidal_pos = self.embed_positions[: hidden_states.shape[1], :]
return self.layer(
hidden_states,
attention_mask,
sinusoidal_pos,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
class FlaxRoFormerPredictionHeadTransform(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
self.activation = ACT2FN[self.config.hidden_act]
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
def __call__(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.activation(hidden_states)
return self.LayerNorm(hidden_states)
class FlaxRoFormerLMPredictionHead(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
def setup(self):
self.transform = FlaxRoFormerPredictionHeadTransform(self.config, dtype=self.dtype)
self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False)
self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
def __call__(self, hidden_states, shared_embedding=None):
hidden_states = self.transform(hidden_states)
if shared_embedding is not None:
hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
else:
hidden_states = self.decoder(hidden_states)
bias = jnp.asarray(self.bias, self.dtype)
hidden_states += bias
return hidden_states
class FlaxRoFormerOnlyMLMHead(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.predictions = FlaxRoFormerLMPredictionHead(self.config, dtype=self.dtype)
def __call__(self, hidden_states, shared_embedding=None):
hidden_states = self.predictions(hidden_states, shared_embedding=shared_embedding)
return hidden_states
class FlaxRoFormerClassificationHead(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.dense = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
self.out_proj = nn.Dense(
self.config.num_labels,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
self.activation = ACT2FN[self.config.hidden_act]
def __call__(self, hidden_states, deterministic=True):
hidden_states = hidden_states[:, 0, :]
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
hidden_states = self.dense(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
hidden_states = self.out_proj(hidden_states)
return hidden_states
class FlaxRoFormerPreTrainedModel(FlaxPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = RoFormerConfig
base_model_prefix = "roformer"
module_class: nn.Module = None
def __init__(
self,
config: RoFormerConfig,
input_shape: Tuple = (1, 1),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
module = self.module_class(config=config, dtype=dtype, **kwargs)
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
input_ids = jnp.zeros(input_shape, dtype="i4")
token_type_ids = jnp.zeros_like(input_ids)
attention_mask = jnp.ones_like(input_ids)
head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
params_rng, dropout_rng = jax.random.split(rng)
rngs = {"params": params_rng, "dropout": dropout_rng}
random_params = self.module.init(
rngs, input_ids, attention_mask, token_type_ids, head_mask, return_dict=False
)["params"]
if params is not None:
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
return freeze(unflatten_dict(params))
else:
return random_params
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def __call__(
self,
input_ids,
attention_mask=None,
token_type_ids=None,
head_mask=None,
params: dict = None,
dropout_rng: jax.random.PRNGKey = None,
train: bool = False,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.return_dict
if token_type_ids is None:
token_type_ids = jnp.zeros_like(input_ids)
if attention_mask is None:
attention_mask = jnp.ones_like(input_ids)
if head_mask is None:
head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
return self.module.apply(
{"params": params or self.params},
jnp.array(input_ids, dtype="i4"),
jnp.array(attention_mask, dtype="i4"),
jnp.array(token_type_ids, dtype="i4"),
jnp.array(head_mask, dtype="i4"),
not train,
output_attentions,
output_hidden_states,
return_dict,
rngs=rngs,
)
class FlaxRoFormerModule(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.embeddings = FlaxRoFormerEmbeddings(self.config, dtype=self.dtype)
self.encoder = FlaxRoFormerEncoder(self.config, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
hidden_states = self.embeddings(input_ids, token_type_ids, attention_mask, deterministic=deterministic)
outputs = self.encoder(
hidden_states,
attention_mask,
head_mask=head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
if not return_dict:
return (hidden_states,) + outputs[1:]
return FlaxBaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.",
ROFORMER_START_DOCSTRING,
)
class FlaxRoFormerModel(FlaxRoFormerPreTrainedModel):
module_class = FlaxRoFormerModule
append_call_sample_docstring(FlaxRoFormerModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
class FlaxRoFormerForMaskedLMModule(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.roformer = FlaxRoFormerModule(config=self.config, dtype=self.dtype)
self.cls = FlaxRoFormerOnlyMLMHead(config=self.config, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
outputs = self.roformer(
input_ids,
attention_mask,
token_type_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
if self.config.tie_word_embeddings:
shared_embedding = self.roformer.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
else:
shared_embedding = None
logits = self.cls(hidden_states, shared_embedding=shared_embedding)
if not return_dict:
return (logits,) + outputs[1:]
return FlaxMaskedLMOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
class FlaxRoFormerForMaskedLM(FlaxRoFormerPreTrainedModel):
module_class = FlaxRoFormerForMaskedLMModule
append_call_sample_docstring(
FlaxRoFormerForMaskedLM,
_CHECKPOINT_FOR_DOC,
FlaxMaskedLMOutput,
_CONFIG_FOR_DOC,
mask="<mask>",
)
class FlaxRoFormerForSequenceClassificationModule(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.roformer = FlaxRoFormerModule(config=self.config, dtype=self.dtype)
self.classifier = FlaxRoFormerClassificationHead(config=self.config, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
outputs = self.roformer(
input_ids,
attention_mask,
token_type_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output, deterministic=deterministic)
if not return_dict:
return (logits,) + outputs[1:]
return FlaxSequenceClassifierOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
RoFormer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
ROFORMER_START_DOCSTRING,
)
class FlaxRoFormerForSequenceClassification(FlaxRoFormerPreTrainedModel):
module_class = FlaxRoFormerForSequenceClassificationModule
append_call_sample_docstring(
FlaxRoFormerForSequenceClassification,
_CHECKPOINT_FOR_DOC,
FlaxSequenceClassifierOutput,
_CONFIG_FOR_DOC,
)
class FlaxRoFormerForMultipleChoiceModule(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.roformer = FlaxRoFormerModule(config=self.config, dtype=self.dtype)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
self.classifier = nn.Dense(1, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
):
num_choices = input_ids.shape[1]
input_ids = input_ids.reshape(-1, input_ids.shape[-1])
attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1])
token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1])
outputs = self.roformer(
input_ids,
attention_mask,
token_type_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
pooled_output = hidden_states[:, -1]
pooled_output = self.dropout(pooled_output, deterministic=deterministic)
logits = self.classifier(pooled_output)
reshaped_logits = logits.reshape(-1, num_choices)
if not return_dict:
return (reshaped_logits,) + outputs[2:]
return FlaxMultipleChoiceModelOutput(
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
RoFormer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
ROFORMER_START_DOCSTRING,
)
class FlaxRoFormerForMultipleChoice(FlaxRoFormerPreTrainedModel):
module_class = FlaxRoFormerForMultipleChoiceModule
overwrite_call_docstring(
FlaxRoFormerForMultipleChoice, ROFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
append_call_sample_docstring(
FlaxRoFormerForMultipleChoice,
_CHECKPOINT_FOR_DOC,
FlaxMultipleChoiceModelOutput,
_CONFIG_FOR_DOC,
)
class FlaxRoFormerForTokenClassificationModule(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.roformer = FlaxRoFormerModule(config=self.config, dtype=self.dtype)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
outputs = self.roformer(
input_ids,
attention_mask,
token_type_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
logits = self.classifier(hidden_states)
if not return_dict:
return (logits,) + outputs[1:]
return FlaxTokenClassifierOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
RoFormer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
ROFORMER_START_DOCSTRING,
)
class FlaxRoFormerForTokenClassification(FlaxRoFormerPreTrainedModel):
module_class = FlaxRoFormerForTokenClassificationModule
append_call_sample_docstring(
FlaxRoFormerForTokenClassification,
_CHECKPOINT_FOR_DOC,
FlaxTokenClassifierOutput,
_CONFIG_FOR_DOC,
)
class FlaxRoFormerForQuestionAnsweringModule(nn.Module):
config: RoFormerConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.roformer = FlaxRoFormerModule(config=self.config, dtype=self.dtype)
self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
head_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
outputs = self.roformer(
input_ids,
attention_mask,
token_type_ids,
head_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
logits = self.qa_outputs(hidden_states)
start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
if not return_dict:
return (start_logits, end_logits) + outputs[1:]
return FlaxQuestionAnsweringModelOutput(
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
RoFormer Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
ROFORMER_START_DOCSTRING,
)
class FlaxRoFormerForQuestionAnswering(FlaxRoFormerPreTrainedModel):
module_class = FlaxRoFormerForQuestionAnsweringModule
append_call_sample_docstring(
FlaxRoFormerForQuestionAnswering,
_CHECKPOINT_FOR_DOC,
FlaxQuestionAnsweringModelOutput,
_CONFIG_FOR_DOC,
)
.\models\roformer\modeling_roformer.py
""" PyTorch RoFormer model."""
import math
import os
from typing import Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel, SequenceSummary
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_roformer import RoFormerConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "junnyu/roformer_chinese_base"
_CONFIG_FOR_DOC = "RoFormerConfig"
ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"junnyu/roformer_chinese_small",
"junnyu/roformer_chinese_base",
"junnyu/roformer_chinese_char_small",
"junnyu/roformer_chinese_char_base",
"junnyu/roformer_small_discriminator",
"junnyu/roformer_small_generator",
]
class RoFormerSinusoidalPositionalEmbedding(nn.Embedding):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
super().__init__(num_positions, embedding_dim)
self.weight = self._init_weight(self.weight)
@staticmethod
def _init_weight(out: nn.Parameter) -> nn.Parameter:
"""
初始化权重矩阵,类似于 XLM 的 create_sinusoidal_embeddings 函数,但特征未交错。
余弦特征位于向量的后半部分。[dim // 2:]
"""
n_pos, dim = out.shape
position_enc = np.array(
[[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
)
out.requires_grad = False
sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
out.detach_()
return out
@torch.no_grad()
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
"""`input_ids_shape` 期望为 [bsz x seqlen]。"""
bsz, seq_len = input_ids_shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
)
return super().forward(positions)
def load_tf_weights_in_roformer(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name.replace("bert", "roformer"))
arrays.append(array)
for name, array in zip(names, arrays):
name = name.split("/")
if any(
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name
):
logger.info(f"Skipping {'/'.join(name)}")
continue
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info(f"Skipping {'/'.join(name)}")
continue
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
if not pointer.shape == array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
return model
class RoFormerSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.rotary_value = config.rotary_value
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
attention_mask=None,
sinusoidal_pos=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=inputs_embeds.device)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
def apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer, value_layer=None):
sin, cos = sinusoidal_pos.chunk(2, dim=-1)
sin_pos = torch.stack([sin, sin], dim=-1).reshape_as(sinusoidal_pos)
cos_pos = torch.stack([cos, cos], dim=-1).reshape_as(sinusoidal_pos)
rotate_half_query_layer = torch.stack([-query_layer[..., 1::2], query_layer[..., ::2]], dim=-1).reshape_as(
query_layer
)
query_layer = query_layer * cos_pos + rotate_half_query_layer * sin_pos
rotate_half_key_layer = torch.stack([-key_layer[..., 1::2], key_layer[..., ::2]], dim=-1).reshape_as(key_layer)
key_layer = key_layer * cos_pos + rotate_half_key_layer * sin_pos
if value_layer is not None:
rotate_half_value_layer = torch.stack([-value_layer[..., 1::2], value_layer[..., ::2]], dim=-1).reshape_as(
value_layer
)
value_layer = value_layer * cos_pos + rotate_half_value_layer * sin_pos
return query_layer, key_layer, value_layer
return query_layer, key_layer
class RoFormerSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class RoFormerAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = RoFormerSelfAttention(config)
self.output = RoFormerSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states,
attention_mask=None,
sinusoidal_pos=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
self_outputs = self.self(
hidden_states,
attention_mask,
sinusoidal_pos,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class RoFormerIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class RoFormerOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class RoFormerLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = RoFormerAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = RoFormerAttention(config)
self.intermediate = RoFormerIntermediate(config)
self.output = RoFormerOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
sinusoidal_pos=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
sinusoidal_pos,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention "
"layers by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
sinusoidal_pos,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class RoFormerEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.embed_positions = RoFormerSinusoidalPositionalEmbedding(
config.max_position_embeddings, config.hidden_size // config.num_attention_heads
)
self.layer = nn.ModuleList([RoFormerLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
pass
class RoFormerPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.embedding_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class RoFormerLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = RoFormerPredictionHeadTransform(config)
self.decoder = nn.Linear(config.embedding_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class RoFormerOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = RoFormerLMPredictionHead(config)
def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class RoFormerPreTrainedModel(PreTrainedModel):
"""
一个抽象类,处理权重初始化、下载和加载预训练模型的简单接口。
"""
config_class = RoFormerConfig
load_tf_weights = load_tf_weights_in_roformer
base_model_prefix = "roformer"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, RoFormerSinusoidalPositionalEmbedding):
pass
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
ROFORMER_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`RoFormerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
ROFORMER_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"""
Add model-specific documentation to the given function.
Args:
**kwargs: Keyword arguments forwarded to the function.
This decorator helps in adding standardized documentation string (`ROFORMER_START_DOCSTRING`) to functions.
"""
)
"The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.",
ROFORMER_START_DOCSTRING,
class RoFormerModel(RoFormerPreTrainedModel):
"""
模型可以作为编码器(仅自注意力)或解码器运行,当作解码器时,在自注意力层之间添加了交叉注意力层,
遵循 [Attention is all you need](https://arxiv.org/abs/1706.03762) 中描述的架构,作者为 Ashish Vaswani,
Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser 和 Illia Polosukhin.
若要作为解码器运行,模型需要用配置中的 `is_decoder` 参数初始化为 `True`。
若要在 Seq2Seq 模型中使用,模型需要用 `is_decoder` 参数和 `add_cross_attention` 参数同时初始化为 `True`;
此时预期在前向传播中输入 `encoder_hidden_states`。
"""
def __init__(self, config):
super().__init__(config)
self.config = config
self.embeddings = RoFormerEmbeddings(config)
if config.embedding_size != config.hidden_size:
self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size)
self.encoder = RoFormerEncoder(config)
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
剪枝模型中的注意力头。heads_to_prune: {层编号: 要在该层中剪枝的头列表} 参见基类 PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
def __init__(self, config):
super().__init__(config)
if config.is_decoder:
logger.warning(
"If you want to use `RoFormerForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.roformer = RoFormerModel(config)
self.cls = RoFormerOnlyMLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[MaskedLMOutput, Tuple[torch.Tensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
effective_batch_size = input_shape[0]
assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
dummy_token = torch.full(
(effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
)
input_ids = torch.cat([input_ids, dummy_token], dim=1)
return {"input_ids": input_ids, "attention_mask": attention_mask}
@add_start_docstrings(
"""RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING
)
class RoFormerForCausalLM(RoFormerPreTrainedModel):
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
def __init__(self, config):
super().__init__(config)
if not config.is_decoder:
logger.warning("If you want to use `RoFormerForCausalLM` as a standalone, add `is_decoder=True.`")
self.roformer = RoFormerModel(config)
self.cls = RoFormerOnlyMLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
input_shape = input_ids.shape
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
def _reorder_cache(self, past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
class RoFormerClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
self.config = config
def forward(self, features, **kwargs):
x = features[:, 0, :]
x = self.dropout(x)
x = self.dense(x)
x = ACT2FN[self.config.hidden_act](x)
x = self.dropout(x)
x = self.out_proj(x)
return x
@add_start_docstrings(
"""
RoFormer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
ROFORMER_START_DOCSTRING,
)
class RoFormerForSequenceClassification(RoFormerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.roformer = RoFormerModel(config)
self.classifier = RoFormerClassificationHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[SequenceClassifierOutput, Tuple[torch.Tensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
RoFormer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
ROFORMER_START_DOCSTRING,
)
class RoFormerForMultipleChoice(RoFormerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.roformer = RoFormerModel(config)
self.sequence_summary = SequenceSummary(config)
self.classifier = nn.Linear(config.hidden_size, 1)
self.post_init()
@add_start_docstrings_to_model_forward(
ROFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[MultipleChoiceModelOutput, Tuple[torch.Tensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.roformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
pooled_output = self.sequence_summary(sequence_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
RoFormer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
ROFORMER_START_DOCSTRING,
)
class RoFormerForTokenClassification(RoFormerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.roformer = RoFormerModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[TokenClassifierOutput, Tuple[torch.Tensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
RoFormer Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
ROFORMER_START_DOCSTRING,
)
class RoFormerForQuestionAnswering(RoFormerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
config.num_labels = 2
self.num_labels = config.num_labels
self.roformer = RoFormerModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)