Transformers 源码解析(二十三)
.\models\bros\processing_bros.py
"""
Processor class for Bros.
"""
from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class BrosProcessor(ProcessorMixin):
r"""
Constructs a Bros processor which wraps a BERT tokenizer.
[`BrosProcessor`] offers all the functionalities of [`BertTokenizerFast`]. See the docstring of
[`~BrosProcessor.__call__`] and [`~BrosProcessor.decode`] for more information.
Args:
tokenizer (`BertTokenizerFast`, *optional*):
An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
"""
attributes = ["tokenizer"]
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, tokenizer=None, **kwargs):
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(tokenizer)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
):
"""
Processes input text or pre-tokenized input into a format suitable for BERT models.
Args:
text (Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], optional):
The input text to process, can be single or batched inputs.
add_special_tokens (bool, optional):
Whether to add special tokens like [CLS], [SEP].
padding (Union[bool, str, PaddingStrategy], optional):
Strategy for padding sequences to the same length.
truncation (Union[bool, str, TruncationStrategy], optional):
Strategy for truncating sequences to a maximum length.
max_length (int, optional):
Maximum length of the returned sequences after truncation and padding.
stride (int, optional):
Stride for splitting text into chunks when truncation is applied.
pad_to_multiple_of (int, optional):
Pad all sequences to a multiple of this value.
return_token_type_ids (bool, optional):
Whether to return token type IDs.
return_attention_mask (bool, optional):
Whether to return attention masks.
return_overflowing_tokens (bool, optional):
Whether to return overflowing tokens that were truncated.
return_special_tokens_mask (bool, optional):
Whether to return a mask indicating special tokens.
return_offsets_mapping (bool, optional):
Whether to return offsets mapping tokenized input to original text.
return_length (bool, optional):
Whether to return the length of the output sequence.
verbose (bool, optional):
Whether to print informative messages during processing.
return_tensors (Optional[Union[str, TensorType]], optional):
Type of tensor to return (e.g., 'pt' for PyTorch tensors).
**kwargs:
Additional keyword arguments passed to the tokenizer.
Returns:
BatchEncoding:
Processed batch encoding containing tokenized inputs and relevant masks/tensors.
"""
pass
) -> BatchEncoding:
"""
This method uses `BertTokenizerFast.__call__` to prepare text for the model.
Please refer to the docstring of the above two methods for more information.
"""
encoding = self.tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
return encoding
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to BertTokenizerFast's `~PreTrainedTokenizer.batch_decode`. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to BertTokenizerFast's `~PreTrainedTokenizer.decode`. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
return list(dict.fromkeys(tokenizer_input_names))
.\models\bros\__init__.py
from typing import TYPE_CHECKING
_import_structure = {
"configuration_bros": ["BROS_PRETRAINED_CONFIG_ARCHIVE_MAP", "BrosConfig"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["processing_bros"] = ["BrosProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_bros"] = [
"BROS_PRETRAINED_MODEL_ARCHIVE_LIST",
"BrosPreTrainedModel",
"BrosModel",
"BrosForTokenClassification",
"BrosSpadeEEForTokenClassification",
"BrosSpadeELForTokenClassification",
]
if TYPE_CHECKING:
from .configuration_bros import BROS_PRETRAINED_CONFIG_ARCHIVE_MAP, BrosConfig
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .processing_bros import BrosProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_bros import (
BROS_PRETRAINED_MODEL_ARCHIVE_LIST,
BrosForTokenClassification,
BrosModel,
BrosPreTrainedModel,
BrosSpadeEEForTokenClassification,
BrosSpadeELForTokenClassification,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\byt5\convert_byt5_original_tf_checkpoint_to_pytorch.py
"""Convert T5 checkpoint."""
import argparse
from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5
from transformers.utils import logging
logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
config = T5Config.from_json_file(config_file)
print(f"Building PyTorch model from configuration: {config}")
model = T5ForConditionalGeneration(config)
load_tf_weights_in_t5(model, config, tf_checkpoint_path)
print(f"Save PyTorch model to {pytorch_dump_path}")
model.save_pretrained(pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help=(
"The config json file corresponding to the pre-trained T5 model. \nThis specifies the model architecture."
),
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
.\models\byt5\tokenization_byt5.py
class ByT5Tokenizer(PreTrainedTokenizer):
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
extra_ids=125,
additional_special_tokens=None,
**kwargs,
) -> None:
if extra_ids > 0 and additional_special_tokens is None:
additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0:
extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
if extra_tokens != extra_ids:
raise ValueError(
f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
" provided to ByT5Tokenizer. In this case the additional_special_tokens must include the"
" extra_ids tokens"
)
pad_token = AddedToken(pad_token, lstrip=True, rstrip=True) if isinstance(pad_token, str) else pad_token
eos_token = AddedToken(eos_token, lstrip=True, rstrip=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=True, rstrip=True) if isinstance(unk_token, str) else unk_token
self._added_tokens_decoder = {0: pad_token, 1: eos_token, 2: unk_token}
self.offset = len(self._added_tokens_decoder)
self._utf_vocab_size = 2**8
super().__init__(
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
extra_ids=0,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
@property
def vocab_size(self):
return self._utf_vocab_size
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)}
vocab.update(self.added_tokens_encoder)
return vocab
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return ([0] * len(token_ids_0)) + [1]
else:
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
"""Do not add eos again if user already added it."""
if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
warnings.warn(
f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
" eos tokens being added."
)
return token_ids
else:
return token_ids + [self.eos_token_id]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
eos = [self.eos_token_id]
if token_ids_1 is None:
return len(token_ids_0 + eos) * [0]
else:
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of input IDs with special tokens added in the appropriate positions.
"""
def build_inputs_with_special_tokens(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:
- single sequence: `X </s>`
- pair of sequences: `A </s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of input IDs with the appropriate special tokens.
"""
token_ids_0 = self._add_eos_if_not_present(token_ids_0)
if token_ids_1 is None:
return token_ids_0
else:
token_ids_1 = self._add_eos_if_not_present(token_ids_1)
return token_ids_0 + token_ids_1
def _tokenize(self, text: str) -> List[str]:
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
tokens = [chr(i) for i in text.encode("utf-8")]
return tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) into an ID using the vocabulary."""
if len(token) != 1:
token_id = None
else:
token_id = ord(token) + self.offset
return token_id
def _convert_id_to_token(self, index):
"""Converts an index (integer) into a token (str) using the vocabulary."""
token = chr(index - self.offset)
return token
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings) into a single string."""
bstring = b""
for token in tokens:
if token in self.added_tokens_decoder:
tok_string = self.added_tokens_decoder[token].encode("utf-8")
elif token in self.added_tokens_encoder:
tok_string = token.encode("utf-8")
else:
tok_string = bytes([ord(token)])
bstring += tok_string
string = bstring.decode("utf-8", errors="ignore")
return string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
return ()
.\models\byt5\__init__.py
from typing import TYPE_CHECKING
from ...utils import _LazyModule
_import_structure = {"tokenization_byt5": ["ByT5Tokenizer"]}
if TYPE_CHECKING:
from .tokenization_byt5 import ByT5Tokenizer
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\camembert\configuration_camembert.py
""" CamemBERT configuration"""
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"almanach/camembert-base": "https://huggingface.co/almanach/camembert-base/resolve/main/config.json",
"umberto-commoncrawl-cased-v1": (
"https://huggingface.co/Musixmatch/umberto-commoncrawl-cased-v1/resolve/main/config.json"
),
"umberto-wikipedia-uncased-v1": (
"https://huggingface.co/Musixmatch/umberto-wikipedia-uncased-v1/resolve/main/config.json"
),
}
class CamembertConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`CamembertModel`] or a [`TFCamembertModel`]. It is
used to instantiate a Camembert model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the Camembert
[almanach/camembert-base](https://huggingface.co/almanach/camembert-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import CamembertConfig, CamembertModel
>>> # Initializing a Camembert almanach/camembert-base style configuration
>>> configuration = CamembertConfig()
>>> # Initializing a model (with random weights) from the almanach/camembert-base style configuration
>>> model = CamembertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "camembert"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
position_embedding_type="absolute",
use_cache=True,
classifier_dropout=None,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
class CamembertOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
]
)
.\models\camembert\modeling_camembert.py
"""PyTorch CamemBERT 模型。"""
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN, gelu
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_camembert import CamembertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "almanach/camembert-base"
_CONFIG_FOR_DOC = "CamembertConfig"
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"almanach/camembert-base",
"Musixmatch/umberto-commoncrawl-cased-v1",
"Musixmatch/umberto-wikipedia-uncased-v1",
]
CAMEMBERT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`CamembertConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
class CamembertEmbeddings(nn.Module):
"""
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
"""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
)
def forward(
self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
):
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class CamembertSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
class CamembertSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class CamembertAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = CamembertSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = CamembertSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class CamembertIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class CamembertOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class CamembertLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = CamembertAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = CamembertAttention(config, position_embedding_type="absolute")
self.intermediate = CamembertIntermediate(config)
self.output = CamembertOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class CamembertEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([CamembertLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class CamembertPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class CamembertPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = CamembertConfig
base_model_prefix = "roberta"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
CAMEMBERT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
# 输入序列标记在词汇表中的索引。
# 可以使用 [`AutoTokenizer`] 获取这些索引。
# 参见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`] 获取详细信息。
# [什么是输入 ID?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
# 遮罩,用于避免在填充的标记索引上执行注意力操作。
# 遮罩值选取在 `[0, 1]` 之间:
# - 1 表示**未遮罩**的标记,
# - 0 表示**遮罩**的标记。
# [什么是注意力遮罩?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 段标记索引,用于指示输入的第一和第二部分。
# 索引选取在 `[0, 1]` 之间:
# - 0 对应*句子 A* 的标记,
# - 1 对应*句子 B* 的标记。
# [什么是标记类型 ID?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 输入序列中每个标记在位置嵌入中的位置索引。
# 索引选取在 `[0, config.max_position_embeddings - 1]` 范围内。
# [什么是位置 ID?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
# 用于将自注意力模块中选择的头部置空的遮罩。
# 遮罩值选取在 `[0, 1]` 之间:
# - 1 表示**未遮罩**的头部,
# - 0 表示**遮罩**的头部。
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
# 可选,代替传递 `input_ids`,您可以直接传递嵌入表示。
# 如果您希望更加控制将 `input_ids` 索引转换为关联向量的方式,这将会很有用,而不是使用模型的内部嵌入查找矩阵。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。
# 有关更多详细信息,请参见返回的张量中的 `attentions`。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。
# 有关更多详细信息,请参见返回的张量中的 `hidden_states`。
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是简单的元组。
"""
class CamembertClassificationHead(nn.Module):
"""用于句子级分类任务的头部模块。"""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features[:, 0, :]
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class CamembertLMHead(nn.Module):
"""用于掩码语言建模的 Camembert 头部模块。"""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, features, **kwargs):
x = self.dense(features)
x = gelu(x)
x = self.layer_norm(x)
x = self.decoder(x)
return x
def _tie_weights(self):
if self.decoder.bias.device.type == "meta":
self.decoder.bias = self.bias
else:
self.bias = self.decoder.bias
@add_start_docstrings(
"The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
CAMEMBERT_START_DOCSTRING,
)
class CamembertModel(CamembertPreTrainedModel):
"""
模型可以作为编码器(仅自注意力)或解码器使用,此时在自注意力层之间添加了一层交叉注意力层,遵循 *Attention is
all you need*_ 中描述的架构,作者是 Ashish Vaswani、Noam Shazeer、Niki Parmar、Jakob Uszkoreit、Llion
Jones、Aidan N. Gomez、Lukasz Kaiser 和 Illia Polosukhin。
要作为解码器使用,模型需要使用配置设置中的 `is_decoder` 参数初始化为 `True`。要用于 Seq2Seq 模型,
模型需要同时使用 `is_decoder` 参数和
```
"""
add_cross_attention 设置为 True;预期在前向传播中作为输入传入 encoder_hidden_states。
.. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
"""
_no_split_modules = []
# 从 transformers.models.bert.modeling_bert.BertModel.__init__ 复制并修改为 Camembert
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
# 初始化嵌入层和编码器
self.embeddings = CamembertEmbeddings(config)
self.encoder = CamembertEncoder(config)
# 如果需要添加池化层,则初始化池化器
self.pooler = CamembertPooler(config) if add_pooling_layer else None
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
对模型的注意力头进行修剪。heads_to_prune: {layer_num: 要在该层中修剪的头列表} 参见基类 PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
# 从 transformers.models.bert.modeling_bert.BertModel.forward 复制
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 使用装饰器将文档字符串添加到模型类的定义中,描述了此类是一个带有语言建模头部的CamemBERT模型。
# 这些文档字符串是从CAMEMBERT_START_DOCSTRING导入的基础信息后面增加的。
@add_start_docstrings(
"""CamemBERT Model with a `language modeling` head on top.""",
CAMEMBERT_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_roberta.RobertaForMaskedLM复制过来,将Roberta改为Camembert,ROBERTA改为CAMEMBERT。
class CamembertForMaskedLM(CamembertPreTrainedModel):
# 定义了一个列表,包含了lm_head.decoder.weight和lm_head.decoder.bias,这些权重是被绑定的。
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
# 初始化方法,接受一个config参数,并调用其父类的初始化方法。
def __init__(self, config):
super().__init__(config)
# 如果config.is_decoder为True,给出警告,建议在使用CamembertForMaskedLM时将其设为False,以使用双向自注意力。
if config.is_decoder:
logger.warning(
"If you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
# 初始化一个CamembertModel对象,并禁用添加池化层。
self.roberta = CamembertModel(config, add_pooling_layer=False)
# 初始化一个CamembertLMHead对象。
self.lm_head = CamembertLMHead(config)
# 初始化权重并应用最终处理。
self.post_init()
# 返回语言建模头部的输出嵌入。
def get_output_embeddings(self):
return self.lm_head.decoder
# 设置语言建模头部的输出嵌入为新的嵌入。
def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings
# 前向传播方法,接受多个输入参数,并且被装饰器修饰,添加了一些模型前向传播的文档字符串。
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="<mask>",
expected_output="' Paris'",
expected_loss=0.1,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 以下是方法参数的描述,注释解释了每个参数的作用和类型。
):
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
# Determine whether to use a return dictionary based on the provided argument or the default configuration
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Pass the input data through the Roberta model to obtain outputs
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Retrieve the sequence output from the Roberta model's outputs
sequence_output = outputs[0]
# Generate prediction scores using the language modeling head
prediction_scores = self.lm_head(sequence_output)
# Initialize the masked language modeling loss variable
masked_lm_loss = None
# Calculate the masked language modeling loss if labels are provided
if labels is not None:
# Move labels to the device where prediction_scores tensor resides for model parallelism
labels = labels.to(prediction_scores.device)
# Define the loss function as Cross Entropy Loss
loss_fct = CrossEntropyLoss()
# Compute the masked LM loss based on prediction scores and labels
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
# If return_dict is False, prepare the output tuple with prediction scores and additional outputs
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
# If return_dict is True, construct a MaskedLMOutput object with specific attributes
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
CAMEMBERT_START_DOCSTRING,
)
# 基于 transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification 复制修改,将所有 Roberta 替换为 Camembert,所有 ROBERTA 替换为 CAMEMBERT
class CamembertForSequenceClassification(CamembertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels # 从配置中获取标签数目
self.config = config # 存储配置信息
self.roberta = CamembertModel(config, add_pooling_layer=False) # 初始化 Camembert 模型,不添加汇聚层
self.classifier = CamembertClassificationHead(config) # 初始化 Camembert 分类头部
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="cardiffnlp/twitter-roberta-base-emotion",
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="'optimism'",
expected_loss=0.08,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 如果 return_dict 不为 None,则使用指定的值;否则使用模型配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 RoBERTa 模型进行前向传播
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取 RoBERTa 输出的序列输出
sequence_output = outputs[0]
# 经过分类器得到 logits
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
# 将标签移动到正确的设备以启用模型并行处理
labels = labels.to(logits.device)
# 确定问题类型,根据 num_labels 和 labels 的数据类型进行分类
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型选择损失函数
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果不需要返回字典,则返回模型的输出和损失
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 如果需要返回字典,则构造 SequenceClassifierOutput 对象并返回
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 使用装饰器为类添加文档字符串,描述了该类是基于CamemBERT模型的多选分类器,适用于例如RocStories/SWAG任务。
@add_start_docstrings(
"""
CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
CAMEMBERT_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice中复制的代码,将Roberta替换为Camembert,ROBERTA替换为CAMEMBERT
class CamembertForMultipleChoice(CamembertPreTrainedModel):
def __init__(self, config):
# 调用父类构造函数初始化对象
super().__init__(config)
# 初始化Camembert模型
self.roberta = CamembertModel(config)
# 使用config中定义的hidden_dropout_prob初始化一个Dropout层
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 创建一个线性层用于多选分类,输入维度为config中定义的hidden_size,输出维度为1
self.classifier = nn.Linear(config.hidden_size, 1)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(
CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
# 定义前向传播方法,接收多个输入参数并返回一个包含输出的字典或者一个元组
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 描述输入参数的文档字符串,指定了输入的形状和含义
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
# 根据 `return_dict` 参数确定是否使用返回字典
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 获取输入 `input_ids` 的第二维大小作为选项数
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
# 如果 `input_ids` 不为空,则展平为二维张量
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
# 如果 `position_ids` 不为空,则展平为二维张量
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
# 如果 `token_type_ids` 不为空,则展平为二维张量
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
# 如果 `attention_mask` 不为空,则展平为二维张量
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
# 如果 `inputs_embeds` 不为空,则展平为三维张量
flat_inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
# 调用 RoBERTa 模型进行前向传播
outputs = self.roberta(
flat_input_ids,
position_ids=flat_position_ids,
token_type_ids=flat_token_type_ids,
attention_mask=flat_attention_mask,
head_mask=head_mask,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取池化后的输出
pooled_output = outputs[1]
# 对池化输出应用 dropout
pooled_output = self.dropout(pooled_output)
# 对池化后的输出应用分类器得到 logits
logits = self.classifier(pooled_output)
# 重塑 logits 的形状为 (batch_size, num_choices)
reshaped_logits = logits.view(-1, num_choices)
# 初始化损失为 None
loss = None
# 如果提供了标签 `labels`
if labels is not None:
# 将标签移动到正确的设备以支持模型并行计算
labels = labels.to(reshaped_logits.device)
# 定义交叉熵损失函数
loss_fct = CrossEntropyLoss()
# 计算交叉熵损失
loss = loss_fct(reshaped_logits, labels)
# 如果不使用返回字典,则返回扁平化后的输出
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 使用返回字典形式输出结果
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
"""
# 从transformers.models.roberta.modeling_roberta.RobertaForTokenClassification复制,将Roberta替换为Camembert,ROBERTA替换为CAMEMBERT
@add_start_docstrings(
"""
CamemBERT模型,顶部带有一个标记分类头(在隐藏状态输出的顶部增加了一个线性层),例如用于命名实体识别(NER)任务。
""",
CAMEMBERT_START_DOCSTRING,
)
class CamembertForTokenClassification(CamembertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
# 初始化Camembert模型,不包括池化层
self.roberta = CamembertModel(config, add_pooling_layer=False)
# 分类器的dropout率,如果未指定,则使用config.hidden_dropout_prob
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
# 线性分类器,将隐藏状态的输出映射到标签数量
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="Jean-Baptiste/roberta-large-ner-english",
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
expected_loss=0.01,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
CamemBERT模型的前向传播方法。
Args:
input_ids (Optional[torch.LongTensor], optional): 输入的token索引序列. Defaults to None.
attention_mask (Optional[torch.FloatTensor], optional): 注意力遮罩,指示哪些元素是填充值而不是实际数据. Defaults to None.
token_type_ids (Optional[torch.LongTensor], optional): token类型ids,用于区分不同的句子. Defaults to None.
position_ids (Optional[torch.LongTensor], optional): 位置ids,指示每个token在输入中的位置. Defaults to None.
head_mask (Optional[torch.FloatTensor], optional): 头部遮罩,用于指定哪些注意力头部被屏蔽. Defaults to None.
inputs_embeds (Optional[torch.FloatTensor], optional): 嵌入的输入,而不是使用input_ids. Defaults to None.
labels (Optional[torch.LongTensor], optional): 标签,用于训练时的监督. Defaults to None.
output_attentions (Optional[bool], optional): 是否输出所有注意力权重. Defaults to None.
output_hidden_states (Optional[bool], optional): 是否输出所有隐藏状态. Defaults to None.
return_dict (Optional[bool], optional): 是否返回字典格式的输出. Defaults to None.
Returns:
TokenClassifierOutput or Tuple[torch.FloatTensor]: 模型的输出结果或元组,根据return_dict参数决定输出形式.
"""
# 实现CamemBERT模型的前向传播逻辑,详细解释见上文
pass # forward方法的具体实现在实际代码中,这里暂时不作展示
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
# 如果 return_dict 不为 None,则使用传入的 return_dict,否则使用配置中的 use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 Roberta 模型处理输入数据
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取序列输出
sequence_output = outputs[0]
# 对序列输出进行 dropout 处理
sequence_output = self.dropout(sequence_output)
# 使用分类器对处理后的序列输出进行分类得到 logits
logits = self.classifier(sequence_output)
# 初始化损失为 None
loss = None
# 如果存在标签,则计算交叉熵损失
if labels is not None:
# 将标签移到与 logits 相同的设备上,以支持模型并行计算
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
# 计算交叉熵损失
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
# 如果不使用 return_dict,按顺序返回 logits 和额外的模型输出
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 使用 TokenClassifierOutput 类构建返回结果,包括损失、logits、隐藏状态和注意力权重
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`
""",
CAMEMBERT_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering 复制而来,将所有 Roberta 替换为 Camembert,所有 ROBERTA 替换为 CAMEMBERT
class CamembertForQuestionAnswering(CamembertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
# 初始化 Camembert 模型,禁用 pooling 层
self.roberta = CamembertModel(config, add_pooling_layer=False)
# 线性层,用于输出分类 logits
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="deepset/roberta-base-squad2",
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="' puppet'",
expected_loss=0.86,
)
# 定义前向传播方法,接受多种输入参数并返回结果
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 Roberta 模型进行前向传播
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取模型输出的序列输出
sequence_output = outputs[0]
# 对序列输出进行问答任务的输出
logits = self.qa_outputs(sequence_output)
# 将输出分割为开始位置和结束位置的 logits
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous() # 去除维度为 1 的维度,并保证连续性
end_logits = end_logits.squeeze(-1).contiguous() # 去除维度为 1 的维度,并保证连续性
total_loss = None
if start_positions is not None and end_positions is not None:
# 如果输入的 start_positions 或 end_positions 是多维的,在 GPU 上处理时需要进行调整
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# 将超出模型输入长度的位置索引设置为忽略索引
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
# 定义交叉熵损失函数,忽略忽略索引
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
# 计算开始位置和结束位置的损失
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
# 计算总体损失
total_loss = (start_loss + end_loss) / 2
# 如果不需要返回字典,则返回一个元组
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
# 返回 QuestionAnsweringModelOutput 对象,包含损失、开始位置 logits、结束位置 logits、隐藏状态和注意力权重
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 使用自定义的装饰器添加模型文档字符串,说明这是一个带有语言建模头部的CamemBERT模型,用于条件语言建模(CLM)微调
@add_start_docstrings(
"""CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING
)
# 从transformers.models.roberta.modeling_roberta.RobertaForCausalLM复制并修改为CamembertForCausalLM,替换了相关引用和模型名称
# 将FacebookAI/roberta-base替换为almanach/camembert-base
class CamembertForCausalLM(CamembertPreTrainedModel):
# 指定权重共享的键列表,这些键将与lm_head.decoder的权重和偏置相关联
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
super().__init__(config)
# 如果配置不是解码器,则发出警告,建议添加"is_decoder=True"以独立使用CamembertLMHeadModel
if not config.is_decoder:
logger.warning("If you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
# 初始化Camembert模型部分,不包括池化层
self.roberta = CamembertModel(config, add_pooling_layer=False)
# 初始化Camembert语言建模头部
self.lm_head = CamembertLMHead(config)
# 初始化权重并进行最终处理
self.post_init()
# 获取输出嵌入层的方法,返回lm_head.decoder,即语言建模头部的解码器
def get_output_embeddings(self):
return self.lm_head.decoder
# 设置输出嵌入层的方法,更新lm_head.decoder的值为新的嵌入层
def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings
# 重写forward方法,根据参数文档说明进行详细的输入和输出注释
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 为生成准备输入数据,根据给定参数设置输入形状
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
# 获取输入数据的形状
input_shape = input_ids.shape
# 如果未提供注意力掩码,则创建一个全为1的掩码,长度与输入相同
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
# 如果已提供过去的键值(用于缓存),则根据过去键值裁剪输入的ID序列
if past_key_values is not None:
# 获取过去键值的长度(通常是序列长度)
past_length = past_key_values[0][0].shape[2]
# 如果输入ID序列长度大于过去键值长度,裁剪序列,保留后面部分
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# 否则,默认只保留最后一个ID
remove_prefix_length = input_ids.shape[1] - 1
# 裁剪输入ID序列
input_ids = input_ids[:, remove_prefix_length:]
# 返回准备好的输入参数字典
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
# 重新排序缓存中的过去键值,根据给定的beam索引
def _reorder_cache(self, past_key_values, beam_idx):
# 初始化重新排序后的过去键值元组
reordered_past = ()
# 遍历每一层的过去键值
for layer_past in past_key_values:
# 对每个过去状态,根据beam索引重新排序,并加入元组中
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
# 返回重新排序后的过去键值
return reordered_past
# 从输入的input_ids中创建位置标识符,用于Transformer模型的位置编码
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.
Args:
input_ids: 输入的整数张量,包含了模型的输入内容
padding_idx: 表示填充的索引,用于识别填充符号
past_key_values_length: 过去键值的长度,用于增量索引计算
Returns:
torch.Tensor: 包含了每个位置的标识符的长整型张量
"""
# 创建一个掩码张量,将非填充符号的位置标记为1,填充符号标记为0
mask = input_ids.ne(padding_idx).int()
# 计算每个位置的增量索引,忽略填充符号
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
# 将增量索引转换为长整型,并加上填充索引,以获得最终的位置标识符
return incremental_indices.long() + padding_idx
.\models\camembert\modeling_tf_camembert.py
import math
import warnings
from typing import Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutputWithPastAndCrossAttentions,
TFBaseModelOutputWithPoolingAndCrossAttentions,
TFCausalLMOutputWithCrossAttentions,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFMaskedLanguageModelingLoss,
TFModelInputType,
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_camembert import CamembertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "almanach/camembert-base"
_CONFIG_FOR_DOC = "CamembertConfig"
TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
]
CAMEMBERT_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
"""
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
"""
"""
Parameters:
config ([`CamembertConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
"""
CAMEMBERT_INPUTS_DOCSTRING = r"""
"""
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings 复制过来的
class TFCamembertEmbeddings(keras.layers.Layer):
"""
与 BertEmbeddings 相同,但在位置嵌入索引方面有微小调整。
"""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.padding_idx = 1 # 定义填充索引,用于表示填充的位置
self.config = config # 保存配置对象
self.hidden_size = config.hidden_size # 获取隐藏层大小
self.max_position_embeddings = config.max_position_embeddings # 获取最大位置嵌入数
self.initializer_range = config.initializer_range # 获取初始化范围
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") # 创建层归一化层对象
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) # 创建 dropout 层对象
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
# 创建词嵌入权重矩阵
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
with tf.name_scope("token_type_embeddings"):
# 创建 token 类型嵌入权重矩阵
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.config.type_vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
with tf.name_scope("position_embeddings"):
# 创建位置嵌入权重矩阵
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
# 构建层归一化层
self.LayerNorm.build([None, None, self.config.hidden_size])
def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
"""
用输入的 id 创建位置 id,非填充符号被替换为它们的位置数字。位置数字从 padding_idx+1 开始。
填充符号被忽略。这是从 fairseq 的 `utils.make_positions` 修改而来。
Args:
input_ids: tf.Tensor 输入的 id 张量
Returns: tf.Tensor 输出的位置 id 张量
"""
mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask
return incremental_indices + self.padding_idx
def call(
self,
input_ids=None,
position_ids=None,
token_type_ids=None,
inputs_embeds=None,
past_key_values_length=0,
training=False,
):
"""
Applies embedding based on inputs tensor.
Returns:
final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
# 如果没有提供 input_ids 或 inputs_embeds,抛出异常
if input_ids is not None:
# 检查 input_ids 是否在词汇表大小内
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
# 根据 input_ids 从权重矩阵中获取对应的 embeddings
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
# 获取输入 embeds 的形状,去掉最后一个维度
input_shape = shape_list(inputs_embeds)[:-1]
# 如果没有提供 token_type_ids,则用0填充形状
if token_type_ids is None:
token_type_ids = tf.fill(dims=input_shape, value=0)
# 如果没有提供 position_ids
if position_ids is None:
if input_ids is not None:
# 根据输入的 token ids 创建 position ids。任何填充的 token 仍然保持填充状态。
position_ids = self.create_position_ids_from_input_ids(
input_ids=input_ids, past_key_values_length=past_key_values_length
)
else:
# 创建默认的 position ids,范围从 padding_idx + 1 到 input_shape[-1] + padding_idx + 1
position_ids = tf.expand_dims(
tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
)
# 根据 position_ids 获取 position embeddings
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
# 根据 token_type_ids 获取 token type embeddings
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
# 计算最终的 embeddings,组合 inputs_embeds、position_embeds 和 token_type_embeds
final_embeddings = inputs_embeds + position_embeds + token_type_embeds
# 对最终的 embeddings 进行 LayerNorm 处理
final_embeddings = self.LayerNorm(inputs=final_embeddings)
# 对最终的 embeddings 进行 dropout 处理
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
# 返回最终的 embeddings 结果
return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Camembert
class TFCamembertPooler(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
# 定义一个全连接层,用于池化隐藏状态的第一个令牌
self.dense = keras.layers.Dense(
units=config.hidden_size, # 全连接层的输出大小为配置文件中定义的隐藏大小
kernel_initializer=get_initializer(config.initializer_range), # 使用配置中的初始化器范围进行权重初始化
activation="tanh", # 激活函数为双曲正切函数
name="dense", # 层的名称为dense
)
self.config = config # 保存配置参数
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# 池化模型的方法是简单地选择与第一个令牌对应的隐藏状态
first_token_tensor = hidden_states[:, 0] # 获取每个样本的第一个令牌的隐藏状态
pooled_output = self.dense(inputs=first_token_tensor) # 使用全连接层池化第一个令牌的隐藏状态
return pooled_output # 返回池化输出
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert
class TFCamembertSelfAttention(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
# 检查隐藏大小是否能够被注意力头数整除
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
f"of attention heads ({config.num_attention_heads})"
)
# 初始化注意力头数和每个头的大小
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
# 定义查询、键、值的全连接层,并使用配置中的初始化器范围初始化权重
self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) # 定义注意力概率的dropout层
self.is_decoder = config.is_decoder # 记录是否为解码器
self.config = config # 保存配置参数
# 将输入张量重新调整形状从 [batch_size, seq_length, all_head_size] 到 [batch_size, seq_length, num_attention_heads, attention_head_size]
tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
# 将张量转置从 [batch_size, seq_length, num_attention_heads, attention_head_size] 到 [batch_size, num_attention_heads, seq_length, attention_head_size]
return tf.transpose(tensor, perm=[0, 2, 1, 3])
# 神经网络模型的调用方法,接受多个输入张量和参数,执行注意力机制相关的计算
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
encoder_hidden_states: tf.Tensor,
encoder_attention_mask: tf.Tensor,
past_key_value: Tuple[tf.Tensor],
output_attentions: bool,
training: bool = False,
):
# 在构建模型时调用,用于设置层的结构
def build(self, input_shape=None):
# 如果已经构建过一次,直接返回
if self.built:
return
# 标记该层已构建
self.built = True
# 如果存在查询张量,构建查询张量的结构
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
# 如果存在键张量,构建键张量的结构
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
# 如果存在值张量,构建值张量的结构
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert
class TFCamembertSelfOutput(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
# 初始化一个全连接层,用于转换隐藏状态的维度
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 初始化 LayerNormalization 层,用于归一化隐藏状态
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
# 初始化 Dropout 层,用于在训练时随机置零输入张量的一部分
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
# 将隐藏状态通过全连接层 dense 进行线性转换
hidden_states = self.dense(inputs=hidden_states)
# 在训练时应用 Dropout,随机置零一部分输入张量
hidden_states = self.dropout(inputs=hidden_states, training=training)
# 对转换后的隐藏状态应用 LayerNormalization,加上输入张量 input_tensor
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 构建 dense 层,设置其输入维度为 config.hidden_size
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# 构建 LayerNorm 层,设置其输入维度为 config.hidden_size
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert
class TFCamembertAttention(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
# 初始化自注意力层 TFCamembertSelfAttention
self.self_attention = TFCamembertSelfAttention(config, name="self")
# 初始化输出层 TFCamembertSelfOutput
self.dense_output = TFCamembertSelfOutput(config, name="output")
def prune_heads(self, heads):
raise NotImplementedError
def call(
self,
input_tensor: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
encoder_hidden_states: tf.Tensor,
encoder_attention_mask: tf.Tensor,
past_key_value: Tuple[tf.Tensor],
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
# 调用自注意力层进行注意力计算,返回自注意力层的输出
self_outputs = self.self_attention(
hidden_states=input_tensor,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_value=past_key_value,
output_attentions=output_attentions,
training=training,
)
# 将自注意力层的输出作为输入,通过输出层进行转换
attention_output = self.dense_output(
hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
)
# 如果需要输出注意力值,则将其添加到输出元组中
outputs = (attention_output,) + self_outputs[1:]
return outputs
# 定义神经网络层的构建方法,用于在给定输入形状时构建层
def build(self, input_shape=None):
# 如果已经构建过,则直接返回,避免重复构建
if self.built:
return
# 设置标志位,表示该层已经构建完成
self.built = True
# 检查是否存在自注意力层,并构建其名称作用域下的层
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
# 检查是否存在密集输出层,并构建其名称作用域下的层
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert
class TFCamembertIntermediate(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
# 创建一个全连接层,用于中间状态转换,输出单元数由配置文件决定
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 根据配置文件中指定的激活函数类型,获取对应的 TensorFlow 激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# 将输入的隐藏状态通过全连接层处理
hidden_states = self.dense(inputs=hidden_states)
# 使用配置中指定的中间激活函数处理转换后的隐藏状态
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 构建层次结构,若已存在 dense 层则使用其名字的命名空间,构建时指定输入形状
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert
class TFCamembertOutput(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
# 创建一个全连接层,用于输出层,输出单元数由配置文件决定
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 创建一个 LayerNormalization 层,用于规范化层次,epsilon 值由配置文件决定
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
# 创建一个 Dropout 层,用于在训练时进行随机失活,失活率由配置文件决定
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
# 将输入的隐藏状态通过全连接层处理
hidden_states = self.dense(inputs=hidden_states)
# 若在训练状态下,对输出的隐藏状态进行随机失活处理
hidden_states = self.dropout(inputs=hidden_states, training=training)
# 将失活后的隐藏状态与输入张量进行加和,并通过 LayerNormalization 处理
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 构建层次结构,若已存在 dense 层则使用其名字的命名空间,构建时指定输入形状
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
# 构建层次结构,若已存在 LayerNorm 层则使用其名字的命名空间,构建时指定输入形状
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert
class TFCamembertLayer(keras.layers.Layer):
# 使用指定的配置初始化 Camembert 模型
def __init__(self, config: CamembertConfig, **kwargs):
# 调用父类的初始化方法
super().__init__(**kwargs)
# 创建注意力层对象,使用给定的配置,并命名为"attention"
self.attention = TFCamembertAttention(config, name="attention")
# 设置是否为解码器的标志
self.is_decoder = config.is_decoder
# 设置是否添加交叉注意力的标志
self.add_cross_attention = config.add_cross_attention
# 如果要添加交叉注意力,需检查当前模型是否为解码器模型
if self.add_cross_attention:
if not self.is_decoder:
# 如果不是解码器模型且添加了交叉注意力,则引发错误
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
# 创建交叉注意力层对象,使用给定的配置,并命名为"crossattention"
self.crossattention = TFCamembertAttention(config, name="crossattention")
# 创建中间层对象,使用给定的配置,并命名为"intermediate"
self.intermediate = TFCamembertIntermediate(config, name="intermediate")
# 创建输出层对象,使用给定的配置,并命名为"output"
self.bert_output = TFCamembertOutput(config, name="output")
# 定义模型的调用方法
def call(
self,
hidden_states: tf.Tensor, # 输入的隐藏状态张量
attention_mask: tf.Tensor, # 注意力掩码张量
head_mask: tf.Tensor, # 头部掩码张量
encoder_hidden_states: tf.Tensor | None, # 编码器的隐藏状态张量或空值
encoder_attention_mask: tf.Tensor | None, # 编码器的注意力掩码张量或空值
past_key_value: Tuple[tf.Tensor] | None, # 过去的键-值张量元组或空值
output_attentions: bool, # 是否输出注意力权重
training: bool = False, # 是否处于训练模式,默认为False
````
# 定义方法签名,指定返回类型为包含单个元素的元组,该元素类型为 tf.Tensor
def call(
self,
hidden_states: tf.Tensor,
attention_mask: Optional[tf.Tensor] = None,
head_mask: Optional[tf.Tensor] = None,
encoder_hidden_states: Optional[tf.Tensor] = None,
encoder_attention_mask: Optional[tf.Tensor] = None,
past_key_value: Optional[Tuple[tf.Tensor]] = None,
output_attentions: Optional[bool] = False,
training: Optional[bool] = False,
) -> Tuple[tf.Tensor]:
# 如果 past_key_value 不为 None,则提取出 self-attention 的过去键/值缓存
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
# 调用 self.attention 方法进行自注意力计算
self_attention_outputs = self.attention(
input_tensor=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=self_attn_past_key_value,
output_attentions=output_attentions,
training=training,
)
# 获取 self-attention 的输出
attention_output = self_attention_outputs[0]
# 如果模型是解码器模型
if self.is_decoder:
# 输出中除了 self_attention_outputs 中的第一个元素之外的所有元素
outputs = self_attention_outputs[1:-1]
# 提取 self_attention_outputs 中的最后一个元素作为 present_key_value
present_key_value = self_attention_outputs[-1]
else:
# 输出中包含 self_attention_outputs 中除第一个元素外的所有元素(如果输出注意力权重的话)
outputs = self_attention_outputs[1:]
# 初始化 cross_attn_present_key_value 为 None
cross_attn_present_key_value = None
# 如果模型是解码器并且存在编码器的隐藏状态
if self.is_decoder and encoder_hidden_states is not None:
# 如果模型没有交叉注意力层,则引发 ValueError 异常
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
# 如果 past_key_value 不为 None,则提取出交叉注意力的过去键/值缓存
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
# 调用 self.crossattention 方法进行交叉注意力计算
cross_attention_outputs = self.crossattention(
input_tensor=attention_output,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_value=cross_attn_past_key_value,
output_attentions=output_attentions,
training=training,
)
# 获取交叉注意力的输出
attention_output = cross_attention_outputs[0]
# 将交叉注意力的输出中除了第一个和最后一个元素之外的所有元素添加到 outputs 中
outputs = outputs + cross_attention_outputs[1:-1]
# 将交叉注意力的输出中的最后一个元素添加到 present_key_value 中
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
# 对注意力输出进行中间层处理
intermediate_output = self.intermediate(hidden_states=attention_output)
# 对中间层输出进行最终的 Bert 输出层处理
layer_output = self.bert_output(
hidden_states=intermediate_output, input_tensor=attention_output, training=training
)
# 将最终的层输出添加到 outputs 中
outputs = (layer_output,) + outputs
# 如果模型是解码器,将注意力键/值作为最后的输出添加到 outputs 中
if self.is_decoder:
outputs = outputs + (present_key_value,)
# 返回最终的输出元组
return outputs
# 构建方法,用于构建模型的层次结构。如果已经构建过,则直接返回。
def build(self, input_shape=None):
# 如果已经构建过,直接返回,不再重复构建
if self.built:
return
# 将标志位设置为已构建
self.built = True
# 如果存在 self.attention 属性,则构建 self.attention 层次结构
if getattr(self, "attention", None) is not None:
# 使用 tf.name_scope 为 self.attention 层创建命名空间
with tf.name_scope(self.attention.name):
# 调用 self.attention 的 build 方法来构建该层
self.attention.build(None)
# 如果存在 self.intermediate 属性,则构建 self.intermediate 层次结构
if getattr(self, "intermediate", None) is not None:
# 使用 tf.name_scope 为 self.intermediate 层创建命名空间
with tf.name_scope(self.intermediate.name):
# 调用 self.intermediate 的 build 方法来构建该层
self.intermediate.build(None)
# 如果存在 self.bert_output 属性,则构建 self.bert_output 层次结构
if getattr(self, "bert_output", None) is not None:
# 使用 tf.name_scope 为 self.bert_output 层创建命名空间
with tf.name_scope(self.bert_output.name):
# 调用 self.bert_output 的 build 方法来构建该层
self.bert_output.build(None)
# 如果存在 self.crossattention 属性,则构建 self.crossattention 层次结构
if getattr(self, "crossattention", None) is not None:
# 使用 tf.name_scope 为 self.crossattention 层创建命名空间
with tf.name_scope(self.crossattention.name):
# 调用 self.crossattention 的 build 方法来构建该层
self.crossattention.build(None)
# 从 transformers.models.bert.modeling_tf_bert.TFBertEncoder 复制代码,将其中的 Bert 替换为 Camembert
class TFCamembertEncoder(keras.layers.Layer):
# 初始化函数,接收 CamembertConfig 对象作为参数
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
# 创建 CamembertLayer 的列表,根据层数进行命名
self.layer = [TFCamembertLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
# 前向传播函数,接收多个参数和返回类型的注解
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
encoder_hidden_states: tf.Tensor | None,
encoder_attention_mask: tf.Tensor | None,
past_key_values: Tuple[Tuple[tf.Tensor]] | None,
use_cache: Optional[bool],
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
# 初始化空元组或 None,用于存储中间结果
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
# 若 use_cache 为 True,则初始化空元组用于存储下一层的缓存
next_decoder_cache = () if use_cache else None
# 遍历每一层的 CamembertLayer
for i, layer_module in enumerate(self.layer):
# 如果需要输出隐藏状态,则将当前隐藏状态添加到 all_hidden_states
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 获取当前层的过去键值对,如果 past_key_values 不为 None
past_key_value = past_key_values[i] if past_key_values is not None else None
# 调用当前层的前向传播函数,计算当前层的输出
layer_outputs = layer_module(
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask[i],
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_value=past_key_value,
output_attentions=output_attentions,
training=training,
)
# 更新 hidden_states 为当前层的输出的第一个元素
hidden_states = layer_outputs[0]
# 如果 use_cache 为 True,则更新下一层的缓存
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
# 如果 output_attentions 为 True,则将当前层的注意力加入 all_attentions
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
# 如果配置中包含交叉注意力,并且 encoder_hidden_states 不为 None,则将交叉注意力加入 all_cross_attentions
if self.config.add_cross_attention and encoder_hidden_states is not None:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
# 添加最后一层的隐藏状态到 all_hidden_states
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果 return_dict 为 False,则返回非空的元组
if not return_dict:
return tuple(
v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
)
# 返回 TFBaseModelOutputWithPastAndCrossAttentions 对象,包含各类输出结果
return TFBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_attentions,
cross_attentions=all_cross_attentions,
)
# 定义一个方法 `build`,用于构建神经网络模型的层次结构
def build(self, input_shape=None):
# 如果模型已经构建过,直接返回,避免重复构建
if self.built:
return
# 将模型标记为已构建状态
self.built = True
# 检查是否存在 `layer` 属性,并逐层构建每个子层
if getattr(self, "layer", None) is not None:
# 遍历每个子层
for layer in self.layer:
# 在 TensorFlow 中为每个层次设置命名空间,以层次的名字作为命名空间
with tf.name_scope(layer.name):
# 构建每个子层,此处传入 `None` 作为输入形状参数
layer.build(None)
@keras_serializable
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer 复制并修改为 Camembert
class TFCamembertMainLayer(keras.layers.Layer):
config_class = CamembertConfig
def __init__(self, config, add_pooling_layer=True, **kwargs):
super().__init__(**kwargs)
self.config = config # 设置配置对象
self.is_decoder = config.is_decoder # 是否为解码器
self.num_hidden_layers = config.num_hidden_layers # 隐藏层的数量
self.initializer_range = config.initializer_range # 初始化范围
self.output_attentions = config.output_attentions # 是否输出注意力权重
self.output_hidden_states = config.output_hidden_states # 是否输出隐藏状态
self.return_dict = config.use_return_dict # 是否返回字典格式的输出
self.encoder = TFCamembertEncoder(config, name="encoder") # Camembert 编码器
self.pooler = TFCamembertPooler(config, name="pooler") if add_pooling_layer else None # 可选的池化层
# embeddings 必须是最后声明的,以保持权重的顺序
self.embeddings = TFCamembertEmbeddings(config, name="embeddings") # Camembert embeddings
# 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings 复制
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings # 获取输入 embeddings
# 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings 复制
def set_input_embeddings(self, value: tf.Variable):
self.embeddings.weight = value # 设置 embeddings 的权重
self.embeddings.vocab_size = shape_list(value)[0] # 设置 embeddings 的词汇表大小
# 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads 复制
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError # 未实现的方法,用于剪枝模型的注意力头部
@unpack_inputs
# 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call 复制
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
# 如果模型已经构建完成,则直接返回,不做任何操作
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果模型中存在编码器(encoder),则构建编码器
if getattr(self, "encoder", None) is not None:
# 在TensorFlow的命名空间中,使用编码器的名称
with tf.name_scope(self.encoder.name):
# 构建编码器,input_shape设为None
self.encoder.build(None)
# 如果模型中存在池化器(pooler),则构建池化器
if getattr(self, "pooler", None) is not None:
# 在TensorFlow的命名空间中,使用池化器的名称
with tf.name_scope(self.pooler.name):
# 构建池化器,input_shape设为None
self.pooler.build(None)
# 如果模型中存在嵌入层(embeddings),则构建嵌入层
if getattr(self, "embeddings", None) is not None:
# 在TensorFlow的命名空间中,使用嵌入层的名称
with tf.name_scope(self.embeddings.name):
# 构建嵌入层,input_shape设为None
self.embeddings.build(None)
# 定义一个名为 TFCamembertPreTrainedModel 的类,继承自 TFPreTrainedModel
class TFCamembertPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
一个抽象类,用于处理权重初始化以及下载和加载预训练模型的简单接口。
"""
# 指定配置类为 CamembertConfig
config_class = CamembertConfig
# 基础模型前缀为 "roberta"
base_model_prefix = "roberta"
# 引入函数装饰器 add_start_docstrings,并传入文档字符串和 CAMEMBERT_START_DOCSTRING 常量
@add_start_docstrings(
"The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
CAMEMBERT_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaModel 复制代码,并将 Roberta->Camembert, ROBERTA->CAMEMBERT
class TFCamembertModel(TFCamembertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
# 调用父类构造函数,传入配置对象和其他参数
super().__init__(config, *inputs, **kwargs)
# 初始化 self.roberta 属性为 TFCamembertMainLayer 类的实例,传入配置对象和名称 "roberta"
self.roberta = TFCamembertMainLayer(config, name="roberta")
# 引入函数装饰器 unpack_inputs,用于展开输入参数
# 引入函数装饰器 add_start_docstrings_to_model_forward,传入格式化字符串 CAMEMBERT_INPUTS_DOCSTRING 和输入参数说明
# 引入函数装饰器 add_code_sample_docstrings,传入检查点、输出类型和配置类的相关文档信息
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
# 函数参数说明完毕
) -> Union[Tuple, TFBaseModelOutputWithPoolingAndCrossAttentions]:
r"""
encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*, defaults to `True`):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`). Set to `False` during training, `True` during generation
"""
outputs = self.roberta(
input_ids=input_ids, # 输入的 token IDs
attention_mask=attention_mask, # 注意力遮罩,掩盖无效位置的 token
token_type_ids=token_type_ids, # token 类型 IDs,用于区分句子 A 和句子 B
position_ids=position_ids, # token 的位置编码
head_mask=head_mask, # 头部掩码,用于指定哪些注意力头部被屏蔽
inputs_embeds=inputs_embeds, # 输入的嵌入表示
encoder_hidden_states=encoder_hidden_states, # 编码器的隐藏状态序列
encoder_attention_mask=encoder_attention_mask, # 编码器的注意力遮罩
past_key_values=past_key_values, # 预计算的键值状态,用于加速解码
use_cache=use_cache, # 是否使用缓存以加速解码
output_attentions=output_attentions, # 是否输出注意力权重
output_hidden_states=output_hidden_states, # 是否输出隐藏状态
return_dict=return_dict, # 返回结果类型,字典还是元组
training=training, # 是否处于训练模式
)
return outputs
def build(self, input_shape=None):
if self.built:
return # 如果已经构建过,则直接返回
self.built = True # 标记模型已构建
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name): # 在 TensorFlow 中设置命名空间
self.roberta.build(None) # 构建 RoBERTa 模型
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead复制而来,将Roberta->Camembert
class TFCamembertLMHead(keras.layers.Layer):
"""Camembert模型的masked语言建模头部。"""
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
self.config = config
self.hidden_size = config.hidden_size
self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.act = get_tf_activation("gelu")
# 输出权重与输入嵌入相同,但每个token有一个仅输出的偏置项。
self.decoder = input_embeddings
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
def get_output_embeddings(self):
return self.decoder
def set_output_embeddings(self, value):
self.decoder.weight = value
self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self):
return {"bias": self.bias}
def set_bias(self, value):
self.bias = value["bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.layer_norm(hidden_states)
# 投影回词汇表大小,带有偏置项
seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states
@add_start_docstrings(
"""在顶部有一个`language modeling`头的CamemBERT模型。""",
CAMEMBERT_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM复制而来,将Roberta->Camembert, ROBERTA->CAMEMBERT
class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelingLoss):
# 带有'.'的名称表示在从PT模型加载TF模型时授权的意外/缺失层
# 初始化一个列表,包含在加载时要忽略的特定键
_keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
# 初始化方法,接受配置对象和其他输入参数
def __init__(self, config, *inputs, **kwargs):
# 调用父类的初始化方法,传入配置和其他输入参数
super().__init__(config, *inputs, **kwargs)
# 初始化一个 RoBERTa 主层对象,禁用添加池化层,命名为 "roberta"
self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
# 初始化一个语言模型头部对象,传入配置和 RoBERTa 主层的嵌入层,命名为 "lm_head"
self.lm_head = TFCamembertLMHead(config, self.roberta.embeddings, name="lm_head")
# 返回语言模型头部对象的方法
def get_lm_head(self):
return self.lm_head
# 返回前缀偏置名称的方法,已弃用,发出未来警告
def get_prefix_bias_name(self):
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
# 返回头部名称组成的字符串,使用 "/" 分隔
return self.name + "/" + self.lm_head.name
# 调用方法的装饰器,将输入参数解包,并添加模型前向传递的文档字符串和代码示例的文档字符串
@unpack_inputs
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="<mask>",
expected_output="' Paris'",
expected_loss=0.1,
)
# 模型的前向传递方法,接受多个输入参数,并返回预测输出
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
# 调用 RoBERTa 模型进行前向传播,获取模型的输出结果
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 从 RoBERTa 模型的输出中提取序列输出
sequence_output = outputs[0]
# 将序列输出送入语言模型头部,得到预测分数(logits)
prediction_scores = self.lm_head(sequence_output)
# 如果提供了标签,则计算损失;否则损失设为 None
loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
# 如果不要求返回字典形式的输出,则按照元组形式返回结果
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 返回 TFMaskedLMOutput 类型的对象,包括损失、预测分数、隐藏状态和注意力权重
return TFMaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
# 如果已经构建过模型,则直接返回
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果定义了 RoBERTa 模型,则构建 RoBERTa
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
# 如果定义了语言模型头部,则构建语言模型头部
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead复制而来,定义了一个用于句子级别分类任务的头部。
class TFCamembertClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 创建一个全连接层,输出维度为config.hidden_size,激活函数为tanh
self.dense = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="dense",
)
# 根据config中的设置,选择分类器的dropout率,如果未指定则使用hidden_dropout_prob
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
# 定义一个Dropout层,应用于全连接层的输出
self.dropout = keras.layers.Dropout(classifier_dropout)
# 创建一个全连接层,输出维度为config.num_labels,用于输出分类任务的结果
self.out_proj = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
self.config = config
def call(self, features, training=False):
# 取出features的第一个token的向量表示,通常代表<CLS> token
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
# 应用dropout层到x上,用于训练时进行随机失活
x = self.dropout(x, training=training)
# 将x传入全连接层dense中进行线性变换并激活
x = self.dense(x)
# 再次应用dropout层到x上,用于训练时进行随机失活
x = self.dropout(x, training=training)
# 将x传入全连接层out_proj中,生成最终的分类结果
x = self.out_proj(x)
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果已经建立过网络,则直接返回,否则开始构建
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
# 构建全连接层dense,输入维度为config.hidden_size
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
# 构建全连接层out_proj,输入维度为config.hidden_size
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
CAMEMBERT_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification中复制,仅将Roberta替换为Camembert,ROBERTA替换为CAMEMBERT
class TFCamembertForSequenceClassification(TFCamembertPreTrainedModel, TFSequenceClassificationLoss):
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
# _keys_to_ignore_on_load_unexpected列出了在从PT模型加载TF模型时,可以忽略的意外/丢失的层的名称模式
_keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 设置分类任务的标签数量
self.num_labels = config.num_labels
# 创建Camembert主体层,用于处理输入序列,不包含池化层
self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
# 创建Camembert分类头部,用于生成分类任务的输出
self.classifier = TFCamembertClassificationHead(config, name="classifier")
@unpack_inputs
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# 使用装饰器为方法添加文档字符串,指定模型和输出类型,以及配置类和预期输出和损失
@add_code_sample_docstrings(
checkpoint="cardiffnlp/twitter-roberta-base-emotion",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="'optimism'",
expected_loss=0.08,
)
# 定义模型的调用方法,接受多个输入参数和可选的标签,返回分类器输出或者元组包含 logits
def call(
self,
input_ids: TFModelInputType | None = None, # 输入文本的 token IDs
attention_mask: np.ndarray | tf.Tensor | None = None, # 表示输入文本中实际词汇的掩码
token_type_ids: np.ndarray | tf.Tensor | None = None, # 区分不同句子的标识符
position_ids: np.ndarray | tf.Tensor | None = None, # 表示输入中 token 的位置
head_mask: np.ndarray | tf.Tensor | None = None, # 多头注意力机制的掩码
inputs_embeds: np.ndarray | tf.Tensor | None = None, # 输入 token 的嵌入表示
output_attentions: Optional[bool] = None, # 是否返回注意力权重
output_hidden_states: Optional[bool] = None, # 是否返回隐藏状态
return_dict: Optional[bool] = None, # 是否返回 TFSequenceClassifierOutput 对象
labels: np.ndarray | tf.Tensor | None = None, # 计算序列分类/回归损失的标签
training: Optional[bool] = False, # 是否处于训练模式
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 使用 RoBERTa 模型处理输入数据,返回模型输出
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取序列输出
sequence_output = outputs[0]
# 使用分类器模型处理序列输出,得到 logits
logits = self.classifier(sequence_output, training=training)
# 如果标签为空,则损失也为空;否则计算标签和 logits 之间的损失
loss = None if labels is None else self.hf_compute_loss(labels, logits)
# 如果不返回字典,则按顺序返回 logits 和其他输出(如隐藏状态)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 返回 TFSequenceClassifierOutput 对象,包括损失、logits、隐藏状态和注意力权重
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 构建模型,初始化 RoBERTa 和分类器层
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果 RoBERTa 模型存在,则构建 RoBERTa 层
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
# 如果分类器存在,则构建分类器层
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
# 使用装饰器将以下字符串添加到模型文档字符串的开头,描述了 CamemBERT 模型及其在命名实体识别 (NER) 任务中的用途
@add_start_docstrings(
"""
CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
CAMEMBERT_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification 复制,将 Roberta 替换为 Camembert,ROBERTA 替换为 CAMEMBERT
class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClassificationLoss):
# 在从 PyTorch 模型加载到 TensorFlow 模型时,这些键表示不希望或缺少的层
_keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
# 初始化 Camembert 主层,排除添加池化层,命名为 "roberta"
self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
# 设置分类器的 dropout 比例为 config.classifier_dropout,若未指定则使用 config.hidden_dropout_prob
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = keras.layers.Dropout(classifier_dropout)
# 定义分类器层,输出维度为 config.num_labels,使用给定范围内的初始化器进行初始化,命名为 "classifier"
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
# 保存配置信息
self.config = config
# 使用装饰器解包输入参数,并添加模型前向传播的文档字符串,描述输入格式
@unpack_inputs
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="ydshieh/roberta-large-ner-english",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
expected_loss=0.01,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
**kwargs
):
"""
CamemBERT 模型的前向传播方法,支持各种输入参数,返回 TFTokenClassifierOutput 类型的输出结果。
"""
# 实现前向传播的具体逻辑,包括输入的各种处理和模型输出的计算
pass
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
# 使用 Type Hinting 指定函数返回类型,可以是 TFTokenClassifierOutput 或包含 tf.Tensor 的元组
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取 RoBERTa 模型的输出序列
sequence_output = outputs[0]
# 应用 dropout 操作,用于防止过拟合
sequence_output = self.dropout(sequence_output, training=training)
# 对输出序列进行分类器分类
logits = self.classifier(sequence_output)
# 如果提供了标签,计算损失函数
loss = None if labels is None else self.hf_compute_loss(labels, logits)
# 如果 return_dict 为 False,则返回不同的输出格式
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则构建 TFTokenClassifierOutput 对象并返回
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
# 如果模型已经构建,直接返回
if self.built:
return
# 设置模型已构建标志
self.built = True
# 如果存在 RoBERTa 模型,则构建 RoBERTa 模型
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
# 如果存在分类器模型,则构建分类器模型
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
CAMEMBERT_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaForMultipleChoice复制过来,将Roberta改为Camembert,ROBERTA改为CAMEMBERT
class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceLoss):
# 当从PyTorch模型加载到TensorFlow模型时,以下带'.'的名称表示授权的意外/缺失层
_keys_to_ignore_on_load_unexpected = [r"lm_head"]
# 当从PyTorch模型加载到TensorFlow模型时,以下名称表示授权的缺失层
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 使用TFCamembertMainLayer初始化Camembert主层,并命名为"roberta"
self.roberta = TFCamembertMainLayer(config, name="roberta")
# 使用config.hidden_dropout_prob初始化Dropout层
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
# 使用config.initializer_range初始化Dense层,用于分类
self.classifier = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(
CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
# 模型的前向传播函数,接受多个输入参数并返回相应输出
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
# 参数training用于指定当前是否处于训练模式,默认为False
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
# 如果输入包含 input_ids,则确定 num_choices 和 seq_length
if input_ids is not None:
num_choices = shape_list(input_ids)[1] # 获取选择项的数量
seq_length = shape_list(input_ids)[2] # 获取序列长度
else:
num_choices = shape_list(inputs_embeds)[1] # 获取选择项的数量(从 embeddings 中)
seq_length = shape_list(inputs_embeds)[2] # 获取序列长度(从 embeddings 中)
# 根据 input_ids 是否为 None,对输入的张量进行扁平化处理
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
# 调用 self.roberta 进行模型的前向传播
outputs = self.roberta(
flat_input_ids,
flat_attention_mask,
flat_token_type_ids,
flat_position_ids,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取池化后的输出(通常是第二个输出)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training) # 对池化输出应用 dropout
logits = self.classifier(pooled_output) # 使用分类器对池化输出进行分类
# 将 logits 重新整形为 (batch_size, num_choices)
reshaped_logits = tf.reshape(logits, (-1, num_choices))
# 计算损失,如果 labels 不为 None,则计算损失值
loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
# 如果不要求返回字典,则返回结果的元组
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 如果要求返回字典,则返回 TFMultipleChoiceModelOutput 对象
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果 self.roberta 存在,则构建 self.roberta 模型
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
# 如果 self.classifier 存在,则构建 self.classifier 模型
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
CAMEMBERT_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering中复制过来,将Roberta替换为Camembert,将ROBERTA替换为CAMEMBERT
class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsweringLoss):
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
# 在从PyTorch模型加载TF模型时,'pooler'和'lm_head'是允许的未预期/缺失的层
_keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
# 初始化Camembert主层,不添加池化层,命名为"roberta"
self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
# QA输出层,全连接层,输出维度为config.num_labels,初始化方法为config中定义的initializer_range
self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="ydshieh/roberta-base-squad2",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="' puppet'",
expected_loss=0.86,
)
# 定义模型的前向传播方法
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
**kwargs
):
"""
Perform the forward pass of the model.
"""
# 调用Camembert主层进行前向传播
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
**kwargs,
)
# 获取Camembert主层的输出
sequence_output = outputs[0]
# 计算问题回答的起始位置和结束位置的logits
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits) + outputs[2:]
if not return_dict:
return outputs + (outputs[0],)
return TFQuestionAnsweringModelOutput(
start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
# 调用 RoBERTa 模型进行预测
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取模型输出的序列表示
sequence_output = outputs[0]
# 对序列表示进行线性变换,得到起始和结束位置的 logits
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
# 如果提供了起始和结束位置的标签,则计算损失
loss = None
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.hf_compute_loss(labels, (start_logits, end_logits))
# 如果不需要返回字典形式的输出,则返回 logits 和可能的其他输出
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 返回 TFQuestionAnsweringModelOutput 类型的输出,包括损失、logits、隐藏状态和注意力权重
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果定义了 RoBERTa 模型,则构建其结构
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
# 如果定义了 QA 输出层,则构建其结构
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING
)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM 复制并修改为 Camembert,ROBERTA->CAMEMBERT
class TFCamembertForCausalLM(TFCamembertPreTrainedModel, TFCausalLanguageModelingLoss):
# 在从 PT 模型加载 TF 模型时,以下带有 '.' 的名称表示授权的意外/丢失的层
_keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
def __init__(self, config: CamembertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
if not config.is_decoder:
logger.warning("如果要将 `TFCamembertLMHeadModel` 作为独立模型使用,请添加 `is_decoder=True.`")
# 初始化 Camembert 主层,不添加池化层,命名为 "roberta"
self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
# 初始化 Camembert LM 头部,使用 self.roberta.embeddings 作为输入嵌入,命名为 "lm_head"
self.lm_head = TFCamembertLMHead(config, input_embeddings=self.roberta.embeddings, name="lm_head")
def get_lm_head(self):
return self.lm_head
def get_prefix_bias_name(self):
warnings.warn("方法 get_prefix_bias_name 已弃用,请改用 `get_bias`.", FutureWarning)
# 返回头部名称,以及 LM 头部名称的组合
return self.name + "/" + self.lm_head.name
# 从 transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation 复制
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
# 如果没有提供注意力遮罩,则创建全为1的遮罩
if attention_mask is None:
attention_mask = tf.ones(input_shape)
# 如果存在过去的键值对,则截取最后一个输入 ID
if past_key_values is not None:
input_ids = input_ids[:, -1:]
# 返回包含输入 ID、注意力遮罩和过去键值对的字典
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
@unpack_inputs
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFCausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
# 定义神经网络模型中的方法,用于执行模型的前向推断或训练
def call(
self,
input_ids: TFModelInputType | None = None, # 输入的 token IDs,类型可以是 TFModelInputType 或 None
attention_mask: np.ndarray | tf.Tensor | None = None, # 注意力掩码,类型可以是 numpy 数组、Tensor 或 None
token_type_ids: np.ndarray | tf.Tensor | None = None, # token 类型 IDs,类型可以是 numpy 数组、Tensor 或 None
position_ids: np.ndarray | tf.Tensor | None = None, # 位置 IDs,类型可以是 numpy 数组、Tensor 或 None
head_mask: np.ndarray | tf.Tensor | None = None, # 头部掩码,类型可以是 numpy 数组、Tensor 或 None
inputs_embeds: np.ndarray | tf.Tensor | None = None, # 输入的嵌入表示,类型可以是 numpy 数组、Tensor 或 None
encoder_hidden_states: np.ndarray | tf.Tensor | None = None, # 编码器隐藏状态,类型可以是 numpy 数组、Tensor 或 None
encoder_attention_mask: np.ndarray | tf.Tensor | None = None, # 编码器的注意力掩码,类型可以是 numpy 数组、Tensor 或 None
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, # 过去的键值对,类型为可选的嵌套元组
use_cache: Optional[bool] = None, # 是否使用缓存,类型为可选的布尔值
output_attentions: Optional[bool] = None, # 是否输出注意力信息,类型为可选的布尔值
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,类型为可选的布尔值
return_dict: Optional[bool] = None, # 是否返回字典格式的结果,类型为可选的布尔值
labels: np.ndarray | tf.Tensor | None = None, # 标签,类型可以是 numpy 数组、Tensor 或 None
training: Optional[bool] = False, # 是否处于训练模式,类型为可选的布尔值,默认为 False
# 构建神经网络模型的结构,设置层的连接关系和参数
def build(self, input_shape=None):
if self.built:
return # 如果已经构建过,则直接返回
self.built = True # 标记模型已经构建
if getattr(self, "roberta", None) is not None:
# 如果存在名为 "roberta" 的属性,则在命名空间下构建它
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
# 如果存在名为 "lm_head" 的属性,则在命名空间下构建它
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
.\models\camembert\tokenization_camembert.py
""" Tokenization classes for Camembert model."""
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"almanach/camembert-base": "https://huggingface.co/almanach/camembert-base/resolve/main/sentencepiece.bpe.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"almanach/camembert-base": 512,
}
SPIECE_UNDERLINE = "▁"
class CamembertTokenizer(PreTrainedTokenizer):
"""
Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Construct a CamemBERT tokenizer. Based on
[SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Attributes:
sp_model (`SentencePieceProcessor`):
The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"],
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False, special=True)
if isinstance(mask_token, str)
else mask_token
)
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
self._added_tokens_decoder = {
0: AddedToken("<s>NOTUSED", special=True),
1: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token,
2: AddedToken("</s>NOTUSED", special=True),
3: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token,
4: AddedToken("<unk>NOTUSED", special=True),
}
self.fairseq_offset = 4
if "added_tokens_decoder" in kwargs:
kwargs["added_tokens_decoder"].update(self._added_tokens_decoder)
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
@property
def vocab_size(self):
return len(self.sp_model)
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.fairseq_offset)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text: str) -> List[str]:
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
"""将 token (str) 转换为对应的 id,使用词汇表."""
if self.sp_model.PieceToId(token) == 0:
return self.unk_token_id
return self.fairseq_offset + self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.sp_model.IdToPiece(index - self.fairseq_offset)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
prev_is_special = False
for token in tokens:
if token in self.all_special_tokens:
if not prev_is_special:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string.strip()
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
从一个序列或者一个序列对构建模型输入,用于序列分类任务,通过连接和添加特殊标记。一个 CamemBERT 序列的格式如下:
- 单个序列: `<s> X </s>`
- 序列对: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
将添加特殊标记的 ID 列表。
token_ids_1 (`List[int]`, *optional*):
第二个可选的序列对 ID 列表。
Returns:
`List[int]`: 包含适当特殊标记的输入 ID 列表。
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
从没有添加特殊标记的令牌列表中检索序列 ID。在使用分词器的 `prepare_for_model` 方法添加特殊标记时调用此方法。
Args:
token_ids_0 (`List[int]`):
ID 列表。
token_ids_1 (`List[int]`, *optional*):
第二个可选的序列对 ID 列表。
already_has_special_tokens (`bool`, *optional*, 默认为 `False`):
标记列表是否已经格式化为模型的特殊标记。
Returns:
`List[int]`: 一个整数列表,范围在 [0, 1]:1 表示特殊标记,0 表示序列标记。
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
从序列对创建令牌类型 ID。这个方法用于创建用于区分不同序列的令牌类型 ID。
Args:
token_ids_0 (`List[int]`):
ID 列表。
token_ids_1 (`List[int]`, *optional*):
第二个可选的序列对 ID 列表。
Returns:
`List[int]`: 令牌类型 ID 列表。
"""
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like
RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
.\models\camembert\tokenization_camembert_fast.py
""" Fast tokenization classes for Camembert model."""
import os
from shutil import copyfile
from typing import List, Optional, Tuple
from ...tokenization_utils import AddedToken
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging
if is_sentencepiece_available():
from .tokenization_camembert import CamembertTokenizer
else:
CamembertTokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"almanach/camembert-base": "https://huggingface.co/almanach/camembert-base/resolve/main/sentencepiece.bpe.model",
},
"tokenizer_file": {
"almanach/camembert-base": "https://huggingface.co/almanach/camembert-base/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"almanach/camembert-base": 512,
}
SPIECE_UNDERLINE = "▁"
class CamembertTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
[`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
"""
Args:
vocab_file (`str`):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
SentencePiece文件的路径,用于实例化分词器的词汇表文件。
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
序列的起始标记,用于预训练。可用作序列分类器的标记。
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
在使用特殊标记构建序列时,并非使用此标记作为序列的起始标记。实际使用的是 `cls_token`。
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
序列的结束标记。
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
在使用特殊标记构建序列时,并非使用此标记作为序列的结束标记。实际使用的是 `sep_token`。
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
分隔符标记,在构建来自多个序列的序列时使用,例如用于序列分类的两个序列,或用于问答中的文本和问题序列。也用作使用特殊标记构建的序列的最后一个标记。
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
在进行序列分类(整个序列而不是每个标记的分类)时使用的分类器标记。使用特殊标记构建序列时,它是序列的第一个标记。
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
未知标记。不在词汇表中的标记无法转换为ID,因此将被设置为此标记。
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
用于填充的标记,例如在批处理不同长度的序列时使用。
mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
用于掩码值的标记。在进行掩码语言建模训练时使用的标记。这是模型将尝试预测的标记。
additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
分词器使用的额外特殊标记列表。
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = CamembertTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"],
**kwargs,
):
# Mask token behavior is modified to strip left spaces and is marked as special
mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
# 调用父类的构造方法,初始化基类的属性
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
# 设置实例的属性,保存词汇表文件路径
self.vocab_file = vocab_file
@property
def can_save_slow_tokenizer(self) -> bool:
# 判断词汇表文件是否存在,用于判断是否可以保存慢速分词器
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An CamemBERT sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of input IDs with the appropriate special tokens.
"""
if token_ids_1 is None:
# 返回只包含单个序列的输入 ID,包含特殊的开始和结束标记
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
# 返回包含两个序列的输入 ID,包含特殊的开始和结束标记
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
# 根据输入的序列创建 token type IDs,用于区分输入序列的类型(单个序列或序列对)
# 在 CamemBERT 中,token type IDs 用于指示每个 token 属于哪个输入序列
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like
RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
# Separator token ID for separating sequences
sep = [self.sep_token_id]
# CLS token ID for start of sequence classification
cls = [self.cls_token_id]
# If only one sequence is provided
if token_ids_1 is None:
# Return a list of zeros with the length of cls + token_ids_0 + sep
return len(cls + token_ids_0 + sep) * [0]
# If two sequences are provided
# Return a list of zeros with the length of cls + token_ids_0 + 2 * sep + token_ids_1 + sep
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# Check if saving slow tokenizer vocabulary is possible
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
# Check if save_directory exists and is a directory; log error if not
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# Define the output vocabulary file path
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# Copy the current vocabulary file to the specified directory if different
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# Return the path to the saved vocabulary file
return (out_vocab_file,)
.\models\camembert\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_camembert": ["CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CamembertConfig", "CamembertOnnxConfig"],
}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_camembert"] = ["CamembertTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_camembert_fast"] = ["CamembertTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_camembert"] = [
"CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"CamembertForCausalLM",
"CamembertForMaskedLM",
"CamembertForMultipleChoice",
"CamembertForQuestionAnswering",
"CamembertForSequenceClassification",
"CamembertForTokenClassification",
"CamembertModel",
"CamembertPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_camembert"] = [
"TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFCamembertForCausalLM",
"TFCamembertForMaskedLM",
"TFCamembertForMultipleChoice",
"TFCamembertForQuestionAnswering",
"TFCamembertForSequenceClassification",
"TFCamembertForTokenClassification",
"TFCamembertModel",
"TFCamembertPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig, CamembertOnnxConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_camembert import CamembertTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
pass
else:
pass
try:
from .tokenization_camembert_fast import CamembertTokenizerFast
except OptionalDependencyNotAvailable:
pass
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_camembert import (
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
CamembertForCausalLM,
CamembertForMaskedLM,
CamembertForMultipleChoice,
CamembertForQuestionAnswering,
CamembertForSequenceClassification,
CamembertForTokenClassification,
CamembertModel,
CamembertPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_camembert import (
TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFCamembertForCausalLM,
TFCamembertForMaskedLM,
TFCamembertForMultipleChoice,
TFCamembertForQuestionAnswering,
TFCamembertForSequenceClassification,
TFCamembertForTokenClassification,
TFCamembertModel,
TFCamembertPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\canine\configuration_canine.py
""" CANINE 模型配置"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/canine-s": "https://huggingface.co/google/canine-s/resolve/main/config.json",
}
class CanineConfig(PretrainedConfig):
r"""
这是配置类,用于存储 [`CanineModel`] 的配置信息。根据指定的参数实例化 CANINE 模型,定义模型架构。
使用默认配置实例化将会产生类似于 CANINE [google/canine-s](https://huggingface.co/google/canine-s) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。
Example:
```
>>> from transformers import CanineConfig, CanineModel
>>> # 初始化一个 CANINE google/canine-s 风格的配置
>>> configuration = CanineConfig()
>>> # 使用该配置初始化一个(随机权重)模型,使用 google/canine-s 风格的配置
>>> model = CanineModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "canine"
def __init__(
self,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=16384,
type_vocab_size=16,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
bos_token_id=0xE000,
eos_token_id=0xE001,
downsampling_rate=4,
upsampling_kernel_size=4,
num_hash_functions=8,
num_hash_buckets=16384,
local_transformer_stride=128,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.type_vocab_size = type_vocab_size
self.layer_norm_eps = layer_norm_eps
self.downsampling_rate = downsampling_rate
self.upsampling_kernel_size = upsampling_kernel_size
self.num_hash_functions = num_hash_functions
self.num_hash_buckets = num_hash_buckets
self.local_transformer_stride = local_transformer_stride
.\models\canine\convert_canine_original_tf_checkpoint_to_pytorch.py
"""Convert CANINE checkpoint."""
import argparse
from transformers import CanineConfig, CanineModel, CanineTokenizer, load_tf_weights_in_canine
from transformers.utils import logging
logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, pytorch_dump_path):
config = CanineConfig()
model = CanineModel(config)
model.eval()
print(f"Building PyTorch model from configuration: {config}")
load_tf_weights_in_canine(model, config, tf_checkpoint_path)
print(f"Save PyTorch model to {pytorch_dump_path}")
model.save_pretrained(pytorch_dump_path)
tokenizer = CanineTokenizer()
print(f"Save tokenizer files to {pytorch_dump_path}")
tokenizer.save_pretrained(pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tf_checkpoint_path",
default=None,
type=str,
required=True,
help="Path to the TensorFlow checkpoint. Should end with model.ckpt",
)
parser.add_argument(
"--pytorch_dump_path",
default=None,
type=str,
required=True,
help="Path to a folder where the PyTorch model will be placed.",
)
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.pytorch_dump_path)
.\models\canine\modeling_canine.py
@dataclass
class CanineModelOutputWithPooling(ModelOutput):
"""
Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
Transformer encoders.
"""
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态序列,是深度Transformer编码器的输出。
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
序列中第一个标记(分类标记)在深度Transformer编码器最后一层的隐藏状态,经过线性层和Tanh激活函数进一步处理。
线性层的权重在预训练期间从下一个句子预测(分类)目标中训练得到。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
元组类型,包含`torch.FloatTensor`类型的张量,每个编码器的输入和每个编码器每一层的输出。
第一个张量的形状为 `(batch_size, sequence_length, hidden_size)`,第二个张量的形状为
`(batch_size, sequence_length // config.downsampling_rate, hidden_size)`。
浅层编码器的隐藏状态长度为 `sequence_length`,深层编码器的隐藏状态长度为 `sequence_length // config.downsampling_rate`。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
元组类型,包含`torch.FloatTensor`类型的张量,每个编码器的注意力权重。
第一个张量的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`,
第二个张量的形状为 `(batch_size, num_heads, sequence_length // config.downsampling_rate, sequence_length // config.downsampling_rate)`。
在注意力softmax之后的注意力权重,用于计算自注意力头中的加权平均值。
"""
# 初始化函数参数的默认值
last_hidden_state: torch.FloatTensor = None
pooler_output: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
def load_tf_weights_in_canine(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re # 导入正则表达式模块,用于处理字符串匹配
import numpy as np # 导入NumPy库,用于数值计算
import tensorflow as tf # 导入TensorFlow库,用于加载TensorFlow模型权重
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path) # 获取TensorFlow模型检查点文件的绝对路径
logger.info(f"Converting TensorFlow checkpoint from {tf_path}") # 记录日志信息,显示正在转换的TensorFlow检查点路径
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path) # 获取TensorFlow模型中所有变量的名称和形状信息
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}") # 记录日志信息,显示正在加载的TensorFlow权重名称和形状
array = tf.train.load_variable(tf_path, name) # 加载TensorFlow模型中指定变量的权重数据
names.append(name)
arrays.append(array)
return model
class CanineEmbeddings(nn.Module):
"""Construct the character, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.config = config
# character embeddings
shard_embedding_size = config.hidden_size // config.num_hash_functions
for i in range(config.num_hash_functions):
name = f"HashBucketCodepointEmbedder_{i}"
setattr(self, name, nn.Embedding(config.num_hash_buckets, shard_embedding_size))
# 设置每个哈希桶代码点嵌入层,使用nn.Embedding创建嵌入矩阵
self.char_position_embeddings = nn.Embedding(config.num_hash_buckets, config.hidden_size)
# 设置字符位置嵌入层,使用nn.Embedding创建嵌入矩阵,嵌入维度为config.hidden_size
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# 设置令牌类型嵌入层,使用nn.Embedding创建嵌入矩阵,嵌入维度为config.hidden_size
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 设置LayerNorm层,使用nn.LayerNorm进行层归一化,归一化维度为config.hidden_size,设置epsilon为config.layer_norm_eps
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 设置Dropout层,使用nn.Dropout进行Dropout操作,设置丢弃概率为config.hidden_dropout_prob
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
# 注册position_ids作为缓冲区,存储长度为config.max_position_embeddings的位置ID张量,不持久化保存
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
# 设置位置嵌入类型,默认为"absolute",如果config中有指定position_embedding_type则使用指定值
def _hash_bucket_tensors(self, input_ids, num_hashes: int, num_buckets: int):
"""
Converts ids to hash bucket ids via multiple hashing.
Args:
input_ids: The codepoints or other IDs to be hashed.
num_hashes: The number of hash functions to use.
num_buckets: The number of hash buckets (i.e. embeddings in each table).
Returns:
A list of tensors, each of which is the hash bucket IDs from one hash function.
"""
# 检查 `num_hashes` 是否超过了预定义的素数列表长度,抛出异常
if num_hashes > len(_PRIMES):
raise ValueError(f"`num_hashes` must be <= {len(_PRIMES)}")
# 选择前 `num_hashes` 个素数作为哈希函数的参数
primes = _PRIMES[:num_hashes]
result_tensors = []
# 对每一个素数进行哈希计算
for prime in primes:
# 根据哈希函数计算输入 ID 的哈希桶 ID
hashed = ((input_ids + 1) * prime) % num_buckets
result_tensors.append(hashed)
return result_tensors
def _embed_hash_buckets(self, input_ids, embedding_size: int, num_hashes: int, num_buckets: int):
"""Converts IDs (e.g. codepoints) into embeddings via multiple hashing."""
# 检查 `embedding_size` 是否可以被 `num_hashes` 整除,否则抛出异常
if embedding_size % num_hashes != 0:
raise ValueError(f"Expected `embedding_size` ({embedding_size}) % `num_hashes` ({num_hashes}) == 0")
# 使用 `_hash_bucket_tensors` 方法将输入 ID 转换为哈希桶 ID 的张量列表
hash_bucket_tensors = self._hash_bucket_tensors(input_ids, num_hashes=num_hashes, num_buckets=num_buckets)
embedding_shards = []
# 对每一个哈希桶 ID 张量进行嵌入映射
for i, hash_bucket_ids in enumerate(hash_bucket_tensors):
name = f"HashBucketCodepointEmbedder_{i}"
# 调用模型的子模块进行哈希桶 ID 的嵌入映射
shard_embeddings = getattr(self, name)(hash_bucket_ids)
embedding_shards.append(shard_embeddings)
# 将所有嵌入映射拼接成一个张量
return torch.cat(embedding_shards, dim=-1)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
) -> torch.FloatTensor:
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
# 获取输入序列的长度
seq_length = input_shape[1]
# 如果未提供位置 ID,则使用预定义的位置 ID
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
# 如果未提供 token 类型 ID,则默认为全零张量
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
# 如果未提供输入嵌入张量,则通过 `_embed_hash_buckets` 方法生成
if inputs_embeds is None:
inputs_embeds = self._embed_hash_buckets(
input_ids, self.config.hidden_size, self.config.num_hash_functions, self.config.num_hash_buckets
)
# 获取 token 类型的嵌入映射
token_type_embeddings = self.token_type_embeddings(token_type_ids)
# 将输入嵌入张量与 token 类型嵌入映射相加
embeddings = inputs_embeds + token_type_embeddings
# 如果位置嵌入类型为 "absolute",则加上字符位置嵌入映射
if self.position_embedding_type == "absolute":
position_embeddings = self.char_position_embeddings(position_ids)
embeddings += position_embeddings
# 执行 LayerNorm 操作
embeddings = self.LayerNorm(embeddings)
# 执行 dropout 操作
embeddings = self.dropout(embeddings)
return embeddings
class CharactersToMolecules(nn.Module):
"""Convert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions."""
def __init__(self, config):
super().__init__()
# Define 1D convolutional layer for downsampling
self.conv = nn.Conv1d(
in_channels=config.hidden_size,
out_channels=config.hidden_size,
kernel_size=config.downsampling_rate,
stride=config.downsampling_rate,
)
# Activation function based on the configuration
self.activation = ACT2FN[config.hidden_act]
# Layer normalization to normalize outputs across the hidden_size dimension
# `self.LayerNorm` is kept as is to maintain compatibility with TensorFlow checkpoints
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, char_encoding: torch.Tensor) -> torch.Tensor:
# Extract the [CLS] token encoding: [batch, 1, hidden_size]
cls_encoding = char_encoding[:, 0:1, :]
# Transpose `char_encoding` to [batch, hidden_size, char_seq]
char_encoding = torch.transpose(char_encoding, 1, 2)
# Apply convolution for downsampling, then transpose back
downsampled = self.conv(char_encoding)
downsampled = torch.transpose(downsampled, 1, 2)
# Apply activation function to the downsampled sequence
downsampled = self.activation(downsampled)
# Remove the last molecule to reserve space for [CLS], maintaining alignment on TPUs
downsampled_truncated = downsampled[:, 0:-1, :]
# Concatenate [CLS] encoding with downsampled sequence
result = torch.cat([cls_encoding, downsampled_truncated], dim=1)
# Apply LayerNorm to the concatenated sequence
result = self.LayerNorm(result)
return result
class ConvProjection(nn.Module):
"""
Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size
characters.
"""
def __init__(self, config):
super().__init__()
self.config = config
# Define 1D convolutional layer for upsampling
self.conv = nn.Conv1d(
in_channels=config.hidden_size * 2,
out_channels=config.hidden_size,
kernel_size=config.upsampling_kernel_size,
stride=1,
)
# Activation function based on the configuration
self.activation = ACT2FN[config.hidden_act]
# Layer normalization to normalize outputs across the hidden_size dimension
# `self.LayerNorm` is kept as is to maintain compatibility with TensorFlow checkpoints
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# Dropout layer for regularization
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(
self,
inputs: torch.Tensor,
final_seq_char_positions: Optional[torch.Tensor] = None,
# inputs has shape [batch, mol_seq, molecule_hidden_size+char_hidden_final]
# we transpose it to be [batch, molecule_hidden_size+char_hidden_final, mol_seq]
inputs = torch.transpose(inputs, 1, 2)
# PyTorch < 1.9 does not support padding="same" (which is used in the original implementation),
# so we pad the tensor manually before passing it to the conv layer
# based on https://github.com/google-research/big_transfer/blob/49afe42338b62af9fbe18f0258197a33ee578a6b/bit_tf2/models.py#L36-L38
# Calculate total padding needed to achieve 'same' padding
pad_total = self.config.upsampling_kernel_size - 1
pad_beg = pad_total // 2 # Calculate padding to be added at the beginning
pad_end = pad_total - pad_beg # Calculate padding to be added at the end
# Create a 1-dimensional constant padding layer for convolution
pad = nn.ConstantPad1d((pad_beg, pad_end), 0)
# Apply padding to inputs tensor before passing it through convolutional layer
padded_inputs = pad(inputs)
# Perform convolution operation on the padded inputs
# `result`: shape (batch_size, char_seq_len, hidden_size)
result = self.conv(padded_inputs)
# Transpose result tensor to revert to original shape [batch, mol_seq, hidden_size]
result = torch.transpose(result, 1, 2)
# Apply activation function (e.g., ReLU) to the convolved result
result = self.activation(result)
# Apply layer normalization to stabilize training
result = self.LayerNorm(result)
# Apply dropout for regularization
result = self.dropout(result)
# Store the processed character sequence as the final output
final_char_seq = result
if final_seq_char_positions is not None:
# Limit transformer query seq and attention mask to these character
# positions to greatly reduce the compute cost. Typically, this is just
# done for the MLM training task.
# TODO add support for MLM
raise NotImplementedError("CanineForMaskedLM is currently not supported")
else:
# If no specific character positions are provided, use the entire processed sequence
query_seq = final_char_seq
# Return the final processed query sequence
return query_seq
# 定义一个名为 CanineSelfOutput 的神经网络模块,用于处理自注意力机制的输出
class CanineSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
# 线性变换层,将隐藏状态的维度转换为 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# LayerNorm 层,用于规范化隐藏状态,以减少内部协变量偏移
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# Dropout 层,用于随机置零隐藏状态的部分单元,以减少过拟合风险
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(
self, hidden_states: Tuple[torch.FloatTensor], input_tensor: torch.FloatTensor
) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
# 线性变换操作,将隐藏状态转换为 config.hidden_size 维度
hidden_states = self.dense(hidden_states)
# 对转换后的隐藏状态进行随机置零处理,以减少过拟合
hidden_states = self.dropout(hidden_states)
# 对处理后的隐藏状态进行 LayerNorm 规范化,加上输入的 tensor,形成残差连接
hidden_states = self.LayerNorm(hidden_states + input_tensor)
# 返回规范化后的隐藏状态
return hidden_states
def __init__(
self,
config,
local=False,
always_attend_to_first_position: bool = False,
first_position_attends_to_all: bool = False,
attend_from_chunk_width: int = 128,
attend_from_chunk_stride: int = 128,
attend_to_chunk_width: int = 128,
attend_to_chunk_stride: int = 128,
):
super().__init__()
# 初始化自注意力机制和自注意力输出层
self.self = CanineSelfAttention(config)
self.output = CanineSelfOutput(config)
# 初始化一个空的剪枝头集合
self.pruned_heads = set()
# 检查是否开启局部注意力
self.local = local
# 检查块大小和跨步是否合理,防止序列位置被跳过
if attend_from_chunk_width < attend_from_chunk_stride:
raise ValueError(
"`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped."
)
if attend_to_chunk_width < attend_to_chunk_stride:
raise ValueError(
"`attend_to_chunk_width` < `attend_to_chunk_stride` would cause sequence positions to get skipped."
)
# 设置额外的局部注意力参数
self.always_attend_to_first_position = always_attend_to_first_position
self.first_position_attends_to_all = first_position_attends_to_all
self.attend_from_chunk_width = attend_from_chunk_width
self.attend_from_chunk_stride = attend_from_chunk_stride
self.attend_to_chunk_width = attend_to_chunk_width
self.attend_to_chunk_stride = attend_to_chunk_stride
# 对 self 对象中的注意力头进行修剪操作
def prune_heads(self, heads):
# 如果 heads 列表为空,则直接返回,不执行修剪操作
if len(heads) == 0:
return
# 调用 find_pruneable_heads_and_indices 函数查找可修剪的注意力头及其索引
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# 对线性层进行修剪
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# 更新超参数并存储修剪后的头信息
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
# 定义前向传播函数
def forward(
self,
hidden_states: Tuple[torch.FloatTensor],
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
# 定义一个名为 CanineIntermediate 的神经网络模块类
class CanineIntermediate(nn.Module):
# 初始化方法,接受一个 config 参数
def __init__(self, config):
super().__init__()
# 创建一个线性层,输入大小为 config.hidden_size,输出大小为 config.intermediate_size
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
# 根据 config 中的 hidden_act 字段选择激活函数,存储在 self.intermediate_act_fn 中
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
# 前向传播方法,接受 hidden_states 参数作为输入张量,返回处理后的张量
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
# 将输入张量通过 self.dense 线性层处理
hidden_states = self.dense(hidden_states)
# 将处理后的张量通过选定的激活函数 self.intermediate_act_fn 进行激活
hidden_states = self.intermediate_act_fn(hidden_states)
# 返回处理后的张量作为输出
return hidden_states
# 定义一个名为 CanineOutput 的神经网络模块类
class CanineOutput(nn.Module):
# 初始化方法,接受一个 config 参数
def __init__(self, config):
super().__init__()
# 创建一个线性层,输入大小为 config.intermediate_size,输出大小为 config.hidden_size
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
# 创建一个 LayerNorm 层,对输入大小为 config.hidden_size 的张量进行归一化
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 创建一个 Dropout 层,使用 config.hidden_dropout_prob 作为丢弃概率
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 前向传播方法,接受 hidden_states 和 input_tensor 两个参数作为输入,返回处理后的张量
def forward(self, hidden_states: Tuple[torch.FloatTensor], input_tensor: torch.FloatTensor) -> torch.FloatTensor:
# 将输入张量通过 self.dense 线性层处理
hidden_states = self.dense(hidden_states)
# 对处理后的张量应用 Dropout 操作
hidden_states = self.dropout(hidden_states)
# 将 Dropout 后的张量与输入张量 input_tensor 相加,并通过 LayerNorm 层处理
hidden_states = self.LayerNorm(hidden_states + input_tensor)
# 返回处理后的张量作为输出
return hidden_states
# 定义一个名为 CanineLayer 的神经网络模块类
class CanineLayer(nn.Module):
# 初始化方法,接受多个参数,包括 config 和各种注意力机制的相关参数
def __init__(
self,
config,
local,
always_attend_to_first_position,
first_position_attends_to_all,
attend_from_chunk_width,
attend_from_chunk_stride,
attend_to_chunk_width,
attend_to_chunk_stride,
):
super().__init__()
# 设定块大小 feed forward 的大小为 config.chunk_size_feed_forward
self.chunk_size_feed_forward = config.chunk_size_feed_forward
# 序列长度维度为 1
self.seq_len_dim = 1
# 创建 CanineAttention 层,使用给定的参数进行初始化
self.attention = CanineAttention(
config,
local,
always_attend_to_first_position,
first_position_attends_to_all,
attend_from_chunk_width,
attend_from_chunk_stride,
attend_to_chunk_width,
attend_to_chunk_stride,
)
# 创建 CanineIntermediate 层,使用 config 进行初始化
self.intermediate = CanineIntermediate(config)
# 创建 CanineOutput 层,使用 config 进行初始化
self.output = CanineOutput(config)
# 前向传播方法,接受 hidden_states、attention_mask、head_mask、output_attentions 四个参数,
# 返回处理后的张量和可能的注意力权重张量
def forward(
self,
hidden_states: Tuple[torch.FloatTensor],
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
# 使用 self.attention 对 hidden_states 进行自注意力机制处理
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
)
# 获取注意力机制处理后的输出
attention_output = self_attention_outputs[0]
# 如果输出注意力权重,则添加自注意力权重到输出中
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
# 将 attention_output 通过 apply_chunking_to_forward 方法进行分块处理
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
# 将分块处理后的输出添加到 outputs 中
outputs = (layer_output,) + outputs
# 返回处理后的输出
return outputs
# 定义神经网络的前向传播方法,处理注意力输出作为输入
def feed_forward_chunk(self, attention_output):
# 将注意力输出作为输入,调用中间层的方法处理
intermediate_output = self.intermediate(attention_output)
# 使用中间层的输出和注意力输出调用输出层的方法,计算最终层的输出
layer_output = self.output(intermediate_output, attention_output)
# 返回最终的层输出作为这一块的前向传播结果
return layer_output
class CanineEncoder(nn.Module):
# CanineEncoder 类,用于实现特定的编码器模型
def __init__(
self,
config,
local=False,
always_attend_to_first_position=False,
first_position_attends_to_all=False,
attend_from_chunk_width=128,
attend_from_chunk_stride=128,
attend_to_chunk_width=128,
attend_to_chunk_stride=128,
):
super().__init__()
self.config = config
# 创建一个由 CanineLayer 组成的层列表,根据 config 中的隐藏层数量进行初始化
self.layer = nn.ModuleList(
[
CanineLayer(
config,
local,
always_attend_to_first_position,
first_position_attends_to_all,
attend_from_chunk_width,
attend_from_chunk_stride,
attend_to_chunk_width,
attend_to_chunk_stride,
)
for _ in range(config.num_hidden_layers)
]
)
self.gradient_checkpointing = False # 梯度检查点标志,默认为 False
def forward(
self,
hidden_states: Tuple[torch.FloatTensor],
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple, BaseModelOutput]:
# 初始化空元组,用于存储所有隐藏状态和自注意力分数(根据需要)
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
# 遍历所有的层,并执行前向传播
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 获取当前层的头部掩码
layer_head_mask = head_mask[i] if head_mask is not None else None
if self.gradient_checkpointing and self.training:
# 如果启用梯度检查点且处于训练模式,使用梯度检查点函数来计算当前层的输出
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
output_attentions,
)
else:
# 否则,直接调用当前层的__call__方法来计算当前层的输出
layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
# 更新隐藏状态为当前层的输出的第一个元素(通常是隐藏状态)
hidden_states = layer_outputs[0]
if output_attentions:
# 如果需要输出自注意力分数,将当前层的自注意力分数添加到 all_self_attentions 中
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
# 如果需要输出所有隐藏状态,将最终的隐藏状态添加到 all_hidden_states 中
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
# 如果不需要返回字典形式的输出,返回一个元组,其中包含非空的结果项
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
# 否则,返回一个 BaseModelOutput 对象,包含最终的隐藏状态、所有隐藏状态和自注意力分数
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class CaninePooler(nn.Module):
# CaninePooler 类,用于实现特定的池化器模型
def __init__(self, config):
super().__init__()
# 全连接层,将输入的大小转换为隐藏大小
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# Tanh 激活函数
self.activation = nn.Tanh()
# 定义类方法 `forward`,接受 `hidden_states` 参数作为输入,并返回 `torch.FloatTensor` 类型的张量
def forward(self, hidden_states: Tuple[torch.FloatTensor]) -> torch.FloatTensor:
# 通过取第一个令牌的隐藏状态来"汇聚"模型的输出
first_token_tensor = hidden_states[:, 0]
# 将第一个令牌的隐藏状态输入全连接层 `self.dense` 进行线性变换
pooled_output = self.dense(first_token_tensor)
# 对线性变换的结果应用激活函数 `self.activation`
pooled_output = self.activation(pooled_output)
# 返回汇聚后的输出张量 `pooled_output`
return pooled_output
class CaninePredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化一个全连接层,输入和输出维度都是 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 根据配置选择激活函数,如果配置中指定了激活函数名称,则使用对应的函数;否则直接使用配置中的激活函数对象
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
# 初始化 LayerNorm 层,归一化大小为 config.hidden_size,epsilon 值为 config.layer_norm_eps
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states: Tuple[torch.FloatTensor]) -> torch.FloatTensor:
# 将输入 hidden_states 通过全连接层 dense
hidden_states = self.dense(hidden_states)
# 使用预先选择的激活函数进行变换
hidden_states = self.transform_act_fn(hidden_states)
# 对变换后的 hidden_states 进行 LayerNorm 归一化处理
hidden_states = self.LayerNorm(hidden_states)
# 返回处理后的 hidden_states
return hidden_states
class CanineLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化预测头变换层,使用 CaninePredictionHeadTransform 类处理输入的 config
self.transform = CaninePredictionHeadTransform(config)
# 初始化解码层,使用全连接层实现,输入维度为 config.hidden_size,输出维度为 config.vocab_size,无偏置
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# 初始化偏置参数,维度为 config.vocab_size,作为解码层的偏置
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
# 将解码层的偏置参数设置为初始化的偏置参数
self.decoder.bias = self.bias
def forward(self, hidden_states: Tuple[torch.FloatTensor]) -> torch.FloatTensor:
# 使用预测头变换层处理输入 hidden_states
hidden_states = self.transform(hidden_states)
# 使用解码层对处理后的 hidden_states 进行预测
hidden_states = self.decoder(hidden_states)
# 返回预测得分
return hidden_states
class CanineOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化 MLM 头部,使用 CanineLMPredictionHead 类处理输入的 config
self.predictions = CanineLMPredictionHead(config)
def forward(
self,
sequence_output: Tuple[torch.Tensor],
) -> Tuple[torch.Tensor]:
# 将序列输出作为输入,通过预测头进行预测
prediction_scores = self.predictions(sequence_output)
# 返回预测分数
return prediction_scores
class CaninePreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 指定模型对应的配置类为 CanineConfig
config_class = CanineConfig
# 指定加载 TensorFlow 权重的函数为 load_tf_weights_in_canine
load_tf_weights = load_tf_weights_in_canine
# 设置基础模型的名称前缀为 "canine"
base_model_prefix = "canine"
# 支持梯度检查点
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
# 如果 module 是 nn.Linear 或 nn.Conv1d 类型
if isinstance(module, (nn.Linear, nn.Conv1d)):
# 使用正态分布随机初始化权重,均值为 0.0,标准差为 self.config.initializer_range
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果 module 存在偏置项,则将偏置项初始化为 0
if module.bias is not None:
module.bias.data.zero_()
# 如果 module 是 nn.Embedding 类型
elif isinstance(module, nn.Embedding):
# 使用正态分布随机初始化权重,均值为 0.0,标准差为 self.config.initializer_range
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果 module 设置了 padding_idx,将对应位置的权重初始化为 0
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
# 如果 module 是 nn.LayerNorm 类型
elif isinstance(module, nn.LayerNorm):
# 将 LayerNorm 层的偏置项初始化为 0
module.bias.data.zero_()
# 将 LayerNorm 层的权重初始化为 1
module.weight.data.fill_(1.0)
CANINE_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`CanineConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
CANINE_INPUTS_DOCSTRING = r"""
This string is intended to provide documentation about the expected inputs for the CANINE model. However, this section
currently lacks specific content and requires further completion to describe the inputs comprehensively.
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
# 输入序列中的标记索引,在词汇表中的位置
# 可以使用 AutoTokenizer 获取这些索引。参见 PreTrainedTokenizer.encode 和 PreTrainedTokenizer.__call__ 进行详细说明。
# 什么是输入 ID?请参见 ../glossary#input-ids
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
# 遮罩,用于在填充标记索引上避免执行注意力操作
# 遮罩的值选择在 [0, 1] 范围内:
# - 1 表示 **未被遮罩** 的标记
# - 0 表示 **被遮罩** 的标记
# 什么是注意力遮罩?请参见 ../glossary#attention-mask
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 段标记索引,用于指示输入的第一部分和第二部分
# 索引在 [0, 1] 范围内选择:
# - 0 对应 *句子 A* 的标记
# - 1 对应 *句子 B* 的标记
# 什么是标记类型 ID?请参见 ../glossary#token-type-ids
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 每个输入序列标记在位置嵌入中的位置索引
# 选择范围在 [0, config.max_position_embeddings - 1] 内
# 什么是位置 ID?请参见 ../glossary#position-ids
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
# 用于置空自注意力模块的选定头部的遮罩
# 遮罩的值选择在 [0, 1] 范围内:
# - 1 表示 **未被遮罩** 的头部
# - 0 表示 **被遮罩** 的头部
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
# 可选项,可以直接传递嵌入表示而不是传递 input_ids
# 如果您想要更多控制如何将 input_ids 索引转换为相关联的向量,而不是使用模型内部的嵌入查找矩阵,则这很有用。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。更多细节请参见返回张量中的 `attentions`。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。更多细节请参见返回张量中的 `hidden_states`。
return_dict (`bool`, *optional*):
# 是否返回 `~utils.ModelOutput` 而不是普通元组。
"""
@add_start_docstrings(
"The bare CANINE Model transformer outputting raw hidden-states without any specific head on top.",
CANINE_START_DOCSTRING,
)
class CanineModel(CaninePreTrainedModel):
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
shallow_config = copy.deepcopy(config)
shallow_config.num_hidden_layers = 1
self.char_embeddings = CanineEmbeddings(config)
self.initial_char_encoder = CanineEncoder(
shallow_config,
local=True,
always_attend_to_first_position=False,
first_position_attends_to_all=False,
attend_from_chunk_width=config.local_transformer_stride,
attend_from_chunk_stride=config.local_transformer_stride,
attend_to_chunk_width=config.local_transformer_stride,
attend_to_chunk_stride=config.local_transformer_stride,
)
self.chars_to_molecules = CharactersToMolecules(config)
self.encoder = CanineEncoder(config)
self.projection = ConvProjection(config)
self.final_char_encoder = CanineEncoder(shallow_config)
self.pooler = CaninePooler(config) if add_pooling_layer else None
self.post_init()
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def _create_3d_attention_mask_from_input_mask(self, from_tensor, to_mask):
"""
Create 3D attention mask from a 2D tensor mask.
Args:
from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
to_mask: int32 Tensor of shape [batch_size, to_seq_length].
Returns:
float Tensor of shape [batch_size, from_seq_length, to_seq_length].
"""
batch_size, from_seq_length = from_tensor.shape[0], from_tensor.shape[1]
to_seq_length = to_mask.shape[1]
to_mask = torch.reshape(to_mask, (batch_size, 1, to_seq_length)).float()
broadcast_ones = torch.ones(size=(batch_size, from_seq_length, 1), dtype=torch.float32, device=to_mask.device)
mask = broadcast_ones * to_mask
return mask
def _downsample_attention_mask(self, char_attention_mask: torch.Tensor, downsampling_rate: int):
"""Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer."""
batch_size, char_seq_len = char_attention_mask.shape
poolable_char_mask = torch.reshape(char_attention_mask, (batch_size, 1, char_seq_len))
pooled_molecule_mask = torch.nn.MaxPool1d(kernel_size=downsampling_rate, stride=downsampling_rate)(
poolable_char_mask.float()
)
molecule_attention_mask = torch.squeeze(pooled_molecule_mask, dim=-1)
return molecule_attention_mask
def _repeat_molecules(self, molecules: torch.Tensor, char_seq_length: torch.Tensor) -> torch.Tensor:
"""Repeats molecules to make them the same length as the char sequence."""
rate = self.config.downsampling_rate
molecules_without_extra_cls = molecules[:, 1:, :]
repeated = torch.repeat_interleave(molecules_without_extra_cls, repeats=rate, dim=-2)
last_molecule = molecules[:, -1:, :]
remainder_length = torch.fmod(torch.tensor(char_seq_length), torch.tensor(rate)).item()
remainder_repeated = torch.repeat_interleave(
last_molecule,
repeats=remainder_length + rate,
dim=-2,
)
return torch.cat([repeated, remainder_repeated], dim=-2)
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CanineModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
CANINE_START_DOCSTRING,
)
class CanineForSequenceClassification(CaninePreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.canine = CanineModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
前向传播函数,接收多个输入参数,执行模型推断。
Args:
input_ids (Optional[torch.LongTensor]): 输入的token IDs序列.
attention_mask (Optional[torch.FloatTensor]): 注意力掩码,指示哪些位置是填充的.
token_type_ids (Optional[torch.LongTensor]): token类型IDs,如用于BERT模型的segment IDs.
position_ids (Optional[torch.LongTensor]): 位置IDs,用于指定每个token的绝对位置.
head_mask (Optional[torch.FloatTensor]): 多头注意力层的掩码.
inputs_embeds (Optional[torch.FloatTensor]): 直接的嵌入表示输入.
labels (Optional[torch.LongTensor]): 模型的标签.
output_attentions (Optional[bool]): 是否输出注意力权重.
output_hidden_states (Optional[bool]): 是否输出所有隐藏状态.
return_dict (Optional[bool]): 是否返回输出字典.
Returns:
SequenceClassifierOutput: 序列分类器的输出,包括预测和额外的元数据.
"""
outputs = self.canine(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
return SequenceClassifierOutput(
loss=None if labels is None else F.cross_entropy(logits, labels),
logits=logits,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=outputs.attentions if output_attentions else None,
)
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.canine(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
CANINE Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
CANINE_START_DOCSTRING,
)
class CanineForMultipleChoice(CaninePreTrainedModel):
"""
CANINE模型,顶部带有多选分类头部(在汇总输出之上的线性层和softmax),例如用于RocStories/SWAG任务。
继承自CaninePreTrainedModel类。
"""
def __init__(self, config):
"""
初始化方法,设置模型结构。
Args:
config (CanineConfig): 模型配置对象,包含模型的各种参数设置。
"""
super().__init__(config)
self.canine = CanineModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.post_init()
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
前向传播方法,定义模型的数据流。
Args:
input_ids (Optional[torch.LongTensor]): 输入的token IDs张量。
attention_mask (Optional[torch.FloatTensor]): 注意力掩码张量,用于指定哪些位置是填充的。
token_type_ids (Optional[torch.LongTensor]): 分段类型IDs张量,用于区分不同句子的位置。
position_ids (Optional[torch.LongTensor]): 位置IDs张量,用于指定输入token的绝对位置。
head_mask (Optional[torch.FloatTensor]): 多头注意力机制的掩码张量,用于指定哪些头部是无效的。
inputs_embeds (Optional[torch.FloatTensor]): 嵌入向量的输入张量。
labels (Optional[torch.LongTensor]): 标签张量,用于多选分类任务的真实标签。
output_attentions (Optional[bool]): 是否输出注意力权重。
output_hidden_states (Optional[bool]): 是否输出隐藏状态。
return_dict (Optional[bool]): 是否返回字典格式的输出。
Returns:
MultipleChoiceModelOutput: 包含模型输出的对象,包括分类预测和其他可选的输出(如注意力权重、隐藏状态)。
"""
pass
) -> Union[Tuple, MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.canine(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
CANINE Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
CANINE_START_DOCSTRING,
)
class CanineForTokenClassification(CaninePreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.canine = CanineModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Returns:
Depending on `return_dict`:
- If `return_dict=True`, returns a `TokenClassifierOutput` containing `loss`, `logits`, `hidden_states`, and `attentions`.
- If `return_dict=False`, returns a tuple with `logits` followed by additional outputs.
Example:
```
>>> from transformers import AutoTokenizer, CanineForTokenClassification
>>> import torch
>>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
>>> model = CanineForTokenClassification.from_pretrained("google/canine-s")
>>> inputs = tokenizer(
... "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
... )
>>> with torch.no_grad():
... logits = model(**inputs).logits
>>> predicted_token_class_ids = logits.argmax(-1)
>>> # Note that tokens are classified rather then input words which means that
>>> # there might be more predicted token classes than words.
>>> # Multiple token classes might account for the same word
>>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
>>> predicted_tokens_classes # doctest: +SKIP
```
```
>>> labels = predicted_token_class_ids
>>> loss = model(**inputs, labels=labels).loss
>>> round(loss.item(), 2) # doctest: +SKIP
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.canine(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
CANINE Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
CANINE_START_DOCSTRING,
)
class CanineForQuestionAnswering(CaninePreTrainedModel):
"""
CANINE模型,顶部带有用于提取式问答任务(如SQuAD)的跨度分类头部(在隐藏状态输出之上的线性层,用于计算`span start logits`和`span end logits`)。
继承自CaninePreTrainedModel。
"""
def __init__(self, config):
"""
初始化方法,设置模型参数和各层。
Args:
config (CanineConfig): 模型的配置对象,包含模型的各种参数。
"""
super().__init__(config)
self.num_labels = config.num_labels
self.canine = CanineModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="Splend1dchan/canine-c-squad",
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="'nice puppet'",
expected_loss=8.81,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
前向传播方法,执行模型的前向计算。
Args:
input_ids (Optional[torch.LongTensor]): 输入token的ids。
attention_mask (Optional[torch.FloatTensor]): 注意力掩码,指示哪些tokens需要注意,哪些不需要。
token_type_ids (Optional[torch.LongTensor]): token类型ids,如segment ids。
position_ids (Optional[torch.LongTensor]): token位置ids。
head_mask (Optional[torch.FloatTensor]): 头部掩码,用于指定哪些层的注意力是有效的。
inputs_embeds (Optional[torch.FloatTensor]): 嵌入的输入。
start_positions (Optional[torch.LongTensor]): 答案起始位置的ids。
end_positions (Optional[torch.LongTensor]): 答案结束位置的ids。
output_attentions (Optional[bool]): 是否返回注意力权重。
output_hidden_states (Optional[bool]): 是否返回隐藏状态。
return_dict (Optional[bool]): 是否返回字典格式的输出。
Returns:
QuestionAnsweringModelOutput: 包含模型预测结果的输出对象。
"""
pass
) -> Union[Tuple, QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.canine(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions.clamp_(0, ignored_index)
end_positions.clamp_(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)