Transformers 源码解析(九)
.\models\albert\configuration_albert.py
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"albert/albert-base-v1": "https://huggingface.co/albert/albert-base-v1/resolve/main/config.json",
"albert/albert-large-v1": "https://huggingface.co/albert/albert-large-v1/resolve/main/config.json",
"albert/albert-xlarge-v1": "https://huggingface.co/albert/albert-xlarge-v1/resolve/main/config.json",
"albert/albert-xxlarge-v1": "https://huggingface.co/albert/albert-xxlarge-v1/resolve/main/config.json",
"albert/albert-base-v2": "https://huggingface.co/albert/albert-base-v2/resolve/main/config.json",
"albert/albert-large-v2": "https://huggingface.co/albert/albert-large-v2/resolve/main/config.json",
"albert/albert-xlarge-v2": "https://huggingface.co/albert/albert-xlarge-v2/resolve/main/config.json",
"albert/albert-xxlarge-v2": "https://huggingface.co/albert/albert-xxlarge-v2/resolve/main/config.json",
}
class AlbertConfig(PretrainedConfig):
r"""
这是一个配置类,用于存储 [`AlbertModel`] 或 [`TFAlbertModel`] 的配置。根据指定的参数实例化一个 ALBERT 模型配置,
定义模型的架构。使用默认参数实例化配置将得到与 ALBERT [albert/albert-xxlarge-v2] 相似的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。
示例:
```
>>> from transformers import AlbertConfig, AlbertModel
>>> # 初始化 ALBERT-xxlarge 风格的配置
>>> albert_xxlarge_configuration = AlbertConfig()
>>> # 初始化 ALBERT-base 风格的配置
>>> albert_base_configuration = AlbertConfig(
... hidden_size=768,
... num_attention_heads=12,
... intermediate_size=3072,
... )
>>> # 使用 ALBERT-base 风格的配置初始化一个模型(带有随机权重)
```
```
>>> model = AlbertModel(albert_xxlarge_configuration)
# 创建一个 AlbertModel 的实例,使用给定的配置 albert_xxlarge_configuration
model = AlbertModel(albert_xxlarge_configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
# 获取模型的配置信息并赋值给 configuration 变量
configuration = model.config
model_type = "albert"
def __init__(
self,
vocab_size=30000,
embedding_size=128,
hidden_size=4096,
num_hidden_layers=12,
num_hidden_groups=1,
num_attention_heads=64,
intermediate_size=16384,
inner_group_num=1,
hidden_act="gelu_new",
hidden_dropout_prob=0,
attention_probs_dropout_prob=0,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
classifier_dropout_prob=0.1,
position_embedding_type="absolute",
pad_token_id=0,
bos_token_id=2,
eos_token_id=3,
**kwargs,
):
# 调用父类的构造函数,并传入相关的特殊 token id 参数
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
# 初始化 AlbertModel 的各种参数
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_hidden_groups = num_hidden_groups
self.num_attention_heads = num_attention_heads
self.inner_group_num = inner_group_num
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.classifier_dropout_prob = classifier_dropout_prob
self.position_embedding_type = position_embedding_type
# 从 transformers.models.bert.configuration_bert.BertOnnxConfig 复制并修改为 AlbertOnnxConfig 类,用于处理 Albert 模型的配置
class AlbertOnnxConfig(OnnxConfig):
# 定义 inputs 属性,返回一个映射,表示输入张量的动态轴
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
# 根据任务类型设定动态轴的不同设置
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
# 返回一个有序字典,指定模型输入张量名称与对应的动态轴
return OrderedDict(
[
("input_ids", dynamic_axis), # 模型输入张量 input_ids 对应的动态轴
("attention_mask", dynamic_axis), # 模型输入张量 attention_mask 对应的动态轴
("token_type_ids", dynamic_axis), # 模型输入张量 token_type_ids 对应的动态轴
]
)
.\models\albert\convert_albert_original_tf_checkpoint_to_pytorch.py
"""Convert ALBERT checkpoint."""
import argparse
import torch
from ...utils import logging
from . import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
config = AlbertConfig.from_json_file(albert_config_file)
print(f"Building PyTorch model from configuration: {config}")
model = AlbertForPreTraining(config)
load_tf_weights_in_albert(model, config, tf_checkpoint_path)
print(f"Save PyTorch model to {pytorch_dump_path}")
torch.save(model.state_dict(), pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--albert_config_file",
default=None,
type=str,
required=True,
help=(
"The config json file corresponding to the pre-trained ALBERT model. \n"
"This specifies the model architecture."
),
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
D:\Python310\Lib\site-packages\transformers\models\albert\modeling_albert.py
"""PyTorch ALBERT model."""
import math
import os
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPooling,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import (
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_albert import AlbertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"
_CONFIG_FOR_DOC = "AlbertConfig"
ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"albert/albert-base-v1",
"albert/albert-large-v1",
"albert/albert-xlarge-v1",
"albert/albert-xxlarge-v1",
"albert/albert-base-v2",
"albert/albert-large-v2",
"albert/albert-xlarge-v2",
"albert/albert-xxlarge-v2",
]
def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
print(name)
return model
"""
Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config: AlbertConfig):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
past_key_values_length: int = 0,
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
past_key_values_length: int = 0,
) -> torch.Tensor:
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class AlbertAttention(nn.Module):
def __init__(self, config: AlbertConfig):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads}"
)
self.num_attention_heads = config.num_attention_heads
self.hidden_size = config.hidden_size
self.attention_head_size = config.hidden_size // config.num_attention_heads
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.output_dropout = nn.Dropout(config.hidden_dropout_prob)
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.pruned_heads = set()
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def prune_heads(self, heads: List[int]) -> None:
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.num_attention_heads, self.attention_head_size, self.pruned_heads
)
self.query = prune_linear_layer(self.query, index)
self.key = prune_linear_layer(self.key, index)
self.value = prune_linear_layer(self.value, index)
self.dense = prune_linear_layer(self.dense, index, dim=1)
self.num_attention_heads = self.num_attention_heads - len(heads)
self.all_head_size = self.attention_head_size * self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: bool = False,
class AlbertLayer(nn.Module):
def __init__(self, config: AlbertConfig):
super().__init__()
self.config = config
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.attention = AlbertAttention(config)
self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
self.activation = ACT2FN[config.hidden_act]
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
ffn_output = apply_chunking_to_forward(
self.ff_chunk,
self.chunk_size_feed_forward,
self.seq_len_dim,
attention_output[0],
)
hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
return (hidden_states,) + attention_output[1:]
def ff_chunk(self, attention_output: torch.Tensor) -> torch.Tensor:
ffn_output = self.ffn(attention_output)
ffn_output = self.activation(ffn_output)
ffn_output = self.ffn_output(ffn_output)
return ffn_output
class AlbertLayerGroup(nn.Module):
def __init__(self, config: AlbertConfig):
super().__init__()
self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
for layer_module in self.albert_layers:
hidden_states = layer_module(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)[0]
return hidden_states,
-> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
layer_hidden_states = ()
layer_attentions = ()
for layer_index, albert_layer in enumerate(self.albert_layers):
layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index], output_attentions)
hidden_states = layer_output[0]
if output_attentions:
layer_attentions = layer_attentions + (layer_output[1],)
if output_hidden_states:
layer_hidden_states = layer_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if output_hidden_states:
outputs = outputs + (layer_hidden_states,)
if output_attentions:
outputs = outputs + (layer_attentions,)
return outputs
class AlbertTransformer(nn.Module):
def __init__(self, config: AlbertConfig):
super().__init__()
self.config = config
self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[BaseModelOutput, Tuple]:
hidden_states = self.embedding_hidden_mapping_in(hidden_states)
all_hidden_states = (hidden_states,) if output_hidden_states else None
all_attentions = () if output_attentions else None
head_mask = [None] * self.config.num_hidden_layers if head_mask is None else head_mask
for i in range(self.config.num_hidden_layers):
layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
layer_group_output = self.albert_layer_groups[group_idx](
hidden_states,
attention_mask,
head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
output_attentions,
output_hidden_states,
)
hidden_states = layer_group_output[0]
if output_attentions:
all_attentions = all_attentions + layer_group_output[-1]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
class AlbertPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = AlbertConfig
load_tf_weights = load_tf_weights_in_albert
base_model_prefix = "albert"
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
@dataclass
class AlbertForPreTrainingOutput(ModelOutput):
"""
Output type of [`AlbertForPreTraining`].
Args:
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor] = None
prediction_logits: torch.FloatTensor = None
sop_logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
ALBERT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Args:
config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
ALBERT_INPUTS_DOCSTRING = r"""
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
[`PreTrainedTokenizer.encode`] for details.
[What are input IDs?](../glossary
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
# 是否返回一个 `~utils.ModelOutput` 而不是一个普通元组
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
AlbertModel 类定义了 ALBERT 模型,用于处理文本数据的转换器模型。
@add_start_docstrings(
"The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
ALBERT_START_DOCSTRING,
)
class AlbertModel(AlbertPreTrainedModel):
config_class = AlbertConfig
base_model_prefix = "albert"
def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True):
super().__init__(config)
self.config = config
self.embeddings = AlbertEmbeddings(config) # 初始化 ALBERT 的嵌入层
self.encoder = AlbertTransformer(config) # 初始化 ALBERT 的 transformer 编码器
if add_pooling_layer:
self.pooler = nn.Linear(config.hidden_size, config.hidden_size) # 添加池化层的线性变换
self.pooler_activation = nn.Tanh() # 池化层的激活函数为 Tanh
else:
self.pooler = None
self.pooler_activation = None
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self) -> nn.Embedding:
return self.embeddings.word_embeddings # 返回嵌入层的词嵌入
def set_input_embeddings(self, value: nn.Embedding) -> None:
self.embeddings.word_embeddings = value # 设置嵌入层的词嵌入为指定值
def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.
These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
while [2,3] correspond to the two inner groups of the second hidden layer.
Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
information about head pruning
"""
for layer, heads in heads_to_prune.items():
group_idx = int(layer / self.config.inner_group_num)
inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
# 剪枝指定层的注意力头部
self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[BaseModelOutputWithPooling, Tuple]:
# 如果未显式指定,使用配置中的值来确定是否输出注意力权重
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果未显式指定,使用配置中的值来确定是否输出隐藏状态
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未显式指定,使用配置中的值来确定是否返回字典格式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 检查输入参数,确保不同时指定 input_ids 和 inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
# 如果指定了 input_ids,则检查是否需要警告关于填充和注意力蒙版的使用
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
elif inputs_embeds is not None:
# 如果指定了 inputs_embeds,则获取其形状(除最后一个维度)
input_shape = inputs_embeds.size()[:-1]
else:
# 如果既未指定 input_ids 也未指定 inputs_embeds,则抛出错误
raise ValueError("You have to specify either input_ids or inputs_embeds")
# 获取批次大小和序列长度
batch_size, seq_length = input_shape
# 获取设备信息,用于在 GPU 或 CPU 上执行操作
device = input_ids.device if input_ids is not None else inputs_embeds.device
# 如果未提供 attention_mask,则创建一个全为1的张量
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
# 如果未提供 token_type_ids,则根据嵌入层的特性来决定是否需要创建
if token_type_ids is None:
if hasattr(self.embeddings, "token_type_ids"):
buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
# 扩展注意力蒙版的维度以匹配编码器的期望形状
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # 为了fp16的兼容性
# 对扩展的注意力蒙版进行填充,使未考虑部分的权重为负无穷
extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min
# 获取头部遮罩,用于确定哪些层的注意力应被屏蔽
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
# 通过嵌入层获取嵌入输出
embedding_output = self.embeddings(
input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
)
# 通过编码器处理嵌入输出,获取编码器的输出
encoder_outputs = self.encoder(
embedding_output,
extended_attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从编码器的输出中获取序列输出
sequence_output = encoder_outputs[0]
# 如果存在池化器,则对序列输出的首个位置进行池化操作并激活
pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) if self.pooler is not None else None
# 如果不返回字典格式的输出,则返回元组形式的输出
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
# 返回字典格式的输出,包括序列输出、池化输出、隐藏状态和注意力权重
return BaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
"""
Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
`sentence order prediction (classification)` head.
"""
# 导入所需的模块和库
from transformers import AlbertPreTrainedModel, AlbertModel, AlbertConfig
from transformers.modeling_outputs import AlbertForPreTrainingOutput
from transformers.activations import ACT2FN
from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
import torch
import torch.nn as nn
# ALBERT_START_DOCSTRING 定义在引入的模块中,这里假设是常量或全局变量
@add_start_docstrings(
"""
Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
`sentence order prediction (classification)` head.
""",
ALBERT_START_DOCSTRING,
)
class AlbertForPreTraining(AlbertPreTrainedModel):
_tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
def __init__(self, config: AlbertConfig):
super().__init__(config)
# 初始化 Albert 模型和两个头部
self.albert = AlbertModel(config)
self.predictions = AlbertMLMHead(config)
self.sop_classifier = AlbertSOPHead(config)
# 初始化权重并应用最终处理
self.post_init()
def get_output_embeddings(self) -> nn.Linear:
return self.predictions.decoder
def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
self.predictions.decoder = new_embeddings
def get_input_embeddings(self) -> nn.Embedding:
return self.albert.embeddings.word_embeddings
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
sentence_order_label: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 省略了 forward 方法的实现,由 AlbertPreTrainedModel 提供
class AlbertMLMHead(nn.Module):
def __init__(self, config: AlbertConfig):
super().__init__()
# 初始化 MLM 头部的各个组件
self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.dense = nn.Linear(config.hidden_size, config.embedding_size)
self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
self.activation = ACT2FN[config.hidden_act]
self.decoder.bias = self.bias
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# MLM 头部的前向传播
hidden_states = self.dense(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
hidden_states = self.decoder(hidden_states)
prediction_scores = hidden_states
return prediction_scores
def _tie_weights(self) -> None:
# 如果这两个权重被分离(在 TPU 上或当偏置被调整大小时),将它们绑定起来
self.bias = self.decoder.bias
class AlbertSOPHead(nn.Module):
# 初始化方法,接受一个 AlbertConfig 类型的参数 config
def __init__(self, config: AlbertConfig):
# 调用父类(nn.Module)的初始化方法
super().__init__()
# 创建一个 dropout 层,使用 config 中的 classifier_dropout_prob 参数作为丢弃概率
self.dropout = nn.Dropout(config.classifier_dropout_prob)
# 创建一个全连接层(线性变换),输入大小为 config.hidden_size,输出大小为 config.num_labels
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# 前向传播方法,输入一个 torch.Tensor 类型的 pooled_output,返回一个 torch.Tensor 类型的 logits
def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
# 对 pooled_output 应用 dropout 操作
dropout_pooled_output = self.dropout(pooled_output)
# 将 dropout 后的 pooled_output 输入到全连接层中进行线性变换,得到 logits
logits = self.classifier(dropout_pooled_output)
# 返回 logits 作为输出
return logits
# 使用装饰器添加文档字符串,描述该类是带有顶层语言建模头的Albert模型。
@add_start_docstrings(
"Albert Model with a `language modeling` head on top.",
ALBERT_START_DOCSTRING,
)
# 定义Albert模型的具体实现类,继承自AlbertPreTrainedModel。
class AlbertForMaskedLM(AlbertPreTrainedModel):
# 定义需要共享权重的关键键列表。
_tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
# 初始化方法,接收配置参数config并调用父类的初始化方法。
def __init__(self, config):
super().__init__(config)
# 创建Albert模型实例,不添加池化层。
self.albert = AlbertModel(config, add_pooling_layer=False)
# 创建AlbertMLMHead实例,用于预测。
self.predictions = AlbertMLMHead(config)
# 初始化权重并进行最终处理。
self.post_init()
# 获取输出嵌入层的方法,返回预测层的解码器。
def get_output_embeddings(self) -> nn.Linear:
return self.predictions.decoder
# 设置输出嵌入层的方法,用新的线性层替换预测层的解码器。
def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
self.predictions.decoder = new_embeddings
# 获取输入嵌入层的方法,返回Albert模型的词嵌入。
def get_input_embeddings(self) -> nn.Embedding:
return self.albert.embeddings.word_embeddings
# 前向传播方法,接收多个输入参数,返回掩码语言模型的输出。
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 方法的具体实现在AlbertPreTrainedModel中被覆盖。
# 根据输入参数执行Albert模型的前向传播,返回预测的掩码语言模型输出。
pass # 这里只是声明方法,实现在父类中
这样的注释能够清晰地解释每个方法和类的作用及其关键细节,帮助他人理解代码的功能和实现方式。
) -> Union[MaskedLMOutput, Tuple]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
# 根据 return_dict 是否为 None 决定是否使用 self.config.use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 Albert 模型,传入各种参数,获取模型输出
outputs = self.albert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中取出序列输出
sequence_outputs = outputs[0]
# 使用预测器预测序列输出中的预测分数
prediction_scores = self.predictions(sequence_outputs)
masked_lm_loss = None
# 如果 labels 不为 None,则计算 masked language modeling 的损失
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
# 如果 return_dict 为 False,则返回完整的输出,包括预测分数和其他输出状态
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
# 如果 return_dict 为 True,则返回 MaskedLMOutput 对象,包括损失、预测分数、隐藏状态和注意力权重
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 定义一个带有顶部序列分类/回归头部的 Albert 模型转换器,例如用于 GLUE 任务(该头部是在汇总输出之上的线性层)。
# 这是一个使用 ALBERT 的特定文档字符串的装饰器。
@add_start_docstrings(
"""
Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
ALBERT_START_DOCSTRING, # 引入 ALBERT 模型的通用文档字符串
)
class AlbertForSequenceClassification(AlbertPreTrainedModel):
def __init__(self, config: AlbertConfig):
super().__init__(config)
self.num_labels = config.num_labels # 从配置中获取标签数量
self.config = config
self.albert = AlbertModel(config) # 创建 ALBERT 模型
self.dropout = nn.Dropout(config.classifier_dropout_prob) # 应用分类器的 dropout
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) # 创建分类器的线性层
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="textattack/albert-base-v2-imdb", # 提供了一个代码示例的检查点
output_type=SequenceClassifierOutput, # 预期的输出类型
config_class=_CONFIG_FOR_DOC, # 用于文档的配置类
expected_output="'LABEL_1'", # 预期的输出
expected_loss=0.12, # 预期的损失
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 上述参数用于模型前向传播,描述了每个参数的类型和作用
# 以下是参数的描述:
# - input_ids: 输入的token IDs
# - attention_mask: 注意力掩码,用于指示哪些元素需要被注意力机制忽略
# - token_type_ids: 标识不同序列的token类型,如segment A/B用于BERT等模型
# - position_ids: 标识每个token在序列中的位置ID
# - head_mask: 用于控制多头注意力机制中每个头部的掩码
# - inputs_embeds: 可选的嵌入表示输入
# - labels: 对应于每个输入的标签,用于训练
# - output_attentions: 是否返回注意力权重
# - output_hidden_states: 是否返回所有隐藏状态
# - return_dict: 是否返回字典类型的输出
# 下面的装饰器为模型的前向方法添加了文档字符串,描述了输入参数的具体形状和含义
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 确定是否使用返回字典,如果未指定,则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 ALBERT 模型进行前向传播计算
outputs = self.albert(
input_ids=input_ids, # 输入的词索引序列
attention_mask=attention_mask, # 输入的注意力掩码,指示哪些标记是实际输入,哪些是填充
token_type_ids=token_type_ids, # 输入的标记类型 IDs,用于区分不同句子或片段
position_ids=position_ids, # 输入的位置 IDs,指示每个位置在输入中的位置
head_mask=head_mask, # 多头注意力机制的掩码,用于控制每个头的权重
inputs_embeds=inputs_embeds, # 替代输入的嵌入表示
output_attentions=output_attentions, # 是否输出注意力权重
output_hidden_states=output_hidden_states, # 是否输出隐藏状态
return_dict=return_dict, # 是否返回字典格式的输出
)
# 获取 ALBERT 模型的汇聚输出
pooled_output = outputs[1]
# 对汇聚输出应用 dropout
pooled_output = self.dropout(pooled_output)
# 使用分类器对汇聚输出进行分类得到 logits
logits = self.classifier(pooled_output)
# 初始化损失为 None
loss = None
# 如果提供了标签
if labels is not None:
# 如果问题类型未指定,则根据标签类型和标签数量推断问题类型
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression" # 如果标签数量为 1,则为回归问题
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification" # 如果标签数量大于 1 且标签类型为 long 或 int,则为单标签分类问题
else:
self.config.problem_type = "multi_label_classification" # 否则为多标签分类问题
# 根据问题类型选择相应的损失函数
if self.config.problem_type == "regression":
loss_fct = MSELoss() # 均方误差损失函数
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze()) # 对于回归问题,计算损失
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss() # 交叉熵损失函数
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) # 对于单标签分类问题,计算损失
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss() # 二元交叉熵损失函数
loss = loss_fct(logits, labels) # 对于多标签分类问题,计算损失
# 如果不需要返回字典格式的输出
if not return_dict:
output = (logits,) + outputs[2:] # 输出结果包括 logits 和额外的输出信息
return ((loss,) + output) if loss is not None else output # 如果有损失,则加入到输出结果中
# 如果需要返回字典格式的输出,构造 SequenceClassifierOutput 对象
return SequenceClassifierOutput(
loss=loss, # 损失值
logits=logits, # 模型输出的 logits
hidden_states=outputs.hidden_states, # 隐藏状态
attentions=outputs.attentions, # 注意力权重
)
"""
Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""" # 描述 Albert 模型,在隐藏状态输出之上添加了一个用于标记分类(例如命名实体识别)的线性层的头部。
# 使用 ALBERT_START_DOCSTRING 和额外提供的描述来为类添加文档字符串
@add_start_docstrings(
ALBERT_START_DOCSTRING,
"""
Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
)
class AlbertForTokenClassification(AlbertPreTrainedModel):
def __init__(self, config: AlbertConfig):
super().__init__(config)
self.num_labels = config.num_labels
# 初始化 ALBERT 模型,不包括池化层
self.albert = AlbertModel(config, add_pooling_layer=False)
# 根据配置设置分类器的 dropout 概率
classifier_dropout_prob = (
config.classifier_dropout_prob
if config.classifier_dropout_prob is not None
else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout_prob)
# 线性层,将 ALBERT 隐藏层的输出映射到标签数量的空间
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
# 为 forward 方法添加文档字符串,描述输入和输出的格式
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# 添加代码示例的文档字符串,指定了模型使用的检查点、输出类型和配置类
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[TokenClassifierOutput, Tuple]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
# 如果 return_dict 为 None,则根据配置决定是否使用 return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 ALBERT 模型进行前向传播,获取输出结果
outputs = self.albert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从 ALBERT 模型输出中获取序列输出
sequence_output = outputs[0]
# 对序列输出进行 dropout 处理
sequence_output = self.dropout(sequence_output)
# 将 dropout 后的输出送入分类器得到 logits
logits = self.classifier(sequence_output)
# 初始化损失值为 None
loss = None
# 如果提供了标签,则计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
# 如果不使用 return_dict,则返回完整的输出元组
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 使用 TokenClassifierOutput 类构建返回结果,包括损失、logits、隐藏状态和注意力权重
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
@add_start_docstrings(
"""
Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
ALBERT_START_DOCSTRING, # 添加 Albert 模型的文档字符串和 Albert 的开始文档字符串
)
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
def __init__(self, config: AlbertConfig):
super().__init__(config) # 调用父类的初始化方法
self.num_labels = config.num_labels # 设置标签数量
self.albert = AlbertModel(config, add_pooling_layer=False) # 初始化 Albert 模型,不加汇聚层
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) # 使用线性层进行输出
# Initialize weights and apply final processing
self.post_init() # 调用自定义的初始化方法
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="twmkn9/albert-base-v2-squad2",
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
qa_target_start_index=12,
qa_target_end_index=13,
expected_output="'a nice puppet'",
expected_loss=7.36,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[AlbertForPreTrainingOutput, Tuple]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
# Determine whether to use the return_dict from function arguments or default configuration
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Pass inputs to the Albert model and obtain outputs
outputs = self.albert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Extract sequence output from the model outputs
sequence_output = outputs[0]
# Generate logits by passing sequence output through the QA output layer
logits: torch.Tensor = self.qa_outputs(sequence_output)
# Split logits into start and end logits
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
# Ensure start_positions and end_positions are properly shaped for processing
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# Clamp positions within valid range
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
# Define loss function and compute start_loss and end_loss
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
# If return_dict is False, return output as tuple
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
# If return_dict is True, return structured output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 定义一个 Albert 模型,该模型在顶部具有用于多选分类任务的分类头部(一个线性层叠加在汇总输出和 softmax 上),例如用于 RocStories/SWAG 任务。
@add_start_docstrings(
"""
Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
ALBERT_START_DOCSTRING, # 添加起始注释,描述 Albert 模型的概述
)
class AlbertForMultipleChoice(AlbertPreTrainedModel):
def __init__(self, config: AlbertConfig):
super().__init__(config)
# 初始化 Albert 模型
self.albert = AlbertModel(config)
# 使用配置中的 dropout 概率初始化 dropout 层
self.dropout = nn.Dropout(config.classifier_dropout_prob)
# 使用隐藏层大小初始化分类器线性层,输出维度为1(用于多选任务)
self.classifier = nn.Linear(config.hidden_size, 1)
# 初始化权重并应用最终处理
self.post_init() # 执行后期初始化操作,可能包括权重初始化等
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 定义前向传播函数,接受一系列输入参数,并输出多选模型的预测结果
) -> Union[AlbertForPreTrainingOutput, Tuple]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
*input_ids* above)
"""
# 根据是否指定返回字典的选项来确定是否返回字典形式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 获取输入的选项数目
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
# 如果传入了input_ids,则将其视作(batch_size, num_choices, sequence_length)的形状
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
# 如果传入了attention_mask,则将其视作(batch_size, num_choices, sequence_length)的形状
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
# 如果传入了token_type_ids,则将其视作(batch_size, num_choices, sequence_length)的形状
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
# 如果传入了position_ids,则将其视作(batch_size, num_choices, sequence_length)的形状
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
# 如果传入了inputs_embeds,则将其视作(batch_size, num_choices, sequence_length, hidden_size)的形状
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
# 调用Albert模型进行前向传播,获取输出
outputs = self.albert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取汇聚的输出表示
pooled_output = outputs[1]
# 对汇聚的输出表示应用dropout
pooled_output = self.dropout(pooled_output)
# 将处理后的汇聚输出表示输入分类器,得到logits
logits: torch.Tensor = self.classifier(pooled_output)
# 将logits变形为(batch_size * num_choices, -1)的形状
reshaped_logits = logits.view(-1, num_choices)
loss = None
# 如果给定了标签,则计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
# 如果不要求返回字典形式的输出,则按照元组的形式返回结果
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 如果要求返回字典形式的输出,则构建MultipleChoiceModelOutput对象并返回
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\albert\modeling_flax_albert.py
from typing import Callable, Optional, Tuple
import flax
import flax.linen as nn
import jax
import jax.numpy as jnp
import numpy as np
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from ...modeling_flax_outputs import (
FlaxBaseModelOutput,
FlaxBaseModelOutputWithPooling,
FlaxMaskedLMOutput,
FlaxMultipleChoiceModelOutput,
FlaxQuestionAnsweringModelOutput,
FlaxSequenceClassifierOutput,
FlaxTokenClassifierOutput,
)
from ...modeling_flax_utils import (
ACT2FN,
FlaxPreTrainedModel,
append_call_sample_docstring,
append_replace_return_docstrings,
overwrite_call_docstring,
)
from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_albert import AlbertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"
_CONFIG_FOR_DOC = "AlbertConfig"
@flax.struct.dataclass
class FlaxAlbertForPreTrainingOutput(ModelOutput):
"""
Output type of [`FlaxAlbertForPreTraining`].
用于 [`FlaxAlbertForPreTraining`] 的输出类型。
"""
Args:
prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
语言建模头部的预测分数(在 SoftMax 之前的每个词汇标记的分数)。
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
sop_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
下一个序列预测(分类)头部的预测分数(在 SoftMax 之前的 True/False 连续性的分数)。
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型隐藏状态的元组(jnp.ndarray),形状为 `(batch_size, sequence_length, hidden_size)`。
每个层的输出和初始嵌入输出的隐藏状态。
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
模型自注意力层的注意力权重的元组(jnp.ndarray),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
在注意力 SoftMax 之后的注意力权重,用于计算自注意力头部中的加权平均值。
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
prediction_logits: jnp.ndarray = None
sop_logits: jnp.ndarray = None
hidden_states: Optional[Tuple[jnp.ndarray]] = None
attentions: Optional[Tuple[jnp.ndarray]] = None
# ALBERT_START_DOCSTRING 是一个包含模型文档字符串的原始字符串常量
ALBERT_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
This model is also a
[flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html
Parameters:
config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
`jax.numpy.bfloat16` (on TPUs).
This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
specified all the computation will be performed with the given `dtype`.
**Note that this only specifies the dtype of the computation and does not influence the dtype of model
parameters.**
If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
[`~FlaxPreTrainedModel.to_bf16`].
"""
# ALBERT_INPUTS_DOCSTRING 是一个包含输入文档字符串的原始字符串常量,目前为空
ALBERT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`numpy.ndarray` of shape `({0})`):
attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
return_dict (`bool`, *optional*):
"""
class FlaxAlbertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
config: AlbertConfig # 定义配置对象的类型
dtype: jnp.dtype = jnp.float32 # 计算过程中使用的数据类型
def setup(self):
# 初始化词嵌入层,使用正态分布初始化器
self.word_embeddings = nn.Embed(
self.config.vocab_size,
self.config.embedding_size,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
# 初始化位置嵌入层,使用正态分布初始化器
self.position_embeddings = nn.Embed(
self.config.max_position_embeddings,
self.config.embedding_size,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
# 初始化标记类型嵌入层,使用正态分布初始化器
self.token_type_embeddings = nn.Embed(
self.config.type_vocab_size,
self.config.embedding_size,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
# 初始化层归一化层,使用给定的 epsilon 和数据类型
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# 初始化 dropout 层,使用给定的 dropout 概率
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
def __call__(self, input_ids, token_type_ids, position_ids, deterministic: bool = True):
# 嵌入输入 ID,转换为指定数据类型的张量
inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
# 嵌入位置 ID,转换为指定数据类型的张量
position_embeds = self.position_embeddings(position_ids.astype("i4"))
# 嵌入标记类型 ID,转换为指定数据类型的张量
token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
# 将所有嵌入相加
hidden_states = inputs_embeds + token_type_embeddings + position_embeds
# 应用层归一化
hidden_states = self.LayerNorm(hidden_states)
# 应用 dropout
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
return hidden_states
class FlaxAlbertSelfAttention(nn.Module):
config: AlbertConfig # 定义配置对象的类型
dtype: jnp.dtype = jnp.float32 # 计算过程中使用的数据类型
# 在设置阶段验证隐藏层大小是否可以被注意力头数整除
def setup(self):
if self.config.hidden_size % self.config.num_attention_heads != 0:
raise ValueError(
"`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
" : {self.config.num_attention_heads}"
)
# 创建用于查询的全连接层,输入大小为隐藏层大小,使用指定的数据类型和正态分布初始化
self.query = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
# 创建用于键的全连接层,输入大小为隐藏层大小,使用指定的数据类型和正态分布初始化
self.key = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
# 创建用于值的全连接层,输入大小为隐藏层大小,使用指定的数据类型和正态分布初始化
self.value = nn.Dense(
self.config.hidden_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
)
# 创建用于最终输出的全连接层,输入大小为隐藏层大小,使用指定的正态分布初始化
self.dense = nn.Dense(
self.config.hidden_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
)
# 创建层归一化对象,使用指定的 epsilon 值和数据类型
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# 创建用于dropout的对象,指定丢弃率
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
# 定义一个调用方法,接受隐藏状态、注意力掩码等参数,返回注意力层的输出
def __call__(self, hidden_states, attention_mask, deterministic=True, output_attentions: bool = False):
# 计算每个注意力头的维度
head_dim = self.config.hidden_size // self.config.num_attention_heads
# 使用 query 网络处理隐藏状态,然后重塑为 (batch_size, seq_length, num_attention_heads, head_dim)
query_states = self.query(hidden_states).reshape(
hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
)
# 使用 value 网络处理隐藏状态,然后重塑为 (batch_size, seq_length, num_attention_heads, head_dim)
value_states = self.value(hidden_states).reshape(
hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
)
# 使用 key 网络处理隐藏状态,然后重塑为 (batch_size, seq_length, num_attention_heads, head_dim)
key_states = self.key(hidden_states).reshape(
hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
)
# 将布尔类型的注意力掩码转换为注意力偏置
if attention_mask is not None:
# 将注意力掩码扩展维度以匹配 query 和 key 张量的维度
attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
# 根据注意力掩码的值生成注意力偏置,使用 lax.select 来根据条件选择不同的值
attention_bias = lax.select(
attention_mask > 0,
jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
)
else:
attention_bias = None
# 初始化 dropout RNG
dropout_rng = None
# 如果不是确定性计算并且设置了注意力概率的 dropout,则生成一个 dropout RNG
if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
dropout_rng = self.make_rng("dropout")
# 计算注意力权重,使用 dot_product_attention_weights 函数
attn_weights = dot_product_attention_weights(
query_states,
key_states,
bias=attention_bias,
dropout_rng=dropout_rng,
dropout_rate=self.config.attention_probs_dropout_prob,
broadcast_dropout=True,
deterministic=deterministic,
dtype=self.dtype,
precision=None,
)
# 使用 einsum 函数计算注意力输出,将注意力权重应用到 value 状态上
attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
# 重塑注意力输出的形状为 (batch_size, seq_length, hidden_size)
attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
# 将注意力输出投影到相同维度空间
projected_attn_output = self.dense(attn_output)
# 如果使用 dropout,则对投影后的注意力输出应用 dropout
projected_attn_output = self.dropout(projected_attn_output, deterministic=deterministic)
# 使用 LayerNorm 对投影后的注意力输出进行规范化,并与原始隐藏状态相加
layernormed_attn_output = self.LayerNorm(projected_attn_output + hidden_states)
# 根据需求决定是否输出注意力权重
outputs = (layernormed_attn_output, attn_weights) if output_attentions else (layernormed_attn_output,)
# 返回最终的输出
return outputs
# 定义 FlaxAlbertLayer 类,继承自 nn.Module
class FlaxAlbertLayer(nn.Module):
# 保存 AlbertConfig 类型的配置信息
config: AlbertConfig
# 指定计算中使用的数据类型为 jnp.float32
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
# 初始化方法,设置层的组件
def setup(self):
# 创建 self.attention 属性,使用 FlaxAlbertSelfAttention 类处理注意力机制
self.attention = FlaxAlbertSelfAttention(self.config, dtype=self.dtype)
# 创建 self.ffn 属性,使用 nn.Dense 层作为前馈神经网络
self.ffn = nn.Dense(
self.config.intermediate_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
)
# 根据配置选择激活函数
self.activation = ACT2FN[self.config.hidden_act]
# 创建 self.ffn_output 属性,使用 nn.Dense 层作为前馈神经网络输出层
self.ffn_output = nn.Dense(
self.config.hidden_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
)
# 创建 self.full_layer_layer_norm 属性,使用 nn.LayerNorm 层进行层归一化
self.full_layer_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# 创建 self.dropout 属性,使用 nn.Dropout 层进行随机失活
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
# 调用方法,定义前向传播逻辑
def __call__(
self,
hidden_states,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
):
# 使用 self.attention 处理注意力输出
attention_outputs = self.attention(
hidden_states, attention_mask, deterministic=deterministic, output_attentions=output_attentions
)
# 获取注意力输出的第一个元素作为 attention_output
attention_output = attention_outputs[0]
# 使用 self.ffn 处理 attention_output 得到前馈神经网络的输出
ffn_output = self.ffn(attention_output)
# 使用 self.activation 应用激活函数
ffn_output = self.activation(ffn_output)
# 使用 self.ffn_output 处理前馈神经网络输出得到最终输出
ffn_output = self.ffn_output(ffn_output)
# 使用 self.dropout 进行随机失活处理最终输出
ffn_output = self.dropout(ffn_output, deterministic=deterministic)
# 将前馈神经网络的输出与注意力输出相加,然后进行层归一化得到 hidden_states
hidden_states = self.full_layer_layer_norm(ffn_output + attention_output)
# 将 hidden_states 存入 outputs 元组中
outputs = (hidden_states,)
# 如果需要输出注意力权重,则将 attention_outputs[1] 也加入 outputs 中
if output_attentions:
outputs += (attention_outputs[1],)
# 返回 outputs 元组作为最终的输出结果
return outputs
# 定义 FlaxAlbertLayerCollection 类,继承自 nn.Module
class FlaxAlbertLayerCollection(nn.Module):
# 保存 AlbertConfig 类型的配置信息
config: AlbertConfig
# 指定计算中使用的数据类型为 jnp.float32
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
# 初始化方法,设置层的组件
def setup(self):
# 创建 self.layers 属性,包含多个 FlaxAlbertLayer 层组成的列表
self.layers = [
FlaxAlbertLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.inner_group_num)
]
# 调用方法,定义前向传播逻辑
def __call__(
self,
hidden_states,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
# 注意:此处代码未完整
):
# 初始化空元组,用于存储各层的隐藏状态和注意力分布
layer_hidden_states = ()
layer_attentions = ()
# 遍历模型的每一层
for layer_index, albert_layer in enumerate(self.layers):
# 调用当前层的前向传播方法,获取该层的输出
layer_output = albert_layer(
hidden_states,
attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
)
# 更新隐藏状态为当前层的输出的第一个元素
hidden_states = layer_output[0]
# 如果需要输出注意力分布,将当前层的注意力分布添加到layer_attentions元组中
if output_attentions:
layer_attentions = layer_attentions + (layer_output[1],)
# 如果需要输出隐藏状态,将当前层的隐藏状态添加到layer_hidden_states元组中
if output_hidden_states:
layer_hidden_states = layer_hidden_states + (hidden_states,)
# 构建输出元组,包括最终的隐藏状态
outputs = (hidden_states,)
# 如果需要输出每层的隐藏状态,将其添加到输出元组中
if output_hidden_states:
outputs = outputs + (layer_hidden_states,)
# 如果需要输出每层的注意力分布,将其添加到输出元组中
if output_attentions:
outputs = outputs + (layer_attentions,)
# 返回模型的输出,包括最后一层的隐藏状态,可选的每层隐藏状态和每层注意力分布
return outputs # 最后一层的隐藏状态,(每层隐藏状态),(每层注意力)
class FlaxAlbertLayerCollections(nn.Module):
config: AlbertConfig
dtype: jnp.dtype = jnp.float32 # 计算所用的数据类型
layer_index: Optional[str] = None
def setup(self):
self.albert_layers = FlaxAlbertLayerCollection(self.config, dtype=self.dtype)
# 初始化 Albert 层集合
def __call__(
self,
hidden_states,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
):
outputs = self.albert_layers(
hidden_states,
attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
return outputs
# 调用 Albert 层集合并返回输出结果
class FlaxAlbertLayerGroups(nn.Module):
config: AlbertConfig
dtype: jnp.dtype = jnp.float32 # 计算所用的数据类型
def setup(self):
self.layers = [
FlaxAlbertLayerCollections(self.config, name=str(i), layer_index=str(i), dtype=self.dtype)
for i in range(self.config.num_hidden_groups)
]
# 初始化 Albert 层组
def __call__(
self,
hidden_states,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
all_attentions = () if output_attentions else None
all_hidden_states = (hidden_states,) if output_hidden_states else None
for i in range(self.config.num_hidden_layers):
# 计算当前层所属的隐藏组索引
group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
layer_group_output = self.layers[group_idx](
hidden_states,
attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
hidden_states = layer_group_output[0]
if output_attentions:
all_attentions = all_attentions + layer_group_output[-1]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return FlaxBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
# 如果不返回字典,则返回相应的输出元组
class FlaxAlbertEncoder(nn.Module):
config: AlbertConfig
dtype: jnp.dtype = jnp.float32 # 计算所用的数据类型
def setup(self):
self.embedding_hidden_mapping_in = nn.Dense(
self.config.hidden_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
)
self.albert_layer_groups = FlaxAlbertLayerGroups(self.config, dtype=self.dtype)
# 初始化 Albert 编码器
# 定义一个特殊方法 __call__,使得对象可以像函数一样被调用
def __call__(
self,
hidden_states,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 将输入的隐藏状态通过 embedding_hidden_mapping_in 方法映射转换
hidden_states = self.embedding_hidden_mapping_in(hidden_states)
# 调用 albert_layer_groups 方法处理映射后的隐藏状态和注意力掩码,
# 可选参数包括 deterministic(是否确定性计算)、output_attentions(是否输出注意力权重)、
# output_hidden_states(是否输出每层的隐藏状态),返回结果根据 return_dict 决定是否返回字典形式
return self.albert_layer_groups(
hidden_states,
attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
# 定义一个名为 FlaxAlbertOnlyMLMHead 的类,继承自 nn.Module
class FlaxAlbertOnlyMLMHead(nn.Module):
# 配置属性,指定为 AlbertConfig 类型
config: AlbertConfig
# 数据类型,默认为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 偏置初始化函数,默认为零初始化
bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
# 初始化方法
def setup(self):
# 创建一个全连接层,输出维度为 config.embedding_size
self.dense = nn.Dense(self.config.embedding_size, dtype=self.dtype)
# 激活函数,根据配置选择 ACT2FN 中的激活函数
self.activation = ACT2FN[self.config.hidden_act]
# LayerNorm 层,使用 config.layer_norm_eps 作为 epsilon 参数
self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
# 输出层,输出维度为 config.vocab_size,不使用偏置
self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False)
# 初始化偏置参数,维度为 (config.vocab_size,)
self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
# 前向传播方法
def __call__(self, hidden_states, shared_embedding=None):
# 全连接层
hidden_states = self.dense(hidden_states)
# 激活函数
hidden_states = self.activation(hidden_states)
# LayerNorm 层
hidden_states = self.LayerNorm(hidden_states)
# 如果传入了 shared_embedding 参数,则使用 decoder 层进行解码
if shared_embedding is not None:
hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
else:
# 否则直接使用 decoder 层
hidden_states = self.decoder(hidden_states)
# 加上偏置
hidden_states += self.bias
# 返回最终的隐藏状态
return hidden_states
# 定义一个名为 FlaxAlbertSOPHead 的类,继承自 nn.Module
class FlaxAlbertSOPHead(nn.Module):
# 配置属性,指定为 AlbertConfig 类型
config: AlbertConfig
# 数据类型,默认为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 初始化方法
def setup(self):
# Dropout 层,使用配置中的 classifier_dropout_prob
self.dropout = nn.Dropout(self.config.classifier_dropout_prob)
# 全连接层,输出维度为 2
self.classifier = nn.Dense(2, dtype=self.dtype)
# 前向传播方法
def __call__(self, pooled_output, deterministic=True):
# 应用 Dropout
pooled_output = self.dropout(pooled_output, deterministic=deterministic)
# 分类器层,得到 logits
logits = self.classifier(pooled_output)
# 返回 logits
return logits
# 定义一个名为 FlaxAlbertPreTrainedModel 的类,继承自 FlaxPreTrainedModel
class FlaxAlbertPreTrainedModel(FlaxPreTrainedModel):
"""
一个处理权重初始化、预训练模型下载和加载的抽象类。
"""
# 配置类,指定为 AlbertConfig
config_class = AlbertConfig
# 基础模型前缀名称为 "albert"
base_model_prefix = "albert"
# 模块类,默认为 None,需要在子类中指定具体的模块类
module_class: nn.Module = None
# 初始化方法
def __init__(
self,
config: AlbertConfig,
input_shape: Tuple = (1, 1),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
# 创建模块实例,传入配置和其他参数
module = self.module_class(config=config, dtype=dtype, **kwargs)
# 调用父类的初始化方法
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
````
# 初始化权重函数,使用随机数种子和输入形状,返回参数字典
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化输入张量,创建一个全零张量,数据类型为整数4位
input_ids = jnp.zeros(input_shape, dtype="i4")
# 创建与 input_ids 相同形状的全零张量,作为 token 类型标识
token_type_ids = jnp.zeros_like(input_ids)
# 根据 input_ids 的最后一个维度生成位置 ID,使用广播到输入形状
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
# 创建一个与 input_ids 相同形状的全一张量,作为注意力掩码
attention_mask = jnp.ones_like(input_ids)
# 分割随机数种子为参数随机种子和 dropout 随机种子
params_rng, dropout_rng = jax.random.split(rng)
rngs = {"params": params_rng, "dropout": dropout_rng}
# 初始化模型参数,调用模块的 init 方法,返回随机参数
random_params = self.module.init(
rngs, input_ids, attention_mask, token_type_ids, position_ids, return_dict=False
)["params"]
# 如果传入了已有参数,合并随机参数和已有参数,并补全缺失的参数
if params is not None:
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
# 返回合并后的参数字典,进行冻结
return freeze(unflatten_dict(params))
else:
# 返回随机初始化的参数
return random_params
# 添加文档字符串,定义模型前向传播方法的文档
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def __call__(
self,
input_ids,
attention_mask=None,
token_type_ids=None,
position_ids=None,
params: dict = None,
dropout_rng: jax.random.PRNGKey = None,
train: bool = False,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 根据配置,设置输出注意力和隐藏状态的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 如果未传入 token_type_ids,初始化为与 input_ids 相同的全零张量
if token_type_ids is None:
token_type_ids = jnp.zeros_like(input_ids)
# 如果未传入 position_ids,初始化为根据 input_ids 的最后一个维度生成的广播张量
if position_ids is None:
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
# 如果未传入 attention_mask,初始化为与 input_ids 相同的全一张量
if attention_mask is None:
attention_mask = jnp.ones_like(input_ids)
# 初始化随机数种子字典
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
# 调用模型的 apply 方法,传入参数进行前向计算
return self.module.apply(
{"params": params or self.params},
jnp.array(input_ids, dtype="i4"),
jnp.array(attention_mask, dtype="i4"),
jnp.array(token_type_ids, dtype="i4"),
jnp.array(position_ids, dtype="i4"),
not train, # 训练模式为 False,推理模式为 True
output_attentions,
output_hidden_states,
return_dict,
rngs=rngs,
)
# 定义一个继承自`nn.Module`的类,用于实现Flax版本的Albert模型
class FlaxAlbertModule(nn.Module):
# 类型注解,指定`config`为AlbertConfig类型
config: AlbertConfig
# 指定`dtype`为jnp.float32,用于计算的数据类型
dtype: jnp.dtype = jnp.float32 # 计算时的数据类型
# 是否添加池化层的标志,默认为True
add_pooling_layer: bool = True
# 模块初始化函数
def setup(self):
# 初始化嵌入层`embeddings`,使用FlaxAlbertEmbeddings类
self.embeddings = FlaxAlbertEmbeddings(self.config, dtype=self.dtype)
# 初始化编码器`encoder`,使用FlaxAlbertEncoder类
self.encoder = FlaxAlbertEncoder(self.config, dtype=self.dtype)
# 如果设置添加池化层,则初始化`pooler`为全连接层,并指定激活函数为tanh
if self.add_pooling_layer:
self.pooler = nn.Dense(
self.config.hidden_size,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
dtype=self.dtype,
name="pooler",
)
self.pooler_activation = nn.tanh
else:
# 如果不添加池化层,则将`pooler`和`pooler_activation`设置为None
self.pooler = None
self.pooler_activation = None
# 对象调用函数,实现模型的前向传播
def __call__(
self,
input_ids,
attention_mask,
token_type_ids: Optional[np.ndarray] = None,
position_ids: Optional[np.ndarray] = None,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 当未传入`token_type_ids`时,初始化为与`input_ids`相同形状的全零数组
if token_type_ids is None:
token_type_ids = jnp.zeros_like(input_ids)
# 当未传入`position_ids`时,初始化为广播形式的序列长度数组
if position_ids is None:
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
# 使用嵌入层`embeddings`计算输入数据的隐状态表示
hidden_states = self.embeddings(input_ids, token_type_ids, position_ids, deterministic=deterministic)
# 将隐状态表示输入编码器`encoder`,获取模型输出
outputs = self.encoder(
hidden_states,
attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取编码器输出的隐状态表示
hidden_states = outputs[0]
# 如果设置添加池化层,则对第一个时间步的隐状态进行池化操作
if self.add_pooling_layer:
pooled = self.pooler(hidden_states[:, 0])
pooled = self.pooler_activation(pooled)
else:
# 如果不添加池化层,则将`pooled`设置为None
pooled = None
# 如果不返回字典形式的输出,则根据`return_dict`的设置返回相应结果
if not return_dict:
if pooled is None:
# 如果`pooled`为None,则不返回它
return (hidden_states,) + outputs[1:]
return (hidden_states, pooled) + outputs[1:]
# 返回包含池化输出和其他模型输出的字典形式结果
return FlaxBaseModelOutputWithPooling(
last_hidden_state=hidden_states,
pooler_output=pooled,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 使用装饰器`add_start_docstrings`为`FlaxAlbertModel`类添加注释文档
@add_start_docstrings(
"The bare Albert Model transformer outputting raw hidden-states without any specific head on top.",
ALBERT_START_DOCSTRING,
)
# `FlaxAlbertModel`类继承自`FlaxAlbertPreTrainedModel`,指定使用的模块类为`FlaxAlbertModule`
class FlaxAlbertModel(FlaxAlbertPreTrainedModel):
module_class = FlaxAlbertModule
# 调用`append_call_sample_docstring`函数,为`FlaxAlbertModel`类添加调用示例注释
append_call_sample_docstring(FlaxAlbertModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
# 定义一个继承自`nn.Module`的类,用于实现Albert预训练模型
class FlaxAlbertForPreTrainingModule(nn.Module):
# 类型注解,指定`config`为AlbertConfig类型
config: AlbertConfig
# 定义默认的数据类型为 jnp.float32,使用了 jax.numpy 的数据类型
dtype: jnp.dtype = jnp.float32
# 初始化模型的方法,创建了 Albert 模型、MLM 头部和 SOP 分类器
def setup(self):
# 使用给定的配置和数据类型创建 Albert 模型
self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype)
# 使用给定的配置和数据类型创建只有 MLM 头部的模型
self.predictions = FlaxAlbertOnlyMLMHead(config=self.config, dtype=self.dtype)
# 使用给定的配置和数据类型创建 SOP 分类器
self.sop_classifier = FlaxAlbertSOPHead(config=self.config, dtype=self.dtype)
# 调用模型时的方法,接收多个输入参数和几个布尔型选项
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用 Albert 模型进行前向传播,获取输出
outputs = self.albert(
input_ids,
attention_mask,
token_type_ids,
position_ids,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果配置要求共享词嵌入,则获取共享的词嵌入
if self.config.tie_word_embeddings:
shared_embedding = self.albert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
else:
shared_embedding = None
# 从 Albert 模型的输出中提取隐藏状态和汇聚输出
hidden_states = outputs[0]
pooled_output = outputs[1]
# 使用 MLM 头部对隐藏状态进行预测
prediction_scores = self.predictions(hidden_states, shared_embedding=shared_embedding)
# 使用 SOP 分类器对汇聚输出进行预测
sop_scores = self.sop_classifier(pooled_output, deterministic=deterministic)
# 如果不要求返回字典形式的结果,则返回元组形式的结果
if not return_dict:
return (prediction_scores, sop_scores) + outputs[2:]
# 返回预训练 Albert 模型的输出结果,包括预测 logits、SOP logits、隐藏状态和注意力权重
return FlaxAlbertForPreTrainingOutput(
prediction_logits=prediction_scores,
sop_logits=sop_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
`sentence order prediction (classification)` head.
""",
ALBERT_START_DOCSTRING,
)
class FlaxAlbertForPreTraining(FlaxAlbertPreTrainedModel):
module_class = FlaxAlbertForPreTrainingModule
FLAX_ALBERT_FOR_PRETRAINING_DOCSTRING = """
Returns:
Example:
```
>>> from transformers import AutoTokenizer, FlaxAlbertForPreTraining
>>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
>>> model = FlaxAlbertForPreTraining.from_pretrained("albert/albert-base-v2")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
>>> outputs = model(**inputs)
>>> prediction_logits = outputs.prediction_logits
>>> seq_relationship_logits = outputs.sop_logits
```
"""
# Overwrite the docstring of FlaxAlbertForPreTraining to include input docstring and predefined docstring
overwrite_call_docstring(
FlaxAlbertForPreTraining,
ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_ALBERT_FOR_PRETRAINING_DOCSTRING,
)
# Append and replace return docstrings for FlaxAlbertForPreTraining
append_replace_return_docstrings(
FlaxAlbertForPreTraining, output_type=FlaxAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC
)
class FlaxAlbertForMaskedLMModule(nn.Module):
config: AlbertConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
# Set up Albert model without pooling layer
self.albert = FlaxAlbertModule(config=self.config, add_pooling_layer=False, dtype=self.dtype)
# Set up Masked LM head for predictions
self.predictions = FlaxAlbertOnlyMLMHead(config=self.config, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# Forward pass through Albert model
outputs = self.albert(
input_ids,
attention_mask,
token_type_ids,
position_ids,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Extract hidden states from model outputs
hidden_states = outputs[0]
# Determine if word embeddings are tied
if self.config.tie_word_embeddings:
shared_embedding = self.albert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
else:
shared_embedding = None
# Compute masked language modeling logits
logits = self.predictions(hidden_states, shared_embedding=shared_embedding)
# Return either a tuple or a named tuple depending on return_dict
if not return_dict:
return (logits,) + outputs[1:]
return FlaxMaskedLMOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING)
class FlaxAlbertForMaskedLM(FlaxAlbertPreTrainedModel):
module_class = FlaxAlbertForMaskedLMModule
# Append call sample docstring for FlaxAlbertForMaskedLM
append_call_sample_docstring(
# Import the specific classes and variables from the module for Flax-based Albert model for Masked Language Modeling
FlaxAlbertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC, revision="refs/pr/11"
# 定义一个名为 FlaxAlbertForSequenceClassificationModule 的 PyTorch 模块,用于序列分类任务
class FlaxAlbertForSequenceClassificationModule(nn.Module):
# 类属性,存储 Albert 模型的配置
config: AlbertConfig
# 类属性,默认数据类型为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 模块初始化方法
def setup(self):
# 根据配置创建一个 FlaxAlbertModule 实例
self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype)
# 根据配置中的 dropout 概率创建一个 dropout 层
classifier_dropout = (
self.config.classifier_dropout_prob
if self.config.classifier_dropout_prob is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(rate=classifier_dropout)
# 创建一个全连接层作为分类器,输出维度为 config.num_labels
self.classifier = nn.Dense(
self.config.num_labels,
dtype=self.dtype,
)
# 模块调用方法,用于模型推断
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用 Albert 模型进行前向传播
outputs = self.albert(
input_ids,
attention_mask,
token_type_ids,
position_ids,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从 Albert 模型的输出中获取池化后的输出
pooled_output = outputs[1]
# 应用 dropout 层到池化后的输出上
pooled_output = self.dropout(pooled_output, deterministic=deterministic)
# 将处理后的输出传入分类器得到最终的 logits
logits = self.classifier(pooled_output)
# 如果不要求返回字典,则返回 logits 和额外的隐藏状态
if not return_dict:
return (logits,) + outputs[2:]
# 如果要求返回字典,则构建一个 FlaxSequenceClassifierOutput 对象并返回
return FlaxSequenceClassifierOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 使用装饰器为 FlaxAlbertForSequenceClassification 类添加文档字符串
@add_start_docstrings(
"""
Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
ALBERT_START_DOCSTRING,
)
# 继承自 FlaxAlbertPreTrainedModel 类的子类
class FlaxAlbertForSequenceClassification(FlaxAlbertPreTrainedModel):
# 指定该模型使用的模块类为 FlaxAlbertForSequenceClassificationModule
module_class = FlaxAlbertForSequenceClassificationModule
# 为 FlaxAlbertForSequenceClassification 类添加调用示例的文档字符串
append_call_sample_docstring(
FlaxAlbertForSequenceClassification,
_CHECKPOINT_FOR_DOC,
FlaxSequenceClassifierOutput,
_CONFIG_FOR_DOC,
)
# 定义一个名为 FlaxAlbertForMultipleChoiceModule 的 PyTorch 模块,用于多项选择任务
class FlaxAlbertForMultipleChoiceModule(nn.Module):
# 类属性,存储 Albert 模型的配置
config: AlbertConfig
# 类属性,默认数据类型为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 模块初始化方法
def setup(self):
# 根据配置创建一个 FlaxAlbertModule 实例
self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype)
# 创建一个 dropout 层,dropout 率为 config.hidden_dropout_prob
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
# 创建一个全连接层作为分类器,输出维度为 1
self.classifier = nn.Dense(1, dtype=self.dtype)
# 模块调用方法,用于模型推断
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 获取输入张量的第二维大小,即选项的数量
num_choices = input_ids.shape[1]
# 如果输入张量不为空,则重塑为二维张量
input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None
# 使用 ALBERT 模型进行前向推断
outputs = self.albert(
input_ids,
attention_mask,
token_type_ids,
position_ids,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取池化后的输出
pooled_output = outputs[1]
# 使用 dropout 进行池化后输出的处理
pooled_output = self.dropout(pooled_output, deterministic=deterministic)
# 使用分类器进行分类预测
logits = self.classifier(pooled_output)
# 将 logits 重塑为二维张量,以便与选项数量对应
reshaped_logits = logits.reshape(-1, num_choices)
# 如果不返回字典形式的结果,则返回 logits 以及额外的输出
if not return_dict:
return (reshaped_logits,) + outputs[2:]
# 返回多选题模型的输出结果,包括 logits、隐藏状态和注意力
return FlaxMultipleChoiceModelOutput(
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
ALBERT_START_DOCSTRING,
)
class FlaxAlbertForMultipleChoice(FlaxAlbertPreTrainedModel):
module_class = FlaxAlbertForMultipleChoiceModule
overwrite_call_docstring(
FlaxAlbertForMultipleChoice, ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
append_call_sample_docstring(
FlaxAlbertForMultipleChoice,
_CHECKPOINT_FOR_DOC,
FlaxMultipleChoiceModelOutput,
_CONFIG_FOR_DOC,
)
这部分代码定义了一个基于Albert模型的多选题分类模型,包括一个线性层和softmax输出,适用于多选题任务,例如RocStories/SWAG任务。
class FlaxAlbertForTokenClassificationModule(nn.Module):
config: AlbertConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
classifier_dropout = (
self.config.classifier_dropout_prob
if self.config.classifier_dropout_prob is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(rate=classifier_dropout)
self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# Model
outputs = self.albert(
input_ids,
attention_mask,
token_type_ids,
position_ids,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
logits = self.classifier(hidden_states)
if not return_dict:
return (logits,) + outputs[1:]
return FlaxTokenClassifierOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
这段代码定义了一个基于Albert模型的Token分类模型,该模型在隐藏状态输出的基础上增加了线性层,适用于词元分类任务(例如命名实体识别)。其中包括了用于构建模型的初始化设置和__call__方法用于执行前向传播。
@add_start_docstrings(
"""
Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
ALBERT_START_DOCSTRING,
)
class FlaxAlbertForTokenClassification(FlaxAlbertPreTrainedModel):
module_class = FlaxAlbertForTokenClassificationModule
append_call_sample_docstring(
FlaxAlbertForTokenClassification,
_CHECKPOINT_FOR_DOC,
FlaxTokenClassifierOutput,
_CONFIG_FOR_DOC,
)
这部分代码定义了另一个基于Albert模型的Token分类模型,用于词元分类任务(如命名实体识别),包括一个线性层作为隐藏状态输出的顶部层。
class FlaxAlbertForQuestionAnsweringModule(nn.Module):
config: AlbertConfig
dtype: jnp.dtype = jnp.float32
这段代码定义了一个用于问答任务的Albert模型模块,但是在这段代码中缺少进一步的实现和注释。
# 初始化模型设置
def setup(self):
# 使用配置和数据类型创建一个 FlaxAlbertModule 实例,不添加池化层
self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
# 创建一个全连接层 nn.Dense,输出维度为配置中指定的标签数
self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
# 模型调用函数,接受多个输入和一些可选参数
def __call__(
self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用 self.albert 模型进行前向传播
outputs = self.albert(
input_ids,
attention_mask,
token_type_ids,
position_ids,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取隐藏状态
hidden_states = outputs[0]
# 使用 self.qa_outputs 对隐藏状态进行线性变换得到预测 logits
logits = self.qa_outputs(hidden_states)
# 将 logits 按最后一个维度分割成起始和结束 logits
start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
# 去除多余的维度,将 logits 压缩成一维张量
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
# 如果 return_dict 为 False,则返回一个元组,包含 logits 和额外的模型输出
if not return_dict:
return (start_logits, end_logits) + outputs[1:]
# 如果 return_dict 为 True,则封装输出成 FlaxQuestionAnsweringModelOutput 类型
return FlaxQuestionAnsweringModelOutput(
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
ALBERT_START_DOCSTRING, # 添加起始文档字符串,描述了在Albert模型中加入用于抽取式问答任务的span分类头的结构和功能。
)
class FlaxAlbertForQuestionAnswering(FlaxAlbertPreTrainedModel):
module_class = FlaxAlbertForQuestionAnsweringModule
append_call_sample_docstring(
FlaxAlbertForQuestionAnswering, # 将示例调用添加到文档字符串,展示如何使用FlaxAlbertForQuestionAnswering类的示例。
_CHECKPOINT_FOR_DOC,
FlaxQuestionAnsweringModelOutput, # 调用样本文档字符串附上了模型输出的检查点。
_CONFIG_FOR_DOC,
)
.\models\albert\modeling_tf_albert.py
""" TF 2.0 ALBERT model."""
from __future__ import annotations
import math
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPooling,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFModelInputType,
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import (
check_embeddings_within_bounds,
shape_list,
stable_softmax,
)
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_albert import AlbertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"
_CONFIG_FOR_DOC = "AlbertConfig"
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"albert/albert-base-v1",
"albert/albert-large-v1",
"albert/albert-xlarge-v1",
"albert/albert-xxlarge-v1",
"albert/albert-base-v2",
"albert/albert-large-v2",
"albert/albert-xlarge-v2",
"albert/albert-xxlarge-v2",
]
class TFAlbertPreTrainingLoss:
"""
适用于 ALBERT 预训练的损失函数,即通过结合 SOP + MLM 的语言模型预训练任务。
.. 注意:: 在损失计算中,任何标签为 -100 的样本将被忽略(以及对应的 logits)。
"""
def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
if self.config.tf_legacy_loss:
masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels["labels"], shape=(-1,)), -100)
masked_lm_reduced_logits = tf.boolean_mask(
tensor=tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])),
mask=masked_lm_active_loss,
)
masked_lm_labels = tf.boolean_mask(
tensor=tf.reshape(tensor=labels["labels"], shape=(-1,)), mask=masked_lm_active_loss
)
sentence_order_active_loss = tf.not_equal(
tf.reshape(tensor=labels["sentence_order_label"], shape=(-1,)), -100
)
sentence_order_reduced_logits = tf.boolean_mask(
tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=sentence_order_active_loss
)
sentence_order_label = tf.boolean_mask(
tensor=tf.reshape(tensor=labels["sentence_order_label"], shape=(-1,)), mask=sentence_order_active_loss
)
masked_lm_loss = loss_fn(y_true=masked_lm_labels, y_pred=masked_lm_reduced_logits)
sentence_order_loss = loss_fn(y_true=sentence_order_label, y_pred=sentence_order_reduced_logits)
masked_lm_loss = tf.reshape(tensor=masked_lm_loss, shape=(-1, shape_list(sentence_order_loss)[0]))
masked_lm_loss = tf.reduce_mean(input_tensor=masked_lm_loss, axis=0)
return masked_lm_loss + sentence_order_loss
unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0])
lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype)
masked_lm_losses = unmasked_lm_losses * lm_loss_mask
reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses) / tf.reduce_sum(lm_loss_mask)
sop_logits = tf.reshape(logits[1], (-1, 2))
unmasked_sop_loss = loss_fn(y_true=tf.nn.relu(labels["sentence_order_label"]), y_pred=sop_logits)
sop_loss_mask = tf.cast(labels["sentence_order_label"] != -100, dtype=unmasked_sop_loss.dtype)
masked_sop_loss = unmasked_sop_loss * sop_loss_mask
reduced_masked_sop_loss = tf.reduce_sum(masked_sop_loss) / tf.reduce_sum(sop_loss_mask)
return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,))
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embedding_size = config.embedding_size
self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.config.type_vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
with tf.name_scope("position_embeddings"):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
def call(
self,
input_ids: tf.Tensor = None,
position_ids: tf.Tensor = None,
token_type_ids: tf.Tensor = None,
inputs_embeds: tf.Tensor = None,
past_key_values_length=0,
training: bool = False,
) -> tf.Tensor:
"""
Applies embedding based on inputs tensor.
Returns:
final_embeddings (`tf.Tensor`): output embedding tensor.
"""
if input_ids is None and inputs_embeds is None:
raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
if input_ids is not None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if token_type_ids is None:
token_type_ids = tf.fill(dims=input_shape, value=0)
if position_ids is None:
position_ids = tf.expand_dims(
tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0
)
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
final_embeddings = inputs_embeds + position_embeds + token_type_embeds
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
"""Contains the complete attention sublayer, including both dropouts and layer norm."""
def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
f"of attention heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.output_attentions = config.output_attentions
self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.attention_dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.output_dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(tensor, perm=[0, 2, 1, 3])
def call(
self,
input_tensor: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
batch_size = shape_list(input_tensor)[0]
mixed_query_layer = self.query(inputs=input_tensor)
mixed_key_layer = self.key(inputs=input_tensor)
mixed_value_layer = self.value(inputs=input_tensor)
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
attention_scores = tf.divide(attention_scores, dk)
if attention_mask is not None:
attention_scores = tf.add(attention_scores, attention_mask)
attention_probs = stable_softmax(logits=attention_scores, axis=-1)
attention_probs = self.attention_dropout(inputs=attention_probs, training=training)
if head_mask is not None:
attention_probs = tf.multiply(attention_probs, head_mask)
context_layer = tf.matmul(attention_probs, value_layer)
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
context_layer = tf.reshape(tensor=context_layer, shape=(batch_size, -1, self.all_head_size))
self_outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
hidden_states = self_outputs[0]
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.output_dropout(inputs=hidden_states, training=training)
attention_output = self.LayerNorm(inputs=hidden_states + input_tensor)
outputs = (attention_output,) + self_outputs[1:]
return outputs
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFAlbertLayer(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs)
self.attention = TFAlbertAttention(config, name="attention")
self.ffn = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
)
if isinstance(config.hidden_act, str):
self.activation = get_tf_activation(config.hidden_act)
else:
self.activation = config.hidden_act
self.ffn_output = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output"
)
self.full_layer_layer_norm = keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
)
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
attention_outputs = self.attention(
input_tensor=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
training=training,
)
ffn_output = self.ffn(inputs=attention_outputs[0])
ffn_output = self.activation(ffn_output)
ffn_output = self.ffn_output(inputs=ffn_output)
ffn_output = self.dropout(inputs=ffn_output, training=training)
hidden_states = self.full_layer_layer_norm(inputs=ffn_output + attention_outputs[0])
outputs = (hidden_states,) + attention_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "ffn", None) is not None:
with tf.name_scope(self.ffn.name):
self.ffn.build([None, None, self.config.hidden_size])
if getattr(self, "ffn_output", None) is not None:
with tf.name_scope(self.ffn_output.name):
self.ffn_output.build([None, None, self.config.intermediate_size])
if getattr(self, "full_layer_layer_norm", None) is not None:
with tf.name_scope(self.full_layer_layer_norm.name):
self.full_layer_layer_norm.build([None, None, self.config.hidden_size])
def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs)
self.albert_layers = [
TFAlbertLayer(config, name=f"albert_layers_._{i}") for i in range(config.inner_group_num)
]
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
output_hidden_states: bool,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
layer_hidden_states = () if output_hidden_states else None
layer_attentions = () if output_attentions else None
for layer_index, albert_layer in enumerate(self.albert_layers):
if output_hidden_states:
layer_hidden_states = layer_hidden_states + (hidden_states,)
layer_output = albert_layer(
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask[layer_index],
output_attentions=output_attentions,
training=training,
)
hidden_states = layer_output[0]
if output_attentions:
layer_attentions = layer_attentions + (layer_output[1],)
if output_hidden_states:
layer_hidden_states = layer_hidden_states + (hidden_states,)
return tuple(v for v in [hidden_states, layer_hidden_states, layer_attentions] if v is not None)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert_layers", None) is not None:
for layer in self.albert_layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFAlbertTransformer(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs)
self.num_hidden_layers = config.num_hidden_layers
self.num_hidden_groups = config.num_hidden_groups
self.layers_per_group = int(config.num_hidden_layers / config.num_hidden_groups)
self.embedding_hidden_mapping_in = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="embedding_hidden_mapping_in",
)
self.albert_layer_groups = [
TFAlbertLayerGroup(config, name=f"albert_layer_groups_._{i}") for i in range(config.num_hidden_groups)
]
self.config = config
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
hidden_states = self.embedding_hidden_mapping_in(inputs=hidden_states)
all_attentions = () if output_attentions else None
all_hidden_states = (hidden_states,) if output_hidden_states else None
for i in range(self.num_hidden_layers):
group_idx = int(i / (self.num_hidden_layers / self.num_hidden_groups))
layer_group_output = self.albert_layer_groups[group_idx](
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask[group_idx * self.layers_per_group : (group_idx + 1) * self.layers_per_group],
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
hidden_states = layer_group_output[0]
if output_attentions:
all_attentions = all_attentions + layer_group_output[-1]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedding_hidden_mapping_in", None) is not None:
with tf.name_scope(self.embedding_hidden_mapping_in.name):
self.embedding_hidden_mapping_in.build([None, None, self.config.embedding_size])
if getattr(self, "albert_layer_groups", None) is not None:
for layer in self.albert_layer_groups:
with tf.name_scope(layer.name):
layer.build(None)
"""
处理权重初始化、预训练模型下载和加载的抽象类。
"""
config_class = AlbertConfig
base_model_prefix = "albert"
class TFAlbertMLMHead(keras.layers.Layer):
def __init__(self, config: AlbertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embedding_size = config.embedding_size
self.dense = keras.layers.Dense(
config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
self.activation = get_tf_activation(config.hidden_act)
else:
self.activation = config.hidden_act
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.decoder = input_embeddings
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
self.decoder_bias = self.add_weight(
shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
)
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
def get_output_embeddings(self) -> keras.layers.Layer:
return self.decoder
def set_output_embeddings(self, value: tf.Variable):
self.decoder.weight = value
self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self) -> Dict[str, tf.Variable]:
return {"bias": self.bias, "decoder_bias": self.decoder_bias}
def set_bias(self, value: tf.Variable):
self.bias = value["bias"]
self.decoder_bias = value["decoder_bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.LayerNorm(inputs=hidden_states)
seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias)
return hidden_states
@keras_serializable
class TFAlbertMainLayer(keras.layers.Layer):
config_class = AlbertConfig
def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
self.encoder = TFAlbertTransformer(config, name="encoder")
self.pooler = (
keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="pooler",
)
if add_pooling_layer
else None
)
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
def set_input_embeddings(self, value: tf.Variable):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build([None, None, self.config.hidden_size])
@dataclass
class TFAlbertForPreTrainingOutput(ModelOutput):
"""
Output type of [`TFAlbertForPreTraining`].
"""
loss: tf.Tensor = None
prediction_logits: tf.Tensor = None
sop_logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
"""
这个模型继承自 `TFPreTrainedModel`。查看超类文档以获取库实现的通用方法,比如下载或保存模型、调整输入嵌入大小、修剪头等。
这个模型也是 [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) 的子类。将其用作常规的 TF 2.0 Keras 模型,并参考 TF 2.0 文档,以获取所有与一般使用和行为相关的信息。
<Tip>
`transformers` 中的 TensorFlow 模型和层接受两种输入格式:
- 将所有输入作为关键字参数(类似于 PyTorch 模型),或者
- 将所有输入作为列表、元组或字典传递给第一个位置参数。
支持第二种格式的原因在于,Keras 方法在将输入传递给模型和层时更喜欢这种格式。由于这种支持,当使用诸如 `model.fit()` 这样的方法时,只需传递模型支持的任何格式的输入和标签即可!然而,如果您想在 Keras 方法之外使用第二种格式,比如在使用 Keras `Functional` API 创建自己的层或模型时,可以使用三种可能性来收集第一个位置参数中的所有输入张量:
- 只有 `input_ids` 的单个张量:`model(input_ids)`
- 长度可变的列表,按照文档字符串中给定的顺序包含一个或多个输入张量:`model([input_ids, attention_mask])` 或 `model([input_ids, attention_mask, token_type_ids])`
- 一个字典,将一个或多个输入张量与文档字符串中给定的输入名称相关联:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
请注意,当使用 [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) 创建模型和层时,您无需担心这些问题,因为可以像将输入传递给任何其他 Python 函数一样传递输入!
</Tip>
Args:
config ([`AlbertConfig`]): 包含模型所有参数的模型配置类。
使用配置文件初始化不会加载与模型关联的权重,仅加载配置。查看 [`~PreTrainedModel.from_pretrained`] 方法以加载模型权重。
"""
@add_start_docstrings(
"不带任何特定头部的裸 Albert 模型变压器输出原始隐藏状态。",
ALBERT_START_DOCSTRING,
)
class TFAlbertModel(TFAlbertPreTrainedModel):
def __init__(self, config: AlbertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.albert = TFAlbertMainLayer(config, name="albert")
@unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
outputs = self.albert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
"""
Albert Model with two heads on top for pretraining: a `masked language modeling` head and a `sentence order
prediction` (classification) head.
"""
@add_start_docstrings(
"""
Albert Model with two heads on top for pretraining: a `masked language modeling` head and a `sentence order
prediction` (classification) head.
""",
ALBERT_START_DOCSTRING,
)
class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
_keys_to_ignore_on_load_unexpected = [r"predictions.decoder.weight"]
def __init__(self, config: AlbertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, name="albert")
self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions")
self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier")
def get_lm_head(self) -> keras.layers.Layer:
return self.predictions
@unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
sentence_order_label: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
**kwargs,
) -> Union[TFAlbertForPreTrainingOutput, Tuple[tf.Tensor]]:
) -> Union[TFAlbertForPreTrainingOutput, Tuple[tf.Tensor]]:
r"""
Return:
Example:
```
>>> import tensorflow as tf
>>> from transformers import AutoTokenizer, TFAlbertForPreTraining
>>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
>>> model = TFAlbertForPreTraining.from_pretrained("albert/albert-base-v2")
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]
>>> # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_logits = outputs.prediction_logits
>>> sop_logits = outputs.sop_logits
```"""
outputs = self.albert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output, pooled_output = outputs[:2]
prediction_scores = self.predictions(hidden_states=sequence_output)
sop_scores = self.sop_classifier(pooled_output=pooled_output, training=training)
total_loss = None
if labels is not None and sentence_order_label is not None:
d_labels = {"labels": labels}
d_labels["sentence_order_label"] = sentence_order_label
total_loss = self.hf_compute_loss(labels=d_labels, logits=(prediction_scores, sop_scores))
if not return_dict:
output = (prediction_scores, sop_scores) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return TFAlbertForPreTrainingOutput(
loss=total_loss,
prediction_logits=prediction_scores,
sop_logits=sop_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
if getattr(self, "sop_classifier", None) is not None:
with tf.name_scope(self.sop_classifier.name):
self.sop_classifier.build(None)
class TFAlbertSOPHead(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs)
self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
)
self.config = config
def call(self, pooled_output: tf.Tensor, training: bool) -> tf.Tensor:
dropout_pooled_output = self.dropout(inputs=pooled_output, training=training)
logits = self.classifier(inputs=dropout_pooled_output)
return logits
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING)
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
_keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions.decoder.weight"]
def __init__(self, config: AlbertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions")
def get_lm_head(self) -> keras.layers.Layer:
return self.predictions
@unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
outputs = self.albert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
prediction_scores = self.predictions(hidden_states=sequence_output, training=training)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
@add_start_docstrings(
"""
Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
ALBERT_START_DOCSTRING,
)
class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClassificationLoss):
_keys_to_ignore_on_load_unexpected = [r"predictions"]
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config: AlbertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, name="albert")
self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="vumichien/albert-base-v2-imdb",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="'LABEL_1'",
expected_loss=0.12,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
outputs = self.albert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = outputs[1]
pooled_output = self.dropout(inputs=pooled_output, training=training)
logits = self.classifier(inputs=pooled_output)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
ALBERT_START_DOCSTRING,
)
class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificationLoss):
_keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions"]
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config: AlbertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
classifier_dropout_prob = (
config.classifier_dropout_prob
if config.classifier_dropout_prob is not None
else config.hidden_dropout_prob
)
self.dropout = keras.layers.Dropout(rate=classifier_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
**kwargs,
):
"""
Performs forward pass of the model.
"""
return super().call(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
labels=labels,
training=training,
**kwargs,
)
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
outputs = self.albert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
sequence_output = self.dropout(inputs=sequence_output, training=training)
logits = self.classifier(inputs=sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
"""
Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
@add_start_docstrings(ALBERT_START_DOCSTRING)
class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringLoss):
_keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions"]
def __init__(self, config: AlbertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.qa_outputs = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="vumichien/albert-base-v2-squad2",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
qa_target_start_index=12,
qa_target_end_index=13,
expected_output="'a nice puppet'",
expected_loss=7.36,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
outputs = self.albert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.qa_outputs(inputs=sequence_output)
start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
start_logits = tf.squeeze(input=start_logits, axis=-1)
end_logits = tf.squeeze(input=end_logits, axis=-1)
loss = None
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
"""
Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""
@add_start_docstrings(
"""
Albert 模型,顶部带有一个多选分类头部(在汇总输出的基础上添加一个线性层和 softmax),例如 RocStories/SWAG 任务。
""",
ALBERT_START_DOCSTRING,
)
class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
"""
names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
"""
_keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions"]
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config: AlbertConfig, *inputs, **kwargs):
"""
Initialize TFAlbertForMultipleChoice model
"""
super().__init__(config, *inputs, **kwargs)
self.albert = TFAlbertMainLayer(config, name="albert")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
):
"""
Perform forward pass of TFAlbertForMultipleChoice model.
"""
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = (
tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None
)
flat_token_type_ids = (
tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None
)
flat_position_ids = (
tf.reshape(tensor=position_ids, shape=(-1, seq_length)) if position_ids is not None else None
)
flat_inputs_embeds = (
tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3]))
if inputs_embeds is not None
else None
)
outputs = self.albert(
input_ids=flat_input_ids,
attention_mask=flat_attention_mask,
token_type_ids=flat_token_type_ids,
position_ids=flat_position_ids,
head_mask=head_mask,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = outputs[1]
pooled_output = self.dropout(inputs=pooled_output, training=training)
logits = self.classifier(inputs=pooled_output)
reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
.\models\albert\tokenization_albert.py
""" Tokenization classes for ALBERT model."""
import os
import unicodedata
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"albert/albert-base-v1": "https://huggingface.co/albert/albert-base-v1/resolve/main/spiece.model",
"albert/albert-large-v1": "https://huggingface.co/albert/albert-large-v1/resolve/main/spiece.model",
"albert/albert-xlarge-v1": "https://huggingface.co/albert/albert-xlarge-v1/resolve/main/spiece.model",
"albert/albert-xxlarge-v1": "https://huggingface.co/albert/albert-xxlarge-v1/resolve/main/spiece.model",
"albert/albert-base-v2": "https://huggingface.co/albert/albert-base-v2/resolve/main/spiece.model",
"albert/albert-large-v2": "https://huggingface.co/albert/albert-large-v2/resolve/main/spiece.model",
"albert/albert-xlarge-v2": "https://huggingface.co/albert/albert-xlarge-v2/resolve/main/spiece.model",
"albert/albert-xxlarge-v2": "https://huggingface.co/albert/albert-xxlarge-v2/resolve/main/spiece.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"albert/albert-base-v1": 512,
"albert/albert-large-v1": 512,
"albert/albert-xlarge-v1": 512,
"albert/albert-xxlarge-v1": 512,
"albert/albert-base-v2": 512,
"albert/albert-large-v2": 512,
"albert/albert-xlarge-v2": 512,
"albert/albert-xxlarge-v2": 512,
}
SPIECE_UNDERLINE = "▁"
class AlbertTokenizer(PreTrainedTokenizer):
"""
Construct an ALBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Attributes:
sp_model (`SentencePieceProcessor`):
The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.sp_model = None
def __init__(
self,
vocab_file,
do_lower_case=True,
remove_space=True,
keep_accents=False,
bos_token="[CLS]",
eos_token="[SEP]",
unk_token="<unk>",
sep_token="[SEP]",
pad_token="<pad>",
cls_token="[CLS]",
mask_token="[MASK]",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
if isinstance(mask_token, str)
else mask_token
)
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
super().__init__(
do_lower_case=do_lower_case,
remove_space=remove_space,
keep_accents=keep_accents,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
@property
def vocab_size(self) -> int:
return len(self.sp_model)
def get_vocab(self) -> Dict[str, int]:
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
def preprocess_text(self, inputs):
if self.remove_space:
outputs = " ".join(inputs.strip().split())
else:
outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"')
if not self.keep_accents:
outputs = unicodedata.normalize("NFKD", outputs)
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
if self.do_lower_case:
outputs = outputs.lower()
return outputs
def _tokenize(self, text: str) -> List[str]:
"""Tokenize a string."""
text = self.preprocess_text(text)
pieces = self.sp_model.encode(text, out_type=str)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else:
new_pieces.append(piece)
return new_pieces
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
prev_is_special = False
for token in tokens:
if token in self.all_special_tokens:
if not prev_is_special:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string.strip()
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
"""Build model inputs from a sequence or a pair of sequence for BERT."""
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create token type IDs tensor from token list indices.
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: A list of token type IDs (0 or 1) corresponding to each token in the input sequences.
"""
token_type_ids = [0] * len(token_ids_0)
if token_ids_1 is not None:
token_type_ids += [1] * len(token_ids_1)
return token_type_ids
def create_mask(self, token_ids_0: List[int], token_ids_1: Optional[List[int]]) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of token IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of token IDs for sequence pairs.
Returns:
`List[int]`: List of token type IDs according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary files to the specified directory.
Args:
save_directory (`str`):
Directory path where the vocabulary files will be saved.
filename_prefix (`str`, *optional*):
Optional prefix to prepend to the vocabulary file names.
Returns:
`Tuple[str]`: Tuple containing the path of the saved vocabulary file.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)