Transformers 源码解析（九）

`.\models\albert\configuration_albert.py`

# 引入 OrderedDict 用于有序字典，Mapping 用于类型提示
from collections import OrderedDict
from typing import Mapping

# 引入预训练配置工具类 PretrainedConfig 和 OnnxConfig
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig

# ALBERT 预训练模型配置文件映射字典，将模型名称映射到对应的配置文件 URL
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "albert/albert-base-v1": "https://huggingface.co/albert/albert-base-v1/resolve/main/config.json",
    "albert/albert-large-v1": "https://huggingface.co/albert/albert-large-v1/resolve/main/config.json",
    "albert/albert-xlarge-v1": "https://huggingface.co/albert/albert-xlarge-v1/resolve/main/config.json",
    "albert/albert-xxlarge-v1": "https://huggingface.co/albert/albert-xxlarge-v1/resolve/main/config.json",
    "albert/albert-base-v2": "https://huggingface.co/albert/albert-base-v2/resolve/main/config.json",
    "albert/albert-large-v2": "https://huggingface.co/albert/albert-large-v2/resolve/main/config.json",
    "albert/albert-xlarge-v2": "https://huggingface.co/albert/albert-xlarge-v2/resolve/main/config.json",
    "albert/albert-xxlarge-v2": "https://huggingface.co/albert/albert-xxlarge-v2/resolve/main/config.json",
}

# AlbertConfig 类，继承自 PretrainedConfig，用于存储 ALBERT 模型的配置信息
class AlbertConfig(PretrainedConfig):
    r"""
    这是一个配置类，用于存储 [`AlbertModel`] 或 [`TFAlbertModel`] 的配置。根据指定的参数实例化一个 ALBERT 模型配置，
    定义模型的架构。使用默认参数实例化配置将得到与 ALBERT [albert/albert-xxlarge-v2] 相似的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。

    示例：

    ```
    >>> from transformers import AlbertConfig, AlbertModel

    >>> # 初始化 ALBERT-xxlarge 风格的配置
    >>> albert_xxlarge_configuration = AlbertConfig()

    >>> # 初始化 ALBERT-base 风格的配置
    >>> albert_base_configuration = AlbertConfig(
    ...     hidden_size=768,
    ...     num_attention_heads=12,
    ...     intermediate_size=3072,
    ... )

    >>> # 使用 ALBERT-base 风格的配置初始化一个模型（带有随机权重）

    ```
    ```
    >>> model = AlbertModel(albert_xxlarge_configuration)
    
    
    # 创建一个 AlbertModel 的实例，使用给定的配置 albert_xxlarge_configuration
    model = AlbertModel(albert_xxlarge_configuration)
    
    
    
    >>> # Accessing the model configuration
    >>> configuration = model.config
    
    
    # 获取模型的配置信息并赋值给 configuration 变量
    configuration = model.config
    
    
    
    model_type = "albert"
    
    
    
    def __init__(
        self,
        vocab_size=30000,
        embedding_size=128,
        hidden_size=4096,
        num_hidden_layers=12,
        num_hidden_groups=1,
        num_attention_heads=64,
        intermediate_size=16384,
        inner_group_num=1,
        hidden_act="gelu_new",
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        classifier_dropout_prob=0.1,
        position_embedding_type="absolute",
        pad_token_id=0,
        bos_token_id=2,
        eos_token_id=3,
        **kwargs,
    ):
        # 调用父类的构造函数，并传入相关的特殊 token id 参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
    
        # 初始化 AlbertModel 的各种参数
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_hidden_groups = num_hidden_groups
        self.num_attention_heads = num_attention_heads
        self.inner_group_num = inner_group_num
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.classifier_dropout_prob = classifier_dropout_prob
        self.position_embedding_type = position_embedding_type
# 从 transformers.models.bert.configuration_bert.BertOnnxConfig 复制并修改为 AlbertOnnxConfig 类，用于处理 Albert 模型的配置
class AlbertOnnxConfig(OnnxConfig):
    
    # 定义 inputs 属性，返回一个映射，表示输入张量的动态轴
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 根据任务类型设定动态轴的不同设置
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 返回一个有序字典，指定模型输入张量名称与对应的动态轴
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),        # 模型输入张量 input_ids 对应的动态轴
                ("attention_mask", dynamic_axis),  # 模型输入张量 attention_mask 对应的动态轴
                ("token_type_ids", dynamic_axis),  # 模型输入张量 token_type_ids 对应的动态轴
            ]
        )

`.\models\albert\convert_albert_original_tf_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert ALBERT checkpoint."""

import argparse  # 导入解析命令行参数的模块

import torch  # 导入PyTorch库

from ...utils import logging  # 导入日志模块
from . import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert  # 导入Albert相关模块

logging.set_verbosity_info()  # 设置日志输出级别为INFO


def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
    # Initialise PyTorch model 初始化PyTorch模型
    config = AlbertConfig.from_json_file(albert_config_file)  # 从JSON配置文件加载Albert模型配置
    print(f"Building PyTorch model from configuration: {config}")  # 打印正在构建的PyTorch模型的配置信息
    model = AlbertForPreTraining(config)  # 根据配置创建Albert预训练模型对象

    # Load weights from tf checkpoint 从TensorFlow的checkpoint加载权重
    load_tf_weights_in_albert(model, config, tf_checkpoint_path)

    # Save pytorch-model 保存PyTorch模型
    print(f"Save PyTorch model to {pytorch_dump_path}")  # 打印正在保存的PyTorch模型的路径
    torch.save(model.state_dict(), pytorch_dump_path)  # 将模型的状态字典保存到指定路径


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建命令行参数解析器
    # Required parameters 必需的命令行参数
    parser.add_argument(
        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
    )
    parser.add_argument(
        "--albert_config_file",
        default=None,
        type=str,
        required=True,
        help=(
            "The config json file corresponding to the pre-trained ALBERT model. \n"
            "This specifies the model architecture."
        ),
    )
    parser.add_argument(
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    args = parser.parse_args()  # 解析命令行参数
    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)  # 调用转换函数进行转换

`D:\Python310\Lib\site-packages\transformers\models\albert\modeling_albert.py`

# coding=utf-8
# 上面的注释指定了源代码文件的编码格式为UTF-8

# 版权声明和许可证信息，这里使用Apache License 2.0
# 允许根据此许可证使用和分发代码
# 详细信息可参考 http://www.apache.org/licenses/LICENSE-2.0

"""PyTorch ALBERT model."""
# 引入标准库和第三方库
import math  # 导入数学库
import os  # 导入操作系统相关的功能
from dataclasses import dataclass  # 从标准库导入 dataclass 类型装饰器
from typing import Dict, List, Optional, Tuple, Union  # 导入类型提示相关的功能

import torch  # 导入PyTorch库
from torch import nn  # 导入PyTorch的神经网络模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 导入损失函数类

# 导入子模块和函数
from ...activations import ACT2FN  # 从上层包中导入 ACT2FN 激活函数
from ...modeling_outputs import (  # 导入模型输出相关的类
    BaseModelOutput,
    BaseModelOutputWithPooling,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...pytorch_utils import (  # 导入PyTorch工具函数
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from ...utils import (  # 导入实用函数和类
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_albert import AlbertConfig  # 导入ALBERT配置类

# 获取日志记录器
logger = logging.get_logger(__name__)

# 文档用的模型检查点和配置文件路径
_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"
_CONFIG_FOR_DOC = "AlbertConfig"

# 预训练模型存档列表，包含多个ALBERT模型
ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "albert/albert-base-v1",
    "albert/albert-large-v1",
    "albert/albert-xlarge-v1",
    "albert/albert-xxlarge-v1",
    "albert/albert-base-v2",
    "albert/albert-large-v2",
    "albert/albert-xlarge-v2",
    "albert/albert-xxlarge-v2",
    # 可以在 https://huggingface.co/models?filter=albert 查看所有的ALBERT模型
]

# 加载TensorFlow模型权重到PyTorch模型
def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
    """Load tf checkpoints in a pytorch model."""
    try:
        import re  # 导入正则表达式模块
        import numpy as np  # 导入NumPy库
        import tensorflow as tf  # 导入TensorFlow库
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise

    # 获取TensorFlow模型检查点的绝对路径
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")

    # 从TensorFlow模型加载权重
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []

    # 遍历TensorFlow模型的所有变量
    for name, shape in init_vars:
        logger.info(f"Loading TF weight {name} with shape {shape}")
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        arrays.append(array)

    # 打印加载的变量名
    for name, array in zip(names, arrays):
        print(name)

    return model
    """
    Construct the embeddings from word, position and token_type embeddings.
    """

    def __init__(self, config: AlbertConfig):
        super().__init__()
        # 初始化词嵌入层，将词汇表大小、嵌入维度和填充标记索引传入
        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
        # 初始化位置嵌入层，将最大位置嵌入数和嵌入维度传入
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
        # 初始化标记类型嵌入层，将标记类型词汇表大小和嵌入维度传入
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        # 初始化 LayerNorm 层，传入嵌入维度和层归一化 epsilon 值
        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
        # 初始化 Dropout 层，传入隐藏单元的丢弃概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        # 注册 position_ids 缓冲区，这是一个持久化的张量，表示位置 ID 序列
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 设置位置嵌入类型，默认为绝对位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册 token_type_ids 缓冲区，表示标记类型 ID 序列，初始化为全零张量
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        past_key_values_length: int = 0,
    # 定义函数，输入参数为input_ids（输入的标识符序列）、inputs_embeds（嵌入的输入序列）、position_ids（位置标识符序列）、
    # token_type_ids（标记类型序列）以及past_key_values_length（过去的键值对长度），返回torch.Tensor类型的张量
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        past_key_values_length: int = 0,
    ) -> torch.Tensor:
        # 如果给定input_ids，则获取其形状
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则获取inputs_embeds的形状，但不包括最后一维（通常是批处理维度）
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度，假设为输入形状的第二个维度
        seq_length = input_shape[1]

        # 如果未提供position_ids，则从self.position_ids中获取一个子集，其范围从past_key_values_length到seq_length + past_key_values_length
        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        # 设置token_type_ids为在构造函数中注册的缓冲区，通常是全零。这在模型跟踪时帮助用户，避免传递token_type_ids时的问题
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                # 如果没有已注册的token_type_ids，则创建一个全零的张量，形状与输入形状相同，类型为torch.long，设备为self.position_ids的设备
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果未提供inputs_embeds，则使用self.word_embeddings对input_ids进行嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        
        # 根据token_type_ids获取token类型的嵌入
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将输入嵌入和token类型嵌入相加得到最终嵌入
        embeddings = inputs_embeds + token_type_embeddings
        
        # 如果位置嵌入类型为"absolute"，则加上位置嵌入
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        
        # 对嵌入进行LayerNorm归一化
        embeddings = self.LayerNorm(embeddings)
        
        # 对嵌入进行dropout处理
        embeddings = self.dropout(embeddings)
        
        # 返回处理后的嵌入张量作为结果
        return embeddings
    # 定义 AlbertAttention 类，继承自 nn.Module
    class AlbertAttention(nn.Module):
        def __init__(self, config: AlbertConfig):
            # 调用父类构造函数
            super().__init__()
            # 检查隐藏层大小是否是注意力头数的整数倍，如果不是且没有嵌入大小的属性，则抛出错误
            if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
                raise ValueError(
                    f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                    f"heads ({config.num_attention_heads}"
                )

            # 初始化类的属性
            self.num_attention_heads = config.num_attention_heads
            self.hidden_size = config.hidden_size
            self.attention_head_size = config.hidden_size // config.num_attention_heads
            self.all_head_size = self.num_attention_heads * self.attention_head_size

            # 定义查询、键、值的线性层
            self.query = nn.Linear(config.hidden_size, self.all_head_size)
            self.key = nn.Linear(config.hidden_size, self.all_head_size)
            self.value = nn.Linear(config.hidden_size, self.all_head_size)

            # 定义注意力机制的 dropout
            self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob)
            # 定义输出的 dropout
            self.output_dropout = nn.Dropout(config.hidden_dropout_prob)
            # 定义最终输出的线性层
            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
            # 定义 LayerNorm 层
            self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
            # 初始化被修剪的注意力头集合
            self.pruned_heads = set()

            # 设置位置嵌入的类型，默认为绝对位置嵌入
            self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
            # 如果位置嵌入类型是相对键或相对键查询，则需要额外的距离嵌入层
            if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
                self.max_position_embeddings = config.max_position_embeddings
                self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 定义转置操作函数，用于将输入张量 x 转置成适合多头注意力的形状
        def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
            new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
            x = x.view(new_x_shape)
            return x.permute(0, 2, 1, 3)

        # 定义修剪注意力头的方法
        def prune_heads(self, heads: List[int]) -> None:
            if len(heads) == 0:
                return
            # 查找可修剪的注意力头并获取其索引
            heads, index = find_pruneable_heads_and_indices(
                heads, self.num_attention_heads, self.attention_head_size, self.pruned_heads
            )

            # 对线性层进行修剪
            self.query = prune_linear_layer(self.query, index)
            self.key = prune_linear_layer(self.key, index)
            self.value = prune_linear_layer(self.value, index)
            self.dense = prune_linear_layer(self.dense, index, dim=1)

            # 更新超参数并记录修剪的头
            self.num_attention_heads = self.num_attention_heads - len(heads)
            self.all_head_size = self.attention_head_size * self.num_attention_heads
            self.pruned_heads = self.pruned_heads.union(heads)

        # 定义前向传播函数
        def forward(
            self,
            hidden_states: torch.Tensor,
            attention_mask: Optional[torch.FloatTensor] = None,
            head_mask: Optional[torch.FloatTensor] = None,
            output_attentions: bool = False,
# 定义一个 Albert 模型的层
class AlbertLayer(nn.Module):
    def __init__(self, config: AlbertConfig):
        super().__init__()

        # 初始化层的配置信息
        self.config = config
        # 设置前馈网络分块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度的维度
        self.seq_len_dim = 1
        # 全连接层的 LayerNorm
        self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 注意力机制模块
        self.attention = AlbertAttention(config)
        # 前馈网络的线性层
        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
        # 前馈网络输出层
        self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
        # 激活函数
        self.activation = ACT2FN[config.hidden_act]
        # Dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # 使用注意力机制模块处理隐藏状态
        attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions)

        # 应用分块机制到前馈网络
        ffn_output = apply_chunking_to_forward(
            self.ff_chunk,
            self.chunk_size_feed_forward,
            self.seq_len_dim,
            attention_output[0],
        )
        # 对前馈网络的输出进行 LayerNorm 处理
        hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])

        # 返回处理后的隐藏状态和注意力机制输出
        return (hidden_states,) + attention_output[1:]  # 如果有输出注意力机制，则返回它们

    # 前馈网络分块处理函数定义
    def ff_chunk(self, attention_output: torch.Tensor) -> torch.Tensor:
        # 前馈网络的线性层
        ffn_output = self.ffn(attention_output)
        # 应用激活函数
        ffn_output = self.activation(ffn_output)
        # 前馈网络输出层
        ffn_output = self.ffn_output(ffn_output)
        # 返回前馈网络的输出
        return ffn_output


# 定义 Albert 模型的层组
class AlbertLayerGroup(nn.Module):
    def __init__(self, config: AlbertConfig):
        super().__init__()

        # 创建多个 AlbertLayer 组成的层列表
        self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])

    # 层组的前向传播函数定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # 对每一层的 AlbertLayer 进行前向传播
        for layer_module in self.albert_layers:
            # 使用当前层的前向传播函数处理隐藏状态
            hidden_states = layer_module(
                hidden_states,
                attention_mask,
                head_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )[0]

        # 返回处理后的隐藏状态和可能的输出
        return hidden_states,  # 注意返回值是一个元组
    # 定义函数签名，指定返回类型为元组，元组中包含 torch.Tensor 或元组的联合类型
    -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
        # 初始化空元组用于存储每个层的隐藏状态
        layer_hidden_states = ()
        # 初始化空元组用于存储每个层的注意力权重
        layer_attentions = ()

        # 遍历 self.albert_layers 中的每一层
        for layer_index, albert_layer in enumerate(self.albert_layers):
            # 调用当前层的前向传播方法，计算当前层的输出
            layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index], output_attentions)
            # 更新 hidden_states 为当前层的输出
            hidden_states = layer_output[0]

            # 如果需要输出注意力权重
            if output_attentions:
                # 将当前层的注意力权重添加到 layer_attentions 中
                layer_attentions = layer_attentions + (layer_output[1],)

            # 如果需要输出每层的隐藏状态
            if output_hidden_states:
                # 将当前层的隐藏状态添加到 layer_hidden_states 中
                layer_hidden_states = layer_hidden_states + (hidden_states,)

        # 构建最终的输出元组，包含最后一层的隐藏状态
        outputs = (hidden_states,)
        # 如果需要输出每层的隐藏状态，则将其添加到 outputs 中
        if output_hidden_states:
            outputs = outputs + (layer_hidden_states,)
        # 如果需要输出每层的注意力权重，则将其添加到 outputs 中
        if output_attentions:
            outputs = outputs + (layer_attentions,)
        # 返回最终的 outputs，包括最后一层的隐藏状态、每层的隐藏状态和每层的注意力权重（根据需要）
        return outputs  # 最后一层的隐藏状态，(每层隐藏状态)，(每层注意力权重)
class AlbertTransformer(nn.Module):
    # AlbertTransformer 类定义，继承自 nn.Module
    def __init__(self, config: AlbertConfig):
        # 初始化方法，接收 AlbertConfig 类型的 config 参数
        super().__init__()

        # 将传入的配置信息保存到对象的 config 属性中
        self.config = config
        # 创建一个线性层，将输入的 embedding_size 维度映射到 hidden_size 维度
        self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
        # 创建包含多个 AlbertLayerGroup 对象的 ModuleList，数量为 config.num_hidden_groups
        self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[BaseModelOutput, Tuple]:
        # 将输入的 hidden_states 通过线性映射层 embedding_hidden_mapping_in 进行维度映射
        hidden_states = self.embedding_hidden_mapping_in(hidden_states)

        # 初始化用于存储所有隐藏状态和注意力的空元组
        all_hidden_states = (hidden_states,) if output_hidden_states else None
        all_attentions = () if output_attentions else None

        # 如果未提供头部遮罩 head_mask，则创建一个与层数相同长度的 None 列表
        head_mask = [None] * self.config.num_hidden_layers if head_mask is None else head_mask

        # 遍历每个隐藏层
        for i in range(self.config.num_hidden_layers):
            # 计算每个隐藏组中的层数
            layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
            # 确定当前隐藏层所在的隐藏组索引
            group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))

            # 调用对应隐藏组的 AlbertLayerGroup 进行前向传播
            layer_group_output = self.albert_layer_groups[group_idx](
                hidden_states,
                attention_mask,
                head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
                output_attentions,
                output_hidden_states,
            )
            # 更新 hidden_states 为当前隐藏组的输出的第一个元素（通常是隐藏状态）
            hidden_states = layer_group_output[0]

            # 如果需要输出注意力，将当前隐藏组的注意力信息添加到 all_attentions 中
            if output_attentions:
                all_attentions = all_attentions + layer_group_output[-1]

            # 如果需要输出隐藏状态，将当前隐藏组的隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要以字典形式返回结果，则按顺序返回 hidden_states, all_hidden_states, all_attentions 中非 None 的元素
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
        # 否则，以 BaseModelOutput 形式返回结果，包括最后的隐藏状态、所有隐藏状态和所有注意力信息
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )


class AlbertPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    # AlbertPreTrainedModel 类，继承自 PreTrainedModel，是用于处理权重初始化和预训练模型下载加载的抽象类

    # 配置类的引用
    config_class = AlbertConfig
    # 加载 TensorFlow 权重的方法引用
    load_tf_weights = load_tf_weights_in_albert
    # 基础模型前缀
    base_model_prefix = "albert"
    def _init_weights(self, module):
        """Initialize the weights."""
        # 如果 module 是 nn.Linear 类型
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化权重，均值为0.0，标准差为模型配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项，则将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果 module 是 nn.Embedding 类型
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，均值为0.0，标准差为模型配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果指定了填充索引，将对应位置的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果 module 是 nn.LayerNorm 类型
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为全1
            module.weight.data.fill_(1.0)
@dataclass
class AlbertForPreTrainingOutput(ModelOutput):
    """
    Output type of [`AlbertForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None  # 损失值，当提供`labels`时返回，为浮点数张量，形状为`(1,)`
    prediction_logits: torch.FloatTensor = None  # 语言建模头部的预测分数，softmax之前每个词汇标记的分数，形状为`(batch_size, sequence_length, config.vocab_size)`
    sop_logits: torch.FloatTensor = None  # 下一个序列预测（分类）头部的预测分数，softmax之前True/False延续的分数，形状为`(batch_size, 2)`
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 模型每层输出的隐藏状态，包括初始嵌入输出，形状为`(batch_size, sequence_length, hidden_size)`的浮点数张量元组，当`output_hidden_states=True`时返回
    attentions: Optional[Tuple[torch.FloatTensor]] = None  # 注意力权重，用于计算自注意力头部的加权平均值，形状为`(batch_size, num_heads, sequence_length, sequence_length)`的浮点数张量元组，当`output_attentions=True`时返回


ALBERT_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Args:
        config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

ALBERT_INPUTS_DOCSTRING = r"""
"""
        Args:
            input_ids (`torch.LongTensor` of shape `({0})`):
                # 输入序列标记在词汇表中的索引

                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
                [`PreTrainedTokenizer.encode`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
                # 注意力掩码，避免在填充标记上执行注意力计算

                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                # 分段标记索引，指示输入的第一部分和第二部分

                Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
                1]`:

                - 0 corresponds to a *sentence A* token,
                - 1 corresponds to a *sentence B* token.

                [What are token type IDs?](../glossary#token-type-ids)
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                # 每个输入序列标记在位置嵌入中的位置索引

                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
                config.max_position_embeddings - 1]`.

                [What are position IDs?](../glossary#position-ids)
            head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                # 自注意力模块中选择性屏蔽的头部掩码

                Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
                # 可选参数，直接传入嵌入表示而不是输入标记索引

                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
                is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
                model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                # 是否返回所有注意力层的注意力张量

                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                # 是否返回所有层的隐藏状态

                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
                more detail.
            return_dict (`bool`, *optional*):
                # 是否返回一个 `~utils.ModelOutput` 而不是一个普通元组

                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
AlbertModel 类定义了 ALBERT 模型，用于处理文本数据的转换器模型。

@add_start_docstrings(
    "The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
    ALBERT_START_DOCSTRING,
)
class AlbertModel(AlbertPreTrainedModel):
    config_class = AlbertConfig
    base_model_prefix = "albert"

    def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True):
        super().__init__(config)

        self.config = config
        self.embeddings = AlbertEmbeddings(config)  # 初始化 ALBERT 的嵌入层
        self.encoder = AlbertTransformer(config)  # 初始化 ALBERT 的 transformer 编码器

        if add_pooling_layer:
            self.pooler = nn.Linear(config.hidden_size, config.hidden_size)  # 添加池化层的线性变换
            self.pooler_activation = nn.Tanh()  # 池化层的激活函数为 Tanh
        else:
            self.pooler = None
            self.pooler_activation = None

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Embedding:
        return self.embeddings.word_embeddings  # 返回嵌入层的词嵌入

    def set_input_embeddings(self, value: nn.Embedding) -> None:
        self.embeddings.word_embeddings = value  # 设置嵌入层的词嵌入为指定值

    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
        a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
        model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.

        These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
        while [2,3] correspond to the two inner groups of the second hidden layer.

        Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
        information about head pruning
        """
        for layer, heads in heads_to_prune.items():
            group_idx = int(layer / self.config.inner_group_num)
            inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
            # 剪枝指定层的注意力头部
            self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[BaseModelOutputWithPooling, Tuple]:
        # 如果未显式指定，使用配置中的值来确定是否输出注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未显式指定，使用配置中的值来确定是否输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未显式指定，使用配置中的值来确定是否返回字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 检查输入参数，确保不同时指定 input_ids 和 inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            # 如果指定了 input_ids，则检查是否需要警告关于填充和注意力蒙版的使用
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            # 如果指定了 inputs_embeds，则获取其形状（除最后一个维度）
            input_shape = inputs_embeds.size()[:-1]
        else:
            # 如果既未指定 input_ids 也未指定 inputs_embeds，则抛出错误
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 获取批次大小和序列长度
        batch_size, seq_length = input_shape
        # 获取设备信息，用于在 GPU 或 CPU 上执行操作
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # 如果未提供 attention_mask，则创建一个全为1的张量
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        # 如果未提供 token_type_ids，则根据嵌入层的特性来决定是否需要创建
        if token_type_ids is None:
            if hasattr(self.embeddings, "token_type_ids"):
                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # 扩展注意力蒙版的维度以匹配编码器的期望形状
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # 为了fp16的兼容性
        # 对扩展的注意力蒙版进行填充，使未考虑部分的权重为负无穷
        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min
        # 获取头部遮罩，用于确定哪些层的注意力应被屏蔽
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        # 通过嵌入层获取嵌入输出
        embedding_output = self.embeddings(
            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
        )
        # 通过编码器处理嵌入输出，获取编码器的输出
        encoder_outputs = self.encoder(
            embedding_output,
            extended_attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从编码器的输出中获取序列输出
        sequence_output = encoder_outputs[0]

        # 如果存在池化器，则对序列输出的首个位置进行池化操作并激活
        pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) if self.pooler is not None else None

        # 如果不返回字典格式的输出，则返回元组形式的输出
        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        # 返回字典格式的输出，包括序列输出、池化输出、隐藏状态和注意力权重
        return BaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
"""
Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
`sentence order prediction (classification)` head.
"""
# 导入所需的模块和库
from transformers import AlbertPreTrainedModel, AlbertModel, AlbertConfig
from transformers.modeling_outputs import AlbertForPreTrainingOutput
from transformers.activations import ACT2FN
from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
import torch
import torch.nn as nn

# ALBERT_START_DOCSTRING 定义在引入的模块中，这里假设是常量或全局变量

@add_start_docstrings(
    """
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    """,
    ALBERT_START_DOCSTRING,
)
class AlbertForPreTraining(AlbertPreTrainedModel):
    _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]

    def __init__(self, config: AlbertConfig):
        super().__init__(config)

        # 初始化 Albert 模型和两个头部
        self.albert = AlbertModel(config)
        self.predictions = AlbertMLMHead(config)
        self.sop_classifier = AlbertSOPHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_output_embeddings(self) -> nn.Linear:
        return self.predictions.decoder

    def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
        self.predictions.decoder = new_embeddings

    def get_input_embeddings(self) -> nn.Embedding:
        return self.albert.embeddings.word_embeddings

    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        sentence_order_label: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
):
        # 省略了 forward 方法的实现，由 AlbertPreTrainedModel 提供

class AlbertMLMHead(nn.Module):
    def __init__(self, config: AlbertConfig):
        super().__init__()

        # 初始化 MLM 头部的各个组件
        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
        self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
        self.activation = ACT2FN[config.hidden_act]
        self.decoder.bias = self.bias

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # MLM 头部的前向传播
        hidden_states = self.dense(hidden_states)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        hidden_states = self.decoder(hidden_states)

        prediction_scores = hidden_states

        return prediction_scores

    def _tie_weights(self) -> None:
        # 如果这两个权重被分离（在 TPU 上或当偏置被调整大小时），将它们绑定起来
        self.bias = self.decoder.bias


class AlbertSOPHead(nn.Module):
    # 初始化方法，接受一个 AlbertConfig 类型的参数 config
    def __init__(self, config: AlbertConfig):
        # 调用父类（nn.Module）的初始化方法
        super().__init__()

        # 创建一个 dropout 层，使用 config 中的 classifier_dropout_prob 参数作为丢弃概率
        self.dropout = nn.Dropout(config.classifier_dropout_prob)
        
        # 创建一个全连接层（线性变换），输入大小为 config.hidden_size，输出大小为 config.num_labels
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    # 前向传播方法，输入一个 torch.Tensor 类型的 pooled_output，返回一个 torch.Tensor 类型的 logits
    def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
        # 对 pooled_output 应用 dropout 操作
        dropout_pooled_output = self.dropout(pooled_output)
        
        # 将 dropout 后的 pooled_output 输入到全连接层中进行线性变换，得到 logits
        logits = self.classifier(dropout_pooled_output)
        
        # 返回 logits 作为输出
        return logits
# 使用装饰器添加文档字符串，描述该类是带有顶层语言建模头的Albert模型。
@add_start_docstrings(
    "Albert Model with a `language modeling` head on top.",
    ALBERT_START_DOCSTRING,
)
# 定义Albert模型的具体实现类，继承自AlbertPreTrainedModel。
class AlbertForMaskedLM(AlbertPreTrainedModel):
    # 定义需要共享权重的关键键列表。
    _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]

    # 初始化方法，接收配置参数config并调用父类的初始化方法。
    def __init__(self, config):
        super().__init__(config)

        # 创建Albert模型实例，不添加池化层。
        self.albert = AlbertModel(config, add_pooling_layer=False)
        # 创建AlbertMLMHead实例，用于预测。
        self.predictions = AlbertMLMHead(config)

        # 初始化权重并进行最终处理。
        self.post_init()

    # 获取输出嵌入层的方法，返回预测层的解码器。
    def get_output_embeddings(self) -> nn.Linear:
        return self.predictions.decoder

    # 设置输出嵌入层的方法，用新的线性层替换预测层的解码器。
    def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
        self.predictions.decoder = new_embeddings

    # 获取输入嵌入层的方法，返回Albert模型的词嵌入。
    def get_input_embeddings(self) -> nn.Embedding:
        return self.albert.embeddings.word_embeddings

    # 前向传播方法，接收多个输入参数，返回掩码语言模型的输出。
    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

        # 方法的具体实现在AlbertPreTrainedModel中被覆盖。
        # 根据输入参数执行Albert模型的前向传播，返回预测的掩码语言模型输出。
        pass  # 这里只是声明方法，实现在父类中


这样的注释能够清晰地解释每个方法和类的作用及其关键细节，帮助他人理解代码的功能和实现方式。
        ) -> Union[MaskedLMOutput, Tuple]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """

        # 根据 return_dict 是否为 None 决定是否使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 Albert 模型，传入各种参数，获取模型输出
        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 从模型输出中取出序列输出
        sequence_outputs = outputs[0]

        # 使用预测器预测序列输出中的预测分数
        prediction_scores = self.predictions(sequence_outputs)

        masked_lm_loss = None
        # 如果 labels 不为 None，则计算 masked language modeling 的损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # 如果 return_dict 为 False，则返回完整的输出，包括预测分数和其他输出状态
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # 如果 return_dict 为 True，则返回 MaskedLMOutput 对象，包括损失、预测分数、隐藏状态和注意力权重
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 定义一个带有顶部序列分类/回归头部的 Albert 模型转换器，例如用于 GLUE 任务（该头部是在汇总输出之上的线性层）。
# 这是一个使用 ALBERT 的特定文档字符串的装饰器。
@add_start_docstrings(
    """
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    ALBERT_START_DOCSTRING,  # 引入 ALBERT 模型的通用文档字符串
)
class AlbertForSequenceClassification(AlbertPreTrainedModel):
    
    def __init__(self, config: AlbertConfig):
        super().__init__(config)
        self.num_labels = config.num_labels  # 从配置中获取标签数量
        self.config = config

        self.albert = AlbertModel(config)  # 创建 ALBERT 模型
        self.dropout = nn.Dropout(config.classifier_dropout_prob)  # 应用分类器的 dropout
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)  # 创建分类器的线性层

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="textattack/albert-base-v2-imdb",  # 提供了一个代码示例的检查点
        output_type=SequenceClassifierOutput,  # 预期的输出类型
        config_class=_CONFIG_FOR_DOC,  # 用于文档的配置类
        expected_output="'LABEL_1'",  # 预期的输出
        expected_loss=0.12,  # 预期的损失
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 上述参数用于模型前向传播，描述了每个参数的类型和作用

        # 以下是参数的描述：
        # - input_ids: 输入的token IDs
        # - attention_mask: 注意力掩码，用于指示哪些元素需要被注意力机制忽略
        # - token_type_ids: 标识不同序列的token类型，如segment A/B用于BERT等模型
        # - position_ids: 标识每个token在序列中的位置ID
        # - head_mask: 用于控制多头注意力机制中每个头部的掩码
        # - inputs_embeds: 可选的嵌入表示输入
        # - labels: 对应于每个输入的标签，用于训练
        # - output_attentions: 是否返回注意力权重
        # - output_hidden_states: 是否返回所有隐藏状态
        # - return_dict: 是否返回字典类型的输出

        # 下面的装饰器为模型的前向方法添加了文档字符串，描述了输入参数的具体形状和含义
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 确定是否使用返回字典，如果未指定，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 ALBERT 模型进行前向传播计算
        outputs = self.albert(
            input_ids=input_ids,                      # 输入的词索引序列
            attention_mask=attention_mask,            # 输入的注意力掩码，指示哪些标记是实际输入，哪些是填充
            token_type_ids=token_type_ids,            # 输入的标记类型 IDs，用于区分不同句子或片段
            position_ids=position_ids,                # 输入的位置 IDs，指示每个位置在输入中的位置
            head_mask=head_mask,                      # 多头注意力机制的掩码，用于控制每个头的权重
            inputs_embeds=inputs_embeds,              # 替代输入的嵌入表示
            output_attentions=output_attentions,      # 是否输出注意力权重
            output_hidden_states=output_hidden_states,  # 是否输出隐藏状态
            return_dict=return_dict,                  # 是否返回字典格式的输出
        )

        # 获取 ALBERT 模型的汇聚输出
        pooled_output = outputs[1]

        # 对汇聚输出应用 dropout
        pooled_output = self.dropout(pooled_output)
        
        # 使用分类器对汇聚输出进行分类得到 logits
        logits = self.classifier(pooled_output)

        # 初始化损失为 None
        loss = None
        # 如果提供了标签
        if labels is not None:
            # 如果问题类型未指定，则根据标签类型和标签数量推断问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"  # 如果标签数量为 1，则为回归问题
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"  # 如果标签数量大于 1 且标签类型为 long 或 int，则为单标签分类问题
                else:
                    self.config.problem_type = "multi_label_classification"  # 否则为多标签分类问题

            # 根据问题类型选择相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()  # 均方误差损失函数
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())  # 对于回归问题，计算损失
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()  # 交叉熵损失函数
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))  # 对于单标签分类问题，计算损失
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()  # 二元交叉熵损失函数
                loss = loss_fct(logits, labels)  # 对于多标签分类问题，计算损失

        # 如果不需要返回字典格式的输出
        if not return_dict:
            output = (logits,) + outputs[2:]  # 输出结果包括 logits 和额外的输出信息
            return ((loss,) + output) if loss is not None else output  # 如果有损失，则加入到输出结果中

        # 如果需要返回字典格式的输出，构造 SequenceClassifierOutput 对象
        return SequenceClassifierOutput(
            loss=loss,                               # 损失值
            logits=logits,                           # 模型输出的 logits
            hidden_states=outputs.hidden_states,     # 隐藏状态
            attentions=outputs.attentions,           # 注意力权重
        )
"""
Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""  # 描述 Albert 模型，在隐藏状态输出之上添加了一个用于标记分类（例如命名实体识别）的线性层的头部。

# 使用 ALBERT_START_DOCSTRING 和额外提供的描述来为类添加文档字符串
@add_start_docstrings(
    ALBERT_START_DOCSTRING,
    """
    Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """
)
class AlbertForTokenClassification(AlbertPreTrainedModel):
    def __init__(self, config: AlbertConfig):
        super().__init__(config)
        self.num_labels = config.num_labels

        # 初始化 ALBERT 模型，不包括池化层
        self.albert = AlbertModel(config, add_pooling_layer=False)

        # 根据配置设置分类器的 dropout 概率
        classifier_dropout_prob = (
            config.classifier_dropout_prob
            if config.classifier_dropout_prob is not None
            else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout_prob)

        # 线性层，将 ALBERT 隐藏层的输出映射到标签数量的空间
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    # 为 forward 方法添加文档字符串，描述输入和输出的格式
    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 添加代码示例的文档字符串，指定了模型使用的检查点、输出类型和配置类
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
):
        ) -> Union[TokenClassifierOutput, Tuple]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 如果 return_dict 为 None，则根据配置决定是否使用 return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 ALBERT 模型进行前向传播，获取输出结果
        outputs = self.albert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 ALBERT 模型输出中获取序列输出
        sequence_output = outputs[0]

        # 对序列输出进行 dropout 处理
        sequence_output = self.dropout(sequence_output)
        
        # 将 dropout 后的输出送入分类器得到 logits
        logits = self.classifier(sequence_output)

        # 初始化损失值为 None
        loss = None
        
        # 如果提供了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不使用 return_dict，则返回完整的输出元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 使用 TokenClassifierOutput 类构建返回结果，包括损失、logits、隐藏状态和注意力权重
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
@add_start_docstrings(
    """
    Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    ALBERT_START_DOCSTRING,  # 添加 Albert 模型的文档字符串和 Albert 的开始文档字符串
)
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
    def __init__(self, config: AlbertConfig):
        super().__init__(config)  # 调用父类的初始化方法
        self.num_labels = config.num_labels  # 设置标签数量

        self.albert = AlbertModel(config, add_pooling_layer=False)  # 初始化 Albert 模型，不加汇聚层
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)  # 使用线性层进行输出

        # Initialize weights and apply final processing
        self.post_init()  # 调用自定义的初始化方法

    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="twmkn9/albert-base-v2-squad2",
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        qa_target_start_index=12,
        qa_target_end_index=13,
        expected_output="'a nice puppet'",
        expected_loss=7.36,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[AlbertForPreTrainingOutput, Tuple]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # Determine whether to use the return_dict from function arguments or default configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass inputs to the Albert model and obtain outputs
        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract sequence output from the model outputs
        sequence_output = outputs[0]

        # Generate logits by passing sequence output through the QA output layer
        logits: torch.Tensor = self.qa_outputs(sequence_output)

        # Split logits into start and end logits
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # Ensure start_positions and end_positions are properly shaped for processing
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # Clamp positions within valid range
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # Define loss function and compute start_loss and end_loss
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        # If return_dict is False, return output as tuple
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # If return_dict is True, return structured output
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 定义一个 Albert 模型，该模型在顶部具有用于多选分类任务的分类头部（一个线性层叠加在汇总输出和 softmax 上），例如用于 RocStories/SWAG 任务。
@add_start_docstrings(
    """
    Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    ALBERT_START_DOCSTRING,  # 添加起始注释，描述 Albert 模型的概述
)
class AlbertForMultipleChoice(AlbertPreTrainedModel):
    
    def __init__(self, config: AlbertConfig):
        super().__init__(config)

        # 初始化 Albert 模型
        self.albert = AlbertModel(config)
        # 使用配置中的 dropout 概率初始化 dropout 层
        self.dropout = nn.Dropout(config.classifier_dropout_prob)
        # 使用隐藏层大小初始化分类器线性层，输出维度为1（用于多选任务）
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()  # 执行后期初始化操作，可能包括权重初始化等

    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 定义前向传播函数，接受一系列输入参数，并输出多选模型的预测结果
        ) -> Union[AlbertForPreTrainingOutput, Tuple]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        """
        # 根据是否指定返回字典的选项来确定是否返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 获取输入的选项数目
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 如果传入了input_ids，则将其视作(batch_size, num_choices, sequence_length)的形状
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        # 如果传入了attention_mask，则将其视作(batch_size, num_choices, sequence_length)的形状
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        # 如果传入了token_type_ids，则将其视作(batch_size, num_choices, sequence_length)的形状
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        # 如果传入了position_ids，则将其视作(batch_size, num_choices, sequence_length)的形状
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        # 如果传入了inputs_embeds，则将其视作(batch_size, num_choices, sequence_length, hidden_size)的形状
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )
        # 调用Albert模型进行前向传播，获取输出
        outputs = self.albert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取汇聚的输出表示
        pooled_output = outputs[1]

        # 对汇聚的输出表示应用dropout
        pooled_output = self.dropout(pooled_output)
        # 将处理后的汇聚输出表示输入分类器，得到logits
        logits: torch.Tensor = self.classifier(pooled_output)
        # 将logits变形为(batch_size * num_choices, -1)的形状
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        # 如果给定了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果不要求返回字典形式的输出，则按照元组的形式返回结果
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典形式的输出，则构建MultipleChoiceModelOutput对象并返回
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\albert\modeling_flax_albert.py`

# coding=utf-8
# 上面是指定代码文件的字符编码格式

# 版权声明，指出代码的版权归属于 Google AI, Google Brain 和 HuggingFace Inc. 团队
# 根据 Apache 许可证 2.0 版本授权，只有符合许可证条件才能使用该文件
# 可以通过指定的链接获取许可证的副本
# 除非适用法律要求或书面同意，否则按“原样”分发此软件
# 没有任何明示或暗示的保证或条件
# 有关更多详细信息，请参阅许可证

# 引入必要的类型声明
from typing import Callable, Optional, Tuple

# 引入 Flax 框架及其组件
import flax
import flax.linen as nn
import jax
import jax.numpy as jnp
import numpy as np

# 引入 Flax 相关模块
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax

# 引入模型输出类和实用函数
from ...modeling_flax_outputs import (
    FlaxBaseModelOutput,
    FlaxBaseModelOutputWithPooling,
    FlaxMaskedLMOutput,
    FlaxMultipleChoiceModelOutput,
    FlaxQuestionAnsweringModelOutput,
    FlaxSequenceClassifierOutput,
    FlaxTokenClassifierOutput,
)
from ...modeling_flax_utils import (
    ACT2FN,
    FlaxPreTrainedModel,
    append_call_sample_docstring,
    append_replace_return_docstrings,
    overwrite_call_docstring,
)
# 引入通用实用函数和配置
from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_albert import AlbertConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 以下两行用于文档化，指定相关内容为文档中的关键检查点和配置信息
_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"
_CONFIG_FOR_DOC = "AlbertConfig"

# 定义 FlaxAlbertForPreTrainingOutput 类，用于表示 FlaxAlbertForPreTraining 模型的输出类型
@flax.struct.dataclass
class FlaxAlbertForPreTrainingOutput(ModelOutput):
    """
    Output type of [`FlaxAlbertForPreTraining`].
    用于 [`FlaxAlbertForPreTraining`] 的输出类型。
    """
    # 定义函数参数及其类型注解，用于描述模型预测的输出结果
    
    Args:
        prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
            语言建模头部的预测分数（在 SoftMax 之前的每个词汇标记的分数）。
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        sop_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
            下一个序列预测（分类）头部的预测分数（在 SoftMax 之前的 True/False 连续性的分数）。
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型隐藏状态的元组（jnp.ndarray），形状为 `(batch_size, sequence_length, hidden_size)`。
            
            每个层的输出和初始嵌入输出的隐藏状态。
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
    
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            模型自注意力层的注意力权重的元组（jnp.ndarray），形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            
            在注意力 SoftMax 之后的注意力权重，用于计算自注意力头部中的加权平均值。
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
    
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """
    
    prediction_logits: jnp.ndarray = None
    sop_logits: jnp.ndarray = None
    hidden_states: Optional[Tuple[jnp.ndarray]] = None
    attentions: Optional[Tuple[jnp.ndarray]] = None
# ALBERT_START_DOCSTRING 是一个包含模型文档字符串的原始字符串常量
ALBERT_START_DOCSTRING = r"""

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
"""

# ALBERT_INPUTS_DOCSTRING 是一个包含输入文档字符串的原始字符串常量，目前为空
ALBERT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            # 输入序列中词汇表中的索引列表。

            # 使用 `AutoTokenizer` 可获取这些索引。参见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。

            # 更多信息请查看 [什么是输入 ID？](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            # 遮罩，用于在填充的令牌索引上避免执行注意力操作。遮罩值在 `[0, 1]` 范围内：

            # - 对于 **未遮罩** 的令牌，值为 1，
            # - 对于 **已遮罩** 的令牌，值为 0。

            # 更多信息请查看 [什么是注意力遮罩？](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            # 分段令牌索引，用于指示输入的第一部分和第二部分。索引在 `[0, 1]` 范围内：

            # - 0 对应 *句子 A* 的令牌，
            # - 1 对应 *句子 B* 的令牌。

            # 更多信息请查看 [什么是令牌类型 ID？](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            # 每个输入序列令牌在位置嵌入中的位置索引。选择范围为 `[0, config.max_position_embeddings - 1]`。
        return_dict (`bool`, *optional*):
            # 是否返回一个 [`~utils.ModelOutput`] 而不是普通的元组。
"""
class FlaxAlbertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    config: AlbertConfig  # 定义配置对象的类型
    dtype: jnp.dtype = jnp.float32  # 计算过程中使用的数据类型

    def setup(self):
        # 初始化词嵌入层，使用正态分布初始化器
        self.word_embeddings = nn.Embed(
            self.config.vocab_size,
            self.config.embedding_size,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )
        # 初始化位置嵌入层，使用正态分布初始化器
        self.position_embeddings = nn.Embed(
            self.config.max_position_embeddings,
            self.config.embedding_size,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )
        # 初始化标记类型嵌入层，使用正态分布初始化器
        self.token_type_embeddings = nn.Embed(
            self.config.type_vocab_size,
            self.config.embedding_size,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )
        # 初始化层归一化层，使用给定的 epsilon 和数据类型
        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        # 初始化 dropout 层，使用给定的 dropout 概率
        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)

    def __call__(self, input_ids, token_type_ids, position_ids, deterministic: bool = True):
        # 嵌入输入 ID，转换为指定数据类型的张量
        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
        # 嵌入位置 ID，转换为指定数据类型的张量
        position_embeds = self.position_embeddings(position_ids.astype("i4"))
        # 嵌入标记类型 ID，转换为指定数据类型的张量
        token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))

        # 将所有嵌入相加
        hidden_states = inputs_embeds + token_type_embeddings + position_embeds

        # 应用层归一化
        hidden_states = self.LayerNorm(hidden_states)
        # 应用 dropout
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        return hidden_states


class FlaxAlbertSelfAttention(nn.Module):
    config: AlbertConfig  # 定义配置对象的类型
    dtype: jnp.dtype = jnp.float32  # 计算过程中使用的数据类型
    # 在设置阶段验证隐藏层大小是否可以被注意力头数整除
    def setup(self):
        if self.config.hidden_size % self.config.num_attention_heads != 0:
            raise ValueError(
                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
                "                   : {self.config.num_attention_heads}"
            )

        # 创建用于查询的全连接层，输入大小为隐藏层大小，使用指定的数据类型和正态分布初始化
        self.query = nn.Dense(
            self.config.hidden_size,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )
        # 创建用于键的全连接层，输入大小为隐藏层大小，使用指定的数据类型和正态分布初始化
        self.key = nn.Dense(
            self.config.hidden_size,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )
        # 创建用于值的全连接层，输入大小为隐藏层大小，使用指定的数据类型和正态分布初始化
        self.value = nn.Dense(
            self.config.hidden_size,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )
        # 创建用于最终输出的全连接层，输入大小为隐藏层大小，使用指定的正态分布初始化
        self.dense = nn.Dense(
            self.config.hidden_size,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
            dtype=self.dtype,
        )
        # 创建层归一化对象，使用指定的 epsilon 值和数据类型
        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        # 创建用于dropout的对象，指定丢弃率
        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
    # 定义一个调用方法，接受隐藏状态、注意力掩码等参数，返回注意力层的输出
    def __call__(self, hidden_states, attention_mask, deterministic=True, output_attentions: bool = False):
        # 计算每个注意力头的维度
        head_dim = self.config.hidden_size // self.config.num_attention_heads

        # 使用 query 网络处理隐藏状态，然后重塑为 (batch_size, seq_length, num_attention_heads, head_dim)
        query_states = self.query(hidden_states).reshape(
            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
        )
        # 使用 value 网络处理隐藏状态，然后重塑为 (batch_size, seq_length, num_attention_heads, head_dim)
        value_states = self.value(hidden_states).reshape(
            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
        )
        # 使用 key 网络处理隐藏状态，然后重塑为 (batch_size, seq_length, num_attention_heads, head_dim)
        key_states = self.key(hidden_states).reshape(
            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
        )

        # 将布尔类型的注意力掩码转换为注意力偏置
        if attention_mask is not None:
            # 将注意力掩码扩展维度以匹配 query 和 key 张量的维度
            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
            # 根据注意力掩码的值生成注意力偏置，使用 lax.select 来根据条件选择不同的值
            attention_bias = lax.select(
                attention_mask > 0,
                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
            )
        else:
            attention_bias = None

        # 初始化 dropout RNG
        dropout_rng = None
        # 如果不是确定性计算并且设置了注意力概率的 dropout，则生成一个 dropout RNG
        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
            dropout_rng = self.make_rng("dropout")

        # 计算注意力权重，使用 dot_product_attention_weights 函数
        attn_weights = dot_product_attention_weights(
            query_states,
            key_states,
            bias=attention_bias,
            dropout_rng=dropout_rng,
            dropout_rate=self.config.attention_probs_dropout_prob,
            broadcast_dropout=True,
            deterministic=deterministic,
            dtype=self.dtype,
            precision=None,
        )

        # 使用 einsum 函数计算注意力输出，将注意力权重应用到 value 状态上
        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
        # 重塑注意力输出的形状为 (batch_size, seq_length, hidden_size)
        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))

        # 将注意力输出投影到相同维度空间
        projected_attn_output = self.dense(attn_output)
        # 如果使用 dropout，则对投影后的注意力输出应用 dropout
        projected_attn_output = self.dropout(projected_attn_output, deterministic=deterministic)
        # 使用 LayerNorm 对投影后的注意力输出进行规范化，并与原始隐藏状态相加
        layernormed_attn_output = self.LayerNorm(projected_attn_output + hidden_states)
        # 根据需求决定是否输出注意力权重
        outputs = (layernormed_attn_output, attn_weights) if output_attentions else (layernormed_attn_output,)
        # 返回最终的输出
        return outputs
# 定义 FlaxAlbertLayer 类，继承自 nn.Module
class FlaxAlbertLayer(nn.Module):
    # 保存 AlbertConfig 类型的配置信息
    config: AlbertConfig
    # 指定计算中使用的数据类型为 jnp.float32
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    # 初始化方法，设置层的组件
    def setup(self):
        # 创建 self.attention 属性，使用 FlaxAlbertSelfAttention 类处理注意力机制
        self.attention = FlaxAlbertSelfAttention(self.config, dtype=self.dtype)
        # 创建 self.ffn 属性，使用 nn.Dense 层作为前馈神经网络
        self.ffn = nn.Dense(
            self.config.intermediate_size,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
            dtype=self.dtype,
        )
        # 根据配置选择激活函数
        self.activation = ACT2FN[self.config.hidden_act]
        # 创建 self.ffn_output 属性，使用 nn.Dense 层作为前馈神经网络输出层
        self.ffn_output = nn.Dense(
            self.config.hidden_size,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
            dtype=self.dtype,
        )
        # 创建 self.full_layer_layer_norm 属性，使用 nn.LayerNorm 层进行层归一化
        self.full_layer_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        # 创建 self.dropout 属性，使用 nn.Dropout 层进行随机失活
        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)

    # 调用方法，定义前向传播逻辑
    def __call__(
        self,
        hidden_states,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
    ):
        # 使用 self.attention 处理注意力输出
        attention_outputs = self.attention(
            hidden_states, attention_mask, deterministic=deterministic, output_attentions=output_attentions
        )
        # 获取注意力输出的第一个元素作为 attention_output
        attention_output = attention_outputs[0]
        # 使用 self.ffn 处理 attention_output 得到前馈神经网络的输出
        ffn_output = self.ffn(attention_output)
        # 使用 self.activation 应用激活函数
        ffn_output = self.activation(ffn_output)
        # 使用 self.ffn_output 处理前馈神经网络输出得到最终输出
        ffn_output = self.ffn_output(ffn_output)
        # 使用 self.dropout 进行随机失活处理最终输出
        ffn_output = self.dropout(ffn_output, deterministic=deterministic)
        # 将前馈神经网络的输出与注意力输出相加，然后进行层归一化得到 hidden_states
        hidden_states = self.full_layer_layer_norm(ffn_output + attention_output)

        # 将 hidden_states 存入 outputs 元组中
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则将 attention_outputs[1] 也加入 outputs 中
        if output_attentions:
            outputs += (attention_outputs[1],)

        # 返回 outputs 元组作为最终的输出结果
        return outputs


# 定义 FlaxAlbertLayerCollection 类，继承自 nn.Module
class FlaxAlbertLayerCollection(nn.Module):
    # 保存 AlbertConfig 类型的配置信息
    config: AlbertConfig
    # 指定计算中使用的数据类型为 jnp.float32
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    # 初始化方法，设置层的组件
    def setup(self):
        # 创建 self.layers 属性，包含多个 FlaxAlbertLayer 层组成的列表
        self.layers = [
            FlaxAlbertLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.inner_group_num)
        ]

    # 调用方法，定义前向传播逻辑
    def __call__(
        self,
        hidden_states,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        # 注意：此处代码未完整
        ):
            # 初始化空元组，用于存储各层的隐藏状态和注意力分布
            layer_hidden_states = ()
            layer_attentions = ()

            # 遍历模型的每一层
            for layer_index, albert_layer in enumerate(self.layers):
                # 调用当前层的前向传播方法，获取该层的输出
                layer_output = albert_layer(
                    hidden_states,
                    attention_mask,
                    deterministic=deterministic,
                    output_attentions=output_attentions,
                )
                # 更新隐藏状态为当前层的输出的第一个元素
                hidden_states = layer_output[0]

                # 如果需要输出注意力分布，将当前层的注意力分布添加到layer_attentions元组中
                if output_attentions:
                    layer_attentions = layer_attentions + (layer_output[1],)

                # 如果需要输出隐藏状态，将当前层的隐藏状态添加到layer_hidden_states元组中
                if output_hidden_states:
                    layer_hidden_states = layer_hidden_states + (hidden_states,)

            # 构建输出元组，包括最终的隐藏状态
            outputs = (hidden_states,)
            # 如果需要输出每层的隐藏状态，将其添加到输出元组中
            if output_hidden_states:
                outputs = outputs + (layer_hidden_states,)
            # 如果需要输出每层的注意力分布，将其添加到输出元组中
            if output_attentions:
                outputs = outputs + (layer_attentions,)
            # 返回模型的输出，包括最后一层的隐藏状态，可选的每层隐藏状态和每层注意力分布
            return outputs  # 最后一层的隐藏状态，(每层隐藏状态)，(每层注意力)
class FlaxAlbertLayerCollections(nn.Module):
    config: AlbertConfig
    dtype: jnp.dtype = jnp.float32  # 计算所用的数据类型
    layer_index: Optional[str] = None

    def setup(self):
        self.albert_layers = FlaxAlbertLayerCollection(self.config, dtype=self.dtype)
        # 初始化 Albert 层集合

    def __call__(
        self,
        hidden_states,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
    ):
        outputs = self.albert_layers(
            hidden_states,
            attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )
        return outputs
        # 调用 Albert 层集合并返回输出结果


class FlaxAlbertLayerGroups(nn.Module):
    config: AlbertConfig
    dtype: jnp.dtype = jnp.float32  # 计算所用的数据类型

    def setup(self):
        self.layers = [
            FlaxAlbertLayerCollections(self.config, name=str(i), layer_index=str(i), dtype=self.dtype)
            for i in range(self.config.num_hidden_groups)
        ]
        # 初始化 Albert 层组

    def __call__(
        self,
        hidden_states,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        all_attentions = () if output_attentions else None
        all_hidden_states = (hidden_states,) if output_hidden_states else None

        for i in range(self.config.num_hidden_layers):
            # 计算当前层所属的隐藏组索引
            group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
            layer_group_output = self.layers[group_idx](
                hidden_states,
                attention_mask,
                deterministic=deterministic,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )
            hidden_states = layer_group_output[0]

            if output_attentions:
                all_attentions = all_attentions + layer_group_output[-1]

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
        return FlaxBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )
        # 如果不返回字典，则返回相应的输出元组


class FlaxAlbertEncoder(nn.Module):
    config: AlbertConfig
    dtype: jnp.dtype = jnp.float32  # 计算所用的数据类型

    def setup(self):
        self.embedding_hidden_mapping_in = nn.Dense(
            self.config.hidden_size,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
            dtype=self.dtype,
        )
        self.albert_layer_groups = FlaxAlbertLayerGroups(self.config, dtype=self.dtype)
        # 初始化 Albert 编码器
    # 定义一个特殊方法 __call__，使得对象可以像函数一样被调用
    def __call__(
        self,
        hidden_states,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 将输入的隐藏状态通过 embedding_hidden_mapping_in 方法映射转换
        hidden_states = self.embedding_hidden_mapping_in(hidden_states)
        # 调用 albert_layer_groups 方法处理映射后的隐藏状态和注意力掩码，
        # 可选参数包括 deterministic（是否确定性计算）、output_attentions（是否输出注意力权重）、
        # output_hidden_states（是否输出每层的隐藏状态），返回结果根据 return_dict 决定是否返回字典形式
        return self.albert_layer_groups(
            hidden_states,
            attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )
# 定义一个名为 FlaxAlbertOnlyMLMHead 的类，继承自 nn.Module
class FlaxAlbertOnlyMLMHead(nn.Module):
    # 配置属性，指定为 AlbertConfig 类型
    config: AlbertConfig
    # 数据类型，默认为 jnp.float32
    dtype: jnp.dtype = jnp.float32
    # 偏置初始化函数，默认为零初始化
    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros

    # 初始化方法
    def setup(self):
        # 创建一个全连接层，输出维度为 config.embedding_size
        self.dense = nn.Dense(self.config.embedding_size, dtype=self.dtype)
        # 激活函数，根据配置选择 ACT2FN 中的激活函数
        self.activation = ACT2FN[self.config.hidden_act]
        # LayerNorm 层，使用 config.layer_norm_eps 作为 epsilon 参数
        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        # 输出层，输出维度为 config.vocab_size，不使用偏置
        self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False)
        # 初始化偏置参数，维度为 (config.vocab_size,)
        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))

    # 前向传播方法
    def __call__(self, hidden_states, shared_embedding=None):
        # 全连接层
        hidden_states = self.dense(hidden_states)
        # 激活函数
        hidden_states = self.activation(hidden_states)
        # LayerNorm 层
        hidden_states = self.LayerNorm(hidden_states)

        # 如果传入了 shared_embedding 参数，则使用 decoder 层进行解码
        if shared_embedding is not None:
            hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
        else:
            # 否则直接使用 decoder 层
            hidden_states = self.decoder(hidden_states)

        # 加上偏置
        hidden_states += self.bias
        # 返回最终的隐藏状态
        return hidden_states


# 定义一个名为 FlaxAlbertSOPHead 的类，继承自 nn.Module
class FlaxAlbertSOPHead(nn.Module):
    # 配置属性，指定为 AlbertConfig 类型
    config: AlbertConfig
    # 数据类型，默认为 jnp.float32
    dtype: jnp.dtype = jnp.float32

    # 初始化方法
    def setup(self):
        # Dropout 层，使用配置中的 classifier_dropout_prob
        self.dropout = nn.Dropout(self.config.classifier_dropout_prob)
        # 全连接层，输出维度为 2
        self.classifier = nn.Dense(2, dtype=self.dtype)

    # 前向传播方法
    def __call__(self, pooled_output, deterministic=True):
        # 应用 Dropout
        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
        # 分类器层，得到 logits
        logits = self.classifier(pooled_output)
        # 返回 logits
        return logits


# 定义一个名为 FlaxAlbertPreTrainedModel 的类，继承自 FlaxPreTrainedModel
class FlaxAlbertPreTrainedModel(FlaxPreTrainedModel):
    """
    一个处理权重初始化、预训练模型下载和加载的抽象类。
    """

    # 配置类，指定为 AlbertConfig
    config_class = AlbertConfig
    # 基础模型前缀名称为 "albert"
    base_model_prefix = "albert"
    # 模块类，默认为 None，需要在子类中指定具体的模块类
    module_class: nn.Module = None

    # 初始化方法
    def __init__(
        self,
        config: AlbertConfig,
        input_shape: Tuple = (1, 1),
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        **kwargs,
    ):
        # 创建模块实例，传入配置和其他参数
        module = self.module_class(config=config, dtype=dtype, **kwargs)
        # 调用父类的初始化方法
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
    ````
        # 初始化权重函数，使用随机数种子和输入形状，返回参数字典
        def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
            # 初始化输入张量，创建一个全零张量，数据类型为整数4位
            input_ids = jnp.zeros(input_shape, dtype="i4")
            # 创建与 input_ids 相同形状的全零张量，作为 token 类型标识
            token_type_ids = jnp.zeros_like(input_ids)
            # 根据 input_ids 的最后一个维度生成位置 ID，使用广播到输入形状
            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
            # 创建一个与 input_ids 相同形状的全一张量，作为注意力掩码
            attention_mask = jnp.ones_like(input_ids)
    
            # 分割随机数种子为参数随机种子和 dropout 随机种子
            params_rng, dropout_rng = jax.random.split(rng)
            rngs = {"params": params_rng, "dropout": dropout_rng}
    
            # 初始化模型参数，调用模块的 init 方法，返回随机参数
            random_params = self.module.init(
                rngs, input_ids, attention_mask, token_type_ids, position_ids, return_dict=False
            )["params"]
    
            # 如果传入了已有参数，合并随机参数和已有参数，并补全缺失的参数
            if params is not None:
                random_params = flatten_dict(unfreeze(random_params))
                params = flatten_dict(unfreeze(params))
                for missing_key in self._missing_keys:
                    params[missing_key] = random_params[missing_key]
                self._missing_keys = set()
                # 返回合并后的参数字典，进行冻结
                return freeze(unflatten_dict(params))
            else:
                # 返回随机初始化的参数
                return random_params
    
        # 添加文档字符串，定义模型前向传播方法的文档
        @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
        def __call__(
            self,
            input_ids,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            params: dict = None,
            dropout_rng: jax.random.PRNGKey = None,
            train: bool = False,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
        ):
            # 根据配置，设置输出注意力和隐藏状态的默认值
            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
            output_hidden_states = (
                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
            )
            return_dict = return_dict if return_dict is not None else self.config.return_dict
    
            # 如果未传入 token_type_ids，初始化为与 input_ids 相同的全零张量
            if token_type_ids is None:
                token_type_ids = jnp.zeros_like(input_ids)
    
            # 如果未传入 position_ids，初始化为根据 input_ids 的最后一个维度生成的广播张量
            if position_ids is None:
                position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
    
            # 如果未传入 attention_mask，初始化为与 input_ids 相同的全一张量
            if attention_mask is None:
                attention_mask = jnp.ones_like(input_ids)
    
            # 初始化随机数种子字典
            rngs = {}
            if dropout_rng is not None:
                rngs["dropout"] = dropout_rng
    
            # 调用模型的 apply 方法，传入参数进行前向计算
            return self.module.apply(
                {"params": params or self.params},
                jnp.array(input_ids, dtype="i4"),
                jnp.array(attention_mask, dtype="i4"),
                jnp.array(token_type_ids, dtype="i4"),
                jnp.array(position_ids, dtype="i4"),
                not train,  # 训练模式为 False，推理模式为 True
                output_attentions,
                output_hidden_states,
                return_dict,
                rngs=rngs,
            )
# 定义一个继承自`nn.Module`的类，用于实现Flax版本的Albert模型
class FlaxAlbertModule(nn.Module):
    # 类型注解，指定`config`为AlbertConfig类型
    config: AlbertConfig
    # 指定`dtype`为jnp.float32，用于计算的数据类型
    dtype: jnp.dtype = jnp.float32  # 计算时的数据类型
    # 是否添加池化层的标志，默认为True
    add_pooling_layer: bool = True

    # 模块初始化函数
    def setup(self):
        # 初始化嵌入层`embeddings`，使用FlaxAlbertEmbeddings类
        self.embeddings = FlaxAlbertEmbeddings(self.config, dtype=self.dtype)
        # 初始化编码器`encoder`，使用FlaxAlbertEncoder类
        self.encoder = FlaxAlbertEncoder(self.config, dtype=self.dtype)
        # 如果设置添加池化层，则初始化`pooler`为全连接层，并指定激活函数为tanh
        if self.add_pooling_layer:
            self.pooler = nn.Dense(
                self.config.hidden_size,
                kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
                dtype=self.dtype,
                name="pooler",
            )
            self.pooler_activation = nn.tanh
        else:
            # 如果不添加池化层，则将`pooler`和`pooler_activation`设置为None
            self.pooler = None
            self.pooler_activation = None

    # 对象调用函数，实现模型的前向传播
    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids: Optional[np.ndarray] = None,
        position_ids: Optional[np.ndarray] = None,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 当未传入`token_type_ids`时，初始化为与`input_ids`相同形状的全零数组
        if token_type_ids is None:
            token_type_ids = jnp.zeros_like(input_ids)

        # 当未传入`position_ids`时，初始化为广播形式的序列长度数组
        if position_ids is None:
            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)

        # 使用嵌入层`embeddings`计算输入数据的隐状态表示
        hidden_states = self.embeddings(input_ids, token_type_ids, position_ids, deterministic=deterministic)

        # 将隐状态表示输入编码器`encoder`，获取模型输出
        outputs = self.encoder(
            hidden_states,
            attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        # 获取编码器输出的隐状态表示
        hidden_states = outputs[0]

        # 如果设置添加池化层，则对第一个时间步的隐状态进行池化操作
        if self.add_pooling_layer:
            pooled = self.pooler(hidden_states[:, 0])
            pooled = self.pooler_activation(pooled)
        else:
            # 如果不添加池化层，则将`pooled`设置为None
            pooled = None

        # 如果不返回字典形式的输出，则根据`return_dict`的设置返回相应结果
        if not return_dict:
            if pooled is None:
                # 如果`pooled`为None，则不返回它
                return (hidden_states,) + outputs[1:]
            return (hidden_states, pooled) + outputs[1:]

        # 返回包含池化输出和其他模型输出的字典形式结果
        return FlaxBaseModelOutputWithPooling(
            last_hidden_state=hidden_states,
            pooler_output=pooled,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


# 使用装饰器`add_start_docstrings`为`FlaxAlbertModel`类添加注释文档
@add_start_docstrings(
    "The bare Albert Model transformer outputting raw hidden-states without any specific head on top.",
    ALBERT_START_DOCSTRING,
)
# `FlaxAlbertModel`类继承自`FlaxAlbertPreTrainedModel`，指定使用的模块类为`FlaxAlbertModule`
class FlaxAlbertModel(FlaxAlbertPreTrainedModel):
    module_class = FlaxAlbertModule


# 调用`append_call_sample_docstring`函数，为`FlaxAlbertModel`类添加调用示例注释
append_call_sample_docstring(FlaxAlbertModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)


# 定义一个继承自`nn.Module`的类，用于实现Albert预训练模型
class FlaxAlbertForPreTrainingModule(nn.Module):
    # 类型注解，指定`config`为AlbertConfig类型
    config: AlbertConfig
    # 定义默认的数据类型为 jnp.float32，使用了 jax.numpy 的数据类型
    dtype: jnp.dtype = jnp.float32

    # 初始化模型的方法，创建了 Albert 模型、MLM 头部和 SOP 分类器
    def setup(self):
        # 使用给定的配置和数据类型创建 Albert 模型
        self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype)
        # 使用给定的配置和数据类型创建只有 MLM 头部的模型
        self.predictions = FlaxAlbertOnlyMLMHead(config=self.config, dtype=self.dtype)
        # 使用给定的配置和数据类型创建 SOP 分类器
        self.sop_classifier = FlaxAlbertSOPHead(config=self.config, dtype=self.dtype)

    # 调用模型时的方法，接收多个输入参数和几个布尔型选项
    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用 Albert 模型进行前向传播，获取输出
        outputs = self.albert(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果配置要求共享词嵌入，则获取共享的词嵌入
        if self.config.tie_word_embeddings:
            shared_embedding = self.albert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
        else:
            shared_embedding = None

        # 从 Albert 模型的输出中提取隐藏状态和汇聚输出
        hidden_states = outputs[0]
        pooled_output = outputs[1]

        # 使用 MLM 头部对隐藏状态进行预测
        prediction_scores = self.predictions(hidden_states, shared_embedding=shared_embedding)
        # 使用 SOP 分类器对汇聚输出进行预测
        sop_scores = self.sop_classifier(pooled_output, deterministic=deterministic)

        # 如果不要求返回字典形式的结果，则返回元组形式的结果
        if not return_dict:
            return (prediction_scores, sop_scores) + outputs[2:]

        # 返回预训练 Albert 模型的输出结果，包括预测 logits、SOP logits、隐藏状态和注意力权重
        return FlaxAlbertForPreTrainingOutput(
            prediction_logits=prediction_scores,
            sop_logits=sop_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    """,
    ALBERT_START_DOCSTRING,
)
class FlaxAlbertForPreTraining(FlaxAlbertPreTrainedModel):
    module_class = FlaxAlbertForPreTrainingModule



FLAX_ALBERT_FOR_PRETRAINING_DOCSTRING = """
    Returns:

    Example:

    ```
    >>> from transformers import AutoTokenizer, FlaxAlbertForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
    >>> model = FlaxAlbertForPreTraining.from_pretrained("albert/albert-base-v2")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.prediction_logits
    >>> seq_relationship_logits = outputs.sop_logits
    ```
"""

# Overwrite the docstring of FlaxAlbertForPreTraining to include input docstring and predefined docstring
overwrite_call_docstring(
    FlaxAlbertForPreTraining,
    ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_ALBERT_FOR_PRETRAINING_DOCSTRING,
)



# Append and replace return docstrings for FlaxAlbertForPreTraining
append_replace_return_docstrings(
    FlaxAlbertForPreTraining, output_type=FlaxAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC
)



class FlaxAlbertForMaskedLMModule(nn.Module):
    config: AlbertConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        # Set up Albert model without pooling layer
        self.albert = FlaxAlbertModule(config=self.config, add_pooling_layer=False, dtype=self.dtype)
        # Set up Masked LM head for predictions
        self.predictions = FlaxAlbertOnlyMLMHead(config=self.config, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # Forward pass through Albert model
        outputs = self.albert(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract hidden states from model outputs
        hidden_states = outputs[0]

        # Determine if word embeddings are tied
        if self.config.tie_word_embeddings:
            shared_embedding = self.albert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
        else:
            shared_embedding = None

        # Compute masked language modeling logits
        logits = self.predictions(hidden_states, shared_embedding=shared_embedding)

        # Return either a tuple or a named tuple depending on return_dict
        if not return_dict:
            return (logits,) + outputs[1:]

        return FlaxMaskedLMOutput(
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )



@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING)
class FlaxAlbertForMaskedLM(FlaxAlbertPreTrainedModel):
    module_class = FlaxAlbertForMaskedLMModule



# Append call sample docstring for FlaxAlbertForMaskedLM
append_call_sample_docstring(
    # Import the specific classes and variables from the module for Flax-based Albert model for Masked Language Modeling
    FlaxAlbertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC, revision="refs/pr/11"
# 定义一个名为 FlaxAlbertForSequenceClassificationModule 的 PyTorch 模块，用于序列分类任务
class FlaxAlbertForSequenceClassificationModule(nn.Module):
    # 类属性，存储 Albert 模型的配置
    config: AlbertConfig
    # 类属性，默认数据类型为 jnp.float32
    dtype: jnp.dtype = jnp.float32

    # 模块初始化方法
    def setup(self):
        # 根据配置创建一个 FlaxAlbertModule 实例
        self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype)
        # 根据配置中的 dropout 概率创建一个 dropout 层
        classifier_dropout = (
            self.config.classifier_dropout_prob
            if self.config.classifier_dropout_prob is not None
            else self.config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(rate=classifier_dropout)
        # 创建一个全连接层作为分类器，输出维度为 config.num_labels
        self.classifier = nn.Dense(
            self.config.num_labels,
            dtype=self.dtype,
        )

    # 模块调用方法，用于模型推断
    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用 Albert 模型进行前向传播
        outputs = self.albert(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 Albert 模型的输出中获取池化后的输出
        pooled_output = outputs[1]
        # 应用 dropout 层到池化后的输出上
        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
        # 将处理后的输出传入分类器得到最终的 logits
        logits = self.classifier(pooled_output)

        # 如果不要求返回字典，则返回 logits 和额外的隐藏状态
        if not return_dict:
            return (logits,) + outputs[2:]

        # 如果要求返回字典，则构建一个 FlaxSequenceClassifierOutput 对象并返回
        return FlaxSequenceClassifierOutput(
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


# 使用装饰器为 FlaxAlbertForSequenceClassification 类添加文档字符串
@add_start_docstrings(
    """
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    ALBERT_START_DOCSTRING,
)
# 继承自 FlaxAlbertPreTrainedModel 类的子类
class FlaxAlbertForSequenceClassification(FlaxAlbertPreTrainedModel):
    # 指定该模型使用的模块类为 FlaxAlbertForSequenceClassificationModule
    module_class = FlaxAlbertForSequenceClassificationModule


# 为 FlaxAlbertForSequenceClassification 类添加调用示例的文档字符串
append_call_sample_docstring(
    FlaxAlbertForSequenceClassification,
    _CHECKPOINT_FOR_DOC,
    FlaxSequenceClassifierOutput,
    _CONFIG_FOR_DOC,
)


# 定义一个名为 FlaxAlbertForMultipleChoiceModule 的 PyTorch 模块，用于多项选择任务
class FlaxAlbertForMultipleChoiceModule(nn.Module):
    # 类属性，存储 Albert 模型的配置
    config: AlbertConfig
    # 类属性，默认数据类型为 jnp.float32
    dtype: jnp.dtype = jnp.float32

    # 模块初始化方法
    def setup(self):
        # 根据配置创建一个 FlaxAlbertModule 实例
        self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype)
        # 创建一个 dropout 层，dropout 率为 config.hidden_dropout_prob
        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
        # 创建一个全连接层作为分类器，输出维度为 1
        self.classifier = nn.Dense(1, dtype=self.dtype)

    # 模块调用方法，用于模型推断
    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        ):
            # 获取输入张量的第二维大小，即选项的数量
            num_choices = input_ids.shape[1]
            # 如果输入张量不为空，则重塑为二维张量
            input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
            attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
            token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
            position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None

            # 使用 ALBERT 模型进行前向推断
            outputs = self.albert(
                input_ids,
                attention_mask,
                token_type_ids,
                position_ids,
                deterministic=deterministic,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )

            # 获取池化后的输出
            pooled_output = outputs[1]
            # 使用 dropout 进行池化后输出的处理
            pooled_output = self.dropout(pooled_output, deterministic=deterministic)
            # 使用分类器进行分类预测
            logits = self.classifier(pooled_output)

            # 将 logits 重塑为二维张量，以便与选项数量对应
            reshaped_logits = logits.reshape(-1, num_choices)

            # 如果不返回字典形式的结果，则返回 logits 以及额外的输出
            if not return_dict:
                return (reshaped_logits,) + outputs[2:]

            # 返回多选题模型的输出结果，包括 logits、隐藏状态和注意力
            return FlaxMultipleChoiceModelOutput(
                logits=reshaped_logits,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            )
@add_start_docstrings(
    """
    Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    ALBERT_START_DOCSTRING,
)
class FlaxAlbertForMultipleChoice(FlaxAlbertPreTrainedModel):
    module_class = FlaxAlbertForMultipleChoiceModule


overwrite_call_docstring(
    FlaxAlbertForMultipleChoice, ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
append_call_sample_docstring(
    FlaxAlbertForMultipleChoice,
    _CHECKPOINT_FOR_DOC,
    FlaxMultipleChoiceModelOutput,
    _CONFIG_FOR_DOC,
)

这部分代码定义了一个基于Albert模型的多选题分类模型，包括一个线性层和softmax输出，适用于多选题任务，例如RocStories/SWAG任务。


class FlaxAlbertForTokenClassificationModule(nn.Module):
    config: AlbertConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
        classifier_dropout = (
            self.config.classifier_dropout_prob
            if self.config.classifier_dropout_prob is not None
            else self.config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(rate=classifier_dropout)
        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # Model
        outputs = self.albert(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        logits = self.classifier(hidden_states)

        if not return_dict:
            return (logits,) + outputs[1:]

        return FlaxTokenClassifierOutput(
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

这段代码定义了一个基于Albert模型的Token分类模型，该模型在隐藏状态输出的基础上增加了线性层，适用于词元分类任务（例如命名实体识别）。其中包括了用于构建模型的初始化设置和__call__方法用于执行前向传播。


@add_start_docstrings(
    """
    Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    ALBERT_START_DOCSTRING,
)
class FlaxAlbertForTokenClassification(FlaxAlbertPreTrainedModel):
    module_class = FlaxAlbertForTokenClassificationModule


append_call_sample_docstring(
    FlaxAlbertForTokenClassification,
    _CHECKPOINT_FOR_DOC,
    FlaxTokenClassifierOutput,
    _CONFIG_FOR_DOC,
)

这部分代码定义了另一个基于Albert模型的Token分类模型，用于词元分类任务（如命名实体识别），包括一个线性层作为隐藏状态输出的顶部层。


class FlaxAlbertForQuestionAnsweringModule(nn.Module):
    config: AlbertConfig
    dtype: jnp.dtype = jnp.float32

这段代码定义了一个用于问答任务的Albert模型模块，但是在这段代码中缺少进一步的实现和注释。
    # 初始化模型设置
    def setup(self):
        # 使用配置和数据类型创建一个 FlaxAlbertModule 实例，不添加池化层
        self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
        # 创建一个全连接层 nn.Dense，输出维度为配置中指定的标签数
        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)

    # 模型调用函数，接受多个输入和一些可选参数
    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用 self.albert 模型进行前向传播
        outputs = self.albert(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取隐藏状态
        hidden_states = outputs[0]

        # 使用 self.qa_outputs 对隐藏状态进行线性变换得到预测 logits
        logits = self.qa_outputs(hidden_states)

        # 将 logits 按最后一个维度分割成起始和结束 logits
        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)

        # 去除多余的维度，将 logits 压缩成一维张量
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        # 如果 return_dict 为 False，则返回一个元组，包含 logits 和额外的模型输出
        if not return_dict:
            return (start_logits, end_logits) + outputs[1:]

        # 如果 return_dict 为 True，则封装输出成 FlaxQuestionAnsweringModelOutput 类型
        return FlaxQuestionAnsweringModelOutput(
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    ALBERT_START_DOCSTRING,  # 添加起始文档字符串，描述了在Albert模型中加入用于抽取式问答任务的span分类头的结构和功能。
)
class FlaxAlbertForQuestionAnswering(FlaxAlbertPreTrainedModel):
    module_class = FlaxAlbertForQuestionAnsweringModule


append_call_sample_docstring(
    FlaxAlbertForQuestionAnswering,  # 将示例调用添加到文档字符串，展示如何使用FlaxAlbertForQuestionAnswering类的示例。
    _CHECKPOINT_FOR_DOC,
    FlaxQuestionAnsweringModelOutput,  # 调用样本文档字符串附上了模型输出的检查点。
    _CONFIG_FOR_DOC,
)

`.\models\albert\modeling_tf_albert.py`

# 设置编码格式为 UTF-8，确保代码中可以正确处理各种字符
# 版权声明，版权归 OpenAI Team Authors 和 HuggingFace Inc. team 所有
# 版权声明，版权归 NVIDIA CORPORATION 所有，保留所有权利
#
# 根据 Apache 许可证 2.0 版本使用本文件
# 除非符合许可证的相关法律要求或书面同意，否则不得使用本文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按“原样”分发，不附带任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。

""" TF 2.0 ALBERT model."""

from __future__ import annotations  # 允许在类型注释中使用未定义的类型名称

import math  # 导入数学库，用于执行数学运算
from dataclasses import dataclass  # 导入 dataclass 用于创建不可变对象
from typing import Dict, Optional, Tuple, Union  # 导入类型注释支持的类型

import numpy as np  # 导入 NumPy 库，用于数值计算
import tensorflow as tf  # 导入 TensorFlow 库，用于构建和训练深度学习模型

from ...activations_tf import get_tf_activation  # 导入自定义函数，用于获取 TensorFlow 激活函数
from ...modeling_tf_outputs import (  # 导入模型输出相关类
    TFBaseModelOutput,
    TFBaseModelOutputWithPooling,
    TFMaskedLMOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (  # 导入模型工具函数
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras,  # TensorFlow 的高级 API
    keras_serializable,  # 可序列化的 Keras
    unpack_inputs,  # 解包输入数据的函数
)
from ...tf_utils import (  # 导入 TensorFlow 实用函数
    check_embeddings_within_bounds,  # 检查嵌入向量是否在边界范围内
    shape_list,  # 获取张量的形状列表
    stable_softmax,  # 稳定的 softmax 函数
)
from ...utils import (  # 导入实用工具函数
    ModelOutput,  # 模型输出类
    add_code_sample_docstrings,  # 添加代码示例的文档字符串
    add_start_docstrings,  # 添加函数的起始文档字符串
    add_start_docstrings_to_model_forward,  # 添加模型前向传播的起始文档字符串
    logging,  # 日志记录工具
    replace_return_docstrings,  # 替换返回值的文档字符串
)
from .configuration_albert import AlbertConfig  # 导入 ALBERT 模型配置类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"  # 文档中使用的模型检查点名称
_CONFIG_FOR_DOC = "AlbertConfig"  # 文档中使用的配置名称

TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [  # ALBERT 预训练模型的列表
    "albert/albert-base-v1",
    "albert/albert-large-v1",
    "albert/albert-xlarge-v1",
    "albert/albert-xxlarge-v1",
    "albert/albert-base-v2",
    "albert/albert-large-v2",
    "albert/albert-xlarge-v2",
    "albert/albert-xxlarge-v2",
    # 查看所有 ALBERT 模型：https://huggingface.co/models?filter=albert
]


class TFAlbertPreTrainingLoss:
    """
    适用于 ALBERT 预训练的损失函数，即通过结合 SOP + MLM 的语言模型预训练任务。
    .. 注意:: 在损失计算中，任何标签为 -100 的样本将被忽略（以及对应的 logits）。
    """
    def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
        # 定义损失函数为稀疏分类交叉熵，从 logits 计算，不进行降维
        loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
        
        if self.config.tf_legacy_loss:
            # 确保只有标签不等于 -100 的位置会计算损失
            masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels["labels"], shape=(-1,)), -100)
            # 使用布尔掩码从 logits 中筛选出有效位置的预测值
            masked_lm_reduced_logits = tf.boolean_mask(
                tensor=tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])),
                mask=masked_lm_active_loss,
            )
            # 使用布尔掩码从标签中筛选出有效位置的真实值
            masked_lm_labels = tf.boolean_mask(
                tensor=tf.reshape(tensor=labels["labels"], shape=(-1,)), mask=masked_lm_active_loss
            )
            # 确保只有标签不等于 -100 的位置会计算损失
            sentence_order_active_loss = tf.not_equal(
                tf.reshape(tensor=labels["sentence_order_label"], shape=(-1,)), -100
            )
            # 使用布尔掩码从 logits 中筛选出有效位置的预测值
            sentence_order_reduced_logits = tf.boolean_mask(
                tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=sentence_order_active_loss
            )
            # 使用布尔掩码从标签中筛选出有效位置的真实值
            sentence_order_label = tf.boolean_mask(
                tensor=tf.reshape(tensor=labels["sentence_order_label"], shape=(-1,)), mask=sentence_order_active_loss
            )
            # 计算掩码语言模型的损失
            masked_lm_loss = loss_fn(y_true=masked_lm_labels, y_pred=masked_lm_reduced_logits)
            # 计算序列顺序预测的损失
            sentence_order_loss = loss_fn(y_true=sentence_order_label, y_pred=sentence_order_reduced_logits)
            # 将掩码语言模型损失按照序列顺序预测的数量均匀化
            masked_lm_loss = tf.reshape(tensor=masked_lm_loss, shape=(-1, shape_list(sentence_order_loss)[0]))
            masked_lm_loss = tf.reduce_mean(input_tensor=masked_lm_loss, axis=0)

            return masked_lm_loss + sentence_order_loss

        # 将负标签裁剪为零，避免 NaN 和错误，这些位置后续将被掩盖
        unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0])
        # 确保只有标签不等于 -100 的位置会计算损失
        lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype)
        masked_lm_losses = unmasked_lm_losses * lm_loss_mask
        reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses) / tf.reduce_sum(lm_loss_mask)

        sop_logits = tf.reshape(logits[1], (-1, 2))
        # 将负标签裁剪为零，避免 NaN 和错误，这些位置后续将被掩盖
        unmasked_sop_loss = loss_fn(y_true=tf.nn.relu(labels["sentence_order_label"]), y_pred=sop_logits)
        sop_loss_mask = tf.cast(labels["sentence_order_label"] != -100, dtype=unmasked_sop_loss.dtype)

        masked_sop_loss = unmasked_sop_loss * sop_loss_mask
        reduced_masked_sop_loss = tf.reduce_sum(masked_sop_loss) / tf.reduce_sum(sop_loss_mask)

        return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,))
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config: AlbertConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化层的配置和参数
        self.config = config
        self.embedding_size = config.embedding_size  # 嵌入向量的维度大小
        self.max_position_embeddings = config.max_position_embeddings  # 最大位置嵌入数量
        self.initializer_range = config.initializer_range  # 初始化范围
        # 层归一化操作，使用配置中的 epsilon 参数
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # Dropout 操作，使用配置中的 dropout 比率
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)

    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):
            # 添加词嵌入权重矩阵，形状为 [词汇表大小, 嵌入维度大小]
            self.weight = self.add_weight(
                name="weight",
                shape=[self.config.vocab_size, self.embedding_size],
                initializer=get_initializer(self.initializer_range),
            )

        with tf.name_scope("token_type_embeddings"):
            # 添加 token_type 嵌入权重矩阵，形状为 [token_type 数量, 嵌入维度大小]
            self.token_type_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.config.type_vocab_size, self.embedding_size],
                initializer=get_initializer(self.initializer_range),
            )

        with tf.name_scope("position_embeddings"):
            # 添加位置嵌入权重矩阵，形状为 [最大位置嵌入数量, 嵌入维度大小]
            self.position_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.max_position_embeddings, self.embedding_size],
                initializer=get_initializer(self.initializer_range),
            )

        if self.built:
            return
        self.built = True
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                # 构建层归一化的结构，输入形状为 [None, None, 嵌入维度大小]
                self.LayerNorm.build([None, None, self.config.embedding_size])

    # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
    def call(
        self,
        input_ids: tf.Tensor = None,
        position_ids: tf.Tensor = None,
        token_type_ids: tf.Tensor = None,
        inputs_embeds: tf.Tensor = None,
        past_key_values_length=0,
        training: bool = False,
    ) -> tf.Tensor:
        """
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        """
        # 检查输入参数，确保至少提供了 `input_ids` 或 `inputs_embeds`
        if input_ids is None and inputs_embeds is None:
            raise ValueError("Need to provide either `input_ids` or `input_embeds`.")

        # 如果提供了 `input_ids`，从权重矩阵中根据索引收集对应的嵌入向量
        if input_ids is not None:
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        # 获取输入嵌入张量的形状
        input_shape = shape_list(inputs_embeds)[:-1]

        # 如果未提供 `token_type_ids`，则创建一个形状与输入嵌入张量相同的张量，并用0填充
        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)

        # 如果未提供 `position_ids`，则根据序列长度和历史键值长度生成位置索引张量
        if position_ids is None:
            position_ids = tf.expand_dims(
                tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0
            )

        # 根据位置索引从位置嵌入矩阵中收集位置嵌入向量
        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        # 根据 token_type_ids 从 token_type_embeddings 中收集 token 类型嵌入向量
        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
        # 将输入嵌入向量、位置嵌入向量和 token 类型嵌入向量相加，得到最终的嵌入向量
        final_embeddings = inputs_embeds + position_embeds + token_type_embeds
        # 对最终的嵌入向量进行 LayerNormalization 处理
        final_embeddings = self.LayerNorm(inputs=final_embeddings)
        # 在训练模式下对最终嵌入向量进行 dropout 处理
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)

        # 返回最终的嵌入向量作为输出
        return final_embeddings
    """Contains the complete attention sublayer, including both dropouts and layer norm."""

    def __init__(self, config: AlbertConfig, **kwargs):
        super().__init__(**kwargs)

        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
                f"of attention heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
        self.output_attentions = config.output_attentions

        # Initialize Dense layers for query, key, value, and dense transformations
        self.query = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
        )
        self.key = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
        )
        self.value = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # Layer normalization for post-attention processing
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")

        # Dropout layers with specified dropout rates
        self.attention_dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
        self.output_dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)

        self.config = config

    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))

        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
        return tf.transpose(tensor, perm=[0, 2, 1, 3])

    def call(
        self,
        input_tensor: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 获取输入张量的批量大小
        batch_size = shape_list(input_tensor)[0]
        # 调用 self.query 方法，生成混合查询层张量
        mixed_query_layer = self.query(inputs=input_tensor)
        # 调用 self.key 方法，生成混合键层张量
        mixed_key_layer = self.key(inputs=input_tensor)
        # 调用 self.value 方法，生成混合值层张量
        mixed_value_layer = self.value(inputs=input_tensor)
        # 将混合查询层张量转置以适应注意力分数计算的形状
        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
        # 将混合键层张量转置以适应注意力分数计算的形状
        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
        # 将混合值层张量转置以适应注意力分数计算的形状
        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)

        # 计算查询和键之间的点积，得到原始的注意力分数
        # 形状为 (batch size, num_heads, seq_len_q, seq_len_k)
        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
        # 计算缩放因子 dk，并将注意力分数除以 dk
        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
        attention_scores = tf.divide(attention_scores, dk)

        if attention_mask is not None:
            # 如果存在注意力掩码，应用注意力掩码
            attention_scores = tf.add(attention_scores, attention_mask)

        # 将注意力分数归一化为概率值
        attention_probs = stable_softmax(logits=attention_scores, axis=-1)

        # 对注意力概率值进行 dropout 处理
        attention_probs = self.attention_dropout(inputs=attention_probs, training=training)

        # 如果存在头部掩码，将注意力概率值与头部掩码相乘
        if head_mask is not None:
            attention_probs = tf.multiply(attention_probs, head_mask)

        # 计算上下文张量，将注意力概率值与值层张量相乘
        context_layer = tf.matmul(attention_probs, value_layer)
        # 对上下文张量进行转置操作，调整其形状
        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])

        # 调整上下文张量的形状，以适应下一层网络的输入要求
        # 形状为 (batch_size, seq_len_q, all_head_size)
        context_layer = tf.reshape(tensor=context_layer, shape=(batch_size, -1, self.all_head_size))
        # 将上下文张量作为 self_outputs 的第一个元素
        self_outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
        # 获取 self_outputs 的第一个元素作为隐藏状态张量
        hidden_states = self_outputs[0]
        # 将隐藏状态张量传递给全连接层进行线性变换
        hidden_states = self.dense(inputs=hidden_states)
        # 对线性变换后的隐藏状态进行 dropout 处理
        hidden_states = self.output_dropout(inputs=hidden_states, training=training)
        # 将 dropout 后的隐藏状态与输入张量相加，并应用 LayerNorm
        attention_output = self.LayerNorm(inputs=hidden_states + input_tensor)

        # 如果需要输出注意力分数，则将注意力分数添加到输出中
        outputs = (attention_output,) + self_outputs[1:]

        # 返回最终的输出
        return outputs
    # 如果已经构建过网络结构，则直接返回，不再重复构建
    if self.built:
        return
    # 将标记置为已构建
    self.built = True
    
    # 如果存在查询（query）模块，根据其名称创建作用域，并构建其形状
    if getattr(self, "query", None) is not None:
        with tf.name_scope(self.query.name):
            self.query.build([None, None, self.config.hidden_size])
    
    # 如果存在键（key）模块，根据其名称创建作用域，并构建其形状
    if getattr(self, "key", None) is not None:
        with tf.name_scope(self.key.name):
            self.key.build([None, None, self.config.hidden_size])
    
    # 如果存在值（value）模块，根据其名称创建作用域，并构建其形状
    if getattr(self, "value", None) is not None:
        with tf.name_scope(self.value.name):
            self.value.build([None, None, self.config.hidden_size])
    
    # 如果存在密集层（dense），根据其名称创建作用域，并构建其形状
    if getattr(self, "dense", None) is not None:
        with tf.name_scope(self.dense.name):
            self.dense.build([None, None, self.config.hidden_size])
    
    # 如果存在层归一化（LayerNorm），根据其名称创建作用域，并构建其形状
    if getattr(self, "LayerNorm", None) is not None:
        with tf.name_scope(self.LayerNorm.name):
            self.LayerNorm.build([None, None, self.config.hidden_size])
class TFAlbertLayer(keras.layers.Layer):
    def __init__(self, config: AlbertConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化注意力层，使用给定的配置
        self.attention = TFAlbertAttention(config, name="attention")
        
        # 初始化前馈神经网络层，使用给定的中间大小和初始化器范围
        self.ffn = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
        )

        # 根据配置获取激活函数，或者使用默认的激活函数
        if isinstance(config.hidden_act, str):
            self.activation = get_tf_activation(config.hidden_act)
        else:
            self.activation = config.hidden_act

        # 初始化前馈神经网络输出层，使用给定的隐藏大小和初始化器范围
        self.ffn_output = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output"
        )
        
        # 初始化全层标准化层，使用给定的 epsilon 参数
        self.full_layer_layer_norm = keras.layers.LayerNormalization(
            epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
        )
        
        # 初始化 dropout 层，使用给定的隐藏 dropout 概率
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 调用注意力层，获取注意力输出
        attention_outputs = self.attention(
            input_tensor=hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            training=training,
        )
        
        # 前馈神经网络计算过程
        ffn_output = self.ffn(inputs=attention_outputs[0])  # 使用注意力输出作为输入
        ffn_output = self.activation(ffn_output)  # 应用激活函数
        ffn_output = self.ffn_output(inputs=ffn_output)  # 再次使用前馈神经网络输出层
        
        # 应用 dropout 操作
        ffn_output = self.dropout(inputs=ffn_output, training=training)
        
        # 添加全层标准化层，结合注意力输出和前馈神经网络输出
        hidden_states = self.full_layer_layer_norm(inputs=ffn_output + attention_outputs[0])

        # 如果需要输出注意力，则将注意力输出包含在结果中
        outputs = (hidden_states,) + attention_outputs[1:]

        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        
        # 构建注意力层
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        
        # 构建前馈神经网络层
        if getattr(self, "ffn", None) is not None:
            with tf.name_scope(self.ffn.name):
                self.ffn.build([None, None, self.config.hidden_size])
        
        # 构建前馈神经网络输出层
        if getattr(self, "ffn_output", None) is not None:
            with tf.name_scope(self.ffn_output.name):
                self.ffn_output.build([None, None, self.config.intermediate_size])
        
        # 构建全层标准化层
        if getattr(self, "full_layer_layer_norm", None) is not None:
            with tf.name_scope(self.full_layer_layer_norm.name):
                self.full_layer_layer_norm.build([None, None, self.config.hidden_size])
    # 使用传入的 AlbertConfig 对象初始化模型，调用父类的初始化方法
    def __init__(self, config: AlbertConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建多个 AlbertLayer 层组成的列表，每个层有一个唯一的名称
        self.albert_layers = [
            TFAlbertLayer(config, name=f"albert_layers_._{i}") for i in range(config.inner_group_num)
        ]

    # 模型的调用方法，接收隐藏状态、注意力掩码、头部掩码等输入，输出模型的隐藏状态、层的隐藏状态和注意力分数（如果有的话）
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        output_hidden_states: bool,
        training: bool = False,
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 如果需要输出隐藏状态，则创建一个空元组用于存储每个层的隐藏状态
        layer_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力分数，则创建一个空元组用于存储每个层的注意力分数
        layer_attentions = () if output_attentions else None

        # 遍历所有 AlbertLayer 层
        for layer_index, albert_layer in enumerate(self.albert_layers):
            # 如果需要输出隐藏状态，则将当前层的隐藏状态添加到存储中
            if output_hidden_states:
                layer_hidden_states = layer_hidden_states + (hidden_states,)

            # 调用当前 AlbertLayer 层的处理方法，更新隐藏状态
            layer_output = albert_layer(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask[layer_index],
                output_attentions=output_attentions,
                training=training,
            )
            # 更新主要的隐藏状态为当前层的输出隐藏状态
            hidden_states = layer_output[0]

            # 如果需要输出注意力分数，则将当前层的注意力分数添加到存储中
            if output_attentions:
                layer_attentions = layer_attentions + (layer_output[1],)

        # 添加最后一层的隐藏状态到存储中（如果需要输出隐藏状态）
        if output_hidden_states:
            layer_hidden_states = layer_hidden_states + (hidden_states,)

        # 返回隐藏状态、层的隐藏状态和注意力分数的元组，去除其中为 None 的部分
        return tuple(v for v in [hidden_states, layer_hidden_states, layer_attentions] if v is not None)

    # 构建模型，在第一次调用前进行模型的构建
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经创建了 AlbertLayer 层，则依次构建每一层
        if getattr(self, "albert_layers", None) is not None:
            for layer in self.albert_layers:
                # 使用每个层的名称创建一个命名空间，并调用层的构建方法
                with tf.name_scope(layer.name):
                    layer.build(None)
# 定义一个名为TFAlbertTransformer的类，继承自keras的Layer类
class TFAlbertTransformer(keras.layers.Layer):
    # 初始化方法，接受config和其他参数
    def __init__(self, config: AlbertConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 从config中获取隐藏层的数量和组数
        self.num_hidden_layers = config.num_hidden_layers
        self.num_hidden_groups = config.num_hidden_groups
        # 计算每个隐藏组中的层的数量
        self.layers_per_group = int(config.num_hidden_layers / config.num_hidden_groups)
        # 创建一个Dense层来映射嵌入的隐藏状态
        self.embedding_hidden_mapping_in = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="embedding_hidden_mapping_in",
        )
        # 创建多个ALBERT层组，数量等于隐藏组的数量
        self.albert_layer_groups = [
            TFAlbertLayerGroup(config, name=f"albert_layer_groups_._{i}") for i in range(config.num_hidden_groups)
        ]
        # 保存config
        self.config = config

    # 定义处理输入数据的方法
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        output_hidden_states: bool,
        return_dict: bool,
        training: bool = False,
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 使用Dense层处理输入的隐藏状态
        hidden_states = self.embedding_hidden_mapping_in(inputs=hidden_states)
        # 初始化存储注意力权重和隐藏状态的变量
        all_attentions = () if output_attentions else None
        all_hidden_states = (hidden_states,) if output_hidden_states else None

        # 循环遍历每个隐藏层
        for i in range(self.num_hidden_layers):
            # 计算当前层所在的隐藏组的索引
            group_idx = int(i / (self.num_hidden_layers / self.num_hidden_groups))
            # 调用对应的ALBERT层组，处理隐藏状态
            layer_group_output = self.albert_layer_groups[group_idx](
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask[group_idx * self.layers_per_group : (group_idx + 1) * self.layers_per_group],
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                training=training,
            )
            # 更新隐藏状态
            hidden_states = layer_group_output[0]

            # 如果需要输出注意力权重，则将当前层的注意力权重添加到存储变量中
            if output_attentions:
                all_attentions = all_attentions + layer_group_output[-1]

            # 如果需要输出隐藏状态，则将当前层的隐藏状态添加到存储变量中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典形式的结果，则将所有结果组合成一个元组返回
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)

        # 如果需要返回字典形式的结果，则创建一个TFBaseModelOutput对象并返回
        return TFBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )
    # 定义 build 方法，用于构建模型的结构
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        
        # 如果存在 embedding_hidden_mapping_in 属性
        if getattr(self, "embedding_hidden_mapping_in", None) is not None:
            # 使用 tf.name_scope 来限定命名空间，命名为 embedding_hidden_mapping_in 的名称
            with tf.name_scope(self.embedding_hidden_mapping_in.name):
                # 使用 embedding_hidden_mapping_in 属性构建层，输入形状为 [None, None, self.config.embedding_size]
                self.embedding_hidden_mapping_in.build([None, None, self.config.embedding_size])
        
        # 如果存在 albert_layer_groups 属性
        if getattr(self, "albert_layer_groups", None) is not None:
            # 遍历 albert_layer_groups 中的每个层
            for layer in self.albert_layer_groups:
                # 使用 tf.name_scope 来限定命名空间，命名为 layer 的名称
                with tf.name_scope(layer.name):
                    # 构建当前层，输入形状为 None（未指定特定输入形状）
                    layer.build(None)
    """
    处理权重初始化、预训练模型下载和加载的抽象类。
    """
    
    # 配置类为 AlbertConfig
    config_class = AlbertConfig
    # 基础模型前缀为 "albert"
    base_model_prefix = "albert"

class TFAlbertMLMHead(keras.layers.Layer):
    def __init__(self, config: AlbertConfig, input_embeddings: keras.layers.Layer, **kwargs):
        super().__init__(**kwargs)

        self.config = config
        self.embedding_size = config.embedding_size
        
        # 创建一个全连接层，用于预测下一个词的特征
        self.dense = keras.layers.Dense(
            config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        
        # 根据配置中的激活函数类型，获取激活函数
        if isinstance(config.hidden_act, str):
            self.activation = get_tf_activation(config.hidden_act)
        else:
            self.activation = config.hidden_act
        
        # LayerNormalization 层，用于归一化输入的词嵌入
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")

        # 输入词嵌入层，用于解码器的输出权重
        self.decoder = input_embeddings

    def build(self, input_shape=None):
        # 增加偏置项，用于每个词汇的输出偏置
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
        
        # 解码器的偏置项，用于每个词汇的解码偏置
        self.decoder_bias = self.add_weight(
            shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
        )

        # 如果已经构建，则直接返回
        if self.built:
            return
        self.built = True
        
        # 构建全连接层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        
        # 构建 LayerNormalization 层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.embedding_size])

    def get_output_embeddings(self) -> keras.layers.Layer:
        # 返回解码器的词嵌入层
        return self.decoder

    def set_output_embeddings(self, value: tf.Variable):
        # 设置解码器的权重
        self.decoder.weight = value
        self.decoder.vocab_size = shape_list(value)[0]

    def get_bias(self) -> Dict[str, tf.Variable]:
        # 返回偏置项字典
        return {"bias": self.bias, "decoder_bias": self.decoder_bias}

    def set_bias(self, value: tf.Variable):
        # 设置偏置项的值
        self.bias = value["bias"]
        self.decoder_bias = value["decoder_bias"]
        self.config.vocab_size = shape_list(value["bias"])[0]
    # 使用全连接层对隐藏状态进行线性变换
    hidden_states = self.dense(inputs=hidden_states)
    # 应用激活函数对线性变换后的隐藏状态进行非线性变换
    hidden_states = self.activation(hidden_states)
    # 应用层归一化操作对隐藏状态进行归一化处理
    hidden_states = self.LayerNorm(inputs=hidden_states)
    # 获取隐藏状态张量的第二个维度，即序列长度
    seq_length = shape_list(tensor=hidden_states)[1]
    # 对隐藏状态张量进行形状重塑，将其转换为二维张量
    hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
    # 对重塑后的隐藏状态张量与解码器权重矩阵进行矩阵乘法运算
    hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
    # 将矩阵乘法结果的张量形状重塑为三维张量，恢复为序列长度相关的形状
    hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
    # 对矩阵乘法结果张量添加偏置项
    hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias)

    # 返回经过线性变换、激活函数、归一化、矩阵乘法、偏置项处理后的隐藏状态张量
    return hidden_states
# 使用 keras_serializable 装饰器将类 TFAlbertMainLayer 序列化为 Keras 层
@keras_serializable
class TFAlbertMainLayer(keras.layers.Layer):
    # 设置配置类为 AlbertConfig
    config_class = AlbertConfig

    # 初始化函数，接受 AlbertConfig 类型的 config 和一个布尔值 add_pooling_layer
    def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 保存传入的配置对象
        self.config = config

        # 创建 TFAlbertEmbeddings 层，并命名为 "embeddings"
        self.embeddings = TFAlbertEmbeddings(config, name="embeddings")

        # 创建 TFAlbertTransformer 层，并命名为 "encoder"
        self.encoder = TFAlbertTransformer(config, name="encoder")

        # 如果 add_pooling_layer 为 True，则创建一个 Dense 层作为池化层，否则为 None
        self.pooler = (
            keras.layers.Dense(
                units=config.hidden_size,
                kernel_initializer=get_initializer(config.initializer_range),
                activation="tanh",
                name="pooler",
            )
            if add_pooling_layer
            else None
        )

    # 返回输入嵌入层 embeddings
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.embeddings

    # 设置输入嵌入层的权重值和词汇大小
    def set_input_embeddings(self, value: tf.Variable):
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    # 未实现的方法，用于剪枝模型中的头部
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError

    # 使用 unpack_inputs 装饰器，处理输入的各种参数
    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ):
        # 函数内容未提供

    # 构建层，如果已经构建则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        
        # 如果存在嵌入层 embeddings，则构建其内部结构
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        
        # 如果存在编码器 encoder，则构建其内部结构
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        
        # 如果存在池化层 pooler，则构建其内部结构，输入形状为 [None, None, self.config.hidden_size]
        if getattr(self, "pooler", None) is not None:
            with tf.name_scope(self.pooler.name):
                self.pooler.build([None, None, self.config.hidden_size])


# 使用 dataclass 装饰器创建 TFAlbertForPreTrainingOutput 类，继承自 ModelOutput
@dataclass
class TFAlbertForPreTrainingOutput(ModelOutput):
    """
    Output type of [`TFAlbertForPreTraining`].
    """
    loss: tf.Tensor = None
    # 损失值，初始化为 None

    prediction_logits: tf.Tensor = None
    # 语言建模头部的预测分数张量，形状为 `(batch_size, sequence_length, config.vocab_size)`，在 SoftMax 之前的分数。

    sop_logits: tf.Tensor = None
    # 下一个序列预测（分类）头部的预测分数张量，形状为 `(batch_size, 2)`，在 SoftMax 之前的分数，表示 True/False 的延续。

    hidden_states: Tuple[tf.Tensor] | None = None
    # 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回的隐藏状态元组，
    # 包含每个层的输出张量和初始嵌入输出，形状为 `(batch_size, sequence_length, hidden_size)`。

    attentions: Tuple[tf.Tensor] | None = None
    # 当 `output_attentions=True` 或 `config.output_attentions=True` 时返回的注意力张量元组，
    # 包含每个层的注意力权重张量，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
"""
    这个模型继承自 `TFPreTrainedModel`。查看超类文档以获取库实现的通用方法，比如下载或保存模型、调整输入嵌入大小、修剪头等。

    这个模型也是 [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) 的子类。将其用作常规的 TF 2.0 Keras 模型，并参考 TF 2.0 文档，以获取所有与一般使用和行为相关的信息。

    <Tip>

    `transformers` 中的 TensorFlow 模型和层接受两种输入格式：

    - 将所有输入作为关键字参数（类似于 PyTorch 模型），或者
    - 将所有输入作为列表、元组或字典传递给第一个位置参数。

    支持第二种格式的原因在于，Keras 方法在将输入传递给模型和层时更喜欢这种格式。由于这种支持，当使用诸如 `model.fit()` 这样的方法时，只需传递模型支持的任何格式的输入和标签即可！然而，如果您想在 Keras 方法之外使用第二种格式，比如在使用 Keras `Functional` API 创建自己的层或模型时，可以使用三种可能性来收集第一个位置参数中的所有输入张量：

    - 只有 `input_ids` 的单个张量：`model(input_ids)`
    - 长度可变的列表，按照文档字符串中给定的顺序包含一个或多个输入张量：`model([input_ids, attention_mask])` 或 `model([input_ids, attention_mask, token_type_ids])`
    - 一个字典，将一个或多个输入张量与文档字符串中给定的输入名称相关联：`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    请注意，当使用 [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) 创建模型和层时，您无需担心这些问题，因为可以像将输入传递给任何其他 Python 函数一样传递输入！

    </Tip>

    Args:
        config ([`AlbertConfig`]): 包含模型所有参数的模型配置类。
            使用配置文件初始化不会加载与模型关联的权重，仅加载配置。查看 [`~PreTrainedModel.from_pretrained`] 方法以加载模型权重。
"""

@add_start_docstrings(
    "不带任何特定头部的裸 Albert 模型变压器输出原始隐藏状态。",
    ALBERT_START_DOCSTRING,
)
class TFAlbertModel(TFAlbertPreTrainedModel):
    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        self.albert = TFAlbertMainLayer(config, name="albert")

    @unpack_inputs
    # 使用装饰器添加模型前向传播的文档字符串，指定ALBERT模型输入的批次大小和序列长度
    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 使用装饰器添加代码示例的文档字符串，包括检查点、输出类型、配置类等信息
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播方法，接收多个可能为None的输入参数，并返回模型输出
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入的token ID序列，可以为None
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力遮罩，可以为None
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # token类型ID，可以为None
        position_ids: np.ndarray | tf.Tensor | None = None,  # 位置ID，可以为None
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部遮罩，可以为None
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 输入的嵌入表示，可以为None
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，默认为None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，默认为None
        return_dict: Optional[bool] = None,  # 是否以字典形式返回输出，默认为None
        training: Optional[bool] = False,  # 是否处于训练模式，默认为False
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        # 调用ALBERT模型的前向传播方法，传递所有参数，并接收输出
        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 返回ALBERT模型的输出
        return outputs

    # 构建方法，用于构建模型
    def build(self, input_shape=None):
        # 如果模型已经构建完成，直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果模型的albert属性存在
        if getattr(self, "albert", None) is not None:
            # 在albert的命名空间内构建albert模型
            with tf.name_scope(self.albert.name):
                # 构建albert模型，不需要输入形状参数
                self.albert.build(None)
"""
Albert Model with two heads on top for pretraining: a `masked language modeling` head and a `sentence order
prediction` (classification) head.
"""
# 继承 TFAlbertPreTrainedModel 和 TFAlbertPreTrainingLoss 类，实现预训练模型
@add_start_docstrings(
    """
    Albert Model with two heads on top for pretraining: a `masked language modeling` head and a `sentence order
    prediction` (classification) head.
    """,
    ALBERT_START_DOCSTRING,  # 添加 Albert 模型的起始文档字符串
)
class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # 在从 PyTorch 模型加载 TF 模型时，带 '.' 的名称表示被授权的意外/缺失的层
    _keys_to_ignore_on_load_unexpected = [r"predictions.decoder.weight"]

    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化 Albert 预训练模型，设定标签数
        self.num_labels = config.num_labels

        # Albert 主层，使用 TFAlbertMainLayer 初始化，命名为 "albert"
        self.albert = TFAlbertMainLayer(config, name="albert")

        # Albert MLM 头部，使用 TFAlbertMLMHead 初始化，输入嵌入使用 self.albert.embeddings，命名为 "predictions"
        self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions")

        # Albert SOP 分类头部，使用 TFAlbertSOPHead 初始化，命名为 "sop_classifier"
        self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier")

    # 返回 MLM 头部
    def get_lm_head(self) -> keras.layers.Layer:
        return self.predictions

    # 模型的前向传播函数，接受一系列输入，参照 ALBERT_INPUTS_DOCSTRING 添加起始文档字符串
    @unpack_inputs
    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        sentence_order_label: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        **kwargs,
    ) -> Union[TFAlbertForPreTrainingOutput, Tuple[tf.Tensor]]:
        # 省略的部分对于模型前向传播的具体实现，输出和配置信息
        ) -> Union[TFAlbertForPreTrainingOutput, Tuple[tf.Tensor]]:
        r"""
        Return:

        Example:

        ```
        >>> import tensorflow as tf
        >>> from transformers import AutoTokenizer, TFAlbertForPreTraining

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = TFAlbertForPreTraining.from_pretrained("albert/albert-base-v2")

        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> sop_logits = outputs.sop_logits
        ```"""

        # 调用 self.albert 模型进行预测
        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 提取模型输出的序列输出和池化输出
        sequence_output, pooled_output = outputs[:2]
        # 使用 predictions 层生成预测分数
        prediction_scores = self.predictions(hidden_states=sequence_output)
        # 使用 sop_classifier 层生成 SOP 分类分数
        sop_scores = self.sop_classifier(pooled_output=pooled_output, training=training)
        # 初始化总损失
        total_loss = None

        # 如果有标签和句子顺序标签，则计算损失
        if labels is not None and sentence_order_label is not None:
            # 构建标签字典
            d_labels = {"labels": labels}
            d_labels["sentence_order_label"] = sentence_order_label
            # 使用 hf_compute_loss 计算总损失
            total_loss = self.hf_compute_loss(labels=d_labels, logits=(prediction_scores, sop_scores))

        # 如果 return_dict 为 False，则返回扁平化的输出元组
        if not return_dict:
            output = (prediction_scores, sop_scores) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果 return_dict 为 True，则返回 TFAlbertForPreTrainingOutput 对象
        return TFAlbertForPreTrainingOutput(
            loss=total_loss,
            prediction_logits=prediction_scores,
            sop_logits=sop_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 设置构建标志为 True
        self.built = True
        # 如果存在 self.albert 属性，则构建 self.albert 模型
        if getattr(self, "albert", None) is not None:
            with tf.name_scope(self.albert.name):
                self.albert.build(None)
        # 如果存在 self.predictions 属性，则构建 self.predictions 层
        if getattr(self, "predictions", None) is not None:
            with tf.name_scope(self.predictions.name):
                self.predictions.build(None)
        # 如果存在 self.sop_classifier 属性，则构建 self.sop_classifier 层
        if getattr(self, "sop_classifier", None) is not None:
            with tf.name_scope(self.sop_classifier.name):
                self.sop_classifier.build(None)
# 定义 TFAlbertSOPHead 类，继承自 keras 的 Layer 类
class TFAlbertSOPHead(keras.layers.Layer):
    
    # 初始化方法，接受 AlbertConfig 类型的 config 参数和其他关键字参数
    def __init__(self, config: AlbertConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 使用 config 的 classifier_dropout_prob 属性创建一个 Dropout 层
        self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
        
        # 使用 config 的 num_labels 属性和 initializer_range 属性创建一个全连接 Dense 层
        self.classifier = keras.layers.Dense(
            units=config.num_labels,
            kernel_initializer=get_initializer(config.initializer_range),
            name="classifier",
        )
        
        # 将 config 参数存储在实例变量中
        self.config = config

    # call 方法用于定义层的前向传播逻辑
    def call(self, pooled_output: tf.Tensor, training: bool) -> tf.Tensor:
        # 对输入的 pooled_output 应用 Dropout 操作
        dropout_pooled_output = self.dropout(inputs=pooled_output, training=training)
        
        # 将 Dropout 后的输出传递给全连接 Dense 层，得到 logits
        logits = self.classifier(inputs=dropout_pooled_output)

        # 返回 logits
        return logits

    # build 方法用于构建层，在此方法中创建层的变量
    def build(self, input_shape=None):
        # 如果层已经构建过，直接返回
        if self.built:
            return
        
        # 将层标记为已构建
        self.built = True
        
        # 如果 self.classifier 存在，则在名为 self.classifier 的命名作用域下构建全连接层
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])


# 使用装饰器 @add_start_docstrings 添加文档字符串描述 Albert Model 的语言建模头部
@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING)
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
    
    # 定义 _keys_to_ignore_on_load_unexpected 属性，用于在加载 TF 模型时忽略指定的层
    # 名称中带有 '.' 表示的是从 PT 模型加载 TF 模型时可能会出现的未预期的或丢失的层
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions.decoder.weight"]

    # 初始化方法，接受 AlbertConfig 类型的 config 参数和其他位置和关键字参数
    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
        # 调用父类 TFAlbertPreTrainedModel 的初始化方法
        super().__init__(config, *inputs, **kwargs)
        
        # 创建 TFAlbertMainLayer 类的实例 albert，设置 add_pooling_layer=False，命名为 "albert"
        self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
        
        # 创建 TFAlbertMLMHead 类的实例 predictions，设置 input_embeddings 为 self.albert.embeddings，命名为 "predictions"
        self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions")

    # 返回预测头部的方法，返回 self.predictions
    def get_lm_head(self) -> keras.layers.Layer:
        return self.predictions

    # 使用装饰器 @unpack_inputs、@add_start_docstrings_to_model_forward 和 @replace_return_docstrings
    # 添加文档字符串描述 call 方法的输入和输出
    @unpack_inputs
    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 ALBERT 模型中获取输出结果，包括序列输出和其他选项
        sequence_output = outputs[0]
        # 使用序列输出计算预测得分
        prediction_scores = self.predictions(hidden_states=sequence_output, training=training)
        # 如果提供了标签，计算损失；否则损失为 None
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)

        # 如果不要求返回字典形式的结果，按顺序返回预测得分和其他输出
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 否则，返回 TFMaskedLMOutput 对象，包括损失、预测得分、隐藏状态和注意力权重
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 定义 build 方法，用于构建模型
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 设置标记表示已经构建
        self.built = True
        
        # 如果模型中存在名为 albert 的属性，开始构建 albert 部分
        if getattr(self, "albert", None) is not None:
            # 使用 albert 的名称作为命名空间，开始构建 albert
            with tf.name_scope(self.albert.name):
                self.albert.build(None)
        
        # 如果模型中存在名为 predictions 的属性，开始构建 predictions 部分
        if getattr(self, "predictions", None) is not None:
            # 使用 predictions 的名称作为命名空间，开始构建 predictions
            with tf.name_scope(self.predictions.name):
                self.predictions.build(None)
# 使用装饰器添加文档字符串，描述了这个类的用途和结构
@add_start_docstrings(
    """
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    ALBERT_START_DOCSTRING,
)
# 定义 TFAlbertForSequenceClassification 类，继承自 TFAlbertPreTrainedModel 和 TFSequenceClassificationLoss
class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClassificationLoss):
    # 在加载过程中忽略的不期望/缺失的层名称列表
    _keys_to_ignore_on_load_unexpected = [r"predictions"]
    # 在加载过程中忽略的缺失的层名称列表
    _keys_to_ignore_on_load_missing = [r"dropout"]

    # 构造方法，初始化类的实例
    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
        # 调用父类的构造方法
        super().__init__(config, *inputs, **kwargs)

        # 设置类别数目
        self.num_labels = config.num_labels

        # 创建 Albert 主层，使用给定的配置和名称
        self.albert = TFAlbertMainLayer(config, name="albert")
        # 添加 Dropout 层，使用给定的分类器 dropout 概率
        self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
        # 添加 Dense 层作为分类器，设置输出单元数为类别数目，使用给定的初始化器范围和名称
        self.classifier = keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        # 保存配置对象
        self.config = config

    # 使用装饰器添加文档字符串，描述了 call 方法的输入和输出
    @unpack_inputs
    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="vumichien/albert-base-v2-imdb",
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="'LABEL_1'",
        expected_loss=0.12,
    )
    # 定义 call 方法，实现模型的前向传播
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        # 可选的输入参数，用于自动解包输入
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 调用 ALBERT 模型进行前向传播，获取模型输出
        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 ALBERT 模型输出中获取汇聚输出
        pooled_output = outputs[1]
        # 对汇聚输出应用 dropout，以防止过拟合
        pooled_output = self.dropout(inputs=pooled_output, training=training)
        # 使用分类器模型对汇聚输出进行分类，得到预测 logits
        logits = self.classifier(inputs=pooled_output)
        # 如果提供了标签，则计算损失值
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果不要求返回字典，则按顺序返回 logits 和可能的额外输出
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFSequenceClassifierOutput 对象，包括损失、logits、隐藏状态和注意力权重
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果模型已经构建过，则直接返回
        if getattr(self, "albert", None) is not None:
            # 使用 ALBERT 模型的名字空间构建模型
            with tf.name_scope(self.albert.name):
                self.albert.build(None)
        # 如果存在分类器模型，则使用分类器的名字空间构建模型
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
    """
    Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    ALBERT_START_DOCSTRING,
)
class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificationLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # 定义在从 PyTorch 模型加载 TF 模型时，可以忽略的意外/缺失层的名称列表
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions"]
    # 定义在从 PyTorch 模型加载 TF 模型时，可以忽略的缺失层的名称列表
    _keys_to_ignore_on_load_missing = [r"dropout"]

    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
        # 调用父类构造函数初始化模型
        super().__init__(config, *inputs, **kwargs)

        # 从配置中获取标签数量
        self.num_labels = config.num_labels

        # 创建 Albert 主层对象，不添加池化层，并命名为 "albert"
        self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")

        # 根据配置中的分类器丢弃率或者隐藏层丢弃率，创建 Dropout 层
        classifier_dropout_prob = (
            config.classifier_dropout_prob
            if config.classifier_dropout_prob is not None
            else config.hidden_dropout_prob
        )
        self.dropout = keras.layers.Dropout(rate=classifier_dropout_prob)

        # 创建分类器 Dense 层，用于标签分类，初始化方式使用配置中的范围初始化
        self.classifier = keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )

        # 将配置对象保存到模型中
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        **kwargs,
    ):
        """
        Performs forward pass of the model.
        """
        # 调用父类的 `call` 方法，执行模型的前向传播
        return super().call(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            labels=labels,
            training=training,
            **kwargs,
        )
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 调用 ALBERT 模型，获取模型的输出结果
        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 ALBERT 模型的输出中获取序列输出
        sequence_output = outputs[0]
        # 对序列输出应用 dropout 操作，用于防止过拟合
        sequence_output = self.dropout(inputs=sequence_output, training=training)
        # 将 dropout 后的序列输出输入分类器，得到 logits（预测结果）
        logits = self.classifier(inputs=sequence_output)
        # 如果提供了标签，则计算损失值
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果不需要返回字典形式的输出
        if not return_dict:
            # 组装输出元组
            output = (logits,) + outputs[2:]
            # 如果有损失值，则将损失值作为输出的第一个元素
            return ((loss,) + output) if loss is not None else output

        # 返回 TFTokenClassifierOutput 对象，包含损失值、logits、隐藏状态和注意力权重
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回
        if self.built:
            return
        # 将模型标记为已构建
        self.built = True
        # 如果存在 ALBERT 模型，则构建 ALBERT 模型
        if getattr(self, "albert", None) is not None:
            with tf.name_scope(self.albert.name):
                self.albert.build(None)
        # 如果存在分类器模型，则构建分类器模型
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                # 构建分类器模型，输入形状为 [None, None, self.config.hidden_size]
                self.classifier.build([None, None, self.config.hidden_size])
"""
Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
# 使用 Albert 模型，添加一个用于抽取式问答任务（如 SQuAD）的跨度分类头部（在隐藏状态输出顶部的线性层，用于计算“跨度起始对数”和“跨度终止对数”）。

# 导入 ALBERT_START_DOCSTRING 作为注释的一部分
@add_start_docstrings(ALBERT_START_DOCSTRING)

# 定义 TFAlbertForQuestionAnswering 类，继承自 TFAlbertPreTrainedModel 和 TFQuestionAnsweringLoss
class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringLoss):

    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # _keys_to_ignore_on_load_unexpected 是在从 PT 模型加载 TF 模型时允许的未预期/丢失的层的名称列表
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions"]

    # 初始化方法，接收一个 AlbertConfig 类型的 config 对象和其他位置参数
    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)

        # 设置类属性 num_labels 等于 config 中的 num_labels
        self.num_labels = config.num_labels

        # 初始化 Albert 主层对象，设置不添加池化层，并命名为 "albert"
        self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")

        # 初始化 QA 输出层，使用 Dense 层，单元数为 config 中的 num_labels，使用指定的初始化器范围初始化权重，命名为 "qa_outputs"
        self.qa_outputs = keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )

        # 将 config 对象保存为类属性
        self.config = config

    # 调用方法，用装饰器添加了多个文档字符串，说明了输入和输出的详细信息，以及模型的用法示例
    @unpack_inputs
    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="vumichien/albert-base-v2-squad2",
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        qa_target_start_index=12,
        qa_target_end_index=13,
        expected_output="'a nice puppet'",
        expected_loss=7.36,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取 ALBERT 模型的输出，包括序列输出、注意力权重等

        sequence_output = outputs[0]
        # 从 ALBERT 输出中提取序列输出

        logits = self.qa_outputs(inputs=sequence_output)
        # 将序列输出传递给 QA 输出层，得到预测的开始和结束位置的 logits
        
        start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
        # 将 logits 沿最后一个维度分割为开始和结束 logits
        
        start_logits = tf.squeeze(input=start_logits, axis=-1)
        end_logits = tf.squeeze(input=end_logits, axis=-1)
        # 移除 logits 的单维度，以匹配预期的形状

        loss = None

        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            # 构建标签字典，包含开始和结束位置的真实标签

            loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
            # 使用损失计算函数计算开始和结束位置的损失

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            # 如果不返回字典形式的结果，构建输出元组

            return ((loss,) + output) if loss is not None else output
            # 如果有损失，则在输出元组前添加损失；否则只返回输出元组

        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
        # 返回 TFQuestionAnsweringModelOutput 对象，包含损失、开始和结束 logits、隐藏状态和注意力权重

    def build(self, input_shape=None):
        if self.built:
            return
        # 如果模型已经建立过，则直接返回

        self.built = True
        # 将模型标记为已建立

        if getattr(self, "albert", None) is not None:
            with tf.name_scope(self.albert.name):
                self.albert.build(None)
        # 如果存在 ALBERT 模型，使用其名称作为作用域，构建 ALBERT 模型

        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])
        # 如果存在 QA 输出层，使用其名称作为作用域，构建 QA 输出层
"""
Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""
@add_start_docstrings(
    """
    Albert 模型，顶部带有一个多选分类头部（在汇总输出的基础上添加一个线性层和 softmax），例如 RocStories/SWAG 任务。
    """,
    ALBERT_START_DOCSTRING,
)
class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
    """
    names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    """
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions"]
    # List of keys ignored when certain layers are missing during TF model loading from PT model
    _keys_to_ignore_on_load_missing = [r"dropout"]

    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
        """
        Initialize TFAlbertForMultipleChoice model
        """
        super().__init__(config, *inputs, **kwargs)

        # Initialize Albert main layer with provided configuration
        self.albert = TFAlbertMainLayer(config, name="albert")
        # Dropout layer with dropout rate set from configuration
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        # Classifier dense layer initialized with specific initializer range from configuration
        self.classifier = keras.layers.Dense(
            units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        # Store the configuration object for reference
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        ):
        """
        Perform forward pass of TFAlbertForMultipleChoice model.
        """
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """

        # 如果提供了 input_ids，则获取 num_choices 和 seq_length
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]
            seq_length = shape_list(input_ids)[2]
        # 否则，从 inputs_embeds 获取 num_choices 和 seq_length
        else:
            num_choices = shape_list(inputs_embeds)[1]
            seq_length = shape_list(inputs_embeds)[2]

        # 将 input_ids 展平成二维张量，如果 input_ids 不为 None
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
        # 将 attention_mask 展平成二维张量，如果 attention_mask 不为 None
        flat_attention_mask = (
            tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None
        )
        # 将 token_type_ids 展平成二维张量，如果 token_type_ids 不为 None
        flat_token_type_ids = (
            tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None
        )
        # 将 position_ids 展平成二维张量，如果 position_ids 不为 None
        flat_position_ids = (
            tf.reshape(tensor=position_ids, shape=(-1, seq_length)) if position_ids is not None else None
        )
        # 将 inputs_embeds 展平成三维张量，如果 inputs_embeds 不为 None
        flat_inputs_embeds = (
            tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3]))
            if inputs_embeds is not None
            else None
        )
        # 使用 ALBERT 模型进行推断
        outputs = self.albert(
            input_ids=flat_input_ids,
            attention_mask=flat_attention_mask,
            token_type_ids=flat_token_type_ids,
            position_ids=flat_position_ids,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取池化后的输出
        pooled_output = outputs[1]
        # 应用 dropout
        pooled_output = self.dropout(inputs=pooled_output, training=training)
        # 使用分类器进行预测
        logits = self.classifier(inputs=pooled_output)
        # 将 logits 重新形状为二维张量
        reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))
        # 如果提供了 labels，则计算损失
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)

        # 如果不需要返回字典格式的输出，则返回相应的结果元组
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典格式的输出，则构造 TFMultipleChoiceModelOutput 对象并返回
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 如果模型已经构建，直接返回，避免重复构建
    if self.built:
        return
    # 标记模型已经构建
    self.built = True
    
    # 如果存在属性 self.albert，则构建 self.albert 模型
    if getattr(self, "albert", None) is not None:
        # 使用 self.albert 的名称作为命名空间，并构建该模型
        with tf.name_scope(self.albert.name):
            self.albert.build(None)
    
    # 如果存在属性 self.classifier，则构建 self.classifier 模型
    if getattr(self, "classifier", None) is not None:
        # 使用 self.classifier 的名称作为命名空间，并构建该模型
        with tf.name_scope(self.classifier.name):
            # 构建 classifier 模型，传入输入形状 [None, None, self.config.hidden_size]
            self.classifier.build([None, None, self.config.hidden_size])

`.\models\albert\tokenization_albert.py`

# coding=utf-8
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for ALBERT model."""

# 引入必要的库和模块
import os
import unicodedata
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple

import sentencepiece as spm  # 导入句子分词模块

from ...tokenization_utils import AddedToken, PreTrainedTokenizer  # 导入自定义的分词工具类
from ...utils import logging  # 导入日志工具

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)
# 定义 ALBERT 模型的词汇文件名称
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "albert/albert-base-v1": "https://huggingface.co/albert/albert-base-v1/resolve/main/spiece.model",
        "albert/albert-large-v1": "https://huggingface.co/albert/albert-large-v1/resolve/main/spiece.model",
        "albert/albert-xlarge-v1": "https://huggingface.co/albert/albert-xlarge-v1/resolve/main/spiece.model",
        "albert/albert-xxlarge-v1": "https://huggingface.co/albert/albert-xxlarge-v1/resolve/main/spiece.model",
        "albert/albert-base-v2": "https://huggingface.co/albert/albert-base-v2/resolve/main/spiece.model",
        "albert/albert-large-v2": "https://huggingface.co/albert/albert-large-v2/resolve/main/spiece.model",
        "albert/albert-xlarge-v2": "https://huggingface.co/albert/albert-xlarge-v2/resolve/main/spiece.model",
        "albert/albert-xxlarge-v2": "https://huggingface.co/albert/albert-xxlarge-v2/resolve/main/spiece.model",
    }
}

# 预训练模型的位置嵌入尺寸
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "albert/albert-base-v1": 512,
    "albert/albert-large-v1": 512,
    "albert/albert-xlarge-v1": 512,
    "albert/albert-xxlarge-v1": 512,
    "albert/albert-base-v2": 512,
    "albert/albert-large-v2": 512,
    "albert/albert-xlarge-v2": 512,
    "albert/albert-xxlarge-v2": 512,
}

# SentencePiece 分词器特有的下划线符号
SPIECE_UNDERLINE = "▁"

# ALBERT 模型的分词器类，继承自 PreTrainedTokenizer 类
class AlbertTokenizer(PreTrainedTokenizer):
    """
    Construct an ALBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    """

    # 词汇文件名字典
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 预训练模型的最大输入尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    # sp_model 是 SentencePieceProcessor 对象，用于字符串、token 和 ID 的转换
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.sp_model = None  # 初始化 sp_model 为空
    # 初始化函数，用于初始化一个新的对象实例
    def __init__(
        self,
        vocab_file,  # 词汇文件的路径
        do_lower_case=True,  # 是否将输入文本转换为小写，默认为True
        remove_space=True,  # 是否移除输入文本中的空格，默认为True
        keep_accents=False,  # 是否保留输入文本中的重音符号，默认为False
        bos_token="[CLS]",  # 开始标记（Beginning of Sentence），默认为"[CLS]"
        eos_token="[SEP]",  # 结束标记（End of Sentence），默认为"[SEP]"
        unk_token="<unk>",  # 未知标记（Unknown Token），默认为"<unk>"
        sep_token="[SEP]",  # 分隔标记（Separator Token），默认为"[SEP]"
        pad_token="<pad>",  # 填充标记（Padding Token），默认为"<pad>"
        cls_token="[CLS]",  # 类别标记（Class Token），默认为"[CLS]"
        mask_token="[MASK]",  # 掩码标记（Mask Token），默认为"[MASK]"
        sp_model_kwargs: Optional[Dict[str, Any]] = None,  # SentencePiece 模型的参数，可选字典类型，默认为None
        **kwargs,  # 其他额外的关键字参数
    ) -> None:
        # 将掩码标记（mask_token）处理成一个 AddedToken 对象，具有特定的处理属性
        mask_token = (
            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
            if isinstance(mask_token, str)
            else mask_token
        )

        # 如果未提供 sp_model_kwargs，则初始化为空字典
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 设置对象的各种属性值
        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file

        # 使用 SentencePieceProcessor 初始化一个 sp_model 对象，并加载词汇文件
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(vocab_file)

        # 调用父类的初始化方法，传递参数和额外的关键字参数
        super().__init__(
            do_lower_case=do_lower_case,
            remove_space=remove_space,
            keep_accents=keep_accents,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

    # vocab_size 属性，返回 sp_model 中词汇的数量
    @property
    def vocab_size(self) -> int:
        return len(self.sp_model)

    # 获取词汇表的方法，返回词汇到索引的字典
    def get_vocab(self) -> Dict[str, int]:
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    # __getstate__ 方法，用于对象的序列化状态，排除 sp_model 以防止对象过大
    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None  # 将 sp_model 设置为 None，不包含在序列化状态中
        return state

    # __setstate__ 方法，用于对象的反序列化，重新初始化 sp_model
    def __setstate__(self, d):
        self.__dict__ = d

        # 为了向后兼容，如果对象没有 sp_model_kwargs 属性，则设置为空字典
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 使用 SentencePieceProcessor 重新初始化 sp_model 并加载 vocab_file
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(self.vocab_file)

    # 文本预处理方法，根据对象的属性对输入文本进行处理并返回处理后的文本
    def preprocess_text(self, inputs):
        if self.remove_space:
            outputs = " ".join(inputs.strip().split())  # 移除多余的空格
        else:
            outputs = inputs

        outputs = outputs.replace("``", '"').replace("''", '"')  # 替换双引号

        if not self.keep_accents:
            outputs = unicodedata.normalize("NFKD", outputs)  # 标准化 unicode 字符串
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])  # 移除重音符号

        if self.do_lower_case:
            outputs = outputs.lower()  # 将文本转换为小写

        return outputs
    def _tokenize(self, text: str) -> List[str]:
        """Tokenize a string."""
        # 对输入文本进行预处理
        text = self.preprocess_text(text)
        # 使用句子片段模型对文本进行编码，输出为字符串列表
        pieces = self.sp_model.encode(text, out_type=str)
        new_pieces = []
        for piece in pieces:
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                # 处理特殊情况的逻辑，参见 https://github.com/google-research/bert/blob/master/README.md#tokenization
                # 当遇到形如 `9,9` 的情况时，确保正确分割为 ['▁9', ',', '9']，而非 [`_9,`, '9']
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                new_pieces.append(piece)

        return new_pieces

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 使用词汇表将token转换为对应的ID
        return self.sp_model.PieceToId(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用词汇表将ID转换为对应的token
        return self.sp_model.IdToPiece(index)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        current_sub_tokens = []
        out_string = ""
        prev_is_special = False
        for token in tokens:
            # 确保特殊的token不被句子片段模型解码
            if token in self.all_special_tokens:
                if not prev_is_special:
                    out_string += " "
                out_string += self.sp_model.decode(current_sub_tokens) + token
                prev_is_special = True
                current_sub_tokens = []
            else:
                current_sub_tokens.append(token)
                prev_is_special = False
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string.strip()

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        ):
        """Build model inputs from a sequence or a pair of sequence for BERT."""
        # 实现构建适用于BERT模型的特殊token输入
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create token type IDs tensor from token list indices.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: A list of token type IDs (0 or 1) corresponding to each token in the input sequences.
        """
        # Define token type ID for the first sequence (0)
        token_type_ids = [0] * len(token_ids_0)
        
        if token_ids_1 is not None:
            # Define token type ID for the second sequence (1)
            token_type_ids += [1] * len(token_ids_1)
        
        return token_type_ids
    def create_mask(self, token_ids_0: List[int], token_ids_1: Optional[List[int]]) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of token IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of token IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs according to the given sequence(s).
        """
        # Define separation and classification tokens
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If only one sequence is provided (token_ids_1 is None), return a mask with 0s for the first sequence
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]

        # Otherwise, concatenate both sequences and return a mask with 0s for the first sequence and 1s for the second
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary files to the specified directory.

        Args:
            save_directory (`str`):
                Directory path where the vocabulary files will be saved.
            filename_prefix (`str`, *optional*):
                Optional prefix to prepend to the vocabulary file names.

        Returns:
            `Tuple[str]`: Tuple containing the path of the saved vocabulary file.
        """
        # Check if the save directory exists; if not, log an error and return
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # Determine the output vocabulary file path
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # If the current vocabulary file is not the same as the output file and exists, copy it to the output location
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # If the current vocabulary file doesn't exist, write the serialized model to the output file
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # Return the path of the saved vocabulary file
        return (out_vocab_file,)

Transformers-源码解析-九-

Transformers 源码解析（九）

.\models\albert\configuration_albert.py

.\models\albert\convert_albert_original_tf_checkpoint_to_pytorch.py

D:\Python310\Lib\site-packages\transformers\models\albert\modeling_albert.py

.\models\albert\modeling_flax_albert.py

.\models\albert\modeling_tf_albert.py

.\models\albert\tokenization_albert.py

`.\models\albert\configuration_albert.py`

`.\models\albert\convert_albert_original_tf_checkpoint_to_pytorch.py`

`D:\Python310\Lib\site-packages\transformers\models\albert\modeling_albert.py`

`.\models\albert\modeling_flax_albert.py`

`.\models\albert\modeling_tf_albert.py`

`.\models\albert\tokenization_albert.py`