Transformers 源码解析（九十五）

`.\models\roberta\convert_roberta_original_pytorch_checkpoint_to_pytorch.py`

# 设置 Python 文件编码格式为 UTF-8
# 版权声明和许可协议，这里是 Apache License 2.0
# 详细信息可参见 http://www.apache.org/licenses/LICENSE-2.0

# 导入必要的库和模块
import argparse        # 用于解析命令行参数
import pathlib         # 提供处理路径的类和函数

import fairseq         # 导入 fairseq 库
import torch           # 导入 PyTorch 库
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel  # 导入 Fairseq 中的 RoBERTa 模型
from fairseq.modules import TransformerSentenceEncoderLayer  # 导入 Fairseq 中的 TransformerSentenceEncoderLayer 模块
from packaging import version  # 用于处理版本号的库

from transformers import RobertaConfig, RobertaForMaskedLM, RobertaForSequenceClassification  # 导入 Hugging Face Transformers 中的 RoBERTa 相关类
from transformers.models.bert.modeling_bert import (  # 导入 Transformers BERT 模型的部分组件类
    BertIntermediate,
    BertLayer,
    BertOutput,
    BertSelfAttention,
    BertSelfOutput,
)
from transformers.utils import logging  # 导入 Transformers 的日志模块

# 如果 fairseq 的版本小于 0.9.0，则抛出异常
if version.parse(fairseq.__version__) < version.parse("0.9.0"):
    raise Exception("requires fairseq >= 0.9.0")

# 设置日志输出级别为 INFO
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 示例文本
SAMPLE_TEXT = "Hello world! cécé herlolip"

# 定义函数，将 RoBERTa 模型的检查点转换为 PyTorch 格式
def convert_roberta_checkpoint_to_pytorch(
    roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
):
    """
    复制/粘贴/调整 RoBERTa 的权重以适应我们的 BERT 结构。
    """
    # 从预训练的 RoBERTa 检查点路径加载模型
    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
    # 设置为评估模式，禁用 dropout
    roberta.eval()
    # 获取 RoBERTa 的句子编码器
    roberta_sent_encoder = roberta.model.encoder.sentence_encoder
    # 创建 RoBERTaConfig 对象，用于定义转换后的 BERT 模型配置
    config = RobertaConfig(
        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
        hidden_size=roberta.args.encoder_embed_dim,
        num_hidden_layers=roberta.args.encoder_layers,
        num_attention_heads=roberta.args.encoder_attention_heads,
        intermediate_size=roberta.args.encoder_ffn_embed_dim,
        max_position_embeddings=514,
        type_vocab_size=1,
        layer_norm_eps=1e-5,  # PyTorch 默认值，与 fairseq 保持一致
    )
    # 如果需要分类头部，则设置 num_labels 属性为对应分类头部的输出维度
    if classification_head:
        config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0]
    # 输出 BERT 模型的配置信息
    print("Our BERT config:", config)

    # 创建 RoBERTaForSequenceClassification 或 RoBERTaForMaskedLM 模型对象
    model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
    # 设置为评估模式
    model.eval()

    # 开始复制所有权重
    # 复制词嵌入权重
    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
    # 复制位置编码权重
    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
    # 将 token_type_embeddings 的权重数据置零，因为 RoBERTa 不使用 token_type_embeddings
    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        model.roberta.embeddings.token_type_embeddings.weight
    )  # just zero them out b/c RoBERTa doesn't use them.
    # 将 RoBERTa 模型的 LayerNorm 权重和偏置设置为 RoBERTa 句子编码器的对应权重和偏置
    model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
    model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias

    # 遍历每个隐藏层进行参数设置
    for i in range(config.num_hidden_layers):
        # 获取当前层的 BertLayer 对象和对应的 TransformerSentenceEncoderLayer 对象
        layer: BertLayer = model.roberta.encoder.layer[i]
        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]

        # 设置自注意力层的权重和偏置
        self_attn: BertSelfAttention = layer.attention.self
        assert (
            roberta_layer.self_attn.k_proj.weight.data.shape
            == roberta_layer.self_attn.q_proj.weight.data.shape
            == roberta_layer.self_attn.v_proj.weight.data.shape
            == torch.Size((config.hidden_size, config.hidden_size))
        )
        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias

        # 设置自注意力层输出的权重和偏置
        self_output: BertSelfOutput = layer.attention.output
        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
        self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
        self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias

        # 设置中间层的权重和偏置
        intermediate: BertIntermediate = layer.intermediate
        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
        intermediate.dense.weight = roberta_layer.fc1.weight
        intermediate.dense.bias = roberta_layer.fc1.bias

        # 设置输出层的权重和偏置
        bert_output: BertOutput = layer.output
        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
        bert_output.dense.weight = roberta_layer.fc2.weight
        bert_output.dense.bias = roberta_layer.fc2.bias
        bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
        bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
        # 本层设置结束

    # 如果有分类头，则设置分类器的权重和偏置为 RoBERTa 模型中指定分类头的对应权重和偏置
    if classification_head:
        model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
        model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
        model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
        model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
    else:
        # 如果不是分类任务，复制 RoBERTa 模型的语言模型头部权重和偏置到当前模型的语言模型头部
        model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight
        model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias
        model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight
        model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias
        model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight
        model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias

    # 检查我们的模型是否产生相同的输出结果。
    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # 批大小为1的输入张量

    our_output = model(input_ids)[0]
    if classification_head:
        # 如果有分类头部，使用 RoBERTa 模型的对应分类头部进行推理
        their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
    else:
        # 否则直接使用 RoBERTa 模型的输出进行推理
        their_output = roberta.model(input_ids)[0]
    print(our_output.shape, their_output.shape)
    # 计算输出张量的最大绝对差异
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # 约为 1e-7
    # 检查两个模型的输出张量是否足够接近
    success = torch.allclose(our_output, their_output, atol=1e-3)
    print("Do both models output the same tensors?", "🔥" if success else "💩")
    if not success:
        raise Exception("Something went wRoNg")

    # 创建存储 PyTorch 模型的文件夹路径，如果不存在则创建
    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")
    # 将当前模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    # 如果当前脚本被直接执行而非被导入，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # 必选参数
    parser.add_argument(
        "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
    )
    # 添加一个命令行参数，用于指定 RoBERTa 模型的检查点路径，必须提供，类型为字符串

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 添加一个命令行参数，用于指定输出 PyTorch 模型的文件夹路径，必须提供，类型为字符串

    parser.add_argument(
        "--classification_head", action="store_true", help="Whether to convert a final classification head."
    )
    # 添加一个命令行参数，用于指定是否转换最终的分类头部，采用布尔标志方式

    args = parser.parse_args()
    # 解析命令行参数并返回一个命名空间对象 args，包含了解析后的参数值

    convert_roberta_checkpoint_to_pytorch(
        args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
    )
    # 调用函数 convert_roberta_checkpoint_to_pytorch，传递命令行参数中指定的 RoBERTa 检查点路径、输出路径和分类头部转换标志作为参数

`.\models\roberta\modeling_flax_roberta.py`

# 声明一个长字符串作为模型文档字符串的一部分，用于生成 ROBERTA_START_DOCSTRING
ROBERTA_START_DOCSTRING = r"""

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`RobertaConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
# 定义了一个长字符串，用于文档化 RobertaEmbeddings 类的输入参数及其说明
ROBERTA_INPUTS_DOCSTRING = r"""
Args:
    input_ids (`numpy.ndarray` of shape `({0})`):
        Indices of input sequence tokens in the vocabulary.
        
        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.
        
        [What are input IDs?](../glossary#input-ids)
    attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
        
        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
        
        [What are attention masks?](../glossary#attention-mask)
    token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
        Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
        1]`:
        
        - 0 corresponds to a *sentence A* token,
        - 1 corresponds to a *sentence B* token.
        
        [What are token type IDs?](../glossary#token-type-ids)
    position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
        Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
        config.max_position_embeddings - 1]`.
    head_mask (`numpy.ndarray` of shape `({0})`, `optional):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
        
        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.
        
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 定义了一个类 FlaxRobertaEmbeddings，继承自 nn.Module
class FlaxRobertaEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    # 类属性 config，指定为 RobertaConfig 类型
    config: RobertaConfig
    # 类属性 dtype，指定为 jnp.float32 类型，用于计算的数据类型
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
    def setup(self):
        # 初始化词嵌入层，用于将词汇 ID 映射到隐藏大小的向量空间
        self.word_embeddings = nn.Embed(
            self.config.vocab_size,
            self.config.hidden_size,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
            dtype=self.dtype,
        )
        # 初始化位置嵌入层，用于将位置 ID 映射到隐藏大小的向量空间
        self.position_embeddings = nn.Embed(
            self.config.max_position_embeddings,
            self.config.hidden_size,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
            dtype=self.dtype,
        )
        # 初始化类型嵌入层，用于将类型 ID 映射到隐藏大小的向量空间
        self.token_type_embeddings = nn.Embed(
            self.config.type_vocab_size,
            self.config.hidden_size,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
            dtype=self.dtype,
        )
        # 初始化层归一化模块，使用给定的 epsilon 参数
        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        # 初始化 dropout 模块，使用给定的 dropout 率
        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)

    def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True):
        # 将输入的词汇 ID 转换为词嵌入向量
        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
        # 将位置 ID 转换为位置嵌入向量
        position_embeds = self.position_embeddings(position_ids.astype("i4"))
        # 将类型 ID 转换为类型嵌入向量
        token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))

        # 合并所有嵌入向量
        hidden_states = inputs_embeds + token_type_embeddings + position_embeds

        # 应用层归一化
        hidden_states = self.LayerNorm(hidden_states)
        # 应用 dropout
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        # 返回最终的隐藏状态表示
        return hidden_states
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention 复制过来的类，修改了 Bert -> Roberta
class FlaxRobertaSelfAttention(nn.Module):
    # 类的构造函数中声明了配置参数 config，以及两个额外的类属性：causal 表示是否使用因果（causal）注意力，dtype 表示计算过程中使用的数据类型，默认为 jnp.float32
    config: RobertaConfig
    causal: bool = False
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    def setup(self):
        # 计算每个注意力头的维度
        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
        # 检查 hidden_size 是否能被 num_attention_heads 整除，如果不能则抛出 ValueError 异常
        if self.config.hidden_size % self.config.num_attention_heads != 0:
            raise ValueError(
                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
                "                   : {self.config.num_attention_heads}"
            )

        # 创建 query、key 和 value 网络层，分别初始化为指定的 hidden_size，并使用 normal 分布初始化权重
        self.query = nn.Dense(
            self.config.hidden_size,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )
        self.key = nn.Dense(
            self.config.hidden_size,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )
        self.value = nn.Dense(
            self.config.hidden_size,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )

        # 如果设置了 causal=True，则创建一个因果掩码，用于在自注意力机制中排除未来信息
        if self.causal:
            self.causal_mask = make_causal_mask(
                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
            )

    # 将隐藏状态分割成多个注意力头，返回的形状为 (batch_size, seq_length, num_attention_heads, head_dim)
    def _split_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim))

    # 合并多个注意力头到原始隐藏状态，返回的形状为 (batch_size, seq_length, hidden_size)
    def _merge_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,))

    @nn.compact
    # 从 transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache 复制过来的方法
    def _concatenate_to_cache(self, key, value, query, attention_mask):
        """
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slightly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        """
        # 检测是否已经初始化缓存数据
        is_initialized = self.has_variable("cache", "cached_key")
        # 获取或创建缓存的键和值，并初始化为零矩阵，与输入的形状和数据类型相匹配
        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
        # 获取或创建缓存索引，初始化为整数0
        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))

        if is_initialized:
            # 获取批次维度、最大长度、头数和每头深度等维度信息
            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
            # 使用新的一维空间片段更新键和值的缓存
            cur_index = cache_index.value
            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
            key = lax.dynamic_update_slice(cached_key.value, key, indices)
            value = lax.dynamic_update_slice(cached_value.value, value, indices)
            # 更新缓存的键和值
            cached_key.value = key
            cached_value.value = value
            # 更新缓存索引，增加已更新的缓存向量数量
            num_updated_cache_vectors = query.shape[1]
            cache_index.value = cache_index.value + num_updated_cache_vectors
            # 为缓存的解码器自注意力生成因果掩码：单个查询位置应只关注已生成和缓存的键位置，而不是剩余的零元素。
            pad_mask = jnp.broadcast_to(
                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
            )
            # 合并因果掩码和输入的注意力掩码
            attention_mask = combine_masks(pad_mask, attention_mask)
        
        # 返回更新后的键、值和注意力掩码
        return key, value, attention_mask
# 定义一个用于Roberta模型自注意力层输出的类
class FlaxRobertaSelfOutput(nn.Module):
    config: RobertaConfig  # Roberta模型的配置信息
    dtype: jnp.dtype = jnp.float32  # 计算时使用的数据类型

    def setup(self):
        # 定义一个全连接层，将隐藏状态映射到隐藏大小，使用正态分布初始化权重
        self.dense = nn.Dense(
            self.config.hidden_size,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
            dtype=self.dtype,
        )
        # LayerNorm层，用于归一化隐藏状态
        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        # Dropout层，用于随机失活以防止过拟合
        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)

    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
        # 全连接层处理隐藏状态
        hidden_states = self.dense(hidden_states)
        # 使用Dropout层进行随机失活
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        # LayerNorm归一化并与输入张量相加
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# 定义一个用于Roberta模型注意力机制的类
class FlaxRobertaAttention(nn.Module):
    config: RobertaConfig  # Roberta模型的配置信息
    causal: bool = False  # 是否是因果（causal）注意力
    dtype: jnp.dtype = jnp.float32  # 计算时使用的数据类型

    def setup(self):
        # 定义自注意力层
        self.self = FlaxRobertaSelfAttention(self.config, causal=self.causal, dtype=self.dtype)
        # 定义自注意力层输出层
        self.output = FlaxRobertaSelfOutput(self.config, dtype=self.dtype)

    def __call__(
        self,
        hidden_states,
        attention_mask,
        layer_head_mask,
        key_value_states=None,
        init_cache=False,
        deterministic=True,
        output_attentions: bool = False,
    ):
        # 调用自注意力层进行处理
        attn_outputs = self.self(
            hidden_states,
            attention_mask,
            layer_head_mask=layer_head_mask,
            key_value_states=key_value_states,
            init_cache=init_cache,
            deterministic=deterministic,
            output_attentions=output_attentions,
        )
        # 取得自注意力层的输出
        attn_output = attn_outputs[0]
        # 使用自注意力层输出层处理自注意力层的输出和隐藏状态
        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_outputs[1],)  # 如果需要输出注意力权重，则加入到输出中

        return outputs


# 定义一个用于Roberta模型中间层的类
class FlaxRobertaIntermediate(nn.Module):
    config: RobertaConfig  # Roberta模型的配置信息
    dtype: jnp.dtype = jnp.float32  # 计算时使用的数据类型

    def setup(self):
        # 定义一个全连接层，将隐藏状态映射到中间大小，使用正态分布初始化权重
        self.dense = nn.Dense(
            self.config.intermediate_size,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
            dtype=self.dtype,
        )
        # 激活函数，根据配置选择激活函数类型
        self.activation = ACT2FN[self.config.hidden_act]
    # 定义类的方法 __call__，用于对输入的 hidden_states 进行处理并返回结果
    def __call__(self, hidden_states):
        # 将输入的 hidden_states 通过全连接层 dense 进行线性变换
        hidden_states = self.dense(hidden_states)
        # 将线性变换后的结果通过激活函数 activation 进行非线性变换
        hidden_states = self.activation(hidden_states)
        # 返回经过线性变换和激活函数处理后的 hidden_states 结果
        return hidden_states
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertOutput 复制并将 Bert 替换为 Roberta
class FlaxRobertaOutput(nn.Module):
    config: RobertaConfig  # Roberta 模型的配置对象
    dtype: jnp.dtype = jnp.float32  # 计算过程中使用的数据类型

    def setup(self):
        # 定义全连接层，将隐藏状态映射到指定大小的输出空间
        self.dense = nn.Dense(
            self.config.hidden_size,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),  # 使用正态分布初始化权重
            dtype=self.dtype,
        )
        # 定义 Dropout 层，用于随机屏蔽神经元，防止过拟合
        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
        # 定义 LayerNorm 层，用于归一化隐藏状态
        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)

    def __call__(self, hidden_states, attention_output, deterministic: bool = True):
        # 全连接层处理隐藏状态
        hidden_states = self.dense(hidden_states)
        # 应用 Dropout 层
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        # 应用 LayerNorm 层，并将注意力输出加到处理后的隐藏状态上
        hidden_states = self.LayerNorm(hidden_states + attention_output)
        return hidden_states


# 从 transformers.models.bert.modeling_flax_bert.FlaxBertLayer 复制并将 Bert 替换为 Roberta
class FlaxRobertaLayer(nn.Module):
    config: RobertaConfig  # Roberta 模型的配置对象
    dtype: jnp.dtype = jnp.float32  # 计算过程中使用的数据类型

    def setup(self):
        # 定义 Roberta 自注意力层
        self.attention = FlaxRobertaAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype)
        # 定义 Roberta 中间层
        self.intermediate = FlaxRobertaIntermediate(self.config, dtype=self.dtype)
        # 定义 Roberta 输出层
        self.output = FlaxRobertaOutput(self.config, dtype=self.dtype)
        # 如果配置中包含跨注意力机制，则定义 Roberta 交叉注意力层
        if self.config.add_cross_attention:
            self.crossattention = FlaxRobertaAttention(self.config, causal=False, dtype=self.dtype)

    def __call__(
        self,
        hidden_states,
        attention_mask,
        layer_head_mask,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        deterministic: bool = True,
        output_attentions: bool = False,
        # Self Attention
        # 使用 self.attention 方法进行自注意力计算，传入隐藏状态、注意力掩码等参数
        attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            layer_head_mask=layer_head_mask,
            init_cache=init_cache,
            deterministic=deterministic,
            output_attentions=output_attentions,
        )
        # 获取注意力计算的输出
        attention_output = attention_outputs[0]

        # Cross-Attention Block
        # 如果存在编码器的隐藏状态，执行交叉注意力计算
        if encoder_hidden_states is not None:
            # 使用 self.crossattention 方法进行交叉注意力计算，传入自注意力输出、编码器注意力掩码、编码器隐藏状态等参数
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask=encoder_attention_mask,
                layer_head_mask=layer_head_mask,
                key_value_states=encoder_hidden_states,
                deterministic=deterministic,
                output_attentions=output_attentions,
            )
            # 获取交叉注意力计算的输出作为最终的注意力输出
            attention_output = cross_attention_outputs[0]

        # 使用 self.intermediate 方法进行隐藏状态的中间层处理
        hidden_states = self.intermediate(attention_output)
        # 使用 self.output 方法生成最终的输出隐藏状态
        hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)

        # 输出为隐藏状态的元组
        outputs = (hidden_states,)

        # 如果需要输出注意力权重信息
        if output_attentions:
            # 将自注意力的注意力权重添加到输出元组中
            outputs += (attention_outputs[1],)
            # 如果存在编码器的隐藏状态，将交叉注意力的注意力权重也添加到输出元组中
            if encoder_hidden_states is not None:
                outputs += (cross_attention_outputs[1],)
        # 返回最终的输出元组
        return outputs
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection 复制代码，并将 Bert 替换为 Roberta
class FlaxRobertaLayerCollection(nn.Module):
    config: RobertaConfig  # 使用 RobertaConfig 类型的配置
    dtype: jnp.dtype = jnp.float32  # 计算过程中使用的数据类型
    gradient_checkpointing: bool = False  # 是否使用梯度检查点技术

    def setup(self):
        if self.gradient_checkpointing:
            # 如果启用梯度检查点技术，则使用 remat 函数重新定义 FlaxRobertaLayer 类
            FlaxRobertaCheckpointLayer = remat(FlaxRobertaLayer, static_argnums=(5, 6, 7))
            # 创建包含梯度检查点层的列表，每层以字符串形式命名
            self.layers = [
                FlaxRobertaCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
                for i in range(self.config.num_hidden_layers)
            ]
        else:
            # 否则，创建普通的 FlaxRobertaLayer 层列表，每层以字符串形式命名
            self.layers = [
                FlaxRobertaLayer(self.config, name=str(i), dtype=self.dtype)
                for i in range(self.config.num_hidden_layers)
            ]

    def __call__(
        self,
        hidden_states,
        attention_mask,
        head_mask,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        ):
            # 初始化空元组或 None，根据 output_attentions 的值确定是否返回注意力信息
            all_attentions = () if output_attentions else None
            # 初始化空元组或 None，根据 output_hidden_states 的值确定是否返回隐藏状态信息
            all_hidden_states = () if output_hidden_states else None
            # 初始化空元组或 None，根据 output_attentions 和 encoder_hidden_states 的值确定是否返回交叉注意力信息
            all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None

            # 检查 head_mask 是否正确指定了每层的屏蔽信息
            if head_mask is not None:
                if head_mask.shape[0] != (len(self.layers)):
                    # 抛出异常，提示 head_mask 应该对应于 self.layers 的层数
                    raise ValueError(
                        f"The head_mask should be specified for {len(self.layers)} layers, but it is for "
                        f"{head_mask.shape[0]}."
                    )

            # 遍历所有层，进行前向传播计算
            for i, layer in enumerate(self.layers):
                # 如果输出隐藏状态信息，则将当前隐藏状态加入到 all_hidden_states 中
                if output_hidden_states:
                    all_hidden_states += (hidden_states,)

                # 调用当前层的前向传播函数
                layer_outputs = layer(
                    hidden_states,
                    attention_mask,
                    head_mask[i] if head_mask is not None else None,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    init_cache,
                    deterministic,
                    output_attentions,
                )

                # 更新当前隐藏状态为当前层的输出的第一个元素（通常是隐藏状态）
                hidden_states = layer_outputs[0]

                # 如果输出注意力信息，则将当前层的注意力加入到 all_attentions 中
                if output_attentions:
                    all_attentions += (layer_outputs[1],)

                    # 如果同时存在 encoder_hidden_states，则将当前层的交叉注意力加入到 all_cross_attentions 中
                    if encoder_hidden_states is not None:
                        all_cross_attentions += (layer_outputs[2],)

            # 如果输出隐藏状态信息，则将最终的隐藏状态加入到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            # 整理最终的输出结果
            outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)

            # 如果 return_dict 为 False，则返回 outputs 中非 None 的部分作为元组
            if not return_dict:
                return tuple(v for v in outputs if v is not None)

            # 如果 return_dict 为 True，则将输出整理成 FlaxBaseModelOutputWithPastAndCrossAttentions 类的对象返回
            return FlaxBaseModelOutputWithPastAndCrossAttentions(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states,
                attentions=all_attentions,
                cross_attentions=all_cross_attentions,
            )
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertEncoder 复制并替换为 Roberta
class FlaxRobertaEncoder(nn.Module):
    config: RobertaConfig  # 使用 RobertaConfig 类型的配置信息
    dtype: jnp.dtype = jnp.float32  # 计算时的数据类型为 jnp.float32
    gradient_checkpointing: bool = False  # 是否使用梯度检查点

    def setup(self):
        self.layer = FlaxRobertaLayerCollection(  # 初始化 FlaxRobertaLayerCollection 实例
            self.config,  # 使用给定的配置信息
            dtype=self.dtype,  # 使用指定的数据类型
            gradient_checkpointing=self.gradient_checkpointing,  # 梯度检查点设置
        )

    def __call__(  # 定义对象调用时的行为
        self,
        hidden_states,  # 输入的隐藏状态张量
        attention_mask,  # 注意力掩码张量
        head_mask,  # 头部掩码张量
        encoder_hidden_states: Optional[jnp.ndarray] = None,  # 编码器隐藏状态（可选）
        encoder_attention_mask: Optional[jnp.ndarray] = None,  # 编码器注意力掩码（可选）
        init_cache: bool = False,  # 是否初始化缓存
        deterministic: bool = True,  # 是否确定性计算
        output_attentions: bool = False,  # 是否输出注意力权重
        output_hidden_states: bool = False,  # 是否输出隐藏状态
        return_dict: bool = True,  # 是否以字典形式返回结果
    ):
        return self.layer(  # 调用 FlaxRobertaLayerCollection 的前向传播
            hidden_states,
            attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            init_cache=init_cache,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


# 从 transformers.models.bert.modeling_flax_bert.FlaxBertPooler 复制并替换为 Roberta
class FlaxRobertaPooler(nn.Module):
    config: RobertaConfig  # 使用 RobertaConfig 类型的配置信息
    dtype: jnp.dtype = jnp.float32  # 计算时的数据类型为 jnp.float32

    def setup(self):
        self.dense = nn.Dense(  # 初始化密集连接层
            self.config.hidden_size,  # 输出大小为配置中的隐藏大小
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),  # 使用正态分布初始化权重
            dtype=self.dtype,  # 使用指定的数据类型
        )

    def __call__(self, hidden_states):
        cls_hidden_state = hidden_states[:, 0]  # 取出每个样本的第一个位置的隐藏状态
        cls_hidden_state = self.dense(cls_hidden_state)  # 将其通过密集连接层
        return nn.tanh(cls_hidden_state)  # 返回经过 tanh 激活函数后的结果


class FlaxRobertaLMHead(nn.Module):
    config: RobertaConfig  # 使用 RobertaConfig 类型的配置信息
    dtype: jnp.dtype = jnp.float32  # 计算时的数据类型为 jnp.float32
    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros  # 偏置初始化函数为零初始化

    def setup(self):
        self.dense = nn.Dense(  # 初始化第一个密集连接层
            self.config.hidden_size,  # 输出大小为配置中的隐藏大小
            dtype=self.dtype,  # 使用指定的数据类型
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),  # 使用正态分布初始化权重
        )
        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)  # 初始化 LayerNorm 层
        self.decoder = nn.Dense(  # 初始化解码器密集连接层
            self.config.vocab_size,  # 输出大小为词汇表大小
            dtype=self.dtype,  # 使用指定的数据类型
            use_bias=False,  # 不使用偏置
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),  # 使用正态分布初始化权重
        )
        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))  # 初始化偏置参数
    # 定义类的调用方法，接受隐藏状态和可选的共享嵌入作为输入
    def __call__(self, hidden_states, shared_embedding=None):
        # 使用全连接层对隐藏状态进行线性变换
        hidden_states = self.dense(hidden_states)
        # 应用 GELU 激活函数到线性变换后的隐藏状态
        hidden_states = ACT2FN["gelu"](hidden_states)
        # 对激活后的隐藏状态进行层归一化
        hidden_states = self.layer_norm(hidden_states)

        # 如果提供了共享的嵌入向量，使用解码器对隐藏状态进行处理
        if shared_embedding is not None:
            # 使用解码器对隐藏状态应用共享的嵌入向量的核参数
            hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
        else:
            # 否则，直接使用解码器处理隐藏状态
            hidden_states = self.decoder(hidden_states)

        # 将偏置项转换为与模型指定的数据类型相匹配的 JAX 数组
        bias = jnp.asarray(self.bias, self.dtype)
        # 将偏置项加到隐藏状态上
        hidden_states += bias
        # 返回处理后的最终隐藏状态
        return hidden_states
class FlaxRobertaClassificationHead(nn.Module):
    config: RobertaConfig  # 定义一个属性 config，类型为 RobertaConfig，用于存储配置信息
    dtype: jnp.dtype = jnp.float32  # 定义一个属性 dtype，默认为 jnp.float32

    def setup(self):
        self.dense = nn.Dense(
            self.config.hidden_size,  # 使用 config 中的 hidden_size 初始化一个全连接层
            dtype=self.dtype,  # 指定数据类型为 dtype
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),  # 使用正态分布初始化权重
        )
        classifier_dropout = (
            self.config.classifier_dropout  # 获取 config 中的 classifier_dropout 属性
            if self.config.classifier_dropout is not None  # 如果不为 None，则使用该值
            else self.config.hidden_dropout_prob  # 否则使用 config 中的 hidden_dropout_prob 属性的值
        )
        self.dropout = nn.Dropout(rate=classifier_dropout)  # 使用 classifier_dropout 率初始化一个 Dropout 层
        self.out_proj = nn.Dense(
            self.config.num_labels,  # 使用 config 中的 num_labels 初始化一个全连接层
            dtype=self.dtype,  # 指定数据类型为 dtype
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),  # 使用正态分布初始化权重
        )

    def __call__(self, hidden_states, deterministic=True):
        hidden_states = hidden_states[:, 0, :]  # 仅保留每个样本的第一个 token 的隐藏状态
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)  # 应用 Dropout
        hidden_states = self.dense(hidden_states)  # 全连接层处理隐藏状态
        hidden_states = nn.tanh(hidden_states)  # 使用双曲正切作为激活函数
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)  # 再次应用 Dropout
        hidden_states = self.out_proj(hidden_states)  # 使用最后一个全连接层进行最终的分类预测
        return hidden_states


class FlaxRobertaPreTrainedModel(FlaxPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = RobertaConfig  # 类属性，指定配置类为 RobertaConfig
    base_model_prefix = "roberta"  # 类属性，指定基础模型前缀为 "roberta"

    module_class: nn.Module = None  # 类属性，用于存储模块类，默认为 None

    def __init__(
        self,
        config: RobertaConfig,
        input_shape: Tuple = (1, 1),
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        gradient_checkpointing: bool = False,
        **kwargs,
    ):
        module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing
    def enable_gradient_checkpointing(self):
        self._module = self.module_class(
            config=self.config,  # 使用当前实例的 config 属性初始化模块
            dtype=self.dtype,  # 使用当前实例的 dtype 属性指定数据类型
            gradient_checkpointing=True,  # 启用梯度检查点
        )
    # 初始化权重方法，用于模型参数初始化
    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        # 初始化输入张量
        input_ids = jnp.zeros(input_shape, dtype="i4")
        # token_type_ids初始化为与input_ids相同形状的全1张量
        token_type_ids = jnp.ones_like(input_ids)
        # 根据input_ids创建position_ids，并用config中的pad_token_id进行填充
        position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
        # attention_mask初始化为与input_ids相同形状的全1张量
        attention_mask = jnp.ones_like(input_ids)
        # head_mask初始化为形状为(config.num_hidden_layers, config.num_attention_heads)的全1张量
        head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))

        # 划分随机数生成器rng，分为params和dropout两个部分
        params_rng, dropout_rng = jax.random.split(rng)
        rngs = {"params": params_rng, "dropout": dropout_rng}

        # 如果config中包含cross-attention，初始化encoder_hidden_states和encoder_attention_mask
        if self.config.add_cross_attention:
            encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
            encoder_attention_mask = attention_mask
            # 使用module的init方法初始化模块，传入必要参数，不返回字典
            module_init_outputs = self.module.init(
                rngs,
                input_ids,
                attention_mask,
                token_type_ids,
                position_ids,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                return_dict=False,
            )
        else:
            # 使用module的init方法初始化模块，传入必要参数，不返回字典
            module_init_outputs = self.module.init(
                rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False
            )

        # 从初始化输出中获取随机参数
        random_params = module_init_outputs["params"]

        # 如果传入了params，则将随机参数与params进行融合
        if params is not None:
            random_params = flatten_dict(unfreeze(random_params))
            params = flatten_dict(unfreeze(params))
            for missing_key in self._missing_keys:
                # 将随机参数中缺失的键添加到params中
                params[missing_key] = random_params[missing_key]
            self._missing_keys = set()
            # 返回融合后的冻结params
            return freeze(unflatten_dict(params))
        else:
            # 否则直接返回随机参数
            return random_params

    # 从transformers库中复制的初始化缓存方法
    def init_cache(self, batch_size, max_length):
        r"""
        Args:
            batch_size (`int`):
                fast auto-regressive decoding使用的batch_size，定义了初始化缓存的批大小。
            max_length (`int`):
                auto-regressive decoding的最大可能长度，定义了初始化缓存的序列长度。
        """
        # 初始化用于检索缓存的输入变量
        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
        # attention_mask初始化为与input_ids相同形状的全1张量
        attention_mask = jnp.ones_like(input_ids, dtype="i4")
        # position_ids根据input_ids广播而来，形状与input_ids相同
        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)

        # 使用module的init方法初始化模块，传入必要参数，返回不包含字典的初始化变量
        init_variables = self.module.init(
            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
        )
        # 返回解冻的初始化变量中的cache
        return unfreeze(init_variables["cache"])

    # 将开始字符串的文档字符串添加到模型前向传播方法
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 定义类的 __call__ 方法，使对象可以像函数一样被调用
    def __call__(
        # 输入的 token IDs，用于模型的输入
        self,
        # 注意力掩码，指示模型在哪些位置需要注意
        attention_mask=None,
        # token 类型 IDs，用于区分不同句子的 token
        token_type_ids=None,
        # 位置 IDs，指示每个 token 在句子中的位置
        position_ids=None,
        # 头部掩码，控制多头注意力机制中哪些头部生效
        head_mask=None,
        # 编码器隐藏状态，用于模型的编码器
        encoder_hidden_states=None,
        # 编码器注意力掩码，指示编码器中需要注意的位置
        encoder_attention_mask=None,
        # 参数字典，包含其他参数的字典形式输入
        params: dict = None,
        # 随机数生成器的密钥，用于随机性操作
        dropout_rng: jax.random.PRNGKey = None,
        # 是否在训练阶段，影响是否应用 dropout 等训练相关操作
        train: bool = False,
        # 是否输出注意力矩阵
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态
        output_hidden_states: Optional[bool] = None,
        # 是否返回字典形式的输出结果
        return_dict: Optional[bool] = None,
        # 过去的键值对，用于处理带有过去状态的模型
        past_key_values: dict = None,
# 从transformers.models.bert.modeling_flax_bert.FlaxBertModule复制代码，并将Bert替换为Roberta
class FlaxRobertaModule(nn.Module):
    config: RobertaConfig
    dtype: jnp.dtype = jnp.float32  # 计算中的数据类型
    add_pooling_layer: bool = True  # 是否添加池化层
    gradient_checkpointing: bool = False  # 是否使用梯度检查点

    def setup(self):
        self.embeddings = FlaxRobertaEmbeddings(self.config, dtype=self.dtype)  # 初始化Roberta模型的嵌入层
        self.encoder = FlaxRobertaEncoder(
            self.config,
            dtype=self.dtype,
            gradient_checkpointing=self.gradient_checkpointing,
        )  # 初始化Roberta模型的编码器
        self.pooler = FlaxRobertaPooler(self.config, dtype=self.dtype)  # 初始化Roberta模型的池化层

    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids: Optional[jnp.ndarray] = None,
        position_ids: Optional[jnp.ndarray] = None,
        head_mask: Optional[jnp.ndarray] = None,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 当token_type_ids未传递时，确保其正确初始化为零数组
        if token_type_ids is None:
            token_type_ids = jnp.zeros_like(input_ids)

        # 当position_ids未传递时，确保其正确初始化为广播到适当形状的数组
        if position_ids is None:
            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)

        # 使用嵌入层处理输入，生成隐藏状态
        hidden_states = self.embeddings(
            input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic
        )
        
        # 使用编码器处理隐藏状态，返回模型输出
        outputs = self.encoder(
            hidden_states,
            attention_mask,
            head_mask=head_mask,
            deterministic=deterministic,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        hidden_states = outputs[0]  # 获取编码器输出的隐藏状态

        # 如果不返回字典格式的输出
        if not return_dict:
            # 如果pooled为None，则不返回它
            if pooled is None:
                return (hidden_states,) + outputs[1:]
            return (hidden_states, pooled) + outputs[1:]

        # 返回带池化和交叉注意力的基础模型输出
        return FlaxBaseModelOutputWithPoolingAndCrossAttentions(
            last_hidden_state=hidden_states,
            pooler_output=pooled,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )


@add_start_docstrings(
    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
    ROBERTA_START_DOCSTRING,
)
class FlaxRobertaModel(FlaxRobertaPreTrainedModel):
    module_class = FlaxRobertaModule

# 调用函数向模型类添加示例文档字符串
append_call_sample_docstring(FlaxRobertaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)


class FlaxRobertaForMaskedLMModule(nn.Module):
    config: RobertaConfig
    dtype: jnp.dtype = jnp.float32
    gradient_checkpointing: bool = False

    def setup(self):
        # 初始化 RoBERTa 模型
        self.roberta = FlaxRobertaModule(
            config=self.config,
            add_pooling_layer=False,
            dtype=self.dtype,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        # 初始化 RoBERTa 语言模型头部
        self.lm_head = FlaxRobertaLMHead(config=self.config, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        head_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用 RoBERTa 模型，获取模型输出
        outputs = self.roberta(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]
        # 获取共享的词嵌入（如果配置允许）
        if self.config.tie_word_embeddings:
            shared_embedding = self.roberta.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
        else:
            shared_embedding = None

        # 计算预测得分
        logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)

        # 根据 return_dict 决定返回的格式
        if not return_dict:
            return (logits,) + outputs[1:]

        # 返回 RoBERTa 对象的输出作为 MaskedLM 的输出
        return FlaxMaskedLMOutput(
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING)
class FlaxRobertaForMaskedLM(FlaxRobertaPreTrainedModel):
    module_class = FlaxRobertaForMaskedLMModule

# 调用函数向模型类添加示例文档字符串
append_call_sample_docstring(
    FlaxRobertaForMaskedLM,
    _CHECKPOINT_FOR_DOC,
    FlaxBaseModelOutputWithPooling,
    _CONFIG_FOR_DOC,
    mask="<mask>",
)


class FlaxRobertaForSequenceClassificationModule(nn.Module):
    config: RobertaConfig
    dtype: jnp.dtype = jnp.float32
    gradient_checkpointing: bool = False

    def setup(self):
        # 初始化 RoBERTa 模型
        self.roberta = FlaxRobertaModule(
            config=self.config,
            dtype=self.dtype,
            add_pooling_layer=False,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        # 初始化 RoBERTa 序列分类头部
        self.classifier = FlaxRobertaClassificationHead(config=self.config, dtype=self.dtype)
    # 定义一个方法，使得对象可以像函数一样被调用，接受多个输入参数和多个关键字参数
    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        head_mask,
        deterministic: bool = True,  # 控制模型行为的布尔参数，默认为True
        output_attentions: bool = False,  # 是否输出注意力权重的布尔参数，默认为False
        output_hidden_states: bool = False,  # 是否输出隐藏状态的布尔参数，默认为False
        return_dict: bool = True,  # 是否返回结果字典的布尔参数，默认为True
    ):
        # 使用预训练模型进行前向传播
        outputs = self.roberta(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 提取模型输出的序列输出（通常是最后一层隐藏状态）
        sequence_output = outputs[0]
        # 使用分类器对序列输出进行分类，得到预测的逻辑回归结果
        logits = self.classifier(sequence_output, deterministic=deterministic)

        # 如果不要求返回结果字典，则返回一个元组，包含逻辑回归结果和其他输出
        if not return_dict:
            return (logits,) + outputs[1:]

        # 否则，返回一个FlaxSequenceClassifierOutput对象，包含逻辑回归结果、隐藏状态和注意力权重
        return FlaxSequenceClassifierOutput(
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Roberta Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    ROBERTA_START_DOCSTRING,
)



# 使用装饰器添加文档字符串，描述这是一个在Roberta模型基础上构建的序列分类/回归模型，顶部有一个线性层作为池化输出的一部分，例如用于GLUE任务。
class FlaxRobertaForSequenceClassification(FlaxRobertaPreTrainedModel):
    module_class = FlaxRobertaForSequenceClassificationModule



append_call_sample_docstring(
    FlaxRobertaForSequenceClassification,
    _CHECKPOINT_FOR_DOC,
    FlaxSequenceClassifierOutput,
    _CONFIG_FOR_DOC,
)



# 从transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule复制过来，将Bert改为Roberta，self.bert改为self.roberta
class FlaxRobertaForMultipleChoiceModule(nn.Module):
    config: RobertaConfig
    dtype: jnp.dtype = jnp.float32
    gradient_checkpointing: bool = False

    def setup(self):
        # 初始化Roberta模块，包括配置、数据类型和梯度检查点设置
        self.roberta = FlaxRobertaModule(
            config=self.config,
            dtype=self.dtype,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        # 使用配置中的隐藏层dropout率初始化dropout层
        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
        # 初始化分类器层，输出为1，数据类型与配置中的隐藏层一致
        self.classifier = nn.Dense(1, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        head_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 获取选择项的数量
        num_choices = input_ids.shape[1]
        # 重新整形输入以便于模型处理
        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
        token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
        position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None

        # 调用Roberta模型
        outputs = self.roberta(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取池化后的输出
        pooled_output = outputs[1]
        # 对池化输出应用dropout，具体行为根据deterministic参数确定
        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
        # 将dropout后的输出通过分类器层，得到logits
        logits = self.classifier(pooled_output)

        # 将logits重新整形为（batch_size, num_choices）
        reshaped_logits = logits.reshape(-1, num_choices)

        if not return_dict:
            # 如果不需要返回字典，则返回元组形式的输出
            return (reshaped_logits,) + outputs[2:]

        # 返回多选题模型的输出，包括logits、隐藏状态和注意力
        return FlaxMultipleChoiceModelOutput(
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )



@add_start_docstrings(
    """
    Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,



# 这部分代码片段未完成，应继续添加代码以完成类的定义和功能。
    # 定义 ROBERTA_START_DOCSTRING 常量，通常用于标识文档字符串的起始位置
    ROBERTA_START_DOCSTRING,
# 在 FlaxRobertaForMultipleChoice 类中设置模块类为 FlaxRobertaForMultipleChoiceModule
class FlaxRobertaForMultipleChoice(FlaxRobertaPreTrainedModel):
    module_class = FlaxRobertaForMultipleChoiceModule

# 覆盖 FlaxRobertaForMultipleChoice 类的调用文档字符串，使用 ROBERTA_INPUTS_DOCSTRING 格式化字符串
overwrite_call_docstring(
    FlaxRobertaForMultipleChoice, ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)

# 向 FlaxRobertaForMultipleChoice 类的示例调用文档字符串附加示例代码和相关说明
append_call_sample_docstring(
    FlaxRobertaForMultipleChoice,
    _CHECKPOINT_FOR_DOC,
    FlaxMultipleChoiceModelOutput,
    _CONFIG_FOR_DOC,
)


# 从 transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassificationModule 复制到 FlaxRobertaForTokenClassificationModule，将 Bert 替换为 Roberta，self.bert 替换为 self.roberta
class FlaxRobertaForTokenClassificationModule(nn.Module):
    config: RobertaConfig
    dtype: jnp.dtype = jnp.float32
    gradient_checkpointing: bool = False

    def setup(self):
        # 使用 FlaxRobertaModule 创建 self.roberta，配置为不添加池化层，是否进行梯度检查点由 self.gradient_checkpointing 控制
        self.roberta = FlaxRobertaModule(
            config=self.config,
            dtype=self.dtype,
            add_pooling_layer=False,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        # 根据配置设置分类器的 dropout 率
        classifier_dropout = (
            self.config.classifier_dropout
            if self.config.classifier_dropout is not None
            else self.config.hidden_dropout_prob
        )
        # 创建 dropout 层
        self.dropout = nn.Dropout(rate=classifier_dropout)
        # 创建分类器，输出维度为 self.config.num_labels
        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        head_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用 self.roberta 进行模型前向传播
        outputs = self.roberta(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型的隐藏状态
        hidden_states = outputs[0]
        # 对隐藏状态应用 dropout
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        # 对应用 dropout 后的隐藏状态应用分类器，得到 logits
        logits = self.classifier(hidden_states)

        # 如果不返回字典，则返回 logits 以及 outputs 中的其余部分
        if not return_dict:
            return (logits,) + outputs[1:]

        # 返回 FlaxTokenClassifierOutput 对象，包含 logits、隐藏状态和注意力权重
        return FlaxTokenClassifierOutput(
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


# 为 FlaxRobertaForTokenClassification 类添加起始文档字符串，描述其为在隐藏状态之上具有标记分类头的 Roberta 模型，例如用于命名实体识别 (NER) 任务
@add_start_docstrings(
    """
    Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    ROBERTA_START_DOCSTRING,
)
class FlaxRobertaForTokenClassification(FlaxRobertaPreTrainedModel):
    module_class = FlaxRobertaForTokenClassificationModule

# 向 FlaxRobertaForTokenClassification 类的示例调用文档字符串附加示例代码和相关说明
append_call_sample_docstring(
    FlaxRobertaForTokenClassification,
    _CHECKPOINT_FOR_DOC,
    FlaxTokenClassifierOutput,
    _CONFIG_FOR_DOC,
)
# 从transformers.models.bert.modeling_flax_bert.FlaxBertForQuestionAnsweringModule复制代码到此处，并将Bert->Roberta，self.bert->self.roberta
class FlaxRobertaForQuestionAnsweringModule(nn.Module):
    config: RobertaConfig
    dtype: jnp.dtype = jnp.float32
    gradient_checkpointing: bool = False

    def setup(self):
        # 初始化Roberta模型作为self.roberta，不包含池化层，支持梯度检查点
        self.roberta = FlaxRobertaModule(
            config=self.config,
            dtype=self.dtype,
            add_pooling_layer=False,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        # 初始化用于QA任务的输出层self.qa_outputs，包含num_labels个输出单元
        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        head_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用Roberta模型self.roberta进行前向传播
        outputs = self.roberta(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从输出中获取隐藏状态
        hidden_states = outputs[0]

        # 将隐藏状态传入QA输出层self.qa_outputs，得到起始和结束位置的logits
        logits = self.qa_outputs(hidden_states)
        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        # 如果不返回字典，直接返回logits和可能存在的额外输出
        if not return_dict:
            return (start_logits, end_logits) + outputs[1:]

        # 返回QA模型的输出，包括起始和结束logits、隐藏状态和注意力权重
        return FlaxQuestionAnsweringModelOutput(
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@add_start_docstrings(
    """
    为抽取式问答任务（如SQuAD）设计的Roberta模型，顶部有一个用于span分类的线性层，
    用于计算`span start logits`和`span end logits`的隐藏状态输出。
    """,
    ROBERTA_START_DOCSTRING,
)
class FlaxRobertaForQuestionAnswering(FlaxRobertaPreTrainedModel):
    # 使用FlaxRobertaForQuestionAnsweringModule作为模型类
    module_class = FlaxRobertaForQuestionAnsweringModule


# 添加调用示例的文档字符串
append_call_sample_docstring(
    FlaxRobertaForQuestionAnswering,
    _CHECKPOINT_FOR_DOC,
    FlaxQuestionAnsweringModelOutput,
    _CONFIG_FOR_DOC,
)


class FlaxRobertaForCausalLMModule(nn.Module):
    config: RobertaConfig
    dtype: jnp.dtype = jnp.float32
    gradient_checkpointing: bool = False

    def setup(self):
        # 初始化Roberta模型作为self.roberta，不包含池化层，支持梯度检查点
        self.roberta = FlaxRobertaModule(
            config=self.config,
            add_pooling_layer=False,
            dtype=self.dtype,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        # 初始化用于Causal LM任务的LM头部self.lm_head
        self.lm_head = FlaxRobertaLMHead(config=self.config, dtype=self.dtype)
    # 定义一个特殊方法 __call__，用于实现对象的可调用行为
    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        token_type_ids: Optional[jnp.ndarray] = None,
        head_mask: Optional[jnp.ndarray] = None,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用模型的主体部分
        outputs = self.roberta(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            init_cache=init_cache,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取隐藏状态
        hidden_states = outputs[0]

        # 如果配置允许词嵌入共享
        if self.config.tie_word_embeddings:
            # 获取共享的词嵌入层
            shared_embedding = self.roberta.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
        else:
            shared_embedding = None

        # 计算预测分数（logits）
        logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)

        # 如果不要求返回字典形式的结果，则返回元组形式的输出
        if not return_dict:
            return (logits,) + outputs[1:]

        # 返回带有交叉注意力的因果语言模型输出对象
        return FlaxCausalLMOutputWithCrossAttentions(
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )
"""
Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
autoregressive tasks.
"""

# 将 FlaxRobertaForCausalLM 类定义为一个特定的 RoBERTa 模型，用于自回归任务，包括语言建模等
@add_start_docstrings(
    """
    Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    """,
    ROBERTA_START_DOCSTRING,
)
class FlaxRobertaForCausalLM(FlaxRobertaPreTrainedModel):
    # 设置模型的主体类为 FlaxRobertaForCausalLMModule
    module_class = FlaxRobertaForCausalLMModule

    # 准备用于生成的输入数据，包括初始化缓存和注意力遮罩
    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
        # 获取输入的批量大小和序列长度
        batch_size, seq_length = input_ids.shape

        # 初始化缓存 past_key_values
        past_key_values = self.init_cache(batch_size, max_length)

        # 注意：通常情况下，需要在 attention_mask 中为超出 input_ids.shape[-1] 和小于 cache_length 的位置放置 0
        # 但由于解码器使用因果遮罩，这些位置已经被遮蔽了。
        # 因此，我们可以在这里创建一个静态的 attention_mask，这对编译效率更高。
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
        if attention_mask is not None:
            # 计算位置 IDs
            position_ids = attention_mask.cumsum(axis=-1) - 1
            # 将 attention_mask 动态更新到 extended_attention_mask 中
            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
        else:
            # 如果 attention_mask 为 None，则创建一个广播后的位置 IDs
            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))

        return {
            "past_key_values": past_key_values,
            "attention_mask": extended_attention_mask,
            "position_ids": position_ids,
        }

    # 更新生成过程中的输入参数，包括 past_key_values 和 position_ids
    def update_inputs_for_generation(self, model_outputs, model_kwargs):
        model_kwargs["past_key_values"] = model_outputs.past_key_values
        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
        return model_kwargs


# 附加调用示例文档字符串，指定模型类、检查点信息、输出类型及配置信息
append_call_sample_docstring(
    FlaxRobertaForCausalLM,
    _CHECKPOINT_FOR_DOC,
    FlaxCausalLMOutputWithCrossAttentions,
    _CONFIG_FOR_DOC,
)

`.\models\roberta\modeling_roberta.py`

# coding=utf-8
# 定义编码方式为 UTF-8，确保支持中文等多种字符集
# 版权声明，包括版权归属信息和许可协议
# 此部分代码的版权归 Google AI Language Team 和 HuggingFace Inc. 团队所有
# 版权归 NVIDIA CORPORATION 所有，保留所有权利
#
# 根据 Apache 许可协议版本 2.0 使用本文件
# 除非法律要求或书面同意，否则不得使用此文件
# 您可以在以下网址获取许可协议的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 本代码基于 "按原样提供" 基础分发，不附带任何明示或暗示的保证或条件
# 查看许可协议了解具体条款和条件
"""PyTorch RoBERTa model."""
# 引入数学库，用于数学运算
import math
# 引入类型注解，用于声明变量、函数参数和返回值的类型
from typing import List, Optional, Tuple, Union
# 引入 PyTorch 框架
import torch
# 引入 PyTorch 的模块
import torch.utils.checkpoint
# 引入 PyTorch 的神经网络模块
from torch import nn
# 引入 PyTorch 的损失函数
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
# 引入激活函数及相关函数
from ...activations import ACT2FN, gelu
# 引入模型输出类
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
# 引入模型工具类
from ...modeling_utils import PreTrainedModel
# 引入 PyTorch 工具函数
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
# 引入常用工具函数
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 引入 RoBERTa 的配置类
from .configuration_roberta import RobertaConfig

# 获取 logger 对象，用于日志记录
logger = logging.get_logger(__name__)

# 用于文档的检查点信息
_CHECKPOINT_FOR_DOC = "FacebookAI/roberta-base"
# 用于文档的配置信息
_CONFIG_FOR_DOC = "RobertaConfig"

# RoBERTa 预训练模型的存档列表
ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "FacebookAI/roberta-base",
    "FacebookAI/roberta-large",
    "FacebookAI/roberta-large-mnli",
    "distilbert/distilroberta-base",
    "openai-community/roberta-base-openai-detector",
    "openai-community/roberta-large-openai-detector",
    # 查看所有 RoBERTa 模型：https://huggingface.co/models?filter=roberta
]

# RoBERTaEmbeddings 类，继承自 nn.Module，用于定义 RoBERTa 的嵌入层
class RobertaEmbeddings(nn.Module):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    """
    # 与 BertEmbeddings 相同，稍作调整以支持位置嵌入的索引
    # 初始化函数，用于初始化模型参数
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__()
        # 初始化词嵌入层，vocab_size表示词汇表大小，hidden_size表示隐藏层大小，padding_idx指定填充的token ID
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 初始化位置嵌入层，max_position_embeddings表示最大位置编码数量，hidden_size表示隐藏层大小
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 初始化token类型嵌入层，type_vocab_size表示token类型的数量，hidden_size表示隐藏层大小
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # LayerNorm命名不使用蛇形命名法以保持与TensorFlow模型变量名的一致性，并能够加载任何TensorFlow检查点文件
        # 初始化LayerNorm层，hidden_size表示层的大小，eps为LayerNorm层的epsilon值
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化Dropout层，hidden_dropout_prob表示隐藏层的dropout概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_embedding_type表示位置嵌入类型，默认为"absolute"
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册position_ids张量为缓冲区，表示位置编码，形状为(1, max_position_embeddings)
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册token_type_ids张量为缓冲区，表示token类型编码，形状与position_ids相同，类型为长整型
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

        # End copy
        # 设置padding_idx，用于嵌入层中的填充
        self.padding_idx = config.pad_token_id
        # 重新初始化位置嵌入层，使用与之前不同的方式，padding_idx指定填充的token ID
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )
        ):
            # 如果未提供位置id，则根据输入token id创建位置id，保留任何填充的token的填充状态。
            if position_ids is None:
                if input_ids is not None:
                    position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
                else:
                    position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

            # 如果提供了input_ids，则获取其形状；否则，获取inputs_embeds的形状但不包括最后一维。
            if input_ids is not None:
                input_shape = input_ids.size()
            else:
                input_shape = inputs_embeds.size()[:-1]

            seq_length = input_shape[1]

            # 将token_type_ids设置为构造函数中注册的缓冲区，通常为全零。这在模型跟踪时有帮助，而不需要传递token_type_ids，解决问题＃5664。
            if token_type_ids is None:
                if hasattr(self, "token_type_ids"):
                    buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                    buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                    token_type_ids = buffered_token_type_ids_expanded
                else:
                    token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

            # 如果未提供inputs_embeds，则使用input_ids获取词嵌入。
            if inputs_embeds is None:
                inputs_embeds = self.word_embeddings(input_ids)
            token_type_embeddings = self.token_type_embeddings(token_type_ids)

            embeddings = inputs_embeds + token_type_embeddings
            # 如果使用绝对位置嵌入，则添加位置嵌入。
            if self.position_embedding_type == "absolute":
                position_embeddings = self.position_embeddings(position_ids)
                embeddings += position_embeddings
            embeddings = self.LayerNorm(embeddings)
            embeddings = self.dropout(embeddings)
            return embeddings

        # 从inputs_embeds直接生成位置id。无法推断哪些是填充的，因此仅生成顺序位置id。
        def create_position_ids_from_inputs_embeds(self, inputs_embeds):
            """
            We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

            Args:
                inputs_embeds: torch.Tensor

            Returns: torch.Tensor
            """
            input_shape = inputs_embeds.size()[:-1]
            sequence_length = input_shape[1]

            position_ids = torch.arange(
                self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
            )
            return position_ids.unsqueeze(0).expand(input_shape)
# 从 transformers.models.bert.modeling_bert.BertSelfAttention 复制并将 Bert 替换为 Roberta
class RobertaSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 如果隐藏大小不是注意力头数的整数倍且配置中没有嵌入大小属性，则引发错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建查询、键和值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 注意力概率的 dropout
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果位置嵌入类型是相对键或相对键查询，则创建距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        self.is_decoder = config.is_decoder

    # 重新排列张量形状以准备进行注意力得分计算
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,



# 从 transformers.models.bert.modeling_bert.BertSelfOutput 复制并将 Bert 替换为 Roberta
class RobertaSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 密集层：输入和输出大小都为隐藏大小
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 层归一化
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # dropout
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数定义
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 密集层前向传播
        hidden_states = self.dense(hidden_states)
        # dropout
        hidden_states = self.dropout(hidden_states)
        # 层归一化并添加输入张量
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
# 定义 RobertaAttention 类，继承自 nn.Module
class RobertaAttention(nn.Module):
    # 初始化方法
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 创建 RobertaSelfAttention 对象，并传入 config 和 position_embedding_type 参数
        self.self = RobertaSelfAttention(config, position_embedding_type=position_embedding_type)
        # 创建 RobertaSelfOutput 对象，传入 config 参数
        self.output = RobertaSelfOutput(config)
        # 初始化一个空集合，用于存储要剪枝的注意力头的索引
        self.pruned_heads = set()

    # 剪枝注意力头的方法
    def prune_heads(self, heads):
        # 如果 heads 列表为空，则直接返回
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数获取可剪枝的头部索引及其所在的层级索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储已剪枝的头部索引
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self 对象的 forward 方法进行自注意力计算
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 将自注意力输出传入 self.output 对象，得到最终的注意力输出
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出 attentions，则将 attentions 添加到 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # 如果有的话，添加 attentions
        return outputs


# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制得到的类
class RobertaIntermediate(nn.Module):
    # 初始化方法
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，将隐藏状态的尺寸转换为中间尺寸
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果 config.hidden_act 是字符串，则使用对应的激活函数，否则使用 config.hidden_act
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播方法
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 通过线性层进行尺寸转换
        hidden_states = self.dense(hidden_states)
        # 使用激活函数进行非线性变换
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertOutput 复制得到的类
class RobertaOutput(nn.Module):
    # 在这里没有提供该类的完整定义和方法，所以这里不添加具体注释
    pass
    # 初始化函数，用于创建一个新的神经网络层
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__()
        # 创建一个线性层，输入大小为config中的intermediate_size，输出大小为config中的hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个LayerNorm层，对输入进行归一化，归一化的维度为config中的hidden_size，eps为归一化过程中的小数值防止除零错误
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个Dropout层，以config中的hidden_dropout_prob的概率对输入进行随机置零，用于防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，定义了数据从输入到输出的流动方式
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 线性变换，将输入hidden_states映射到新的空间
        hidden_states = self.dense(hidden_states)
        # 对变换后的结果进行随机置零，防止过拟合
        hidden_states = self.dropout(hidden_states)
        # 将置零后的结果与输入张量input_tensor相加，并进行LayerNorm归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的结果张量
        return hidden_states
# 从transformers.models.bert.modeling_bert.BertLayer复制代码，并将Bert->Roberta
class RobertaLayer(nn.Module):
    # 初始化函数，用于设置层的参数和子模块
    def __init__(self, config):
        super().__init__()
        # 设置前馈传播的分块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度的索引
        self.seq_len_dim = 1
        # 初始化自注意力机制
        self.attention = RobertaAttention(config)
        # 是否作为解码器使用
        self.is_decoder = config.is_decoder
        # 是否添加跨注意力
        self.add_cross_attention = config.add_cross_attention
        # 如果添加跨注意力，且不是解码器，则引发错误
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 初始化跨注意力机制，使用绝对位置编码
            self.crossattention = RobertaAttention(config, position_embedding_type="absolute")
        # 初始化中间层
        self.intermediate = RobertaIntermediate(config)
        # 初始化输出层
        self.output = RobertaOutput(config)

    # 前向传播函数，定义层的计算逻辑
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 使用 self_attn_past_key_value 来缓存解码器自注意力机制的键/值对，位置在 1 和 2
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        attention_output = self_attention_outputs[0]

        # 如果是解码器，最后的输出是自注意力缓存的元组
        if self.is_decoder:
            outputs = self_attention_outputs[1:-1]
            present_key_value = self_attention_outputs[-1]
        else:
            outputs = self_attention_outputs[1:]  # 如果输出注意力权重，则添加自注意力
                                             
        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 使用 cross_attn_past_key_value 来缓存跨注意力机制的键/值对，位置在 past_key_value 元组的 3 和 4
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            attention_output = cross_attention_outputs[0]
            outputs = outputs + cross_attention_outputs[1:-1]  # 如果输出注意力权重，则添加跨注意力

            # 将跨注意力的缓存添加到 present_key_value 元组的位置 3,4
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        outputs = (layer_output,) + outputs

        # 如果是解码器，将注意力键/值对作为最后一个输出返回
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs

    def feed_forward_chunk(self, attention_output):
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
# 从 transformers.models.bert.modeling_bert.BertEncoder 复制并替换为使用 Roberta 模型的编码器类
class RobertaEncoder(nn.Module):
    # 初始化方法，接受一个配置参数 config
    def __init__(self, config):
        super().__init__()
        # 将配置参数保存到实例中
        self.config = config
        # 创建一个由多个 RobertaLayer 组成的层列表，层数由配置参数中的 num_hidden_layers 决定
        self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点（gradient checkpointing）默认关闭
        self.gradient_checkpointing = False

    # 前向传播方法，接受多个输入参数，并返回输出结果
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        all_hidden_states = () if output_hidden_states else None
        # 初始化一个空元组用于存储所有隐藏状态，如果不需要输出隐藏状态则置为None
        all_self_attentions = () if output_attentions else None
        # 初始化一个空元组用于存储所有自注意力权重，如果不需要输出注意力权重则置为None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
        # 初始化一个空元组用于存储所有跨注意力权重，如果不需要输出跨注意力权重或模型未配置跨注意力则置为None

        if self.gradient_checkpointing and self.training:
            # 如果启用了梯度检查点且处于训练模式
            if use_cache:
                # 如果设置了使用缓存，则发出警告并禁用缓存
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        next_decoder_cache = () if use_cache else None
        # 如果使用缓存，则初始化一个空元组用于存储下一个解码器缓存，否则置为None
        for i, layer_module in enumerate(self.layer):
            # 遍历解码器层列表
            if output_hidden_states:
                # 如果需要输出隐藏状态，则将当前隐藏状态添加到所有隐藏状态元组中
                all_hidden_states = all_hidden_states + (hidden_states,)

            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取当前层的头部掩码，如果未提供头部掩码则置为None
            past_key_value = past_key_values[i] if past_key_values is not None else None
            # 获取当前层的过去键值对，如果未提供则置为None

            if self.gradient_checkpointing and self.training:
                # 如果启用了梯度检查点且处于训练模式
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则直接调用解码器层模块计算输出
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            hidden_states = layer_outputs[0]
            # 更新当前隐藏状态为当前层的输出的第一个元素（通常是隐藏状态）
            if use_cache:
                # 如果使用缓存，则将当前层的缓存输出添加到下一个解码器缓存元组中
                next_decoder_cache += (layer_outputs[-1],)
            if output_attentions:
                # 如果需要输出注意力权重
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 将当前层的自注意力权重输出添加到所有自注意力权重元组中
                if self.config.add_cross_attention:
                    # 如果模型配置中包括跨注意力机制
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
                    # 将当前层的跨注意力权重输出添加到所有跨注意力权重元组中

        if output_hidden_states:
            # 如果需要输出隐藏状态，则将最终的隐藏状态添加到所有隐藏状态元组中
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            # 如果不要求返回字典形式的结果
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
            # 返回包含所有结果元组的元组，过滤掉为None的值
        return BaseModelOutputWithPastAndCrossAttentions(
            # 否则返回一个包含所有结果的字典对象
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# 从transformers.models.bert.modeling_bert.BertPooler中复制而来的类，用于RoBERTa模型的池化层
class RobertaPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化全连接层，输入和输出维度都是config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # Tanh激活函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 通过仅仅使用第一个token对应的隐藏状态来"池化"模型
        first_token_tensor = hidden_states[:, 0]
        # 将第一个token的隐藏状态通过全连接层dense进行线性变换
        pooled_output = self.dense(first_token_tensor)
        # 应用Tanh激活函数
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出张量
        return pooled_output


class RobertaPreTrainedModel(PreTrainedModel):
    """
    一个抽象类，用于处理权重初始化以及一个简单的接口，用于下载和加载预训练模型。
    """

    config_class = RobertaConfig
    base_model_prefix = "roberta"
    supports_gradient_checkpointing = True
    _no_split_modules = ["RobertaEmbeddings", "RobertaSelfAttention"]

    # 从transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights中复制而来的函数
    def _init_weights(self, module):
        """初始化权重"""
        if isinstance(module, nn.Linear):
            # 和TF版本稍有不同，这里使用正态分布初始化权重
            # 参考：https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # LayerNorm层的偏置初始化为0
            module.bias.data.zero_()
            # LayerNorm层的权重初始化为1
            module.weight.data.fill_(1.0)


ROBERTA_START_DOCSTRING = r"""
    此模型继承自[`PreTrainedModel`]。请查看其超类文档以了解库实现的所有模型的通用方法（例如下载或保存模型、调整输入嵌入、修剪头等）。

    此模型也是一个PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)子类。
    您可以像使用常规PyTorch模块一样使用它，并参考PyTorch文档以获取有关一般使用和行为的所有相关信息。

    参数:
        config ([`RobertaConfig`]): 包含模型所有参数的配置类。使用配置文件初始化模型不会加载模型的权重，只会加载配置。
        请查看[`~PreTrainedModel.from_pretrained`]方法以加载模型权重。
"""

ROBERTA_INPUTS_DOCSTRING = r"""
    ```
        Args:
            input_ids (`torch.LongTensor` of shape `({0})`):
                # 输入序列标记的索引，对应于词汇表中的标记

                # 可以使用 [`AutoTokenizer`] 获取这些索引。参见 [`PreTrainedTokenizer.encode`] 和
                # [`PreTrainedTokenizer.__call__`] 获取更多详情。

                # [什么是输入 ID？](../glossary#input-ids)
            attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
                # 遮罩，用于在填充的标记索引上避免执行注意力计算。遮罩的值选在 `[0, 1]`：

                # - 对于 **未遮罩的** 标记，值为 1，
                # - 对于 **遮罩的** 标记，值为 0。

                # [什么是注意力遮罩？](../glossary#attention-mask)
            token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                # 段标记索引，指示输入的第一部分和第二部分。索引值选在 `[0,1]`：

                # - 0 对应 *句子 A* 的标记，
                # - 1 对应 *句子 B* 的标记。
                # 当模型用 `type_vocab_size` 参数初始化且值 >= 2 时才能使用此参数。此张量中的所有值应始终 < type_vocab_size。

                # [什么是标记类型 ID？](../glossary#token-type-ids)
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                # 输入序列标记在位置嵌入中的位置索引。选择范围在 `[0, config.max_position_embeddings - 1]`。

                # [什么是位置 ID？](../glossary#position-ids)
            head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                # 用于屏蔽自注意力模块中选定头部的遮罩。遮罩的值选在 `[0, 1]`：

                # - 1 表示头部 **未被屏蔽**，
                # - 0 表示头部 **被屏蔽**。

            inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
                # 可选地，您可以直接传递嵌入表示而不是传递 `input_ids`。这在您想要更精确地控制如何将 `input_ids` 索引转换为相关向量时很有用，而不是使用模型内部的嵌入查找矩阵。

            output_attentions (`bool`, *optional*):
                # 是否返回所有注意力层的注意力张量。有关更多详细信息，请参见返回的张量中的 `attentions`。

            output_hidden_states (`bool`, *optional*):
                # 是否返回所有层的隐藏状态。有关更多详细信息，请参见返回的张量中的 `hidden_states`。

            return_dict (`bool`, *optional*):
                # 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
    """
    # 使用指定的文档字符串作为 RoBERTa 模型的描述
    @add_start_docstrings(
        "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
        ROBERTA_START_DOCSTRING,
    )
    """
    """
    # RoBERTaModel 类的定义，继承自 RoBERTaPreTrainedModel
    class RobertaModel(RobertaPreTrainedModel):
        """

        RoBERTa 模型可以作为编码器（只有自注意力）或解码器使用，后者在自注意力层之间增加了一个交叉注意力层，
        遵循 *Attention is all you need*_ 中描述的架构，由 Ashish Vaswani、Noam Shazeer、Niki Parmar、Jakob Uszkoreit、
        Llion Jones、Aidan N. Gomez、Lukasz Kaiser 和 Illia Polosukhin 提出。

        若要作为解码器使用，模型需要使用配置中设置 `is_decoder` 参数为 `True` 进行初始化。
        若要在 Seq2Seq 模型中使用，模型需要同时设置 `is_decoder` 参数和 `add_cross_attention` 参数为 `True`，
        并期望在前向传播中输入 `encoder_hidden_states`。

        .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762

        """

        # 从 transformers.models.bert.modeling_bert.BertModel.__init__ 复制过来，将 Bert 改为 RoBERTa
        def __init__(self, config, add_pooling_layer=True):
            super().__init__(config)
            # 初始化模型配置
            self.config = config

            # 初始化 RoBERTaEmbeddings
            self.embeddings = RobertaEmbeddings(config)
            # 初始化 RoBERTaEncoder
            self.encoder = RobertaEncoder(config)

            # 如果指定添加池化层，则初始化 RoBERTaPooler
            self.pooler = RobertaPooler(config) if add_pooling_layer else None

            # 初始化权重并应用最终处理
            self.post_init()

        # 获取输入嵌入层（词嵌入）
        def get_input_embeddings(self):
            return self.embeddings.word_embeddings

        # 设置输入嵌入层（词嵌入）
        def set_input_embeddings(self, value):
            self.embeddings.word_embeddings = value

        # 剪枝模型的注意力头
        def _prune_heads(self, heads_to_prune):
            """
            Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
            class PreTrainedModel
            """
            for layer, heads in heads_to_prune.items():
                self.encoder.layer[layer].attention.prune_heads(heads)

        # 为 RoBERTaModel 的前向传播方法添加文档字符串
        @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
        @add_code_sample_docstrings(
            checkpoint=_CHECKPOINT_FOR_DOC,
            output_type=BaseModelOutputWithPoolingAndCrossAttentions,
            config_class=_CONFIG_FOR_DOC,
        )
        # 从 transformers.models.bert.modeling_bert.BertModel.forward 复制过来
        """
    # 定义 Transformer 模型的前向传播方法，接受多个输入参数
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,   # 输入的 token IDs 张量，可选
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码张量，可选
        token_type_ids: Optional[torch.Tensor] = None,  # token 类型 IDs 张量，可选
        position_ids: Optional[torch.Tensor] = None,  # 位置 IDs 张量，可选
        head_mask: Optional[torch.Tensor] = None,   # 头部掩码张量，可选
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入张量，可选
        encoder_hidden_states: Optional[torch.Tensor] = None,  # 编码器隐藏状态张量，可选
        encoder_attention_mask: Optional[torch.Tensor] = None,  # 编码器注意力掩码张量，可选
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 过去的键值对列表，可选
        use_cache: Optional[bool] = None,  # 是否使用缓存，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力张量，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选
        return_dict: Optional[bool] = None,  # 是否以字典形式返回结果，可选
# 定义 RoBERTa 语言模型，用于条件语言建模 fine-tuning
@add_start_docstrings(
    """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.""", ROBERTA_START_DOCSTRING
)
class RobertaForCausalLM(RobertaPreTrainedModel):
    # 共享权重的键名列表
    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

    # 初始化函数，接受一个配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 如果配置中指定不是解码器，则记录警告日志
        if not config.is_decoder:
            logger.warning("If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`")

        # 初始化 RoBERTa 模型，不包含池化层
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # 初始化 RoBERTa 语言建模头部
        self.lm_head = RobertaLMHead(config)

        # 执行初始化权重并应用最终处理
        self.post_init()

    # 返回语言建模头部的输出嵌入层
    def get_output_embeddings(self):
        return self.lm_head.decoder

    # 设置语言建模头部的输出嵌入层
    def set_output_embeddings(self, new_embeddings):
        self.lm_head.decoder = new_embeddings

    # 前向传播函数，接受多个输入参数并返回预测结果
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 准备生成的输入数据，根据需要动态创建解码器的注意力遮罩
        def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
            input_shape = input_ids.shape
            # 如果未提供注意力遮罩，则使用全为1的遮罩
            if attention_mask is None:
                attention_mask = input_ids.new_ones(input_shape)

            # 如果存在过去键值，则截取解码器输入的 ID
            if past_key_values is not None:
                past_length = past_key_values[0][0].shape[2]

                # 一些生成方法可能只传递最后一个输入 ID
                if input_ids.shape[1] > past_length:
                    remove_prefix_length = past_length
                else:
                    # 默认行为：保留最后一个 ID
                    remove_prefix_length = input_ids.shape[1] - 1

                input_ids = input_ids[:, remove_prefix_length:]

            return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
    # 重新排序缓存中的过去键值对，以适应束搜索的索引顺序
    def _reorder_cache(self, past_key_values, beam_idx):
        # 初始化一个空的重排序后的过去键值对元组
        reordered_past = ()
        # 遍历每一层的过去键值对
        for layer_past in past_key_values:
            # 对于每个层的过去状态，根据束搜索的索引重新选择对应的过去状态
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重新排序后的过去键值对元组
        return reordered_past
# 使用自定义的文档字符串注释 RoBERTa 模型，包含了一个顶部的语言建模头部
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING)
class RobertaForMaskedLM(RobertaPreTrainedModel):
    # 定义一个列表，包含了需要共享权重的键名
    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

    # 初始化方法，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 如果配置中指定了 is_decoder 为 True，发出警告信息
        if config.is_decoder:
            logger.warning(
                "If you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # 创建一个 RoBERTa 模型，不包含池化层
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # 创建一个 RoBERTa 语言建模头部
        self.lm_head = RobertaLMHead(config)

        # 执行额外的初始化操作和最终处理
        self.post_init()

    # 返回语言建模头部的输出嵌入
    def get_output_embeddings(self):
        return self.lm_head.decoder

    # 设置新的输出嵌入到语言建模头部
    def set_output_embeddings(self, new_embeddings):
        self.lm_head.decoder = new_embeddings

    # 使用文档字符串和代码示例的注释来注释 forward 方法
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="<mask>",
        expected_output="' Paris'",
        expected_loss=0.1,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        """
        # Determine if the output should be returned as a dictionary or not
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Forward pass through the Roberta model
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # Extract the sequence output from the model outputs
        sequence_output = outputs[0]
        # Generate prediction scores using the language model head
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            # Move labels tensor to the device used for prediction_scores
            labels = labels.to(prediction_scores.device)
            # Define the loss function for masked language modeling
            loss_fct = CrossEntropyLoss()
            # Compute masked language modeling loss
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            # Prepare output tuple without returning a dictionary
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # Return MaskedLMOutput named tuple if return_dict is True
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 定义一个用于 RoBERTa 的语言模型头部的类
class RobertaLMHead(nn.Module):
    """Roberta Head for masked language modeling."""

    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，将输入大小映射到隐藏大小
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 创建一个层归一化层，用于标准化隐藏状态
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 创建一个线性层，将隐藏大小映射到词汇表大小
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
        # 创建一个偏置参数，用于解码器线性层
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
        # 将解码器的偏置设置为自定义的偏置参数
        self.decoder.bias = self.bias

    def forward(self, features, **kwargs):
        # 输入特征经过线性层变换
        x = self.dense(features)
        # 使用 GELU 激活函数进行非线性变换
        x = gelu(x)
        # 输入经过层归一化处理
        x = self.layer_norm(x)

        # 将隐藏状态映射回词汇表大小，并加上偏置
        x = self.decoder(x)

        return x

    def _tie_weights(self):
        # 如果解码器的偏置参数设备类型是 "meta"，则将其与自定义偏置参数绑定
        # 这是为了加速兼容性和保持向后兼容性
        if self.decoder.bias.device.type == "meta":
            self.decoder.bias = self.bias
        else:
            # 否则，将自定义偏置参数与解码器的偏置参数绑定
            self.bias = self.decoder.bias


@add_start_docstrings(
    """
    RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    ROBERTA_START_DOCSTRING,
)
class RobertaForSequenceClassification(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 初始化 RoBERTa 模型的分类/回归头部
        self.num_labels = config.num_labels
        self.config = config

        # 创建 RoBERTa 模型，不包含池化层
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # 创建 RoBERTa 分类头部
        self.classifier = RobertaClassificationHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="cardiffnlp/twitter-roberta-base-emotion",
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="'optimism'",
        expected_loss=0.08,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用其值；否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 RoBERTa 模型进行处理，并获取输出结果
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 从 RoBERTa 模型的输出中获取序列输出
        sequence_output = outputs[0]
        # 将序列输出传递给分类器模型获取 logits
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            # 将 labels 移动到正确的设备以启用模型并行处理
            labels = labels.to(logits.device)
            # 根据问题类型自动推断配置中的 problem_type
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据 problem_type 计算损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回 logits 和可能的额外输出
        if not return_dict:
            output = (logits,) + outputs[2:]  # 保留 logits 和额外的 hidden states
            return ((loss,) + output) if loss is not None else output

        # 返回一个 SequenceClassifierOutput 对象，包含 loss、logits、hidden states 和 attentions
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 使用多项选择分类头部的 RoBERTa 模型（在汇总输出之上有一个线性层和 softmax），例如用于 RocStories/SWAG 任务
@add_start_docstrings(
    """
    Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    ROBERTA_START_DOCSTRING,
)
class RobertaForMultipleChoice(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 RoBERTa 模型
        self.roberta = RobertaModel(config)
        # Dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 分类器线性层，输出维度为1（用于多项选择任务）
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
):
    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据返回值字典是否为空来确定是否使用预设的返回值设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 计算输入张量的第二维大小，即选择数量
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 如果存在input_ids，则将其展平为二维张量，否则为None
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        # 如果存在position_ids，则将其展平为二维张量，否则为None
        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        # 如果存在token_type_ids，则将其展平为二维张量，否则为None
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        # 如果存在attention_mask，则将其展平为二维张量，否则为None
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        # 如果存在inputs_embeds，则将其展平为三维张量，否则为None
        flat_inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用RoBERTa模型进行前向传播
        outputs = self.roberta(
            flat_input_ids,
            position_ids=flat_position_ids,
            token_type_ids=flat_token_type_ids,
            attention_mask=flat_attention_mask,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 提取汇总输出
        pooled_output = outputs[1]

        # 对汇总输出应用dropout
        pooled_output = self.dropout(pooled_output)
        # 使用分类器得到logits
        logits = self.classifier(pooled_output)
        # 将logits重新调整形状为(batch_size, num_choices)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        # 如果存在标签，则计算交叉熵损失
        if labels is not None:
            # 将标签移动到正确的设备上以实现模型并行计算
            labels = labels.to(reshaped_logits.device)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果不需要返回字典，则返回输出元组
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典，则返回多选模型输出
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 定义一个带有标记分类头部的 RoBERTa 模型类，用于例如命名实体识别（NER）任务
@add_start_docstrings(
    """
    Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    ROBERTA_START_DOCSTRING,
)
class RobertaForTokenClassification(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # 初始化 RoBERTa 模型，不包括汇聚层
        self.roberta = RobertaModel(config, add_pooling_layer=False)

        # 根据配置设定分类器的 dropout 率，若未设置则使用隐藏层的 dropout 率
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)

        # 创建一个线性层，将 RoBERTa 隐藏层的输出转换为分类标签
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="Jean-Baptiste/roberta-large-ner-english",
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
        expected_loss=0.01,
    )
    # 前向传播函数，接受 RoBERTa 的输入并输出分类结果
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # RoBERTa 输入的详细说明文档
        ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 如果没有指定 return_dict，则根据配置决定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 RoBERTa 模型处理输入数据，并获取输出结果
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 RoBERTa 模型输出中获取序列输出
        sequence_output = outputs[0]

        # 对序列输出进行 dropout 处理
        sequence_output = self.dropout(sequence_output)
        
        # 将 dropout 后的序列输出送入分类器得到 logits
        logits = self.classifier(sequence_output)

        # 初始化损失值
        loss = None
        # 如果给定了标签，则计算交叉熵损失
        if labels is not None:
            # 将标签移到正确的设备上以支持模型并行计算
            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            # 计算损失值
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不要求返回字典，则组织输出结果
        if not return_dict:
            output = (logits,) + outputs[2:]  # 这里的 outputs[2:] 包含额外的隐藏状态
            return ((loss,) + output) if loss is not None else output

        # 构造 TokenClassifierOutput 对象用于返回结果
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层，输入和输出维度均为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 确定分类器的 dropout 率，如果未提供，则使用隐藏层 dropout 率
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 定义一个 dropout 层，应用上述确定的 dropout 率
        self.dropout = nn.Dropout(classifier_dropout)
        # 定义一个全连接层，将隐藏状态映射到类别数 config.num_labels
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        # 从 features 中选择第一个 token 的隐藏状态作为输出，类似于取 [CLS] token
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)  # 应用 dropout
        x = self.dense(x)  # 应用全连接层
        x = torch.tanh(x)  # 应用 tanh 激活函数
        x = self.dropout(x)  # 再次应用 dropout
        x = self.out_proj(x)  # 应用最终的全连接层映射到输出类别数
        return x


@add_start_docstrings(
    """
    Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    ROBERTA_START_DOCSTRING,
)
class RobertaForQuestionAnswering(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # 初始化 RoBERTa 模型，不包含池化层
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # 定义一个全连接层，将 RoBERTa 的隐藏状态映射到类别数 config.num_labels
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="deepset/roberta-base-squad2",
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="' puppet'",
        expected_loss=0.86,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # Determine whether to use return_dict or default based on configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass input data to the Roberta model for processing
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the sequence output from the Roberta model's outputs
        sequence_output = outputs[0]

        # Compute logits for question answering start and end positions
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # Handle multi-GPU scenarios by adjusting tensor dimensions
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # Clamp positions to ensure they are within valid range
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # Define loss function and compute start/end losses
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            # Prepare output tuple without using return_dict
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # Return structured output using QuestionAnsweringModelOutput class
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        input_ids: torch.Tensor, input tensor containing token ids
        padding_idx: int, index of padding token in the vocabulary
        past_key_values_length: int, length of past key values to consider for incremental indexing

    Returns:
        torch.Tensor, tensor of position ids corresponding to input_ids
    """
    # 创建一个掩码，标记非填充符号的位置为1，填充符号的位置为0
    mask = input_ids.ne(padding_idx).int()
    # 根据掩码累积计算位置索引，加上过去关键值长度，然后乘以掩码以忽略填充符号
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    # 返回最终的位置 ids，加上填充索引以确保填充符号仍然为填充索引
    return incremental_indices.long() + padding_idx

`.\models\roberta\modeling_tf_roberta.py`

# coding=utf-8
# 定义了文件编码格式为UTF-8

# 以下部分代码的版权归 Google AI Language Team 和 HuggingFace Inc. 团队所有，以及 NVIDIA 公司。保留所有权利。
# 根据 Apache License, Version 2.0 许可证使用本文件。您可以在以下网址获取许可证的副本：
# http://www.apache.org/licenses/LICENSE-2.0

# 如果没有符合适用法律的要求或书面同意，本软件是按“原样”提供的，不提供任何明示或暗示的担保或条件。
# 请参阅许可证以获取详细的权限说明和限制。
""" TF 2.0 RoBERTa 模型。"""

from __future__ import annotations

import math
import warnings
from typing import Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutputWithPastAndCrossAttentions,
    TFBaseModelOutputWithPoolingAndCrossAttentions,
    TFCausalLMOutputWithCrossAttentions,
    TFMaskedLMOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
    TFCausalLanguageModelingLoss,
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_roberta import RobertaConfig

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置信息
_CHECKPOINT_FOR_DOC = "FacebookAI/roberta-base"
_CONFIG_FOR_DOC = "RobertaConfig"

# 支持的预训练模型列表
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "FacebookAI/roberta-base",
    "FacebookAI/roberta-large",
    "FacebookAI/roberta-large-mnli",
    "distilbert/distilroberta-base",
    # 可查看所有 RoBERTa 模型列表：https://huggingface.co/models?filter=roberta
]

class TFRobertaEmbeddings(keras.layers.Layer):
    """
    BertEmbeddings 的变种，用于处理位置编码索引的微小调整。
    """

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 填充索引设定为1
        self.padding_idx = 1
        self.config = config
        self.hidden_size = config.hidden_size
        self.max_position_embeddings = config.max_position_embeddings
        self.initializer_range = config.initializer_range
        
        # LayerNormalization 层，使用配置中的 epsilon 参数初始化
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        
        # Dropout 层，使用配置中的 dropout 概率初始化
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
 good answer should meticulously annotate each line of the provided Python code block, adhering to the specified format. This includes adding comments that explain the purpose and functionality of each statement, ensuring clarity and completeness without altering the original code structure or indentation.

Here is a potential answer:


    def build(self, input_shape=None):
        # 开始构建词嵌入层
        with tf.name_scope("word_embeddings"):
            # 添加权重张量用于词嵌入
            self.weight = self.add_weight(
                name="weight",
                shape=[self.config.vocab_size, self.hidden_size],
                initializer=get_initializer(self.initializer_range),
            )

        with tf.name_scope("token_type_embeddings"):
            # 添加令牌类型嵌入层的权重张量
            self.token_type_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.config.type_vocab_size, self.hidden_size],
                initializer=get_initializer(self.initializer_range),
            )

        with tf.name_scope("position_embeddings"):
            # 添加位置嵌入层的权重张量
            self.position_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.max_position_embeddings, self.hidden_size],
                initializer=get_initializer(self.initializer_range),
            )

        if self.built:
            return
        self.built = True
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                # 如果存在 LayerNorm 层，则构建它
                self.LayerNorm.build([None, None, self.config.hidden_size])

    def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
        """
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            input_ids: tf.Tensor
        Returns: tf.Tensor
        """
        # 创建一个掩码，标记非填充符号的位置
        mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
        # 计算累积位置索引，从 past_key_values_length 开始
        incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask

        return incremental_indices + self.padding_idx

    def call(
        self,
        input_ids=None,
        position_ids=None,
        token_type_ids=None,
        inputs_embeds=None,
        past_key_values_length=0,
        training=False,


This annotation ensures that each method and operation within the provided class is clearly explained, promoting understanding and maintenance of the code.
    ):
        """
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        """
        assert not (input_ids is None and inputs_embeds is None)  # 检查输入的 input_ids 和 inputs_embeds 不能同时为空

        if input_ids is not None:
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)  # 检查 input_ids 是否在有效的词汇范围内
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)  # 根据 input_ids 从权重矩阵中获取对应的嵌入向量

        input_shape = shape_list(inputs_embeds)[:-1]  # 获取输入嵌入向量的形状，去除最后一个维度（通常是嵌入维度）

        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)  # 如果 token_type_ids 为空，则用零填充形状与输入嵌入向量相同的张量

        if position_ids is None:
            if input_ids is not None:
                # 根据输入的 token ids 创建位置 ids，保留任何填充的标记
                position_ids = self.create_position_ids_from_input_ids(
                    input_ids=input_ids, past_key_values_length=past_key_values_length
                )
            else:
                # 如果 input_ids 为空，则创建位置 ids，从 padding_idx 开始，长度为输入形状的最后一个维度加上 padding_idx
                position_ids = tf.expand_dims(
                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
                )

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)  # 根据位置 ids 从位置嵌入矩阵中获取位置嵌入向量
        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)  # 根据 token_type_ids 获取 token type 嵌入向量
        final_embeddings = inputs_embeds + position_embeds + token_type_embeds  # 计算最终的嵌入向量，包括输入嵌入、位置嵌入和 token type 嵌入
        final_embeddings = self.LayerNorm(inputs=final_embeddings)  # 使用 LayerNorm 对最终的嵌入向量进行归一化
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)  # 使用 dropout 对最终的嵌入向量进行随机失活

        return final_embeddings  # 返回最终的嵌入向量作为输出
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Roberta
class TFRobertaPooler(keras.layers.Layer):
    def __init__(self, config: RobertaConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义一个全连接层用于池化操作，输出维度为config.hidden_size，激活函数为tanh
        self.dense = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 通过取第一个 token 对应的隐藏状态来进行“池化”模型
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(inputs=first_token_tensor)

        return pooled_output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建过，直接返回；否则，根据配置信息构建 dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta
class TFRobertaSelfAttention(keras.layers.Layer):
    def __init__(self, config: RobertaConfig, **kwargs):
        super().__init__(**kwargs)

        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
                f"of attention heads ({config.num_attention_heads})"
            )

        # 计算每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)

        # 创建查询、键、值的全连接层
        self.query = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
        )
        self.key = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
        )
        self.value = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )
        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)

        # 是否为解码器层
        self.is_decoder = config.is_decoder
        self.config = config
    # 将张量重塑从 [batch_size, seq_length, all_head_size] 到 [batch_size, seq_length, num_attention_heads, attention_head_size]
    tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))

    # 将张量转置从 [batch_size, seq_length, num_attention_heads, attention_head_size] 到 [batch_size, num_attention_heads, seq_length, attention_head_size]
    return tf.transpose(tensor, perm=[0, 2, 1, 3])

    # 检查是否已经构建模型，如果已经构建则直接返回
    if self.built:
        return
    self.built = True

    # 如果存在查询（query）张量，则构建查询张量
    if getattr(self, "query", None) is not None:
        with tf.name_scope(self.query.name):
            self.query.build([None, None, self.config.hidden_size])

    # 如果存在键（key）张量，则构建键张量
    if getattr(self, "key", None) is not None:
        with tf.name_scope(self.key.name):
            self.key.build([None, None, self.config.hidden_size])

    # 如果存在值（value）张量，则构建值张量
    if getattr(self, "value", None) is not None:
        with tf.name_scope(self.value.name):
            self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Roberta
class TFRobertaSelfOutput(keras.layers.Layer):
    def __init__(self, config: RobertaConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义一个全连接层，用于映射隐藏状态到与配置中指定大小相同的空间
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 定义一个层归一化层，用于归一化输入数据，设置了配置中的 epsilon 参数
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 定义一个 dropout 层，用于在训练时随机丢弃部分输入数据，以减少过拟合风险
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将输入的隐藏状态通过全连接层映射到指定大小的空间
        hidden_states = self.dense(inputs=hidden_states)
        # 在训练时应用 dropout，随机丢弃部分输入数据
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 将 dropout 后的结果与输入数据进行残差连接，并通过层归一化层处理
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 构建全连接层，指定输入形状为 [None, None, hidden_size]
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 构建层归一化层，指定输入形状为 [None, None, hidden_size]
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Roberta
class TFRobertaAttention(keras.layers.Layer):
    def __init__(self, config: RobertaConfig, **kwargs):
        super().__init__(**kwargs)

        # 使用 TFRobertaSelfAttention 定义自注意力层
        self.self_attention = TFRobertaSelfAttention(config, name="self")
        # 使用 TFRobertaSelfOutput 定义自注意力输出层
        self.dense_output = TFRobertaSelfOutput(config, name="output")

    def prune_heads(self, heads):
        raise NotImplementedError

    def call(
        self,
        input_tensor: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        encoder_hidden_states: tf.Tensor,
        encoder_attention_mask: tf.Tensor,
        past_key_value: Tuple[tf.Tensor],
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 调用自注意力层处理输入张量
        self_outputs = self.self_attention(
            hidden_states=input_tensor,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            training=training,
        )
        # 调用自注意力输出层处理自注意力层的输出，得到注意力输出张量
        attention_output = self.dense_output(
            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
        )
        # 如果需要输出注意力，将注意力张量加入输出中
        outputs = (attention_output,) + self_outputs[1:]

        return outputs
    # 定义一个方法 `build`，用于构建神经网络层
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 设置标志位，表示已经构建了该层
        self.built = True
        
        # 如果存在 self_attention 属性，进行以下操作
        if getattr(self, "self_attention", None) is not None:
            # 使用 `tf.name_scope` 创建一个作用域，作用域名称为 self_attention 的名称
            with tf.name_scope(self.self_attention.name):
                # 调用 self_attention 对象的 build 方法，传入 input_shape=None
                self.self_attention.build(None)
        
        # 如果存在 dense_output 属性，进行以下操作
        if getattr(self, "dense_output", None) is not None:
            # 使用 `tf.name_scope` 创建一个作用域，作用域名称为 dense_output 的名称
            with tf.name_scope(self.dense_output.name):
                # 调用 dense_output 对象的 build 方法，传入 input_shape=None
                self.dense_output.build(None)
# 从transformers.models.bert.modeling_tf_bert.TFBertIntermediate复制过来，将Bert改为Roberta
class TFRobertaIntermediate(keras.layers.Layer):
    def __init__(self, config: RobertaConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，输出大小为config.intermediate_size，使用给定的初始化器初始化权重
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 如果config.hidden_act是字符串，则使用对应的TensorFlow激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 将输入的hidden_states传递给全连接层，得到输出
        hidden_states = self.dense(inputs=hidden_states)
        # 使用中间激活函数对输出进行非线性变换
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建了dense层，则按照指定的形状构建它
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# 从transformers.models.bert.modeling_tf_bert.TFBertOutput复制过来，将Bert改为Roberta
class TFRobertaOutput(keras.layers.Layer):
    def __init__(self, config: RobertaConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，输出大小为config.hidden_size，使用给定的初始化器初始化权重
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 创建一个LayerNormalization层，epsilon为config.layer_norm_eps，用于归一化
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建一个Dropout层，dropout比率为config.hidden_dropout_prob，用于正则化
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将输入的hidden_states传递给全连接层，得到输出
        hidden_states = self.dense(inputs=hidden_states)
        # 在训练时使用dropout进行正则化
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 对加和原始输入的hidden_states应用LayerNormalization
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建了dense层，则按照指定的形状构建它
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])
        # 如果已经构建了LayerNorm层，则按照指定的形状构建它
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# 从transformers.models.bert.modeling_tf_bert.TFBertLayer复制过来，将Bert改为Roberta
class TFRobertaLayer(keras.layers.Layer):
    # 初始化函数，用于创建一个 RoBERTa 模型的实例
    def __init__(self, config: RobertaConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 创建 RoBERTa 的自注意力层，并命名为 "attention"
        self.attention = TFRobertaAttention(config, name="attention")
        
        # 检查是否为解码器模型
        self.is_decoder = config.is_decoder
        
        # 检查是否添加了跨注意力
        self.add_cross_attention = config.add_cross_attention
        
        # 如果模型添加了跨注意力但不是解码器，抛出异常
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            
            # 创建 RoBERTa 的跨注意力层，并命名为 "crossattention"
            self.crossattention = TFRobertaAttention(config, name="crossattention")
        
        # 创建 RoBERTa 的中间层
        self.intermediate = TFRobertaIntermediate(config, name="intermediate")
        
        # 创建 RoBERTa 的输出层
        self.bert_output = TFRobertaOutput(config, name="output")
        ) -> Tuple[tf.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        # 如果过去的键/值存在，则从中提取自注意力的过去键/值元组的前两个位置；否则设为None
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 使用self.attention方法计算自注意力
        self_attention_outputs = self.attention(
            input_tensor=hidden_states,  # 输入张量
            attention_mask=attention_mask,  # 注意力掩码
            head_mask=head_mask,  # 头部掩码
            encoder_hidden_states=None,  # 编码器隐藏状态（自注意力时为None）
            encoder_attention_mask=None,  # 编码器注意力掩码（自注意力时为None）
            past_key_value=self_attn_past_key_value,  # 过去的键/值，用于缓存
            output_attentions=output_attentions,  # 是否输出注意力权重
            training=training,  # 是否处于训练模式
        )
        attention_output = self_attention_outputs[0]  # 注意力输出的第一个元素

        # 如果是解码器，最后一个输出是自注意力缓存的元组
        if self.is_decoder:
            outputs = self_attention_outputs[1:-1]  # 解码器时，输出除了最后一个元素之外的所有元素
            present_key_value = self_attention_outputs[-1]  # 解码器时，最后一个元素为当前的键/值元组
        else:
            outputs = self_attention_outputs[1:]  # 如果输出注意力权重，则添加自注意力
                                                  
        cross_attn_present_key_value = None
        # 如果是解码器且存在编码器的隐藏状态
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                # 如果没有定义交叉注意力层，则抛出错误
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # 从过去的键/值中提取交叉注意力的键/值元组，位置为过去键/值元组的倒数第二个和最后一个位置
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 使用self.crossattention方法计算交叉注意力
            cross_attention_outputs = self.crossattention(
                input_tensor=attention_output,  # 输入张量为自注意力输出
                attention_mask=attention_mask,  # 注意力掩码
                head_mask=head_mask,  # 头部掩码
                encoder_hidden_states=encoder_hidden_states,  # 编码器隐藏状态
                encoder_attention_mask=encoder_attention_mask,  # 编码器注意力掩码
                past_key_value=cross_attn_past_key_value,  # 过去的键/值，用于缓存
                output_attentions=output_attentions,  # 是否输出注意力权重
                training=training,  # 是否处于训练模式
            )
            attention_output = cross_attention_outputs[0]  # 交叉注意力的输出为第一个元素
            outputs = outputs + cross_attention_outputs[1:-1]  # 如果输出注意力权重，则添加交叉注意力

            # 将交叉注意力的当前键/值添加到当前键/值元组中的第三和第四个位置
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # 使用self.intermediate方法计算中间输出
        intermediate_output = self.intermediate(hidden_states=attention_output)
        # 使用self.bert_output方法计算BERT输出
        layer_output = self.bert_output(
            hidden_states=intermediate_output,  # 中间隐藏状态
            input_tensor=attention_output,  # 输入张量为自注意力输出
            training=training  # 是否处于训练模式
        )
        outputs = (layer_output,) + outputs  # 如果输出注意力，则添加到输出元组中

        # 如果是解码器，将注意力的键/值作为最后一个输出返回
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs
    # 构建函数用于构造模型，接收输入形状参数
    def build(self, input_shape=None):
        # 如果模型已经构建完毕，则直接返回，不进行重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        
        # 如果存在注意力层，根据其名称创建命名作用域并构建注意力层
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        
        # 如果存在中间层，根据其名称创建命名作用域并构建中间层
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        
        # 如果存在BERT输出层，根据其名称创建命名作用域并构建BERT输出层
        if getattr(self, "bert_output", None) is not None:
            with tf.name_scope(self.bert_output.name):
                self.bert_output.build(None)
        
        # 如果存在交叉注意力层，根据其名称创建命名作用域并构建交叉注意力层
        if getattr(self, "crossattention", None) is not None:
            with tf.name_scope(self.crossattention.name):
                self.crossattention.build(None)
# 从 transformers.models.bert.modeling_tf_bert.TFBertEncoder 复制并改为使用 Roberta
class TFRobertaEncoder(keras.layers.Layer):
    def __init__(self, config: RobertaConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        # 初始化 RobeertaEncoder 层的每个子层，命名为 layer_._{i}
        self.layer = [TFRobertaLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        encoder_hidden_states: tf.Tensor | None,
        encoder_attention_mask: tf.Tensor | None,
        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
        use_cache: Optional[bool],
        output_attentions: bool,
        output_hidden_states: bool,
        return_dict: bool,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
        # 如果要输出隐藏状态，则初始化 all_hidden_states 为空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果要输出注意力，则初始化 all_attentions 为空元组
        all_attentions = () if output_attentions else None
        # 如果要输出交叉注意力且配置允许，则初始化 all_cross_attentions 为空元组
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果 use_cache 为真，则初始化 next_decoder_cache 为空元组
        next_decoder_cache = () if use_cache else None
        # 遍历每一层的编码器
        for i, layer_module in enumerate(self.layer):
            # 如果要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的过去键值，如果存在的话
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 调用当前层的模块进行前向传播
            layer_outputs = layer_module(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask[i],
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                training=training,
            )
            # 更新隐藏状态为当前层输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果 use_cache 为真，则更新 next_decoder_cache 为当前层的最后一个输出
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)

            # 如果要输出注意力，则将当前层的注意力添加到 all_attentions 中
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)
                # 如果配置允许添加交叉注意力且存在编码器隐藏状态，则将当前层的交叉注意力添加到 all_cross_attentions 中
                if self.config.add_cross_attention and encoder_hidden_states is not None:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 添加最后一层的隐藏状态到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典，则返回所有非空的结果元组
        if not return_dict:
            return tuple(
                v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
            )

        # 返回 TFBaseModelOutputWithPastAndCrossAttentions 类型的字典结果
        return TFBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
            cross_attentions=all_cross_attentions,
        )
    # 定义一个方法用于构建神经网络层次结构，接受输入形状参数，默认为 None
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 标记为已经构建
        self.built = True
        # 如果存在层列表属性
        if getattr(self, "layer", None) is not None:
            # 遍历每个层并设置 TensorFlow 的命名空间
            for layer in self.layer:
                # 使用每个层的名称作为 TensorFlow 的命名空间
                with tf.name_scope(layer.name):
                    # 调用每个层的 build 方法，传入 None 作为输入形状
                    layer.build(None)
# 使用 keras_serializable 装饰器标记这个类，使其可以被 Keras 序列化
@keras_serializable
class TFRobertaMainLayer(keras.layers.Layer):
    # 将 config_class 属性设置为 RobertaConfig 类，用于配置模型
    config_class = RobertaConfig

    # 初始化函数，接受 config 和其他参数
    def __init__(self, config, add_pooling_layer=True, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 将传入的 config 参数赋值给 self.config
        self.config = config
        # 设置 self.is_decoder 标志位，表示是否为解码器
        self.is_decoder = config.is_decoder

        # 初始化其他属性，从 config 中获取相关配置
        self.num_hidden_layers = config.num_hidden_layers
        self.initializer_range = config.initializer_range
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.return_dict = config.use_return_dict

        # 创建 TFRobertaEncoder 对象，并命名为 "encoder"
        self.encoder = TFRobertaEncoder(config, name="encoder")

        # 如果 add_pooling_layer 为 True，则创建 TFRobertaPooler 对象，并命名为 "pooler"
        self.pooler = TFRobertaPooler(config, name="pooler") if add_pooling_layer else None
        
        # 创建 TFRobertaEmbeddings 对象，并命名为 "embeddings"，这必须是最后声明的，以保持权重顺序
        self.embeddings = TFRobertaEmbeddings(config, name="embeddings")

    # 从 transformers 库中复制的方法：获取输入嵌入层对象
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.embeddings

    # 从 transformers 库中复制的方法：设置输入嵌入层的权重
    def set_input_embeddings(self, value: tf.Variable):
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    # 从 transformers 库中复制的方法：剪枝模型中的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError

    # 使用 unpack_inputs 装饰器，从 transformers 库中复制的方法：调用模型的主要处理逻辑
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        ):
        # 实际的方法体暂未提供
        pass
    # 如果模型已经构建完成，则直接返回，不进行重复构建
    if self.built:
        return
    # 标记模型为已构建状态
    self.built = True
    
    # 如果存在编码器（encoder），则在其命名空间下构建编码器
    if getattr(self, "encoder", None) is not None:
        with tf.name_scope(self.encoder.name):
            self.encoder.build(None)
    
    # 如果存在池化器（pooler），则在其命名空间下构建池化器
    if getattr(self, "pooler", None) is not None:
        with tf.name_scope(self.pooler.name):
            self.pooler.build(None)
    
    # 如果存在嵌入层（embeddings），则在其命名空间下构建嵌入层
    if getattr(self, "embeddings", None) is not None:
        with tf.name_scope(self.embeddings.name):
            self.embeddings.build(None)
# TFRobertaPreTrainedModel 类，用于处理权重初始化以及预训练模型的下载和加载接口
class TFRobertaPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 配置类，指定为 RobertaConfig
    config_class = RobertaConfig
    # 基础模型前缀，指定为 "roberta"
    base_model_prefix = "roberta"


# ROBERTA_START_DOCSTRING 常量，包含以下注释内容
ROBERTA_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`RobertaConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# ROBERTA_INPUTS_DOCSTRING 常量，尚未注释，待后续添加相关内容
ROBERTA_INPUTS_DOCSTRING = r"""
"""

# 使用 add_start_docstrings 装饰器，添加注释说明到 TFRobertaModel 类
@add_start_docstrings(
    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
    ROBERTA_START_DOCSTRING,
)
class TFRobertaModel(TFRobertaPreTrainedModel):
    # TFRobertaPreTrainedModel 类的子类，继承其功能和特性
    # 初始化函数，用于创建一个新的对象实例
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法，传递config以及其他输入参数
        super().__init__(config, *inputs, **kwargs)
        # 创建一个名为roberta的TFRobertaMainLayer层，并用config配置它
        self.roberta = TFRobertaMainLayer(config, name="roberta")

    # 装饰器：将输入参数解包并传递给函数
    @unpack_inputs
    # 装饰器：为模型的前向传播函数添加描述性文档
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 装饰器：为模型添加代码示例的描述文档
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,  # 示例中的检查点说明
        output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,  # 输出类型的说明
        config_class=_CONFIG_FOR_DOC,  # 示例中的配置类说明
    )
    # 定义模型的前向传播函数，接收多个输入参数
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入的token IDs
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力掩码
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # token类型IDs
        position_ids: np.ndarray | tf.Tensor | None = None,  # 位置IDs
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部掩码
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 嵌入的输入
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,  # 编码器的隐藏状态
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,  # 编码器的注意力掩码
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,  # 过去的键值对
        use_cache: Optional[bool] = None,  # 是否使用缓存
        output_attentions: Optional[bool] = None,  # 是否输出注意力
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回字典类型结果
        training: Optional[bool] = False,  # 是否处于训练模式
        # 下面没有更多的输入参数了，这里只是列出所有可能的输入参数
    ) -> Union[Tuple, TFBaseModelOutputWithPoolingAndCrossAttentions]:
        r"""
        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*, defaults to `True`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`). Set to `False` during training, `True` during generation
        """
        # 调用 self.roberta 的前向传播，传入各种参数，包括输入的编码器隐藏状态、注意力掩码等
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回 RoBERTa 模型的输出
        return outputs

    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 如果 self.roberta 已经初始化
        if getattr(self, "roberta", None) is not None:
            # 在命名空间下构建 self.roberta 模型
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
class TFRobertaLMHead(keras.layers.Layer):
    """Roberta Head for masked language modeling."""

    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)

        self.config = config  # 存储模型配置信息
        self.hidden_size = config.hidden_size  # 提取隐藏层大小
        self.dense = keras.layers.Dense(
            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )  # 创建全连接层，大小与隐藏层一致
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        self.act = get_tf_activation("gelu")  # 获取激活函数 GELU

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = input_embeddings  # 存储输入的嵌入层权重作为解码器的权重

    def build(self, input_shape=None):
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
        # 创建偏置项，形状为词汇表大小，初始化为零，可训练

        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果存在全连接层，构建全连接层

        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.hidden_size])
        # 如果存在层归一化层，构建层归一化层

    def get_output_embeddings(self):
        return self.decoder
        # 返回嵌入层权重作为输出的解码器权重

    def set_output_embeddings(self, value):
        self.decoder.weight = value
        self.decoder.vocab_size = shape_list(value)[0]
        # 设置解码器的权重为给定值，并更新词汇表大小

    def get_bias(self):
        return {"bias": self.bias}
        # 返回偏置项

    def set_bias(self, value):
        self.bias = value["bias"]
        self.config.vocab_size = shape_list(value["bias"])[0]
        # 设置偏置项，并更新配置中的词汇表大小信息

    def call(self, hidden_states):
        hidden_states = self.dense(hidden_states)  # 全连接层
        hidden_states = self.act(hidden_states)  # 激活函数
        hidden_states = self.layer_norm(hidden_states)  # 层归一化

        # project back to size of vocabulary with bias
        seq_length = shape_list(tensor=hidden_states)[1]  # 获取序列长度
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])  # 重塑隐藏状态
        hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)  # 矩阵乘法
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])  # 重塑隐藏状态
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)  # 添加偏置项

        return hidden_states
        # 返回最终的隐藏状态
    # 初始化方法，接受配置和其他输入参数
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)

        # 使用TF-Roberta的主层，不添加池化层，命名为"roberta"
        self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
        # 使用TF-Roberta的LM头部，连接到self.roberta的嵌入，命名为"lm_head"
        self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")

    # 返回LM头部模型
    def get_lm_head(self):
        return self.lm_head

    # 返回带有前缀偏差名称的字符串
    def get_prefix_bias_name(self):
        # 发出警告，说明方法get_prefix_bias_name已被弃用
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        # 返回字符串，包含对象名称和LM头部名称
        return self.name + "/" + self.lm_head.name

    # 调用方法，用于处理输入和生成输出
    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="<mask>",
        expected_output="' Paris'",
        expected_loss=0.1,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 调用RoBERTa模型进行前向传播
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取序列输出
        sequence_output = outputs[0]
        # 使用LM头部生成预测分数
        prediction_scores = self.lm_head(sequence_output)

        # 如果没有标签，则损失为None；否则使用标签和预测分数计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)

        # 如果不返回字典，则返回包含预测分数和可能的隐藏状态的元组
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回TFMaskedLMOutput对象，包含损失、预测分数、隐藏状态和注意力
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 定义神经网络模型的构建方法，input_shape参数可选
    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回，不重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        
        # 检查是否存在名为"roberta"的属性，并且不为None
        if getattr(self, "roberta", None) is not None:
            # 在TensorFlow中，使用name_scope为模型组织命名空间
            with tf.name_scope(self.roberta.name):
                # 构建self.roberta模型，传入None作为输入形状
                self.roberta.build(None)
        
        # 检查是否存在名为"lm_head"的属性，并且不为None
        if getattr(self, "lm_head", None) is not None:
            # 在TensorFlow中，使用name_scope为模型组织命名空间
            with tf.name_scope(self.lm_head.name):
                # 构建self.lm_head模型，传入None作为输入形状
                self.lm_head.build(None)
    # TFRobertaForCausalLM 类继承自 TFRobertaPreTrainedModel 和 TFCausalLanguageModelingLoss
    class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLoss):
        # 在从 PyTorch 模型加载 TF 模型时，忽略的层的名称列表，包括一些预期之外的/缺失的层
        _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]

        def __init__(self, config: RobertaConfig, *inputs, **kwargs):
            # 调用父类的构造函数，并传入配置和其他输入参数
            super().__init__(config, *inputs, **kwargs)

            # 如果配置不是 decoder 类型，发出警告提示
            if not config.is_decoder:
                logger.warning("If you want to use `TFRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")

            # 创建 RoBERTa 主体层，不包括 pooling 层，命名为 "roberta"
            self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
            # 创建 RoBERTa LM 头部层，传入 RoBERTa embeddings 作为输入，命名为 "lm_head"
            self.lm_head = TFRobertaLMHead(config, input_embeddings=self.roberta.embeddings, name="lm_head")

        # 返回 LM 头部层对象的方法
        def get_lm_head(self):
            return self.lm_head

        # 获取前缀偏置名称的方法，已经过时，将发出未来警告提示
        def get_prefix_bias_name(self):
            warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
            # 返回拼接的名称，包括实例名称和 LM 头部层名称
            return self.name + "/" + self.lm_head.name

        # 从 transformers 库中复制的方法，准备生成输入，用于生成文本的准备工作
        def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
            # 获取输入的形状信息
            input_shape = input_ids.shape
            # 如果没有提供注意力遮罩，则创建全 1 的注意力遮罩
            if attention_mask is None:
                attention_mask = tf.ones(input_shape)

            # 如果使用了过去的键值对，则截取最后一个输入的 token ID
            if past_key_values is not None:
                input_ids = input_ids[:, -1:]

            # 返回包含生成所需输入的字典
            return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}

        # 将输入解包的装饰器，用于 call 方法
        @unpack_inputs
        # 将 ROBERTA_INPUTS_DOCSTRING 格式化应用到模型前向传播的参数说明上
        @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
        # 添加代码示例的文档字符串，包括检查点、输出类型、配置类
        @add_code_sample_docstrings(
            checkpoint=_CHECKPOINT_FOR_DOC,
            output_type=TFCausalLMOutputWithCrossAttentions,
            config_class=_CONFIG_FOR_DOC,
        )
        # 模型的前向传播方法，接受多个输入参数，输出模型的结果
        def call(
            self,
            input_ids: TFModelInputType | None = None,
            attention_mask: np.ndarray | tf.Tensor | None = None,
            token_type_ids: np.ndarray | tf.Tensor | None = None,
            position_ids: np.ndarray | tf.Tensor | None = None,
            head_mask: np.ndarray | tf.Tensor | None = None,
            inputs_embeds: np.ndarray | tf.Tensor | None = None,
            encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
            encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
            past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            labels: np.ndarray | tf.Tensor | None = None,
            training: Optional[bool] = False,
    # 定义模型构建方法，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 如果模型中包含名为 "roberta" 的属性
        if getattr(self, "roberta", None) is not None:
            # 使用 "roberta" 属性的名字作为命名空间，构建 "roberta" 子模型
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
        # 如果模型中包含名为 "lm_head" 的属性
        if getattr(self, "lm_head", None) is not None:
            # 使用 "lm_head" 属性的名字作为命名空间，构建 "lm_head" 子模型
            with tf.name_scope(self.lm_head.name):
                self.lm_head.build(None)
class TFRobertaClassificationHead(keras.layers.Layer):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 定义一个全连接层，用于分类任务，输入大小为 config.hidden_size
        self.dense = keras.layers.Dense(
            config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        # 设置分类器的 dropout 层，使用 config.classifier_dropout 或者 config.hidden_dropout_prob
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = keras.layers.Dropout(classifier_dropout)
        # 定义输出投影层，输出大小为 config.num_labels
        self.out_proj = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
        )
        self.config = config

    def call(self, features, training=False):
        # 取输入 features 的第一个 token 的特征作为输入，相当于取 <s> token (对应 [CLS])
        x = features[:, 0, :]
        x = self.dropout(x, training=training)
        x = self.dense(x)
        x = self.dropout(x, training=training)
        x = self.out_proj(x)
        return x

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 dense 层已经存在，则建立 dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果 out_proj 层已经存在，则建立 out_proj 层
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.config.hidden_size])


@add_start_docstrings(
    """
    RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    ROBERTA_START_DOCSTRING,
)
class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceClassificationLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 初始化模型，设置分类数量
        self.num_labels = config.num_labels

        # 初始化 RoBERTa 主层，不添加池化层，命名为 "roberta"
        self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
        # 初始化分类器头部
        self.classifier = TFRobertaClassificationHead(config, name="classifier")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="cardiffnlp/twitter-roberta-base-emotion",
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="'optimism'",
        expected_loss=0.08,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 使用 RoBERTa 模型处理输入数据
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 提取 RoBERTa 模型的输出中的序列输出
        sequence_output = outputs[0]
        # 使用分类器对序列输出进行分类，得到 logits
        logits = self.classifier(sequence_output, training=training)

        # 如果没有提供 labels，则不计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果 return_dict 为 False，则返回扁平化的输出元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 TFSequenceClassifierOutput 对象
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果存在 RoBERTa 模型，则构建 RoBERTa
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
        # 如果存在分类器，则构建分类器
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build(None)
@add_start_docstrings(
    """
    Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    ROBERTA_START_DOCSTRING,
)
class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    _keys_to_ignore_on_load_unexpected = [r"lm_head"]
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    _keys_to_ignore_on_load_missing = [r"dropout"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # Initialize the Roberta main layer with the provided configuration
        self.roberta = TFRobertaMainLayer(config, name="roberta")
        # Dropout layer with a dropout rate set according to the configuration
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        # Classifier dense layer for multiple choice tasks, with 1 output unit
        self.classifier = keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        # Store the configuration for reference
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        # Function defining the forward pass of the model with multiple choice inputs
        # Details the inputs and expected outputs in the documentation
        # Uses specified configurations for checkpoint, output type, and configuration class
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """

        # 如果存在 `input_ids`，获取其第二和第三维的尺寸
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]
            seq_length = shape_list(input_ids)[2]
        else:
            # 否则，使用 `inputs_embeds` 的第二和第三维的尺寸
            num_choices = shape_list(inputs_embeds)[1]
            seq_length = shape_list(inputs_embeds)[2]

        # 将输入张量展平为二维张量，如果存在的话
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
        
        # 使用 RoBERTa 模型进行前向传播
        outputs = self.roberta(
            flat_input_ids,
            flat_attention_mask,
            flat_token_type_ids,
            flat_position_ids,
            head_mask,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        
        # 提取池化后的输出表示
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        
        # 将池化后的输出送入分类器得到 logits
        logits = self.classifier(pooled_output)
        
        # 将 logits 重塑为预期的形状
        reshaped_logits = tf.reshape(logits, (-1, num_choices))

        # 如果提供了标签 `labels`，计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)

        # 如果 `return_dict` 为 False，返回扁平化后的 logits 和可能的额外输出
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 `return_dict` 为 True，返回 TFMultipleChoiceModelOutput 对象
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    # 构建模型的方法，设置 RoBERTa 和分类器的结构
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        
        # 如果存在 RoBERTa 模型，则构建其结构
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
        
        # 如果存在分类器，则构建其结构，包括指定隐藏层的大小
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
"""
RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
# 导入所需模块和类
class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassificationLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # 指定加载时忽略的不符合预期的层
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
    # 指定加载时忽略的缺失层
    _keys_to_ignore_on_load_missing = [r"dropout"]

    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)
        # 设置分类的标签数量
        self.num_labels = config.num_labels

        # 初始化 RoBERTa 主层
        self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
        # 获取分类器的 dropout 率，如果未指定，则使用隐藏层的 dropout 率
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 定义 dropout 层
        self.dropout = keras.layers.Dropout(classifier_dropout)
        # 定义分类器层
        self.classifier = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        # 保存配置信息
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="ydshieh/roberta-large-ner-english",
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
        expected_loss=0.01,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        **kwargs,
    ):
        # 调用 RoBERTa 模型，传递输入参数
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
            **kwargs,
        )

        # 获取 RoBERTa 的输出 hidden states
        sequence_output = outputs[0]

        # 对输出进行 dropout
        sequence_output = self.dropout(sequence_output, training=training)

        # 经过分类器层得到 logits
        logits = self.classifier(sequence_output)

        # 根据需求返回不同的输出格式
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((output,) + outputs[2:]) if output else output
        return TFTokenClassifierOutput(logits=logits, hidden_states=outputs.hidden_states)
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 调用 RoBERTa 模型进行前向传播，获取输出结果
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 RoBERTa 输出的元组中获取第一个元素，即模型的序列输出
        sequence_output = outputs[0]

        # 对序列输出进行 Dropout 操作，用于防止过拟合
        sequence_output = self.dropout(sequence_output, training=training)
        # 将 Dropout 后的输出输入到分类器中，得到分类器的 logits
        logits = self.classifier(sequence_output)

        # 如果没有提供标签，则损失置为 None；否则使用损失计算函数计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果 return_dict=False，则按非字典格式返回输出
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict=True，则按 TFTokenClassifierOutput 类型返回输出
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果模型具有 RoBERTa 属性，则构建 RoBERTa 模型
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
        # 如果模型具有分类器属性，则构建分类器
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
"""
RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
# 引入 RoBERTa 模型，添加了一个用于抽取式问答任务的跨度分类头部，例如 SQuAD 数据集。该头部是在隐藏状态输出之上的线性层，
# 用于计算 `span start logits` 和 `span end logits`。

class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnsweringLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # 定义了在从 PyTorch 模型加载到 TensorFlow 模型时，可以忽略的不匹配的层名列表。
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        # 初始化 RoBERTa 主层，不包含池化层，命名为 "roberta"
        self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
        
        # 初始化用于问答任务输出的全连接层，输出大小为 config.num_labels
        self.qa_outputs = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
        
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="ydshieh/roberta-base-squad2",
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="' puppet'",
        expected_loss=0.86,
    )
    # 定义模型的前向传播方法，支持一系列输入参数和注释文档
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 调用 RoBERTa 模型进行前向传播，获取输出结果
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 RoBERTa 输出的结果中获取序列输出
        sequence_output = outputs[0]

        # 将序列输出传入 QA 输出层，得到起始位置和结束位置的 logits
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)

        # 如果提供了起始位置和结束位置的标签，则计算损失值
        loss = None
        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            loss = self.hf_compute_loss(labels, (start_logits, end_logits))

        # 如果 return_dict=False，则返回不同的输出形式
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict=True，则返回 TFQuestionAnsweringModelOutput 类的对象
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回
        if self.built:
            return
        self.built = True

        # 如果模型中包含 RoBERTa 层，则构建 RoBERTa 层
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)

        # 如果模型中包含 QA 输出层，则构建 QA 输出层
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])

`.\models\roberta\tokenization_roberta.py`

# coding=utf-8
# 版权 2018 年 Open AI 团队作者和 HuggingFace Inc. 团队
#
# 根据 Apache 许可证 2.0 版本进行许可；
# 除非符合许可证的要求，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件根据"原样"分发，无任何明示或暗示的担保或条件。
# 请参阅许可证获取特定语言的权限。

"""RoBERTa 的分词类。"""

import json
import os
from functools import lru_cache
from typing import List, Optional, Tuple

import regex as re  # 导入正则表达式模块

from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging

logger = logging.get_logger(__name__)  # 获取用于此模块的日志记录器

VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",    # 词汇表文件名
    "merges_file": "merges.txt",   # 合并文件名
}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {  # 预训练模型的词汇表文件映射
        "FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/vocab.json",
        "FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/vocab.json",
        "FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/vocab.json",
        "distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/vocab.json",
        "openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/vocab.json",
        "openai-community/roberta-large-openai-detector": (
            "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/vocab.json"
        ),
    },
    "merges_file": {  # 预训练模型的合并文件映射
        "FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/merges.txt",
        "FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/merges.txt",
        "FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/merges.txt",
        "distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/merges.txt",
        "openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/merges.txt",
        "openai-community/roberta-large-openai-detector": (
            "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/merges.txt"
        ),
    },
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "FacebookAI/roberta-base": 512,   # 预训练模型位置嵌入的大小
    "FacebookAI/roberta-large": 512,
    "FacebookAI/roberta-large-mnli": 512,
    "distilbert/distilroberta-base": 512,
    "openai-community/roberta-base-openai-detector": 512,
    # 键："openai-community/roberta-large-openai-detector"，值：512
    "openai-community/roberta-large-openai-detector": 512,
}

@lru_cache()
# 使用 lru_cache 装饰器缓存函数结果，以提高性能，函数无需重复计算相同输入的结果
def bytes_to_unicode():
    """
    返回一个 UTF-8 字节列表，并提供到 Unicode 字符串的映射。
    避免将空白字符和控制字符映射到 BPE（字节对编码）代码无法处理的字符。
    
    可逆的 BPE（字节对编码）在 Unicode 字符串上工作。这意味着如果要避免 UNK（未知）字符，
    则需要在词汇表中包含大量的 Unicode 字符。例如，处理 10B 个标记的数据集时，大约需要 5K 个字符
    来获得良好的覆盖率。这相当于正常情况下 32K 个 BPE 词汇表的显著比例。为了避免这种情况，
    我们需要 UTF-8 字节和 Unicode 字符串之间的查找表。
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


def get_pairs(word):
    """
    返回单词中的符号对集合。

    单词表示为符号元组（符号是长度可变的字符串）。
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


class RobertaTokenizer(PreTrainedTokenizer):
    """
    构建 RoBERTa 分词器，基于 GPT-2 分词器，使用字节级别的字节对编码。

    该分词器已经训练成将空格视为标记的一部分（有点类似 sentencepiece），因此一个单词的编码方式取决于
    它是否在句子开头（没有空格）。

    您可以通过在实例化分词器或在对文本调用时传递 `add_prefix_space=True` 来绕过这种行为，但由于模型
    不是以这种方式进行预训练的，这可能会降低性能。

    <Tip>

    当使用 `is_split_into_words=True` 时，此分词器将在每个单词之前添加一个空格（即使是第一个单词）。

    </Tip>

    此分词器继承自 [`PreTrainedTokenizer`]，该类包含大多数主要方法。用户应参考该超类以获取更多有关这些方法的信息。
    """
    # 定义一个函数签名，说明函数的输入参数和默认值
    Args:
        vocab_file (`str`):
            Path to the vocabulary file. 词汇表文件的路径。
        merges_file (`str`):
            Path to the merges file. 合并文件的路径。
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
            解码字节流为 UTF-8 编码时使用的错误处理方式。参见 bytes.decode 获取更多信息。
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
            训练预处理阶段使用的序列开始标记。可用作序列分类器标记。

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
            序列结束标记。

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
            分隔符标记，在构建多个序列的序列时使用，例如用于序列分类或文本问答中的问题与回答。同时也作为使用特殊标记构建序列的最后一个标记。

        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
            用于序列分类任务时的分类器标记（对整个序列进行分类而不是对每个标记进行分类）。在使用特殊标记构建序列时，它是序列的第一个标记。

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
            未知标记。词汇表中不存在的标记无法被转换为标识符，将被设置为此标记。

        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
            用于填充的标记，例如在批处理不同长度的序列时使用。

        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
            用于掩码值的标记。在进行掩码语言建模训练时使用的标记，模型会试图预测这些标记。

        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
            是否将初始空格添加到输入中。这允许将前导词视为其他任何词。RoBERTa 分词器通过前导空格检测词的开头。
    ```

    # 定义一些常量和列表，用于映射和设置模型输入
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]
    # 初始化函数，用于设置特定的tokenizer参数和属性
    def __init__(
        self,
        vocab_file,
        merges_file,
        errors="replace",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        add_prefix_space=False,
        **kwargs,
    ):
        # 如果bos_token是字符串，则创建一个AddedToken对象，保留左右空格
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
        # 如果pad_token是字符串，则创建一个AddedToken对象，保留左右空格
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
        # 如果eos_token是字符串，则创建一个AddedToken对象，保留左右空格
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
        # 如果unk_token是字符串，则创建一个AddedToken对象，保留左右空格
        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
        # 如果sep_token是字符串，则创建一个AddedToken对象，保留左右空格
        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
        # 如果cls_token是字符串，则创建一个AddedToken对象，保留左右空格
        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token

        # Mask token行为类似于普通单词，即在其前面包含空格
        # 如果mask_token是字符串，则创建一个AddedToken对象，去掉左侧空格，保留右侧空格，不进行标准化处理
        mask_token = (
            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
            if isinstance(mask_token, str)
            else mask_token
        )

        # 这些特殊标记不包含在vocab.json中，让我们按正确顺序添加它们
        # 使用UTF-8编码打开vocab_file，并加载到self.encoder中
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        # 创建self.decoder，是self.encoder的反转版本（值-键对）
        self.decoder = {v: k for k, v in self.encoder.items()}
        # 设置解码中的错误处理方式
        self.errors = errors  # 如何处理解码中的错误
        # 创建bytes_to_unicode的实例，用于字节到Unicode字符的编码
        self.byte_encoder = bytes_to_unicode()
        # 创建self.byte_decoder，是self.byte_encoder的反转版本（值-键对）
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        # 使用UTF-8编码打开merges_file，并按行读取其内容（去掉第一行和最后一行空行）
        with open(merges_file, encoding="utf-8") as merges_handle:
            bpe_merges = merges_handle.read().split("\n")[1:-1]
        # 将每行的合并操作（merge）拆分为元组，并创建self.bpe_ranks字典
        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        # 初始化缓存字典
        self.cache = {}
        # 设置是否在特殊标记前添加空格的标志
        self.add_prefix_space = add_prefix_space

        # 应该添加re.IGNORECASE，以便可以对缩写的大写版本进行BPE合并
        # 编译正则表达式模式，用于匹配缩写、字母、数字及其他字符（标点符号等）
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

        # 调用父类的初始化方法，传递参数设置
        super().__init__(
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )

    # vocab_size属性，返回self.encoder字典的长度
    @property
    def vocab_size(self):
        return len(self.encoder)

    # 获取vocab字典，包括self.encoder和added_tokens_encoder的所有内容
    def get_vocab(self):
        vocab = dict(self.encoder).copy()
        vocab.update(self.added_tokens_encoder)
        return vocab
    def _tokenize(self, text):
        """Tokenize a string."""
        # 定义一个空列表，用于存储经过BPE处理后的token
        bpe_tokens = []
        # 使用正则表达式找出文本中的所有匹配项，并遍历每一个token
        for token in re.findall(self.pat, text):
            # 将token转换成UTF-8编码的字节，并逐字节映射到Unicode字符串，避免BPE中的控制标记（在我们的情况下是空格）
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
            # 使用BPE算法处理token，并将处理后的子token通过空格拆分并添加到bpe_tokens中
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        # 返回处理后的token列表
        return bpe_tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 根据token从encoder中获取其对应的id，如果token不存在，则返回未知token的id
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 根据index从decoder中获取其对应的token
        return self.decoder.get(index)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 将tokens列表中的所有token连接成一个字符串
        text = "".join(tokens)
        # 将字符串转换为UTF-8编码的字节数组，然后解码为Unicode字符串，使用指定的错误处理方式
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        # 返回解码后的文本
        return text
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 构造词汇表文件路径
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        # 构造合并文件路径
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 将编码器（self.encoder）以 JSON 格式写入词汇表文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        # 写入合并文件，并检查 BPE 合并索引是否连续
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        # 返回词汇表文件路径和合并文件路径作为元组
        return vocab_file, merge_file


    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # 如果没有第二个序列（token_ids_1），则返回包含特殊标记的单个序列的输入 IDs
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        # 构建包含特殊标记的序列对输入 IDs
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep


    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
        """
        Retrieve a mask of special tokens to avoid performing unnecessary calculations on them.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs corresponding to the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional list of IDs corresponding to the second sequence.
            already_has_special_tokens (`bool`):
                Whether the input token IDs already include special tokens.

        Returns:
            `List[int]`: List indicating the positions of special tokens (1 for special token, 0 otherwise).
        """
        # 初始化一个全零列表，用于记录特殊标记的位置
        special_tokens_mask = [0] * len(token_ids_0)

        # 如果没有第二个序列（token_ids_1），则直接返回全零列表
        if token_ids_1 is None:
            return special_tokens_mask

        # 计算第一个序列的长度
        first_sep_token_idx = token_ids_0.index(self.sep_token_id) if self.sep_token_id in token_ids_0 else -1

        # 遍历第一个序列，将特殊标记的位置设为 1
        for i in range(len(token_ids_0)):
            if token_ids_0[i] in [self.sep_token_id, self.cls_token_id]:
                special_tokens_mask[i] = 1

        # 计算第二个序列的起始位置
        second_sep_token_idx = token_ids_1.index(self.sep_token_id) if self.sep_token_id in token_ids_1 else -1

        # 遍历第二个序列，将特殊标记的位置设为 1
        for i in range(len(token_ids_1)):
            if token_ids_1[i] in [self.sep_token_id, self.cls_token_id]:
                special_tokens_mask.append(1)
            else:
                special_tokens_mask.append(0)

        # 如果输入已经包含特殊标记，将全零列表转换为全一列表
        if already_has_special_tokens:
            special_tokens_mask.extend([1] * (len(token_ids_0) + len(token_ids_1)))

        # 返回特殊标记的掩码列表
        return special_tokens_mask
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # If the token list already has special tokens, delegate to the superclass method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # If there is no token_ids_1 (no second list), return a list with special tokens added to token_ids_0
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        # If token_ids_1 exists, return a list with special tokens added to both token_ids_0 and token_ids_1
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        # Initialize special tokens for SEP and CLS
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If there is no token_ids_1 (no second list), return a list of zeros of the length of cls + token_ids_0 + sep
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        # If token_ids_1 exists, return a list of zeros of the length of cls + token_ids_0 + sep + sep + token_ids_1 + sep
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        """
        Prepare text for tokenization, potentially adding a prefix space if required.

        Args:
            text (str): The input text to be tokenized.
            is_split_into_words (bool, optional): Whether the text is already split into words.
            **kwargs: Additional keyword arguments.

        Returns:
            tuple: A tuple containing the modified text and remaining keyword arguments.
        """
        # Determine if a prefix space needs to be added based on conditions
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
            text = " " + text
        return (text, kwargs)

Transformers-源码解析-九十五-

Transformers 源码解析（九十五）

.\models\roberta\convert_roberta_original_pytorch_checkpoint_to_pytorch.py

.\models\roberta\modeling_flax_roberta.py

.\models\roberta\modeling_roberta.py

.\models\roberta\modeling_tf_roberta.py

.\models\roberta\tokenization_roberta.py

`.\models\roberta\convert_roberta_original_pytorch_checkpoint_to_pytorch.py`

`.\models\roberta\modeling_flax_roberta.py`

`.\models\roberta\modeling_roberta.py`

`.\models\roberta\modeling_tf_roberta.py`

`.\models\roberta\tokenization_roberta.py`