Transformers 源码解析（九十六）

`.\models\roberta\tokenization_roberta_fast.py`

# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Tokenization classes for RoBERTa."""

# 导入必要的模块和类
import json
from typing import List, Optional, Tuple

from tokenizers import pre_tokenizers, processors

from ...tokenization_utils_base import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_roberta import RobertaTokenizer

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件名字典
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/vocab.json",
        "FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/vocab.json",
        "FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/vocab.json",
        "distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/vocab.json",
        "openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/vocab.json",
        "openai-community/roberta-large-openai-detector": (
            "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/vocab.json"
        ),
    },
    "merges_file": {
        "FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/merges.txt",
        "FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/merges.txt",
        "FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/merges.txt",
        "distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/merges.txt",
        "openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/merges.txt",
        "openai-community/roberta-large-openai-detector": (
            "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/merges.txt"
        ),
    },
    # 定义一个字典，将预训练模型名称映射到其对应的 tokenizer.json 文件的 URL
    "tokenizer_file": {
        "FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/tokenizer.json",
        "FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/tokenizer.json",
        "FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/tokenizer.json",
        "distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/tokenizer.json",
        "openai-community/roberta-base-openai-detector": (
            "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/tokenizer.json"
        ),
        "openai-community/roberta-large-openai-detector": (
            "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/tokenizer.json"
        ),
    },
}

# 预训练位置嵌入的尺寸映射，将模型名称映射到嵌入维度大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "FacebookAI/roberta-base": 512,
    "FacebookAI/roberta-large": 512,
    "FacebookAI/roberta-large-mnli": 512,
    "distilbert/distilroberta-base": 512,
    "openai-community/roberta-base-openai-detector": 512,
    "openai-community/roberta-large-openai-detector": 512,
}


class RobertaTokenizerFast(PreTrainedTokenizerFast):
    """
    构建一个“快速”RoBERTa分词器（基于HuggingFace的*tokenizers*库），从GPT-2分词器继承，
    使用字节级别的字节对编码。

    这个分词器已经训练过，将空格视为标记的一部分（类似于sentencepiece），因此一个单词会因为它是否在句子开头（没有空格）而编码不同：

    ```
    >>> from transformers import RobertaTokenizerFast

    >>> tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
    >>> tokenizer("Hello world")["input_ids"]
    [0, 31414, 232, 2]

    >>> tokenizer(" Hello world")["input_ids"]
    [0, 20920, 232, 2]
    ```

    通过在实例化或文本调用时传递`add_prefix_space=True`，可以避免这种行为，但由于模型不是这种方式预训练的，可能会降低性能。

    <Tip>

    当使用`is_split_into_words=True`时，这个分词器需要在实例化时使用`add_prefix_space=True`。

    </Tip>

    这个分词器继承自[`PreTrainedTokenizerFast`]，其中包含大多数主要方法。用户应该参考这个超类获取有关这些方法的更多信息。

    ```
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
        trim_offsets (`bool`, *optional*, defaults to `True`):
            Whether the post processing step should trim offsets to avoid including whitespaces.
    """

    # 定义模型所需的文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 定义预训练模型词汇文件的映射关系
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 定义预训练位置嵌入大小的最大模型输入尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]
    # 定义模型输入的名称列表，包含"input_ids"和"attention_mask"

    slow_tokenizer_class = RobertaTokenizer
    # 指定一个名为RobertaTokenizer的慢速标记器类

    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        tokenizer_file=None,
        errors="replace",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        add_prefix_space=False,
        trim_offsets=True,
        **kwargs,
    ):
        # 初始化函数，用于创建一个新的实例对象
        # 参数解释如下：
        # vocab_file: 词汇文件路径
        # merges_file: 合并文件路径
        # tokenizer_file: 标记器文件路径
        # errors: 处理错误的方式
        # bos_token, eos_token, sep_token, cls_token, unk_token, pad_token, mask_token:
        # 特殊标记，如起始、结束、分隔、类别、未知、填充、掩码标记
        # add_prefix_space: 是否在标记前加空格，默认为False
        # trim_offsets: 是否修剪偏移量，默认为True
        # **kwargs: 其他关键字参数

        # 将mask_token转换为AddedToken对象，根据其类型进行处理
        mask_token = (
            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
            if isinstance(mask_token, str)
            else mask_token
        )

        # 调用父类的初始化方法，传递上述参数和关键字参数
        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            trim_offsets=trim_offsets,
            **kwargs,
        )

        # 获取当前实例的后处理器状态
        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())

        # 如果预处理器的add_prefix_space参数与指定的add_prefix_space不一致，则更新为指定值
        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
            pre_tok_state["add_prefix_space"] = add_prefix_space
            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

        # 将当前实例的add_prefix_space属性设置为指定值
        self.add_prefix_space = add_prefix_space

        # 指定后处理器组件的名称
        tokenizer_component = "post_processor"

        # 获取后处理器组件实例
        tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)

        # 如果后处理器组件存在
        if tokenizer_component_instance:
            # 获取后处理器组件实例的状态
            state = json.loads(tokenizer_component_instance.__getstate__())

            # 如果state中包含'sep'，则将其转换为元组
            if "sep" in state:
                state["sep"] = tuple(state["sep"])
            # 如果state中包含'cls'，则将其转换为元组
            if "cls" in state:
                state["cls"] = tuple(state["cls"])

            # 标记是否有待应用的更改
            changes_to_apply = False

            # 如果状态中的add_prefix_space参数与指定的add_prefix_space不一致，则更新为指定值
            if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
                state["add_prefix_space"] = add_prefix_space
                changes_to_apply = True

            # 如果状态中的trim_offsets参数与指定的trim_offsets不一致，则更新为指定值
            if state.get("trim_offsets", trim_offsets) != trim_offsets:
                state["trim_offsets"] = trim_offsets
                changes_to_apply = True

            # 如果有待应用的更改，则创建新的组件类并设置为后处理器组件
            if changes_to_apply:
                component_class = getattr(processors, state.pop("type"))
                new_value = component_class(**state)
                setattr(self.backend_tokenizer, tokenizer_component, new_value)
    def mask_token(self) -> str:
        """
        `str`: 获取掩码标记，用于训练带有掩码语言建模的模型。如果在未设置的情况下使用，则记录错误日志。
        
        Roberta 分词器具有特殊的掩码标记，可以在填充掩码管道中使用。掩码标记将贪婪地包括 *<mask>* 前面的空格。
        """
        if self._mask_token is None:
            if self.verbose:
                logger.error("Using mask_token, but it is not set yet.")
            return None
        return str(self._mask_token)

    @mask_token.setter
    def mask_token(self, value):
        """
        重写掩码标记的默认行为，使其可以包含前置空格。
        
        这是为了与之前基于 Roberta 的所有使用过的模型保持向后兼容性。
        """
        # 掩码标记表现得像普通单词，即包括前置空格
        # 因此我们将 lstrip 设置为 True
        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
        self._mask_token = value

    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
        is_split_into_words = kwargs.get("is_split_into_words", False)
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
            "to use it with pretokenized inputs."
        )

        return super()._batch_encode_plus(*args, **kwargs)

    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
        is_split_into_words = kwargs.get("is_split_into_words", False)

        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
            "to use it with pretokenized inputs."
        )

        return super()._encode_plus(*args, **kwargs)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        保存词汇表到指定的目录。
        
        使用分词器模型将词汇表保存到指定的目录，可选地指定文件名前缀。
        """
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        根据输入的 token_ids 构建带有特殊标记的输入序列。
        
        在 token_ids_0 前加入 bos_token_id，之后加入 eos_token_id；如果提供了 token_ids_1，则在其前后也加入 eos_token_id。
        """
        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        if token_ids_1 is None:
            return output

        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
        """
        根据输入的 token_ids 创造 token 类型 id。
        
        根据 token_ids_0 和 token_ids_1 创建对应的 token 类型 id，用于区分不同的序列类型。
        """
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        # 定义分隔符 `[SEP]` 的 token ID 列表
        sep = [self.sep_token_id]
        # 定义类别开始 `[CLS]` 的 token ID 列表
        cls = [self.cls_token_id]

        # 如果不存在第二个序列的 token IDs，返回只含第一个序列及特殊标记的列表长度的 0 值列表
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        # 如果存在第二个序列的 token IDs，返回含两个序列及特殊标记的列表长度的 0 值列表
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

`.\models\roberta\init.py`

# 引入需要的模块和函数
from typing import TYPE_CHECKING

# 从工具包中导入依赖项
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaOnnxConfig"],
    "tokenization_roberta": ["RobertaTokenizer"],
}

# 检查是否存在 tokenizers 库，若不存在则抛出异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 tokenizers 库，则导入快速 tokenization 模块
    _import_structure["tokenization_roberta_fast"] = ["RobertaTokenizerFast"]

# 检查是否存在 torch 库，若不存在则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 torch 库，则导入 PyTorch 的相关模型和工具
    _import_structure["modeling_roberta"] = [
        "ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
        "RobertaForCausalLM",
        "RobertaForMaskedLM",
        "RobertaForMultipleChoice",
        "RobertaForQuestionAnswering",
        "RobertaForSequenceClassification",
        "RobertaForTokenClassification",
        "RobertaModel",
        "RobertaPreTrainedModel",
    ]

# 检查是否存在 tensorflow 库，若不存在则抛出异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 tensorflow 库，则导入 TensorFlow 的相关模型和工具
    _import_structure["modeling_tf_roberta"] = [
        "TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFRobertaForCausalLM",
        "TFRobertaForMaskedLM",
        "TFRobertaForMultipleChoice",
        "TFRobertaForQuestionAnswering",
        "TFRobertaForSequenceClassification",
        "TFRobertaForTokenClassification",
        "TFRobertaMainLayer",
        "TFRobertaModel",
        "TFRobertaPreTrainedModel",
    ]

# 检查是否存在 flax 库，若不存在则抛出异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 flax 库，则导入 Flax 的相关模型和工具
    _import_structure["modeling_flax_roberta"] = [
        "FlaxRobertaForCausalLM",
        "FlaxRobertaForMaskedLM",
        "FlaxRobertaForMultipleChoice",
        "FlaxRobertaForQuestionAnswering",
        "FlaxRobertaForSequenceClassification",
        "FlaxRobertaForTokenClassification",
        "FlaxRobertaModel",
        "FlaxRobertaPreTrainedModel",
    ]

# 如果是类型检查阶段，则导入特定的配置和类型定义
if TYPE_CHECKING:
    from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaOnnxConfig
    # 导入来自当前目录的 tokenization_roberta 模块中的 RobertaTokenizer 类
    from .tokenization_roberta import RobertaTokenizer

    # 检查是否已安装 tokenizers，若未安装则引发 OptionalDependencyNotAvailable 异常
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 OptionalDependencyNotAvailable 异常被引发，不做任何处理
        pass
    else:
        # 如果没有异常被引发，则从当前目录导入 tokenization_roberta_fast 模块中的 RobertaTokenizerFast 类
        from .tokenization_roberta_fast import RobertaTokenizerFast

    # 检查是否已安装 torch，若未安装则引发 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 OptionalDependencyNotAvailable 异常被引发，不做任何处理
        pass
    else:
        # 如果没有异常被引发，则从当前目录导入 modeling_roberta 模块中列出的一系列类和常量
        from .modeling_roberta import (
            ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
            RobertaForCausalLM,
            RobertaForMaskedLM,
            RobertaForMultipleChoice,
            RobertaForQuestionAnswering,
            RobertaForSequenceClassification,
            RobertaForTokenClassification,
            RobertaModel,
            RobertaPreTrainedModel,
        )

    # 检查是否已安装 TensorFlow，若未安装则引发 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 OptionalDependencyNotAvailable 异常被引发，不做任何处理
        pass
    else:
        # 如果没有异常被引发，则从当前目录导入 modeling_tf_roberta 模块中列出的一系列类和常量
        from .modeling_tf_roberta import (
            TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFRobertaForCausalLM,
            TFRobertaForMaskedLM,
            TFRobertaForMultipleChoice,
            TFRobertaForQuestionAnswering,
            TFRobertaForSequenceClassification,
            TFRobertaForTokenClassification,
            TFRobertaMainLayer,
            TFRobertaModel,
            TFRobertaPreTrainedModel,
        )

    # 检查是否已安装 Flax，若未安装则引发 OptionalDependencyNotAvailable 异常
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 OptionalDependencyNotAvailable 异常被引发，不做任何处理
        pass
    else:
        # 如果没有异常被引发，则从当前目录导入 modeling_flax_roberta 模块中列出的一系列类和常量
        from .modeling_flax_roberta import (
            FlaxRobertaForCausalLM,
            FlaxRobertaForMaskedLM,
            FlaxRobertaForMultipleChoice,
            FlaxRobertaForQuestionAnswering,
            FlaxRobertaForSequenceClassification,
            FlaxRobertaForTokenClassification,
            FlaxRobertaModel,
            FlaxRobertaPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于操作 Python 解释器运行时的系统环境
    import sys

    # 将当前模块注册到 sys.modules 字典中，使用 _LazyModule 对象作为值
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\roberta_prelayernorm\configuration_roberta_prelayernorm.py`

# coding=utf-8
# Copyright 2022 The Google AI Language Team Authors and The HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" RoBERTa-PreLayerNorm configuration"""

# 从 collections 模块导入 OrderedDict 类
from collections import OrderedDict
# 从 typing 模块导入 Mapping 类型
from typing import Mapping

# 从配置工具中导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 从 ONNX 配置中导入 OnnxConfig
from ...onnx import OnnxConfig
# 从工具模块中导入日志记录功能 logging
from ...utils import logging

# 获取当前模块的日志记录器 logger
logger = logging.get_logger(__name__)

# 定义 RoBERTa-PreLayerNorm 模型预训练配置文件的映射字典
ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "andreasmadsen/efficient_mlm_m0.40": (
        "https://huggingface.co/andreasmadsen/efficient_mlm_m0.40/resolve/main/config.json"
    ),
}


# 定义 RoBERTa-PreLayerNormConfig 类，继承自 PretrainedConfig
# 这个类用于存储 RoBERTa-PreLayerNorm 模型的配置信息
class RobertaPreLayerNormConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`RobertaPreLayerNormModel`] or a [`TFRobertaPreLayerNormModel`]. It is
    used to instantiate a RoBERTa-PreLayerNorm model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa-PreLayerNorm
    [andreasmadsen/efficient_mlm_m0.40](https://huggingface.co/andreasmadsen/efficient_mlm_m0.40) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Examples:

    ```
    >>> from transformers import RobertaPreLayerNormConfig, RobertaPreLayerNormModel

    >>> # Initializing a RoBERTa-PreLayerNorm configuration
    >>> configuration = RobertaPreLayerNormConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = RobertaPreLayerNormModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    
    # 模型类型标识为 "roberta-prelayernorm"
    model_type = "roberta-prelayernorm"
    # 初始化方法，用于创建一个新的实例对象
    def __init__(
        self,
        vocab_size=50265,  # 词汇表大小，默认为50265
        hidden_size=768,  # 隐藏层大小，默认为768
        num_hidden_layers=12,  # 隐藏层的数量，默认为12
        num_attention_heads=12,  # 注意力头的数量，默认为12
        intermediate_size=3072,  # 中间层大小，默认为3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为GELU
        hidden_dropout_prob=0.1,  # 隐藏层的dropout概率，默认为0.1
        attention_probs_dropout_prob=0.1,  # 注意力机制的dropout概率，默认为0.1
        max_position_embeddings=512,  # 最大位置嵌入数，默认为512
        type_vocab_size=2,  # 类型词汇表的大小，默认为2
        initializer_range=0.02,  # 初始化范围，默认为0.02
        layer_norm_eps=1e-12,  # 层归一化的epsilon值，默认为1e-12
        pad_token_id=1,  # 填充标记的ID，默认为1
        bos_token_id=0,  # 开始标记的ID，默认为0
        eos_token_id=2,  # 结束标记的ID，默认为2
        position_embedding_type="absolute",  # 位置嵌入类型，默认为绝对位置嵌入
        use_cache=True,  # 是否使用缓存，默认为True
        classifier_dropout=None,  # 分类器的dropout，默认为None
        **kwargs,
    ):
        # 调用父类的初始化方法，传递填充、开始和结束标记的ID，以及其他参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 将传入的参数赋值给对象的属性
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.use_cache = use_cache
        self.classifier_dropout = classifier_dropout
# 从 transformers.models.roberta.configuration_roberta.RobertaOnnxConfig 复制代码，并将名称中的 Roberta 修改为 RobertaPreLayerNorm
class RobertaPreLayerNormOnnxConfig(OnnxConfig):
    
    # 定义 inputs 属性，返回一个映射，表示模型输入的动态轴
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务是多项选择 ("multiple-choice")，设置动态轴为 {0: "batch", 1: "choice", 2: "sequence"}
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则设置动态轴为 {0: "batch", 1: "sequence"}
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 返回一个有序字典，包含模型输入的名称和对应的动态轴
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),       # 模型输入的 token IDs，使用动态轴
                ("attention_mask", dynamic_axis),  # 模型输入的注意力遮罩，使用动态轴
            ]
        )

`.\models\roberta_prelayernorm\convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert RoBERTa-PreLayerNorm checkpoint."""


import argparse  # 导入用于解析命令行参数的模块

import torch  # 导入PyTorch库
from huggingface_hub import hf_hub_download  # 从huggingface_hub模块中导入hf_hub_download函数

from transformers import AutoTokenizer, RobertaPreLayerNormConfig, RobertaPreLayerNormForMaskedLM  # 导入transformers库中的相关类和函数
from transformers.utils import logging  # 导入logging模块


logging.set_verbosity_info()  # 设置日志输出级别为info
logger = logging.get_logger(__name__)  # 获取当前模块的logger对象


def convert_roberta_prelayernorm_checkpoint_to_pytorch(checkpoint_repo: str, pytorch_dump_folder_path: str):
    """
    Copy/paste/tweak roberta_prelayernorm's weights to our BERT structure.
    将roberta_prelayernorm的权重复制/粘贴/调整到我们的BERT结构中。
    """
    # convert configuration
    config = RobertaPreLayerNormConfig.from_pretrained(
        checkpoint_repo, architectures=["RobertaPreLayerNormForMaskedLM"]
    )  # 从预训练模型路径加载配置信息，指定模型架构为RobertaPreLayerNormForMaskedLM

    # convert state_dict
    original_state_dict = torch.load(hf_hub_download(repo_id=checkpoint_repo, filename="pytorch_model.bin"))
    # 使用hf_hub_download函数下载指定checkpoint_repo和"pytorch_model.bin"的模型文件，并加载为原始state_dict
    state_dict = {}
    for tensor_key, tensor_value in original_state_dict.items():
        # The transformer implementation gives the model a unique name, rather than overwiriting 'roberta'
        # 转换器实现中给模型一个唯一的名称，而不是覆盖 'roberta'
        if tensor_key.startswith("roberta."):
            tensor_key = "roberta_prelayernorm." + tensor_key[len("roberta.") :]
            # 如果tensor_key以"roberta."开头，则替换为"roberta_prelayernorm."
        
        # The original implementation contains weights which are not used, remove them from the state_dict
        # 原始实现包含未使用的权重，从state_dict中移除它们
        if tensor_key.endswith(".self.LayerNorm.weight") or tensor_key.endswith(".self.LayerNorm.bias"):
            continue
            # 如果tensor_key以".self.LayerNorm.weight"或".self.LayerNorm.bias"结尾，则跳过不处理

        state_dict[tensor_key] = tensor_value  # 将处理后的tensor_key和对应的tensor_value加入state_dict

    model = RobertaPreLayerNormForMaskedLM.from_pretrained(
        pretrained_model_name_or_path=None, config=config, state_dict=state_dict
    )
    # 使用from_pretrained方法根据配置和state_dict创建RobertaPreLayerNormForMaskedLM模型对象
    model.save_pretrained(pytorch_dump_folder_path)  # 将模型保存到指定的pytorch_dump_folder_path路径下

    # convert tokenizer
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_repo)
    # 使用checkpoint_repo加载预训练的分词器模型
    tokenizer.save_pretrained(pytorch_dump_folder_path)
    # 将分词器保存到指定的pytorch_dump_folder_path路径下


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建参数解析器
    # Required parameters
    parser.add_argument(
        "--checkpoint-repo",
        default=None,
        type=str,
        required=True,
        help="Path the official PyTorch dump, e.g. 'andreasmadsen/efficient_mlm_m0.40'.",
    )
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    args = parser.parse_args()  # 解析命令行参数
    convert_roberta_prelayernorm_checkpoint_to_pytorch(args.checkpoint_repo, args.pytorch_dump_folder_path)
    # 调用convert_roberta_prelayernorm_checkpoint_to_pytorch函数，进行模型转换和保存

`.\models\roberta_prelayernorm\modeling_flax_roberta_prelayernorm.py`

# coding=utf-8
# Copyright 2022 The Google Flax Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Flax RoBERTa-PreLayerNorm model."""
from typing import Callable, Optional, Tuple

import flax.linen as nn
import jax
import jax.numpy as jnp
import numpy as np
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen import partitioning as nn_partitioning
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax

from ...modeling_flax_outputs import (
    FlaxBaseModelOutputWithPastAndCrossAttentions,
    FlaxBaseModelOutputWithPooling,
    FlaxBaseModelOutputWithPoolingAndCrossAttentions,
    FlaxCausalLMOutputWithCrossAttentions,
    FlaxMaskedLMOutput,
    FlaxMultipleChoiceModelOutput,
    FlaxQuestionAnsweringModelOutput,
    FlaxSequenceClassifierOutput,
    FlaxTokenClassifierOutput,
)
from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_roberta_prelayernorm import RobertaPreLayerNormConfig

logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "andreasmadsen/efficient_mlm_m0.40"
_CONFIG_FOR_DOC = "RobertaPreLayerNormConfig"

remat = nn_partitioning.remat


# Copied from transformers.models.roberta.modeling_flax_roberta.create_position_ids_from_input_ids
def create_position_ids_from_input_ids(input_ids, padding_idx):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        input_ids: jnp.ndarray
        padding_idx: int

    Returns: jnp.ndarray
    """
    # Create a mask where non-padding symbols are marked as 1 and padding symbols as 0
    mask = (input_ids != padding_idx).astype("i4")

    # Reshape mask if it has more than 2 dimensions
    if mask.ndim > 2:
        mask = mask.reshape((-1, mask.shape[-1]))

    # Calculate cumulative sum along the last dimension of the mask
    incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask

    # Reshape incremental indices to match the shape of input_ids
    incremental_indices = incremental_indices.reshape(input_ids.shape)

    # Add padding_idx to the incremental indices to get final position ids
    return incremental_indices.astype("i4") + padding_idx
# ROBERTA_PRELAYERNORM_START_DOCSTRING 字符串常量，包含关于 RobertaPreLayerNormModel 模型的详细文档字符串。
ROBERTA_PRELAYERNORM_START_DOCSTRING = r"""

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`RobertaPreLayerNormConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
"""

# ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING 字符串常量，包含关于 RobertaPreLayerNormModel 模型输入的文档字符串，目前为空。
ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
            
            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
            
            [What are input IDs?](../glossary#input-ids)
        
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            
            [What are attention masks?](../glossary#attention-mask)
        
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            
            [What are token type IDs?](../glossary#token-type-ids)
        
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
            
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# 从transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings复制并修改为RobertaPreLayerNorm
class FlaxRobertaPreLayerNormEmbeddings(nn.Module):
    """从单词、位置和标记类型嵌入构建嵌入。"""

    config: RobertaPreLayerNormConfig
    dtype: jnp.dtype = jnp.float32  # 计算的数据类型

    def setup(self):
        # 初始化单词嵌入，使用正态分布初始化方法
        self.word_embeddings = nn.Embed(
            self.config.vocab_size,
            self.config.hidden_size,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
            dtype=self.dtype,
        )
        # 初始化位置嵌入，使用正态分布初始化方法
        self.position_embeddings = nn.Embed(
            self.config.max_position_embeddings,
            self.config.hidden_size,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
            dtype=self.dtype,
        )
        # 初始化标记类型嵌入，使用正态分布初始化方法
        self.token_type_embeddings = nn.Embed(
            self.config.type_vocab_size,
            self.config.hidden_size,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
            dtype=self.dtype,
        )
        # 初始化 LayerNorm 层
        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        # 初始化 Dropout 层
        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)

    def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True):
        # 嵌入输入的 ids
        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
        # 嵌入位置 ids
        position_embeds = self.position_embeddings(position_ids.astype("i4"))
        # 嵌入标记类型 ids
        token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))

        # 汇总所有嵌入
        hidden_states = inputs_embeds + token_type_embeddings + position_embeds

        # Layer Norm 层
        hidden_states = self.LayerNorm(hidden_states)
        # Dropout 层
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        return hidden_states


# 从transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention复制并修改为RobertaPreLayerNorm
class FlaxRobertaPreLayerNormSelfAttention(nn.Module):
    config: RobertaPreLayerNormConfig
    causal: bool = False
    dtype: jnp.dtype = jnp.float32  # 计算的数据类型
    # 在对象初始化时设置方法
    def setup(self):
        # 计算每个注意力头的维度
        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
        # 检查隐藏层大小是否能被注意力头数整除
        if self.config.hidden_size % self.config.num_attention_heads != 0:
            raise ValueError(
                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
                "                   : {self.config.num_attention_heads}"
            )

        # 初始化查询、键、值的全连接层
        self.query = nn.Dense(
            self.config.hidden_size,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )
        self.key = nn.Dense(
            self.config.hidden_size,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )
        self.value = nn.Dense(
            self.config.hidden_size,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )

        # 如果是因果注意力，则创建因果掩码
        if self.causal:
            self.causal_mask = make_causal_mask(
                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
            )

    # 将隐藏状态分割成多个注意力头
    def _split_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim))

    # 将分割后的注意力头合并成原始隐藏状态的形状
    def _merge_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,))

    @nn.compact
    # 从transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache复制而来的注释
    def _concatenate_to_cache(self, key, value, query, attention_mask):
        """
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slighly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        """
        # 检测是否初始化缓存数据
        is_initialized = self.has_variable("cache", "cached_key")
        # 获取或创建缓存的键值和值的变量，如果不存在则创建新的变量
        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
        # 获取或创建缓存索引的变量，如果不存在则初始化为0
        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))

        if is_initialized:
            # 获取批次维度等信息，并获取当前缓存键的最大长度、头数和每个头的深度
            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
            # 使用新的一维空间片段更新键值缓存
            cur_index = cache_index.value
            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
            key = lax.dynamic_update_slice(cached_key.value, key, indices)
            value = lax.dynamic_update_slice(cached_value.value, value, indices)
            cached_key.value = key
            cached_value.value = value
            # 更新缓存索引以反映新增的缓存向量数目
            num_updated_cache_vectors = query.shape[1]
            cache_index.value = cache_index.value + num_updated_cache_vectors
            # 用于缓存解码器自注意力的因果掩码：我们的单个查询位置只应关注已生成并缓存的键位置，而不是剩余的零元素
            pad_mask = jnp.broadcast_to(
                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
            )
            # 合并填充掩码和注意力掩码
            attention_mask = combine_masks(pad_mask, attention_mask)
        # 返回更新后的键、值和注意力掩码
        return key, value, attention_mask
# 定义 FlaxRobertaPreLayerNormSelfOutput 类，继承自 nn.Module
class FlaxRobertaPreLayerNormSelfOutput(nn.Module):
    # 类型注解，指定 config 属性为 RobertaPreLayerNormConfig 类型
    config: RobertaPreLayerNormConfig
    # 定义 dtype 属性，默认为 jnp.float32，表示计算中使用的数据类型
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    # 初始化函数，在模块设置时调用
    def setup(self):
        # 初始化 dense 层，输出维度为 self.config.hidden_size
        # 使用正态分布初始化权重，范围为 self.config.initializer_range
        self.dense = nn.Dense(
            self.config.hidden_size,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
            dtype=self.dtype,
        )
        # 初始化 dropout 层，丢弃率为 self.config.hidden_dropout_prob
        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)

    # 调用函数，定义模块的前向传播逻辑
    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
        # 输入 hidden_states 经过 dense 层处理
        hidden_states = self.dense(hidden_states)
        # 对处理后的 hidden_states 进行 dropout 操作
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        # 将 dropout 后的 hidden_states 与 input_tensor 相加作为输出
        hidden_states = hidden_states + input_tensor
        return hidden_states


# 定义 FlaxRobertaPreLayerNormAttention 类，继承自 nn.Module
class FlaxRobertaPreLayerNormAttention(nn.Module):
    # 类型注解，指定 config 属性为 RobertaPreLayerNormConfig 类型
    config: RobertaPreLayerNormConfig
    # 定义 causal 属性，默认为 False，表示是否是因果关系的自注意力
    causal: bool = False
    # 定义 dtype 属性，默认为 jnp.float32，表示计算中使用的数据类型
    dtype: jnp.dtype = jnp.float32

    # 初始化函数，在模块设置时调用
    def setup(self):
        # 初始化 self 层为 FlaxRobertaPreLayerNormSelfAttention 类实例
        self.self = FlaxRobertaPreLayerNormSelfAttention(self.config, causal=self.causal, dtype=self.dtype)
        # 初始化 output 层为 FlaxRobertaPreLayerNormSelfOutput 类实例
        self.output = FlaxRobertaPreLayerNormSelfOutput(self.config, dtype=self.dtype)
        # 初始化 LayerNorm 层，用于层归一化，epsilon 为 self.config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)

    # 调用函数，定义模块的前向传播逻辑
    def __call__(
        self,
        hidden_states,
        attention_mask,
        layer_head_mask,
        key_value_states=None,
        init_cache=False,
        deterministic=True,
        output_attentions: bool = False,
    ):
        # 对 hidden_states 进行层归一化
        hidden_states_pre_layer_norm = self.LayerNorm(hidden_states)
        # 调用 self 层的前向传播函数，处理层归一化后的 hidden_states
        # 返回的 attn_outputs 包含注意力输出和可能的附加信息，如注意力权重
        attn_outputs = self.self(
            hidden_states_pre_layer_norm,
            attention_mask,
            layer_head_mask=layer_head_mask,
            key_value_states=key_value_states,
            init_cache=init_cache,
            deterministic=deterministic,
            output_attentions=output_attentions,
        )
        # 从 attn_outputs 中获取注意力输出
        attn_output = attn_outputs[0]
        # 调用 output 层的前向传播函数，处理注意力输出和原始 hidden_states
        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)

        # 构建输出元组，包含更新后的 hidden_states
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则添加到输出元组中
        if output_attentions:
            outputs += (attn_outputs[1],)

        return outputs


# 定义 FlaxRobertaPreLayerNormIntermediate 类，继承自 nn.Module
class FlaxRobertaPreLayerNormIntermediate(nn.Module):
    # 类型注解，指定 config 属性为 RobertaPreLayerNormConfig 类型
    config: RobertaPreLayerNormConfig
    # 定义 dtype 属性，默认为 jnp.float32，表示计算中使用的数据类型
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    # 初始化函数，在模块设置时调用
    def setup(self):
        # 初始化 LayerNorm 层，用于层归一化，epsilon 为 self.config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        # 初始化 dense 层，输出维度为 self.config.intermediate_size
        # 使用正态分布初始化权重，范围为 self.config.initializer_range
        self.dense = nn.Dense(
            self.config.intermediate_size,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
            dtype=self.dtype,
        )
        # 激活函数 activation 为根据配置中的 hidden_act 从 ACT2FN 字典中选择的函数
        self.activation = ACT2FN[self.config.hidden_act]
    # 定义一个类方法，用于处理给定的隐藏状态数据
    def __call__(self, hidden_states):
        # 对隐藏状态进行层归一化操作
        hidden_states = self.LayerNorm(hidden_states)
        # 对归一化后的隐藏状态应用全连接层变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的隐藏状态应用激活函数（如ReLU等）
        hidden_states = self.activation(hidden_states)
        # 返回处理后的隐藏状态作为结果
        return hidden_states
# 定义一个名为 FlaxRobertaPreLayerNormOutput 的新的神经网络模块（nn.Module）
class FlaxRobertaPreLayerNormOutput(nn.Module):
    # 配置对象，用于存储 RobertaPreLayerNormConfig 类的配置信息
    config: RobertaPreLayerNormConfig
    # 计算过程中所用的数据类型，默认为 jnp.float32
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    # 模块初始化方法
    def setup(self):
        # 定义一个全连接层，输出维度为 config.hidden_size，权重初始化为正态分布，范围为 config.initializer_range
        self.dense = nn.Dense(
            self.config.hidden_size,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
            dtype=self.dtype,
        )
        # 定义一个 Dropout 层，丢弃率为 config.hidden_dropout_prob
        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)

    # 模块调用方法，用于前向传播
    def __call__(self, hidden_states, attention_output, deterministic: bool = True):
        # 将输入的 hidden_states 通过全连接层 dense 进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的 hidden_states 应用 Dropout，用于随机丢弃部分神经元，防止过拟合
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        # 将 Dropout 后的 hidden_states 与 attention_output 相加，得到最终输出
        hidden_states = hidden_states + attention_output
        # 返回最终输出的 hidden_states
        return hidden_states


# 从 transformers.models.bert.modeling_flax_bert.FlaxBertLayer 复制的类，修改为使用 RobertaPreLayerNorm
class FlaxRobertaPreLayerNormLayer(nn.Module):
    # 配置对象，用于存储 RobertaPreLayerNormConfig 类的配置信息
    config: RobertaPreLayerNormConfig
    # 计算过程中所用的数据类型，默认为 jnp.float32
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    # 模块初始化方法
    def setup(self):
        # 定义自注意力层，使用 FlaxRobertaPreLayerNormAttention 模块，传入配置信息和数据类型
        self.attention = FlaxRobertaPreLayerNormAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype)
        # 定义中间层，使用 FlaxRobertaPreLayerNormIntermediate 模块，传入配置信息和数据类型
        self.intermediate = FlaxRobertaPreLayerNormIntermediate(self.config, dtype=self.dtype)
        # 定义输出层，使用 FlaxRobertaPreLayerNormOutput 模块，传入配置信息和数据类型
        self.output = FlaxRobertaPreLayerNormOutput(self.config, dtype=self.dtype)
        # 如果配置中设置了添加交叉注意力，定义交叉注意力层，使用 FlaxRobertaPreLayerNormAttention 模块，传入配置信息和数据类型
        if self.config.add_cross_attention:
            self.crossattention = FlaxRobertaPreLayerNormAttention(self.config, causal=False, dtype=self.dtype)

    # 模块调用方法，用于前向传播
    def __call__(
        self,
        hidden_states,
        attention_mask,
        layer_head_mask,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        deterministic: bool = True,
        output_attentions: bool = False,
        # Self Attention
        # 调用 self.attention 方法进行自注意力计算
        attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            layer_head_mask=layer_head_mask,
            init_cache=init_cache,
            deterministic=deterministic,
            output_attentions=output_attentions,
        )
        attention_output = attention_outputs[0]

        # Cross-Attention Block
        # 如果提供了 encoder_hidden_states，则进行交叉注意力计算
        if encoder_hidden_states is not None:
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask=encoder_attention_mask,
                layer_head_mask=layer_head_mask,
                key_value_states=encoder_hidden_states,
                deterministic=deterministic,
                output_attentions=output_attentions,
            )
            attention_output = cross_attention_outputs[0]

        # 经过 intermediate 层的处理
        hidden_states = self.intermediate(attention_output)
        # 经过 output 层的处理
        hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)

        # 将 hidden_states 包装为 outputs 元组
        outputs = (hidden_states,)

        # 如果需要输出 attentions，则将 attention 输出包含在 outputs 中
        if output_attentions:
            outputs += (attention_outputs[1],)
            # 如果提供了 encoder_hidden_states，则将 cross-attention 输出也包含在 outputs 中
            if encoder_hidden_states is not None:
                outputs += (cross_attention_outputs[1],)
        return outputs
# 从transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection复制并修改为FlaxRobertaPreLayerNormLayerCollection类
class FlaxRobertaPreLayerNormLayerCollection(nn.Module):
    config: RobertaPreLayerNormConfig  # 类型提示，指定配置对象类型为RobertaPreLayerNormConfig
    dtype: jnp.dtype = jnp.float32  # 计算时所用的数据类型，默认为jnp.float32
    gradient_checkpointing: bool = False  # 是否使用梯度检查点

    def setup(self):
        if self.gradient_checkpointing:
            # 如果启用梯度检查点，则定义FlaxRobertaPreLayerNormCheckpointLayer，并传递静态参数索引
            FlaxRobertaPreLayerNormCheckpointLayer = remat(FlaxRobertaPreLayerNormLayer, static_argnums=(5, 6, 7))
            # 创建多个梯度检查点层对象，每个层对象对应一层神经网络层
            self.layers = [
                FlaxRobertaPreLayerNormCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
                for i in range(self.config.num_hidden_layers)
            ]
        else:
            # 如果未启用梯度检查点，则创建多个Roberta预层归一化层对象，每个对象对应一层神经网络层
            self.layers = [
                FlaxRobertaPreLayerNormLayer(self.config, name=str(i), dtype=self.dtype)
                for i in range(self.config.num_hidden_layers)
            ]

    def __call__(
        self,
        hidden_states,
        attention_mask,
        head_mask,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        ):
            # 初始化空元组，根据需要输出注意力的设置确定是否包含
            all_attentions = () if output_attentions else None
            # 初始化空元组，根据需要输出隐藏状态的设置确定是否包含
            all_hidden_states = () if output_hidden_states else None
            # 初始化空元组，根据需要输出交叉注意力的设置和编码器隐藏状态是否存在确定是否包含
            all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None

            # 检查头部掩码是否为每层指定了正确数量的层
            if head_mask is not None:
                if head_mask.shape[0] != (len(self.layers)):
                    raise ValueError(
                        f"The head_mask should be specified for {len(self.layers)} layers, but it is for                  "
                        f"       {head_mask.shape[0]}."
                    )

            # 遍历模型的每一层
            for i, layer in enumerate(self.layers):
                # 如果需要输出隐藏状态，则将当前隐藏状态添加到所有隐藏状态的元组中
                if output_hidden_states:
                    all_hidden_states += (hidden_states,)

                # 调用当前层的前向传播方法
                layer_outputs = layer(
                    hidden_states,
                    attention_mask,
                    head_mask[i] if head_mask is not None else None,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    init_cache,
                    deterministic,
                    output_attentions,
                )

                # 更新隐藏状态为当前层的输出的第一个元素
                hidden_states = layer_outputs[0]

                # 如果需要输出注意力权重，则将当前层的注意力权重添加到所有注意力的元组中
                if output_attentions:
                    all_attentions += (layer_outputs[1],)

                    # 如果存在编码器的隐藏状态，则将当前层的交叉注意力添加到所有交叉注意力的元组中
                    if encoder_hidden_states is not None:
                        all_cross_attentions += (layer_outputs[2],)

            # 如果需要输出隐藏状态，则将最终隐藏状态添加到所有隐藏状态的元组中
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            # 组装最终的模型输出，包括隐藏状态、所有隐藏状态、所有注意力和所有交叉注意力
            outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)

            # 如果不需要返回字典形式的输出，则返回非空元素的元组
            if not return_dict:
                return tuple(v for v in outputs if v is not None)

            # 返回具有过去和交叉注意力的 Flax 模型输出对象
            return FlaxBaseModelOutputWithPastAndCrossAttentions(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states,
                attentions=all_attentions,
                cross_attentions=all_cross_attentions,
            )
# 从 transformers.models.bert.modeling_flax_bert.FlaxBertEncoder 复制并将 Bert 替换为 RobertaPreLayerNorm
class FlaxRobertaPreLayerNormEncoder(nn.Module):
    config: RobertaPreLayerNormConfig  # 使用 RobertaPreLayerNormConfig 配置
    dtype: jnp.dtype = jnp.float32  # 计算时使用的数据类型
    gradient_checkpointing: bool = False  # 梯度检查点标志，默认为 False

    def setup(self):
        # 初始化 FlaxRobertaPreLayerNormLayerCollection 层集合
        self.layer = FlaxRobertaPreLayerNormLayerCollection(
            self.config,
            dtype=self.dtype,
            gradient_checkpointing=self.gradient_checkpointing,
        )

    def __call__(
        self,
        hidden_states,
        attention_mask,
        head_mask,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用 self.layer 进行编码器的前向传播
        return self.layer(
            hidden_states,
            attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            init_cache=init_cache,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


# 从 transformers.models.bert.modeling_flax_bert.FlaxBertPooler 复制并将 Bert 替换为 RobertaPreLayerNorm
class FlaxRobertaPreLayerNormPooler(nn.Module):
    config: RobertaPreLayerNormConfig  # 使用 RobertaPreLayerNormConfig 配置
    dtype: jnp.dtype = jnp.float32  # 计算时使用的数据类型

    def setup(self):
        # 初始化全连接层 dense
        self.dense = nn.Dense(
            self.config.hidden_size,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
            dtype=self.dtype,
        )

    def __call__(self, hidden_states):
        # 取出每个序列的第一个隐藏状态作为 CLS 隐藏状态，然后经过全连接层 dense 和 tanh 函数处理后返回
        cls_hidden_state = hidden_states[:, 0]
        cls_hidden_state = self.dense(cls_hidden_state)
        return nn.tanh(cls_hidden_state)


# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaLMHead 复制并将 Roberta 替换为 RobertaPreLayerNorm
class FlaxRobertaPreLayerNormLMHead(nn.Module):
    config: RobertaPreLayerNormConfig  # 使用 RobertaPreLayerNormConfig 配置
    dtype: jnp.dtype = jnp.float32  # 计算时使用的数据类型
    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros  # 偏置初始化器为零初始化器

    def setup(self):
        # 初始化全连接层 dense、LayerNorm 层和解码器 dense
        self.dense = nn.Dense(
            self.config.hidden_size,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )
        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        self.decoder = nn.Dense(
            self.config.vocab_size,
            dtype=self.dtype,
            use_bias=False,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )
        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
    # 定义一个特殊方法 __call__，用于将对象实例作为函数调用
    def __call__(self, hidden_states, shared_embedding=None):
        # 将隐藏状态通过全连接层 dense 进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的隐藏状态应用 GELU 激活函数
        hidden_states = ACT2FN["gelu"](hidden_states)
        # 对激活后的隐藏状态进行层归一化处理
        hidden_states = self.layer_norm(hidden_states)

        # 如果传入了共享的嵌入矩阵，则使用嵌入矩阵进行解码
        if shared_embedding is not None:
            # 使用 decoder 对象的 apply 方法应用共享的嵌入矩阵进行解码
            hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
        else:
            # 否则，直接使用 decoder 对象进行解码
            hidden_states = self.decoder(hidden_states)

        # 将偏置转换为与当前数据类型匹配的 JAX 数组，并加到隐藏状态上
        bias = jnp.asarray(self.bias, self.dtype)
        hidden_states += bias
        # 返回处理后的隐藏状态作为结果
        return hidden_states
# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaClassificationHead with Roberta->RobertaPreLayerNorm
class FlaxRobertaPreLayerNormClassificationHead(nn.Module):
    config: RobertaPreLayerNormConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        # 初始化一个全连接层，用于分类头部，输入维度是隐藏层大小
        self.dense = nn.Dense(
            self.config.hidden_size,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )
        # 根据配置中的 dropout 概率设置 Dropout 层
        classifier_dropout = (
            self.config.classifier_dropout
            if self.config.classifier_dropout is not None
            else self.config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(rate=classifier_dropout)
        # 初始化输出投影层，输出维度是标签数量
        self.out_proj = nn.Dense(
            self.config.num_labels,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )

    def __call__(self, hidden_states, deterministic=True):
        # 取隐藏状态中的第一个 token 的表示，相当于 [CLS] 标志
        hidden_states = hidden_states[:, 0, :]
        # 应用 Dropout 层到隐藏状态，以减少过拟合
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        # 输入全连接层，对隐藏状态进行线性变换
        hidden_states = self.dense(hidden_states)
        # 应用 tanh 激活函数
        hidden_states = nn.tanh(hidden_states)
        # 再次应用 Dropout 层
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        # 输出投影层，生成最终的分类预测结果
        hidden_states = self.out_proj(hidden_states)
        return hidden_states


# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaPreTrainedModel with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
class FlaxRobertaPreLayerNormPreTrainedModel(FlaxPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 配置类为 RobertaPreLayerNormConfig
    config_class = RobertaPreLayerNormConfig
    # 基础模型前缀为 roberta_prelayernorm
    base_model_prefix = "roberta_prelayernorm"

    module_class: nn.Module = None

    def __init__(
        self,
        config: RobertaPreLayerNormConfig,
        input_shape: Tuple = (1, 1),
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        gradient_checkpointing: bool = False,
        **kwargs,
    ):
        # 初始化模型实例，根据传入的配置和参数
        module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing
    def enable_gradient_checkpointing(self):
        # 启用梯度检查点，更新模型内部使用的模块实例
        self._module = self.module_class(
            config=self.config,
            dtype=self.dtype,
            gradient_checkpointing=True,
        )
    # 初始化权重函数，用于模型的参数初始化
    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        # 初始化输入张量
        input_ids = jnp.zeros(input_shape, dtype="i4")  # 创建全零张量，形状为input_shape，数据类型为32位整数
        token_type_ids = jnp.ones_like(input_ids)  # 创建与input_ids相同形状的全一张量
        position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)  # 根据input_ids创建位置编码张量
        attention_mask = jnp.ones_like(input_ids)  # 创建与input_ids相同形状的全一张量作为注意力掩码
        head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))  # 创建全一头掩码张量

        # 使用随机数生成器rng拆分参数rng和dropout_rng
        params_rng, dropout_rng = jax.random.split(rng)
        rngs = {"params": params_rng, "dropout": dropout_rng}

        if self.config.add_cross_attention:
            # 如果配置中包含跨注意力机制，则初始化编码器隐藏状态和注意力掩码
            encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
            encoder_attention_mask = attention_mask
            # 调用模块的初始化函数，返回非字典形式的初始化输出
            module_init_outputs = self.module.init(
                rngs,
                input_ids,
                attention_mask,
                token_type_ids,
                position_ids,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                return_dict=False,
            )
        else:
            # 否则，只使用基本输入初始化模块
            module_init_outputs = self.module.init(
                rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False
            )

        random_params = module_init_outputs["params"]  # 获取模块初始化输出中的随机参数

        if params is not None:
            # 如果提供了预训练参数，则进行参数的扁平化和解冻操作
            random_params = flatten_dict(unfreeze(random_params))
            params = flatten_dict(unfreeze(params))
            for missing_key in self._missing_keys:
                params[missing_key] = random_params[missing_key]
            self._missing_keys = set()
            return freeze(unflatten_dict(params))  # 返回冻结的参数字典
        else:
            return random_params  # 否则，返回随机初始化的参数

    # 从transformers库中复制的初始化缓存函数，用于自回归解码
    def init_cache(self, batch_size, max_length):
        r"""
        Args:
            batch_size (`int`):
                用于快速自回归解码的批大小。定义了初始化缓存的批大小。
            max_length (`int`):
                自回归解码的最大可能长度。定义了初始化缓存的序列长度。
        """
        # 初始化用于检索缓存的输入变量
        input_ids = jnp.ones((batch_size, max_length), dtype="i4")  # 创建全一张量作为输入ids，形状为(batch_size, max_length)
        attention_mask = jnp.ones_like(input_ids, dtype="i4")  # 创建与input_ids相同形状的全一张量作为注意力掩码
        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
        # 根据input_ids的维度广播位置ids张量

        init_variables = self.module.init(
            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
        )
        return unfreeze(init_variables["cache"])  # 返回解冻的初始化缓存变量

    # 为模型前向传播添加文档字符串，来自ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING格式化为模型前向传播
    # 定义一个特殊方法 __call__，用于将实例作为可调用对象使用
    self,
    # 输入参数 input_ids：用于表示输入序列的 token IDs
    input_ids,
    # attention_mask：用于指定哪些 token 应该被忽略（1 表示不被忽略，0 表示被忽略）
    attention_mask=None,
    # token_type_ids：用于区分不同句子的 token 类型
    token_type_ids=None,
    # position_ids：指定 token 的位置信息
    position_ids=None,
    # head_mask：用于控制多头注意力机制中每个注意力头的掩码
    head_mask=None,
    # encoder_hidden_states：编码器的隐藏状态
    encoder_hidden_states=None,
    # encoder_attention_mask：编码器的注意力掩码
    encoder_attention_mask=None,
    # params：额外的参数，应为字典类型
    params: dict = None,
    # dropout_rng：用于随机数生成的 PRNG 键
    dropout_rng: jax.random.PRNGKey = None,
    # train：是否处于训练模式
    train: bool = False,
    # output_attentions：是否输出注意力权重
    output_attentions: Optional[bool] = None,
    # output_hidden_states：是否输出隐藏状态
    output_hidden_states: Optional[bool] = None,
    # return_dict：是否返回字典形式的输出
    return_dict: Optional[bool] = None,
    # past_key_values：过去的键值对，应为字典类型
    past_key_values: dict = None,
class FlaxRobertaPreLayerNormModule(nn.Module):
    # 类型注解：配置对象为 RobertaPreLayerNormConfig 类型
    config: RobertaPreLayerNormConfig
    # 计算的数据类型，默认为 jnp.float32
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
    # 是否添加池化层，默认为 True
    add_pooling_layer: bool = True
    # 是否使用梯度检查点，默认为 False
    gradient_checkpointing: bool = False

    # 模块初始化方法
    def setup(self):
        # 初始化嵌入层
        self.embeddings = FlaxRobertaPreLayerNormEmbeddings(self.config, dtype=self.dtype)
        # 初始化编码器
        self.encoder = FlaxRobertaPreLayerNormEncoder(
            self.config,
            dtype=self.dtype,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        # 初始化 LayerNorm 层
        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
        # 初始化池化层
        self.pooler = FlaxRobertaPreLayerNormPooler(self.config, dtype=self.dtype)

    # 对象调用方法
    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids: Optional[jnp.ndarray] = None,
        position_ids: Optional[jnp.ndarray] = None,
        head_mask: Optional[jnp.ndarray] = None,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 确保当未传入 token_type_ids 时，其被正确初始化为全零数组
        if token_type_ids is None:
            token_type_ids = jnp.zeros_like(input_ids)

        # 确保当未传入 position_ids 时，其被正确初始化为广播后的数组
        if position_ids is None:
            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)

        # 调用嵌入层生成隐藏状态
        hidden_states = self.embeddings(
            input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic
        )
        # 调用编码器处理隐藏状态
        outputs = self.encoder(
            hidden_states,
            attention_mask,
            head_mask=head_mask,
            deterministic=deterministic,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 对编码后的隐藏状态应用 LayerNorm
        hidden_states = outputs[0]
        hidden_states = self.LayerNorm(hidden_states)
        # 如果设置了添加池化层，则对处理后的隐藏状态进行池化
        pooled = self.pooler(hidden_states) if self.add_pooling_layer else None

        # 如果不返回字典，则按非字典格式返回结果
        if not return_dict:
            # 如果 pooled 为 None，则不返回它
            if pooled is None:
                return (hidden_states,) + outputs[1:]
            return (hidden_states, pooled) + outputs[1:]

        # 返回带有池化和交叉注意力的 FlaxBaseModelOutputWithPoolingAndCrossAttentions 对象
        return FlaxBaseModelOutputWithPoolingAndCrossAttentions(
            last_hidden_state=hidden_states,
            pooler_output=pooled,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )


@add_start_docstrings(
    # "The bare RoBERTa-PreLayerNorm Model transformer outputting raw hidden-states without any specific head on top."
    # 上面的字符串描述了 RoBERTa-PreLayerNorm 模型的基本特征，它输出原始隐藏状态而没有特定的顶部头部。
    
    ROBERTA_PRELAYERNORM_START_DOCSTRING
    # 使用 ROBERTA_PRELAYERNORM_START_DOCSTRING 常量，可能是用于指定 RoBERTa-PreLayerNorm 模型的文档字符串的起始部分。
# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaModel 中复制代码，并将 Roberta->RobertaPreLayerNorm 进行了替换
class FlaxRobertaPreLayerNormModel(FlaxRobertaPreLayerNormPreTrainedModel):
    # 设定模块类为 FlaxRobertaPreLayerNormModule
    module_class = FlaxRobertaPreLayerNormModule


# 调用函数 append_call_sample_docstring，向 FlaxRobertaPreLayerNormModel 添加文档字符串示例
append_call_sample_docstring(
    FlaxRobertaPreLayerNormModel,
    _CHECKPOINT_FOR_DOC,
    FlaxBaseModelOutputWithPooling,
    _CONFIG_FOR_DOC,
)


# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForMaskedLMModule 中复制代码，并将 Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm 进行了替换
class FlaxRobertaPreLayerNormForMaskedLMModule(nn.Module):
    # 配置项为 RobertaPreLayerNormConfig，数据类型为 jnp.float32，默认情况下不使用梯度检查点
    config: RobertaPreLayerNormConfig
    dtype: jnp.dtype = jnp.float32
    gradient_checkpointing: bool = False

    def setup(self):
        # 初始化 self.roberta_prelayernorm 为 FlaxRobertaPreLayerNormModule 实例
        self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule(
            config=self.config,
            add_pooling_layer=False,
            dtype=self.dtype,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        # 初始化 self.lm_head 为 FlaxRobertaPreLayerNormLMHead 实例
        self.lm_head = FlaxRobertaPreLayerNormLMHead(config=self.config, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        head_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 模型前向传播
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]
        if self.config.tie_word_embeddings:
            # 如果配置中指定共享词嵌入，则使用共享的词嵌入
            shared_embedding = self.roberta_prelayernorm.variables["params"]["embeddings"]["word_embeddings"][
                "embedding"
            ]
        else:
            shared_embedding = None

        # 计算预测得分
        logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)

        if not return_dict:
            return (logits,) + outputs[1:]

        # 返回 FlaxMaskedLMOutput 对象，包含预测得分、隐藏状态、注意力权重
        return FlaxMaskedLMOutput(
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


# 使用 add_start_docstrings 函数向 FlaxRobertaPreLayerNormForMaskedLM 添加文档字符串
@add_start_docstrings(
    """RoBERTa-PreLayerNorm 模型，顶部带有 `语言建模` 头.""", ROBERTA_PRELAYERNORM_START_DOCSTRING
)
# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForMaskedLM 复制代码，并将 Roberta->RobertaPreLayerNorm 进行了替换
class FlaxRobertaPreLayerNormForMaskedLM(FlaxRobertaPreLayerNormPreTrainedModel):
    # 设定模块类为 FlaxRobertaPreLayerNormForMaskedLMModule
    module_class = FlaxRobertaPreLayerNormForMaskedLMModule


# 调用函数 append_call_sample_docstring，向 FlaxRobertaPreLayerNormForMaskedLM 添加文档字符串示例
append_call_sample_docstring(
    FlaxRobertaPreLayerNormForMaskedLM,
    _CHECKPOINT_FOR_DOC,
    FlaxBaseModelOutputWithPooling,
    _CONFIG_FOR_DOC,
    mask="<mask>",
)
# 从transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForSequenceClassificationModule复制而来，修改了Roberta为RobertaPreLayerNorm，roberta为roberta_prelayernorm
class FlaxRobertaPreLayerNormForSequenceClassificationModule(nn.Module):
    # 配置信息为RobertaPreLayerNormConfig
    config: RobertaPreLayerNormConfig
    # 数据类型默认为jnp.float32
    dtype: jnp.dtype = jnp.float32
    # 是否使用梯度检查点，默认为False
    gradient_checkpointing: bool = False

    # 模块初始化方法
    def setup(self):
        # 初始化self.roberta_prelayernorm，使用FlaxRobertaPreLayerNormModule
        self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule(
            config=self.config,
            dtype=self.dtype,
            add_pooling_layer=False,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        # 初始化self.classifier，使用FlaxRobertaPreLayerNormClassificationHead
        self.classifier = FlaxRobertaPreLayerNormClassificationHead(config=self.config, dtype=self.dtype)

    # 模块调用方法
    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        head_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 模型处理过程
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取序列输出
        sequence_output = outputs[0]
        # 获取logits，使用self.classifier处理序列输出
        logits = self.classifier(sequence_output, deterministic=deterministic)

        # 如果return_dict为False，返回logits和其他输出状态
        if not return_dict:
            return (logits,) + outputs[1:]

        # 返回FlaxSequenceClassifierOutput对象，包括logits、隐藏状态和注意力权重
        return FlaxSequenceClassifierOutput(
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


# 添加起始文档字符串
@add_start_docstrings(
    """
    带有顶部序列分类/回归头部的RobertaPreLayerNorm模型变换器（在汇聚输出的顶部添加线性层），例如用于GLUE任务。
    """,
    ROBERTA_PRELAYERNORM_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForSequenceClassification复制而来，修改了Roberta为RobertaPreLayerNorm
class FlaxRobertaPreLayerNormForSequenceClassification(FlaxRobertaPreLayerNormPreTrainedModel):
    # 模块类为FlaxRobertaPreLayerNormForSequenceClassificationModule
    module_class = FlaxRobertaPreLayerNormForSequenceClassificationModule


# 添加调用示例文档字符串
append_call_sample_docstring(
    FlaxRobertaPreLayerNormForSequenceClassification,
    _CHECKPOINT_FOR_DOC,
    FlaxSequenceClassifierOutput,
    _CONFIG_FOR_DOC,
)


# 从transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule复制而来，修改了Bert为RobertaPreLayerNorm，self.bert为self.roberta_prelayernorm
class FlaxRobertaPreLayerNormForMultipleChoiceModule(nn.Module):
    # 配置信息为RobertaPreLayerNormConfig
    config: RobertaPreLayerNormConfig
    # 数据类型默认为jnp.float32
    dtype: jnp.dtype = jnp.float32
    # 是否使用梯度检查点，默认为False
    gradient_checkpointing: bool = False
    # 设置模型的初始状态，包括Roberta的预层归一化模块
    def setup(self):
        self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule(
            config=self.config,
            dtype=self.dtype,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        # 设置dropout层，使用配置中指定的隐藏层dropout概率
        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
        # 设置分类器层，输出维度为1，使用指定的数据类型
        self.classifier = nn.Dense(1, dtype=self.dtype)

    # 定义类的调用方法，实现模型的前向传播
    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        head_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 计算选择题的数量
        num_choices = input_ids.shape[1]
        # 重塑输入张量，将其展平以便传递给模型
        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
        token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
        position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None

        # 调用Roberta的预层归一化模块进行模型计算
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中提取汇聚输出
        pooled_output = outputs[1]
        # 对汇聚输出应用dropout，以便在训练过程中随机丢弃一部分节点
        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
        # 使用分类器层计算最终的logits
        logits = self.classifier(pooled_output)

        # 将logits张量重新形状为[num_choices, -1]，以便适应多选题的输出格式
        reshaped_logits = logits.reshape(-1, num_choices)

        # 如果不要求返回字典形式的输出，则返回包含logits和可能的额外输出的元组
        if not return_dict:
            return (reshaped_logits,) + outputs[2:]

        # 返回FlaxMultipleChoiceModelOutput类的实例，包含重塑后的logits、隐藏状态和注意力张量
        return FlaxMultipleChoiceModelOutput(
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    """
    RobertaPreLayerNorm Model with a multiple choice classification head on top (a linear layer on top of the pooled
    output and a softmax) e.g. for RocStories/SWAG tasks.
    """


    ROBERTA_PRELAYERNORM_START_DOCSTRING,


)


# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForMultipleChoice with Roberta->RobertaPreLayerNorm
class FlaxRobertaPreLayerNormForMultipleChoice(FlaxRobertaPreLayerNormPreTrainedModel):
    module_class = FlaxRobertaPreLayerNormForMultipleChoiceModule


overwrite_call_docstring(
    FlaxRobertaPreLayerNormForMultipleChoice,
    ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"),
)


append_call_sample_docstring(
    FlaxRobertaPreLayerNormForMultipleChoice,
    _CHECKPOINT_FOR_DOC,
    FlaxMultipleChoiceModelOutput,
    _CONFIG_FOR_DOC,
)


# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassificationModule with Bert->RobertaPreLayerNorm, with self.bert->self.roberta_prelayernorm
class FlaxRobertaPreLayerNormForTokenClassificationModule(nn.Module):
    config: RobertaPreLayerNormConfig
    dtype: jnp.dtype = jnp.float32
    gradient_checkpointing: bool = False

    def setup(self):
        self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule(
            config=self.config,
            dtype=self.dtype,
            add_pooling_layer=False,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        classifier_dropout = (
            self.config.classifier_dropout
            if self.config.classifier_dropout is not None
            else self.config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(rate=classifier_dropout)
        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        head_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # Model
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        logits = self.classifier(hidden_states)

        if not return_dict:
            return (logits,) + outputs[1:]

        return FlaxTokenClassifierOutput(
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@add_start_docstrings(
    """
    RobertaPreLayerNorm Model with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    """
    ROBERTA_PRELAYERNORM_START_DOCSTRING,



    注释：
    RobertaPreLayerNorm 模型，其顶部有一个面向标记分类的头部（在隐藏状态输出之上的线性层），例如用于命名实体识别（NER）任务。
    """
    ROBERTA_PRELAYERNORM_START_DOCSTRING,
# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForTokenClassification 复制代码，并将 Roberta 改为 RobertaPreLayerNorm
class FlaxRobertaPreLayerNormForTokenClassification(FlaxRobertaPreLayerNormPreTrainedModel):
    # 模块类设置为 FlaxRobertaPreLayerNormForTokenClassificationModule
    module_class = FlaxRobertaPreLayerNormForTokenClassificationModule

# 调用函数 append_call_sample_docstring，用于给定的类添加示例文档字符串，包括检查点、输出和配置信息
append_call_sample_docstring(
    FlaxRobertaPreLayerNormForTokenClassification,
    _CHECKPOINT_FOR_DOC,
    FlaxTokenClassifierOutput,
    _CONFIG_FOR_DOC,
)

# 从 transformers.models.bert.modeling_flax_bert.FlaxBertForQuestionAnsweringModule 复制代码，并将 Bert 改为 RobertaPreLayerNorm，self.bert 改为 self.roberta_prelayernorm
class FlaxRobertaPreLayerNormForQuestionAnsweringModule(nn.Module):
    # 配置为 RobertaPreLayerNormConfig 类型
    config: RobertaPreLayerNormConfig
    # 数据类型设置为 jnp.float32
    dtype: jnp.dtype = jnp.float32
    # 梯度检查点设置为 False
    gradient_checkpointing: bool = False

    # 初始化函数
    def setup(self):
        # 创建 RobertaPreLayerNormModule 实例，包括配置、数据类型、不添加池化层、梯度检查点设置
        self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule(
            config=self.config,
            dtype=self.dtype,
            add_pooling_layer=False,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        # 创建全连接层 nn.Dense，输出维度为 self.config.num_labels，数据类型为 self.dtype
        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)

    # 调用函数，实现模型的前向传播
    def __call__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        position_ids,
        head_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用 self.roberta_prelayernorm 进行模型前向传播，传入各种输入参数
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取隐藏状态
        hidden_states = outputs[0]

        # 计算问题回答的起始和结束 logits
        logits = self.qa_outputs(hidden_states)
        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        # 如果不返回字典，则返回元组和其他输出
        if not return_dict:
            return (start_logits, end_logits) + outputs[1:]

        # 返回 FlaxQuestionAnsweringModelOutput 类的实例，包括起始 logits、结束 logits、隐藏状态和注意力
        return FlaxQuestionAnsweringModelOutput(
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

# 使用 add_start_docstrings 函数添加 RobertaPreLayerNorm 模型的文档字符串，适用于抽取式问答任务如 SQuAD
@add_start_docstrings(
    """
    RobertaPreLayerNorm Model with a span classification head on top for extractive question-answering tasks like SQuAD
    (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    ROBERTA_PRELAYERNORM_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForQuestionAnswering 复制代码，并将 Roberta 改为 RobertaPreLayerNorm
class FlaxRobertaPreLayerNormForQuestionAnswering(FlaxRobertaPreLayerNormPreTrainedModel):
    # 定义变量 module_class 并赋值为 FlaxRobertaPreLayerNormForQuestionAnsweringModule 类
    module_class = FlaxRobertaPreLayerNormForQuestionAnsweringModule
# 向函数添加示例文档字符串，用于自动生成API文档
append_call_sample_docstring(
    FlaxRobertaPreLayerNormForQuestionAnswering,  # 要添加示例文档字符串的类
    _CHECKPOINT_FOR_DOC,  # 用于文档的检查点
    FlaxQuestionAnsweringModelOutput,  # 生成文档的模型输出
    _CONFIG_FOR_DOC,  # 用于文档的配置
)


# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForCausalLMModule 复制的代码，将类名和部分参数修改为适应 Causal LM 的设置
class FlaxRobertaPreLayerNormForCausalLMModule(nn.Module):
    config: RobertaPreLayerNormConfig  # 模型配置信息
    dtype: jnp.dtype = jnp.float32  # 数据类型设置为 32 位浮点数
    gradient_checkpointing: bool = False  # 是否使用梯度检查点

    def setup(self):
        # 初始化 RoBERTa + LayerNorm 模块，不包括池化层
        self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule(
            config=self.config,
            add_pooling_layer=False,
            dtype=self.dtype,
            gradient_checkpointing=self.gradient_checkpointing,
        )
        # 初始化语言模型头部，与 RoBERTa + LayerNorm 共享配置和数据类型
        self.lm_head = FlaxRobertaPreLayerNormLMHead(config=self.config, dtype=self.dtype)

    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        token_type_ids: Optional[jnp.ndarray] = None,
        head_mask: Optional[jnp.ndarray] = None,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 执行模型前向传播
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            init_cache=init_cache,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]  # 提取隐藏状态作为输出的第一部分

        if self.config.tie_word_embeddings:
            # 如果配置要求共享词嵌入，获取共享的嵌入层参数
            shared_embedding = self.roberta_prelayernorm.variables["params"]["embeddings"]["word_embeddings"][
                "embedding"
            ]
        else:
            shared_embedding = None  # 否则不共享词嵌入

        # 计算预测分数
        logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)

        if not return_dict:
            # 如果不需要返回字典形式的输出，则返回元组形式的结果
            return (logits,) + outputs[1:]

        # 返回带有交叉注意力的 Causal LM 输出
        return FlaxCausalLMOutputWithCrossAttentions(
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )


@add_start_docstrings(
    """
    在 RoBERTa + LayerNorm 模型上添加语言建模头部的预训练模型，
    例如用于自回归任务的隐藏状态输出之上的线性层。
    """,
    ROBERTA_PRELAYERNORM_START_DOCSTRING,  # 引用 RoBERTa + LayerNorm 的起始文档字符串
)
# 从 transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForCausalLM 复制的代码，将类名和部分参数修改为适应 Causal LM 的设置
class FlaxRobertaPreLayerNormForCausalLM(FlaxRobertaPreLayerNormPreTrainedModel):
    # 指定内部模块的类为 FlaxRobertaPreLayerNormForCausalLMModule
    module_class = FlaxRobertaPreLayerNormForCausalLMModule

    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
        # 初始化缓存
        batch_size, seq_length = input_ids.shape

        # 使用 self.init_cache 方法初始化 past_key_values
        past_key_values = self.init_cache(batch_size, max_length)
        
        # 由于解码器使用因果掩码，可以创建一个静态的扩展注意力掩码
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
        
        # 如果存在 attention_mask，则根据其累积位置更新 extended_attention_mask
        if attention_mask is not None:
            position_ids = attention_mask.cumsum(axis=-1) - 1
            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
        else:
            # 否则，根据序列长度广播位置 ids
            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))

        # 返回准备好的生成输入
        return {
            "past_key_values": past_key_values,
            "attention_mask": extended_attention_mask,
            "position_ids": position_ids,
        }

    def update_inputs_for_generation(self, model_outputs, model_kwargs):
        # 更新生成输入的方法，将过去的键值和位置 ids 更新到 model_kwargs 中
        model_kwargs["past_key_values"] = model_outputs.past_key_values
        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
        return model_kwargs


append_call_sample_docstring(
    FlaxRobertaPreLayerNormForCausalLM,
    _CHECKPOINT_FOR_DOC,
    FlaxCausalLMOutputWithCrossAttentions,
    _CONFIG_FOR_DOC,
)

`.\models\roberta_prelayernorm\modeling_roberta_prelayernorm.py`

# coding=utf-8
# 版权所有 2022 年 Google AI 语言团队和 HuggingFace Inc. 团队
# 版权所有 2018 年 NVIDIA CORPORATION。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可;
# 除非符合许可证的要求，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件是基于“原样”分发的，
# 不附带任何明示或暗示的保证或条件。
# 有关详细信息，请参阅许可证。

"""PyTorch RoBERTa-PreLayerNorm 模型。"""

import math
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN, gelu
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_roberta_prelayernorm import RobertaPreLayerNormConfig

logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "andreasmadsen/efficient_mlm_m0.40"
_CONFIG_FOR_DOC = "RobertaPreLayerNormConfig"

ROBERTA_PRELAYERNORM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "andreasmadsen/efficient_mlm_m0.15",
    "andreasmadsen/efficient_mlm_m0.20",
    "andreasmadsen/efficient_mlm_m0.30",
    "andreasmadsen/efficient_mlm_m0.40",
    "andreasmadsen/efficient_mlm_m0.50",
    "andreasmadsen/efficient_mlm_m0.60",
    "andreasmadsen/efficient_mlm_m0.70",
    "andreasmadsen/efficient_mlm_m0.80",
    # 查看所有 RoBERTaWithPreLayerNorm 模型，请访问 https://huggingface.co/models?filter=roberta_with_prelayernorm
]


# 从 transformers.models.roberta.modeling_roberta.RobertaEmbeddings 复制并修改为 RobertaPreLayerNormEmbeddings
class RobertaPreLayerNormEmbeddings(nn.Module):
    """
    与 BertEmbeddings 相同，稍作调整以适应位置嵌入的索引。
    """
    # 初始化函数，用于创建模型对象
    def __init__(self, config):
        # 调用父类构造函数初始化
        super().__init__()
        # 创建词嵌入层，用于将词索引映射为隐藏表示，支持填充标记
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 创建位置嵌入层，用于将位置索引映射为隐藏表示
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 创建类型嵌入层，用于将类型索引映射为隐藏表示
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 使用 TensorFlow 模型变量名的方式命名 LayerNorm，以便能够加载 TensorFlow 的检查点文件
        # self.LayerNorm 不使用蛇形命名法，保持与 TensorFlow 模型变量名一致
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层，用于随机丢弃输入的一部分数据，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        # 位置嵌入类型，默认为绝对位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册缓冲区 position_ids，用于保存位置嵌入的索引
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册缓冲区 token_type_ids，用于保存类型嵌入的索引，初始化为零张量
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

        # End copy
        # 填充标记索引，用于指示输入中的填充标记
        self.padding_idx = config.pad_token_id
        # 创建位置嵌入层，再次定义，支持填充标记
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

    # 前向传播函数，定义了模型的计算流程
    def forward(
        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
    ):
        # 如果未提供位置 ID，则根据输入的 token ids 创建位置 IDs。任何填充的 token 保持填充状态。
        if position_ids is None:
            if input_ids is not None:
                # 从输入的 token ids 创建位置 IDs，使用 padding_idx 和 past_key_values_length 参数
                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
            else:
                # 如果未提供 input_ids，则从 inputs_embeds 创建位置 IDs
                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        if input_ids is not None:
            # 获取 input_ids 的形状
            input_shape = input_ids.size()
        else:
            # 获取 inputs_embeds 的形状，去除最后一个维度（通常是 batch 维度）
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        # 将 token_type_ids 设置为在构造函数中注册的缓冲区，通常情况下全为零。这有助于用户在不传递 token_type_ids 的情况下追踪模型，解决了问题 #5664
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                # 从注册的缓冲区中获取 token_type_ids，截取到与序列长度相同的部分，并扩展为与输入形状相同
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                # 如果模型没有 token_type_ids 属性，则创建全零的 token_type_ids 张量，与输入形状相同
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            # 如果未提供 inputs_embeds，则根据 input_ids 获取词嵌入
            inputs_embeds = self.word_embeddings(input_ids)
        # 根据 token_type_ids 获取 token 类型嵌入
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将词嵌入与 token 类型嵌入相加得到总的嵌入向量
        embeddings = inputs_embeds + token_type_embeddings
        if self.position_embedding_type == "absolute":
            # 如果位置嵌入类型是绝对位置，则获取位置嵌入并加到总的嵌入向量上
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        # 应用 LayerNormalization 到嵌入向量
        embeddings = self.LayerNorm(embeddings)
        # 对嵌入向量进行 dropout 处理
        embeddings = self.dropout(embeddings)
        # 返回处理后的嵌入向量
        return embeddings

    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        直接提供了嵌入向量，无法推断哪些是填充的，因此只生成顺序的位置 IDs。

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        input_shape = inputs_embeds.size()[:-1]
        sequence_length = input_shape[1]

        # 根据输入嵌入向量的设备生成顺序的位置 IDs
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        # 扩展为与输入形状相同的张量并返回
        return position_ids.unsqueeze(0).expand(input_shape)
# 从transformers.models.bert.modeling_bert.BertSelfAttention复制代码，将Bert->RobertaPreLayerNorm
class RobertaPreLayerNormSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 如果隐藏层大小不是注意力头数的整数倍，并且配置中没有embedding_size属性，则引发错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 注意力头数和每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建查询、键、值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # dropout层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        # 位置嵌入类型，默认为absolute
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果位置嵌入类型为relative_key或relative_key_query，则创建距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 是否为解码器
        self.is_decoder = config.is_decoder

    # 调整形状以便计算分数
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数，接收隐藏状态、注意力掩码、头掩码、编码器隐藏状态、编码器注意力掩码等参数
class RobertaPreLayerNormSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 全连接层和dropout层
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，接收隐藏状态和输入张量，并返回处理后的隐藏状态
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = hidden_states + input_tensor
        return hidden_states


class RobertaPreLayerNormAttention(nn.Module):
    # 初始化函数，接受配置和位置嵌入类型作为参数
    def __init__(self, config, position_embedding_type=None):
        # 调用父类的初始化方法
        super().__init__()
        # 创建自注意力层对象，并传入配置和位置嵌入类型参数
        self.self = RobertaPreLayerNormSelfAttention(config, position_embedding_type=position_embedding_type)
        # 创建自注意力输出层对象，并传入配置参数
        self.output = RobertaPreLayerNormSelfOutput(config)
        # 创建 LayerNorm 层对象，对隐藏状态进行归一化，使用给定的 epsilon 值
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化被修剪头部的集合为空集
        self.pruned_heads = set()

    # 从 transformers 库中复制的函数：用于修剪自注意力层的头部
    def prune_heads(self, heads):
        # 如果待修剪的头部集合为空，则直接返回
        if len(heads) == 0:
            return
        # 调用帮助函数，找到可修剪的头部及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 修剪线性层：查询、键、值、输出密集层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储被修剪的头部信息
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 对隐藏状态进行 LayerNorm 归一化
        hidden_states_pre_layer_norm = self.LayerNorm(hidden_states)
        # 使用自注意力层处理归一化后的隐藏状态
        self_outputs = self.self(
            hidden_states_pre_layer_norm,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 将自注意力层的输出传递给输出层，并与原始隐藏状态合并
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力权重，则在输出中添加注意力权重信息
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要，添加注意力权重信息
        return outputs
# 定义一个名为 RobetaPreLayerNormLayer 的新的 PyTorch 模块，继承自 nn.Module
class RobertaPreLayerNormLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 设置用于层归一化的 LayerNorm 层，根据配置参数设置隐藏大小和 epsilon 值
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 在维度 1（即第二维）上进行分块前馈的大小设置
        self.seq_len_dim = 1
        # 初始化自注意力模块，使用 RobertaPreLayerNormAttention 类
        self.attention = RobertaPreLayerNormAttention(config)
        # 是否作为解码器模型使用
        self.is_decoder = config.is_decoder
        # 是否添加交叉注意力机制的标志
        self.add_cross_attention = config.add_cross_attention
        # 如果设置了添加交叉注意力，需要在解码器模型中使用，否则引发错误
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 初始化交叉注意力模块，使用 RobertaPreLayerNormAttention 类，并设置位置嵌入类型为 "absolute"
            self.crossattention = RobertaPreLayerNormAttention(config, position_embedding_type="absolute")
        # 初始化中间层模块，使用 RobertaPreLayerNormIntermediate 类
        self.intermediate = RobertaPreLayerNormIntermediate(config)
        # 初始化输出层模块，使用 RobertaPreLayerNormOutput 类
        self.output = RobertaPreLayerNormOutput(config)

    # 定义前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
        ):
        # 应用层归一化到输入的隐藏状态
        hidden_states = self.LayerNorm(hidden_states)
        # 通过全连接层处理隐藏状态，调整其维度为中间大小
        hidden_states = self.dense(hidden_states)
        # 应用激活函数到中间隐藏状态
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回处理后的隐藏状态作为输出
        return hidden_states
    ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # Perform self-attention using the stored key/value pairs if available
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # Extract the attention output tensor
        attention_output = self_attention_outputs[0]

        # if decoder, the last output is tuple of self-attn cache
        if self.is_decoder:
            # Exclude the first and last element (self-attention outputs) for decoder outputs
            outputs = self_attention_outputs[1:-1]
            # Retrieve the present key/value for attention caching
            present_key_value = self_attention_outputs[-1]
        else:
            # Include self-attention outputs in outputs for non-decoder case
            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # Perform cross-attention using the stored key/value pairs if available
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            # Extract the cross-attention output tensor
            attention_output = cross_attention_outputs[0]
            # Add cross-attention outputs to outputs
            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights

            # add cross-attn cache to positions 3,4 of present_key_value tuple
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # Apply chunking mechanism to feed forward layer for memory efficiency
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # Append the layer output to outputs
        outputs = (layer_output,) + outputs

        # if decoder, return the attn key/values as the last output
        if self.is_decoder:
            # Append present key/value for attention caching as the last element of outputs
            outputs = outputs + (present_key_value,)

        # Return all computed outputs
        return outputs

    def feed_forward_chunk(self, attention_output):
        # Pass attention output through intermediate layer
        intermediate_output = self.intermediate(attention_output)
        # Pass through final output layer to get the final layer output
        layer_output = self.output(intermediate_output, attention_output)
        # Return the final layer output
        return layer_output
# 从 transformers.models.bert.modeling_bert.BertEncoder 复制并修改为使用 RobertaPreLayerNormEncoder
class RobertaPreLayerNormEncoder(nn.Module):
    # 初始化方法，接收一个配置对象 config
    def __init__(self, config):
        super().__init__()
        # 将配置对象保存到实例变量中
        self.config = config
        # 创建一个 nn.ModuleList，其中包含 config.num_hidden_layers 个 RobertaPreLayerNormLayer 对象
        self.layer = nn.ModuleList([RobertaPreLayerNormLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点标志，默认为 False
        self.gradient_checkpointing = False

    # 前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    # 定义函数的返回类型为一个元组，包含了 torch.Tensor 或者 BaseModelOutputWithPastAndCrossAttentions 类型
    -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果不输出隐藏状态，则初始化为空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果不输出注意力权重，则初始化为空元组
        all_self_attentions = () if output_attentions else None
        # 如果不输出交叉注意力权重且配置要求添加交叉注意力，则初始化为空元组
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果开启了梯度检查点且处于训练模式
        if self.gradient_checkpointing and self.training:
            # 如果设置了 use_cache=True，与梯度检查点不兼容，发出警告并设置 use_cache=False
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果 use_cache=True，则初始化下一个解码器缓存为空元组
        next_decoder_cache = () if use_cache else None

        # 遍历每一个解码器层
        for i, layer_module in enumerate(self.layer):
            # 如果输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果存在头部掩码，则使用对应层的头部掩码，否则为 None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 如果存在过去的键值对，则使用对应层的过去键值对，否则为 None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果开启了梯度检查点且处于训练模式
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数执行当前层模块的调用
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层模块进行前向传播计算
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新隐藏状态为当前层模块计算后的输出的第一个元素（隐藏状态）
            hidden_states = layer_outputs[0]
            # 如果 use_cache=True，则将当前层模块计算后的输出的最后一个元素（缓存）添加到 next_decoder_cache 中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果输出注意力权重，则将当前层模块计算后的输出的第二个元素（自注意力权重）添加到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果配置要求添加交叉注意力，且当前层模块计算后的输出中有第三个元素（交叉注意力权重），则添加到 all_cross_attentions 中
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果输出隐藏状态，则将最终隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的结果，则返回一个包含非空元素的元组
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则返回一个 BaseModelOutputWithPastAndCrossAttentions 对象，包含各类输出结果
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# 从transformers库中的BertPooler类复制而来，用于RoBERTa模型的预处理层归一化池化
class RobertaPreLayerNormPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 密集连接层，输入和输出维度均为config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 激活函数使用双曲正切函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 池化操作通过选择第一个标记对应的隐藏状态来实现
        first_token_tensor = hidden_states[:, 0]
        # 将选择的隐藏状态传递给密集连接层
        pooled_output = self.dense(first_token_tensor)
        # 应用激活函数到输出
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出张量
        return pooled_output


# 从transformers库中的RobertaPreTrainedModel类复制而来，用于RoBERTa模型的预处理层归一化预训练模型
class RobertaPreLayerNormPreTrainedModel(PreTrainedModel):
    """
    一个抽象类，处理权重初始化和下载预训练模型的简单接口。
    """

    # 配置类为RobertaPreLayerNormConfig
    config_class = RobertaPreLayerNormConfig
    # 基础模型前缀为"roberta_prelayernorm"
    base_model_prefix = "roberta_prelayernorm"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不分割的模块列表
    _no_split_modules = ["RobertaPreLayerNormEmbeddings", "RobertaPreLayerNormSelfAttention"]

    # 从transformers库中的BertPreTrainedModel类复制而来的方法，用于初始化权重
    def _init_weights(self, module):
        """初始化权重"""
        if isinstance(module, nn.Linear):
            # 与TensorFlow版本稍有不同，使用正态分布初始化权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果有偏置项，则初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 初始化嵌入层权重为正态分布
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果有填充索引，则将对应位置的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 初始化LayerNorm层的偏置项为零
            module.bias.data.zero_()
            # 初始化LayerNorm层的权重为1
            module.weight.data.fill_(1.0)


ROBERTA_PRELAYERNORM_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.
        Parameters:
            config ([`RobertaPreLayerNormConfig`]): Model configuration class with all the parameters of the
                model. Initializing with a config file does not load the weights associated with the model, only the
                configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
# 用于定义 RoBERTa-PreLayerNorm 模型，输出原始隐藏状态而不带特定的输出头
ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
    ROBERTA_PRELAYERNORM_START_DOCSTRING,
)
class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel):
    """
    这个模型可以作为一个编码器（仅自注意力）或者解码器来使用。如果作为解码器使用，则在自注意力层之间添加交叉注意力层，
    这遵循了*Attention is all you need*一书中描述的架构，作者为Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser和Illia Polosukhin。

    要作为解码器使用，需要将`is_decoder`参数设置为`True`。要用于Seq2Seq模型，需要将`is_decoder`参数和`add_cross_attention`
    参数都设置为`True`；此时需要一个`encoder_hidden_states`作为前向传递的输入。

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    """

    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        self.embeddings = RobertaPreLayerNormEmbeddings(config)
        self.encoder = RobertaPreLayerNormEncoder(config)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        self.pooler = RobertaPreLayerNormPooler(config) if add_pooling_layer else None

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        获取输入嵌入层的方法
        """
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        """
        设置输入嵌入层的方法
        """
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        剪枝模型的注意力头方法。heads_to_prune: 字典，格式为{层号: 需要在该层中剪枝的头列表}，参见基类PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传递方法，详细参数见函数上方的注释说明。
        """
@add_start_docstrings(
    """
    RoBERTa-PreLayerNorm Model with a `language modeling` head on top for CLM fine-tuning.
    """
    # 将 RoBERTa-PreLayerNorm 模型与 CLM 微调的语言建模头部组合在一起
    ROBERTA_PRELAYERNORM_START_DOCSTRING,
# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with modifications to support a different model configuration
class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
    # Define keys for tied weights in the model
    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

    def __init__(self, config):
        super().__init__(config)

        # Warn if the model is not configured as a decoder
        if not config.is_decoder:
            logger.warning(
                "If you want to use `RobertaPreLayerNormLMHeadModel` as a standalone, add `is_decoder=True.`"
            )

        # Initialize the Roberta model with pre-layer normalization and without pooling layer
        self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
        # Initialize the language model head for pre-layer normalization
        self.lm_head = RobertaPreLayerNormLMHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        # Return the decoder part of the language model head
        return self.lm_head.decoder

    def set_output_embeddings(self, new_embeddings):
        # Set new embeddings for the decoder part of the language model head
        self.lm_head.decoder = new_embeddings

    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 为生成准备输入数据，在生成过程中使用的方法
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
        # 获取输入张量的形状
        input_shape = input_ids.shape
        
        # 如果没有提供注意力遮罩，则创建全为1的遮罩张量，与输入张量形状相同
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)

        # 如果传入了过去的键值（past_key_values），则根据过去的长度调整输入的ID
        if past_key_values is not None:
            # 获取过去状态的长度（通常是过去的输入序列长度）
            past_length = past_key_values[0][0].shape[2]

            # 如果当前输入ID的长度大于过去的长度，则截取掉前面部分的输入ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 否则，默认行为是保留最后一个输入ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]

        # 返回包含输入ID、注意力遮罩和过去键值的字典
        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}

    # 重新排序缓存数据，以适应束搜索生成时的顺序
    def _reorder_cache(self, past_key_values, beam_idx):
        reordered_past = ()
        # 对每一层的过去状态进行重新排序
        for layer_past in past_key_values:
            reordered_past += (
                # 使用束索引（beam_idx）将过去状态重新排序，并转移到相同设备上
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重新排序后的过去状态元组
        return reordered_past
# 定义 RoBERTa-PreLayerNorm 模型，带有在顶部的语言建模头部
@add_start_docstrings(
    """RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING
)
class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
    # 共享权重的键列表
    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

    # 从 transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ 复制而来，对应的 ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
    def __init__(self, config):
        super().__init__(config)

        # 如果配置为解码器，发出警告
        if config.is_decoder:
            logger.warning(
                "If you want to use `RobertaPreLayerNormForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # 初始化 RoBERTa-PreLayerNorm 模型和 LM 头部
        self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
        self.lm_head = RobertaPreLayerNormLMHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 返回 LM 头部的输出嵌入
    def get_output_embeddings(self):
        return self.lm_head.decoder

    # 设置 LM 头部的输出嵌入
    def set_output_embeddings(self, new_embeddings):
        self.lm_head.decoder = new_embeddings

    # 从 transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.forward 复制而来，对应的 ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="<mask>",
        expected_output="' Paris'",
        expected_loss=0.69,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 函数参数文档字符串，指定输入参数的格式和功能
        **kwargs,
    ):
        # 函数的具体实现在后续的代码中
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        """
        # Decide whether to return a dictionary format based on the provided `return_dict` parameter or the model's configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Forward pass through the Roberta model with specified inputs and optional arguments
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # Extract the sequence output from the model's outputs
        sequence_output = outputs[0]
        # Generate prediction scores using the language modeling head
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        # Calculate masked language modeling loss if labels are provided
        if labels is not None:
            # Move labels tensor to the device of prediction_scores to enable parallel computation if using model parallelism
            labels = labels.to(prediction_scores.device)
            # Define CrossEntropyLoss function
            loss_fct = CrossEntropyLoss()
            # Compute masked LM loss based on prediction scores and labels
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # If return_dict is False, construct output tuple excluding masked_lm_loss if it's None
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # Return MaskedLMOutput object containing loss, logits, hidden states, and attentions
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->RobertaPreLayerNorm
class RobertaPreLayerNormLMHead(nn.Module):
    """RobertaPreLayerNorm Head for masked language modeling."""

    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，用于将隐藏状态映射到相同大小的空间
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化一个 LayerNorm 层，用于归一化隐藏状态的分布
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 初始化一个全连接层，用于最终的分类，输出的大小是词汇表的大小
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
        # 初始化一个偏置项参数，用于输出层的偏置
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
        self.decoder.bias = self.bias

    def forward(self, features, **kwargs):
        # 将输入特征通过全连接层映射
        x = self.dense(features)
        # 应用 GELU 激活函数
        x = gelu(x)
        # 应用 LayerNorm 层，归一化输出
        x = self.layer_norm(x)

        # 通过输出层映射到词汇表大小的空间，加上偏置
        x = self.decoder(x)

        return x

    def _tie_weights(self):
        # 如果输出层的偏置设备类型为 "meta"，则将输出层的偏置与模型的偏置绑定
        # 用于加速兼容性，并且不破坏向后兼容性
        if self.decoder.bias.device.type == "meta":
            self.decoder.bias = self.bias
        else:
            self.bias = self.decoder.bias


@add_start_docstrings(
    """
    RoBERTa-PreLayerNorm Model transformer with a sequence classification/regression head on top (a linear layer on top
    of the pooled output) e.g. for GLUE tasks.
    """,
    ROBERTA_PRELAYERNORM_START_DOCSTRING,
)
class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        # 使用 RoBERTaPreLayerNormModel 创建一个 RoBERTa-PreLayerNorm 模型，不添加池化层
        self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
        # 创建一个 RoBERTa-PreLayerNorm 分类头部
        self.classifier = RobertaPreLayerNormClassificationHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.forward with roberta->roberta_prelayernorm
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 确保返回的字典对象不为空，根据配置决定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用预处理后的 RoBERTa 模型获取输出
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 从 RoBERTa 输出中获取序列输出
        sequence_output = outputs[0]
        # 将序列输出传入分类器获取 logits
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        if labels is not None:
            # 将标签移到正确的设备以启用模型并行计算
            labels = labels.to(logits.device)
            # 根据配置确定问题类型（回归、单标签分类或多标签分类）
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不使用返回字典，则构建输出元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 使用返回字典的情况下，返回 SequenceClassifierOutput 对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 添加文档字符串，描述了此类的作用是基于 RobustPreLayerNorm 模型的多选分类器
@add_start_docstrings(
    """
    RobertaPreLayerNorm Model with a multiple choice classification head on top (a linear layer on top of the pooled
    output and a softmax) e.g. for RocStories/SWAG tasks.
    """,
    ROBERTA_PRELAYERNORM_START_DOCSTRING,  # 包含 ROBERTA_PRELAYERNORM_START_DOCSTRING 的文档字符串
)
# 基于 RobertaPreLayerNormPreTrainedModel 创建的类，用于多选任务
class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 RobertaPreLayerNormModel 对象
        self.roberta_prelayernorm = RobertaPreLayerNormModel(config)
        # 添加 dropout 层，使用配置中的 hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 添加分类器线性层，将隐藏状态映射到单一输出维度
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(
        ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,  # 包含 _CHECKPOINT_FOR_DOC 的代码示例文档字符串
        output_type=MultipleChoiceModelOutput,  # 指定输出类型为 MultipleChoiceModelOutput
        config_class=_CONFIG_FOR_DOC,  # 包含 _CONFIG_FOR_DOC 的配置类文档字符串
    )
    # 前向传播方法，接收多个输入和配置参数
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 模型前向传播的输入文档字符串
    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 确保返回字典不为空，若为空则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 计算选择项的数量，即第二维度的大小
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 将输入张量展平为二维张量，便于模型处理
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        flat_inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 将展平后的输入传递给模型的前处理层
        outputs = self.roberta_prelayernorm(
            flat_input_ids,
            position_ids=flat_position_ids,
            token_type_ids=flat_token_type_ids,
            attention_mask=flat_attention_mask,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 提取汇总的输出
        pooled_output = outputs[1]

        # 对汇总输出应用 dropout 操作
        pooled_output = self.dropout(pooled_output)
        # 使用分类器对汇总后的输出进行分类预测
        logits = self.classifier(pooled_output)
        # 调整 logits 的形状以便计算损失
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        if labels is not None:
            # 将标签移到正确的设备上以启用模型的并行计算
            labels = labels.to(reshaped_logits.device)
            # 定义交叉熵损失函数
            loss_fct = CrossEntropyLoss()
            # 计算损失
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            # 如果不返回字典，则按顺序返回损失和模型输出
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回多项选择模型的输出，包括损失、调整后的 logits、隐藏状态和注意力
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 使用带有标记分类头部的 RobertaPreLayerNorm 模型，用于例如命名实体识别（NER）任务
# 继承自 RobertaPreLayerNormPreTrainedModel 类
class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedModel):
    
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 设置标签数量
        self.num_labels = config.num_labels
        
        # 使用 RobertaPreLayerNormModel 创建 RobertaPreLayerNorm 模型实例，不添加池化层
        self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
        
        # 确定分类器的 dropout 比例
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 定义 dropout 层
        self.dropout = nn.Dropout(classifier_dropout)
        # 定义线性分类器
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    # 将模型向前传播的方法
    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 从 transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.forward 复制，将 roberta 替换为 roberta_prelayernorm
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
    
        # Decide whether to use the provided return_dict or default to the model's configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    
        # Pass the inputs through the Roberta model layers
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
    
        # Extract the sequence output from the model's outputs
        sequence_output = outputs[0]
    
        # Apply dropout to the sequence output
        sequence_output = self.dropout(sequence_output)
    
        # Feed the sequence output into the classifier to obtain logits
        logits = self.classifier(sequence_output)
    
        # Initialize loss as None
        loss = None
    
        # Compute the loss if labels are provided
        if labels is not None:
            # Move labels to the correct device to enable model parallelism
            labels = labels.to(logits.device)
            # Use CrossEntropyLoss to compute the classification loss
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    
        # If return_dict is False, return output tuple without loss
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
    
        # If return_dict is True, return TokenClassifierOutput object
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 从transformers.models.roberta.modeling_roberta.RobertaClassificationHead复制而来，将Roberta改为RobertaPreLayerNorm
class RobertaPreLayerNormClassificationHead(nn.Module):
    """用于句子级分类任务的头部模块。"""

    def __init__(self, config):
        super().__init__()
        # 线性层，输入和输出大小都为config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 分类器的dropout率，如果未指定则使用config.hidden_dropout_prob
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # Dropout层
        self.dropout = nn.Dropout(classifier_dropout)
        # 输出投影层，将隐藏状态映射到config.num_labels维度
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        # 获取特征的第一个token的隐藏状态（相当于[CLS] token）
        x = features[:, 0, :]
        x = self.dropout(x)  # 应用dropout
        x = self.dense(x)  # 全连接层
        x = torch.tanh(x)  # tanh激活函数
        x = self.dropout(x)  # 再次应用dropout
        x = self.out_proj(x)  # 输出投影层
        return x


@add_start_docstrings(
    """
    基于RoBERTaPreLayerNorm模型的用于抽取式问答（例如SQuAD）的跨度分类头部模块。
    在隐藏状态输出之上添加了线性层，用于计算'起始跨度logits'和'结束跨度logits'。
    """,
    ROBERTA_PRELAYERNORM_START_DOCSTRING,
)
class RobertaPreLayerNormForQuestionAnswering(RobertaPreLayerNormPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # RoBERTa模型的预层标准化版本，不添加池化层
        self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
        # 线性层，将隐藏状态映射到config.num_labels维度
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 从transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.forward复制而来，将roberta->roberta_prelayernorm
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 默认情况下，如果 return_dict 为 None，则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 Roberta 的前处理层处理输入，获取模型输出
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取序列输出
        sequence_output = outputs[0]

        # 将序列输出传入 QA 输出层获取 logits
        logits = self.qa_outputs(sequence_output)

        # 将 logits 按最后一个维度分割为起始位置和结束位置的 logits
        start_logits, end_logits = logits.split(1, dim=-1)

        # 去除多余的维度并保证连续性
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果 start_positions 或 end_positions 是多维的，压缩至一维
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)

            # 忽略超出模型输入长度的位置
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，忽略指定索引的位置
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        # 如果 return_dict 为 False，则输出 tuple
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果 return_dict 为 True，则输出 QuestionAnsweringModelOutput 对象
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 根据输入的 `input_ids` 和 `padding_idx` 创建位置 ID 列表，忽略填充符号并替换为其位置数字。
# 位置数字从 `padding_idx + 1` 开始计数。此函数修改自 fairseq 的 `utils.make_positions`。

def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        input_ids: 输入的 torch.Tensor，包含输入序列的标识符
        padding_idx: 填充符号的索引，要被忽略的位置
        past_key_values_length: 过去的键值对长度，用于增量索引计算

    Returns:
        torch.Tensor: 包含位置 ID 的张量
    """

    # 创建一个掩码张量，标记非填充符号的位置为1，填充符号位置为0
    mask = input_ids.ne(padding_idx).int()

    # 计算增量索引，累加非填充符号的数量，并加上过去的键值对长度
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask

    # 将增量索引转换为长整型，并加上填充索引，以获得最终的位置 ID
    return incremental_indices.long() + padding_idx

`.\models\roberta_prelayernorm\modeling_tf_roberta_prelayernorm.py`

# coding=utf-8
# 版权 2022 年由 Google AI 语言团队和 HuggingFace Inc. 团队所有。
# 版权 (c) 2018 年 NVIDIA 公司。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本许可；
# 除非符合许可证要求，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则依据“原样”提供软件，
# 没有任何形式的明示或暗示担保或条件。
# 有关详细信息，请参阅许可证。
""" TF 2.0 RoBERTa-PreLayerNorm 模型。"""


from __future__ import annotations

import math
import warnings
from typing import Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutputWithPastAndCrossAttentions,
    TFBaseModelOutputWithPoolingAndCrossAttentions,
    TFCausalLMOutputWithCrossAttentions,
    TFMaskedLMOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
    TFCausalLanguageModelingLoss,
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_roberta_prelayernorm import RobertaPreLayerNormConfig

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置
_CHECKPOINT_FOR_DOC = "andreasmadsen/efficient_mlm_m0.40"
_CONFIG_FOR_DOC = "RobertaPreLayerNormConfig"

# 预训练模型存档列表
TF_ROBERTA_PRELAYERNORM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "andreasmadsen/efficient_mlm_m0.15",
    "andreasmadsen/efficient_mlm_m0.20",
    "andreasmadsen/efficient_mlm_m0.30",
    "andreasmadsen/efficient_mlm_m0.40",
    "andreasmadsen/efficient_mlm_m0.50",
    "andreasmadsen/efficient_mlm_m0.60",
    "andreasmadsen/efficient_mlm_m0.70",
    "andreasmadsen/efficient_mlm_m0.80",
    # 查看所有 RoBERTaWithPreLayerNorm 模型，请访问 https://huggingface.co/models?filter=roberta_with_prelayernorm
]


# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings 复制并修改为 TFRobertaPreLayerNormEmbeddings
class TFRobertaPreLayerNormEmbeddings(keras.layers.Layer):
    """
    与 BertEmbeddings 相同，但进行了微小的调整以适应位置嵌入的索引。
    """
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        self.padding_idx = 1  # 设定填充符号的索引为1
        self.config = config  # 保存配置对象
        self.hidden_size = config.hidden_size  # 从配置中获取隐藏层大小
        self.max_position_embeddings = config.max_position_embeddings  # 从配置中获取最大位置嵌入数
        self.initializer_range = config.initializer_range  # 从配置中获取初始化范围
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")  # 创建LayerNorm层
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)  # 创建Dropout层

    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):
            self.weight = self.add_weight(
                name="weight",
                shape=[self.config.vocab_size, self.hidden_size],
                initializer=get_initializer(self.initializer_range),  # 使用指定的初始化器初始化权重
            )

        with tf.name_scope("token_type_embeddings"):
            self.token_type_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.config.type_vocab_size, self.hidden_size],
                initializer=get_initializer(self.initializer_range),  # 使用指定的初始化器初始化类型嵌入
            )

        with tf.name_scope("position_embeddings"):
            self.position_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.max_position_embeddings, self.hidden_size],
                initializer=get_initializer(self.initializer_range),  # 使用指定的初始化器初始化位置嵌入
            )

        if self.built:
            return
        self.built = True
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])  # 构建LayerNorm层

    def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
        """
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            input_ids: tf.Tensor  # 输入的张量，表示输入的标识符

        Returns: tf.Tensor  # 返回的张量，表示生成的位置标识符
        """
        mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)  # 创建一个掩码，标识非填充符号
        incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask  # 计算递增的位置索引

        return incremental_indices + self.padding_idx  # 返回生成的位置标识符

    def call(
        self,
        input_ids=None,
        position_ids=None,
        token_type_ids=None,
        inputs_embeds=None,
        past_key_values_length=0,
        training=False,
    ):
        """
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        """
        assert not (input_ids is None and inputs_embeds is None)

        if input_ids is not None:
            # 检查输入的 token ids 是否在词汇表大小范围内
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            # 使用权重矩阵按照输入的 token ids 获取对应的嵌入向量
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        # 获取输入嵌入张量的形状，去除最后一个维度
        input_shape = shape_list(inputs_embeds)[:-1]

        if token_type_ids is None:
            # 如果没有提供 token_type_ids，则填充全为 0 的张量
            token_type_ids = tf.fill(dims=input_shape, value=0)

        if position_ids is None:
            if input_ids is not None:
                # 根据输入的 token ids 创建位置 ids，任何填充的 token 仍保持填充状态
                position_ids = self.create_position_ids_from_input_ids(
                    input_ids=input_ids, past_key_values_length=past_key_values_length
                )
            else:
                # 如果没有输入 token ids，则生成位置 ids 的张量
                position_ids = tf.expand_dims(
                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
                )

        # 使用位置嵌入矩阵按照位置 ids 获取对应的位置嵌入向量
        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        # 使用 token_type 嵌入矩阵按照 token_type ids 获取对应的 token_type 嵌入向量
        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
        # 将输入嵌入、位置嵌入和 token_type 嵌入相加得到最终的嵌入向量
        final_embeddings = inputs_embeds + position_embeds + token_type_embeds
        # 对最终的嵌入向量进行 LayerNormalization 处理
        final_embeddings = self.LayerNorm(inputs=final_embeddings)
        # 在训练时对最终的嵌入向量进行 dropout 处理
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)

        # 返回最终的嵌入向量
        return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->RobertaPreLayerNorm
class TFRobertaPreLayerNormPooler(keras.layers.Layer):
    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义全连接层，用于池化操作，输出维度为 config.hidden_size
        self.dense = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 从隐藏状态中取出第一个 token 的隐藏状态，作为池化输出
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(inputs=first_token_tensor)

        return pooled_output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            # 构建全连接层
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->RobertaPreLayerNorm
class TFRobertaPreLayerNormSelfAttention(keras.layers.Layer):
    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
        super().__init__(**kwargs)

        # 检查隐藏大小是否能被注意力头数整除
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
                f"of attention heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)

        # 定义查询、键、值的全连接层
        self.query = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
        )
        self.key = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
        )
        self.value = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )
        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)

        self.is_decoder = config.is_decoder
        self.config = config
    # 将输入张量重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size] 的形状
    tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))

    # 将张量的维度顺序从 [batch_size, seq_length, num_attention_heads, attention_head_size] 转置为 [batch_size, num_attention_heads, seq_length, attention_head_size]
    return tf.transpose(tensor, perm=[0, 2, 1, 3])

    # 如果层已经构建则直接返回，否则开始构建
    if self.built:
        return

    # 设置层为已构建状态
    self.built = True

    # 如果存在查询权重张量，则根据配置的隐藏大小构建查询权重张量
    if getattr(self, "query", None) is not None:
        with tf.name_scope(self.query.name):
            self.query.build([None, None, self.config.hidden_size])

    # 如果存在键权重张量，则根据配置的隐藏大小构建键权重张量
    if getattr(self, "key", None) is not None:
        with tf.name_scope(self.key.name):
            self.key.build([None, None, self.config.hidden_size])

    # 如果存在值权重张量，则根据配置的隐藏大小构建值权重张量
    if getattr(self, "value", None) is not None:
        with tf.name_scope(self.value.name):
            self.value.build([None, None, self.config.hidden_size])
# 定义一个自定义的 Keras 层，用于 RoBERTa 模型的前层归一化注意力模块的输出
class TFRobertaPreLayerNormAttention(keras.layers.Layer):
    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建自注意力层对象，用于处理自注意力机制
        self.self_attention = TFRobertaPreLayerNormSelfAttention(config, name="self")
        
        # 创建输出层对象，用于处理自注意力层的输出
        self.dense_output = TFRobertaPreLayerNormSelfOutput(config, name="output")
        
        # 创建层归一化对象，对输入进行归一化处理
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        
        # 保存配置对象，包含模型超参数
        self.config = config

    # 从 transformers 库中的 TFBertAttention 类的 prune_heads 方法复制而来，用于裁剪注意力头
    def prune_heads(self, heads):
        raise NotImplementedError

    # 定义层的前向传播逻辑，输入和输出均为 TensorFlow 张量
    def call(
        self,
        input_tensor: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        encoder_hidden_states: tf.Tensor,
        encoder_attention_mask: tf.Tensor,
        past_key_value: Tuple[tf.Tensor],
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 对输入张量进行层归一化处理
        hidden_states_pre_layer_norm = self.LayerNorm(inputs=input_tensor)
        
        # 将归一化后的张量输入到自注意力层中进行处理
        self_outputs = self.self_attention(
            hidden_states=hidden_states_pre_layer_norm,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            training=training,
        )
        
        # 将自注意力层的输出输入到输出层中进行处理
        attention_output = self.dense_output(
            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
        )
        
        # 如果需要输出注意力信息，则将注意力信息添加到输出中
        outputs = (attention_output,) + self_outputs[1:]

        return outputs
    # 定义神经网络层的构建方法，如果已经构建过，则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 设置标志为已构建
        self.built = True
        # 如果存在 self_attention 属性，则构建 self_attention 层
        if getattr(self, "self_attention", None) is not None:
            # 使用 self_attention 层的名称作为命名空间
            with tf.name_scope(self.self_attention.name):
                # 调用 self_attention 层的构建方法
                self.self_attention.build(None)
        # 如果存在 dense_output 属性，则构建 dense_output 层
        if getattr(self, "dense_output", None) is not None:
            # 使用 dense_output 层的名称作为命名空间
            with tf.name_scope(self.dense_output.name):
                # 调用 dense_output 层的构建方法
                self.dense_output.build(None)
        # 如果存在 LayerNorm 属性，则构建 LayerNorm 层
        if getattr(self, "LayerNorm", None) is not None:
            # 使用 LayerNorm 层的名称作为命名空间
            with tf.name_scope(self.LayerNorm.name):
                # 调用 LayerNorm 层的构建方法，输入形状为 [None, None, self.config.hidden_size]
                self.LayerNorm.build([None, None, self.config.hidden_size])
class TFRobertaPreLayerNormIntermediate(keras.layers.Layer):
    # 初始化方法，设置层的参数
    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
        super().__init__(**kwargs)

        # LayerNormalization层，用于对输入进行归一化
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # Dense层，用于进行线性变换
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 根据配置文件设置中间激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        self.config = config

    # 前向传播方法，定义了层的计算过程
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 对输入进行层归一化
        hidden_states = self.LayerNorm(inputs=hidden_states)
        # 线性变换
        hidden_states = self.dense(inputs=hidden_states)
        # 应用中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    # 构建方法，用于创建层的参数
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 构建LayerNormalization层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])
        # 构建Dense层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


class TFRobertaPreLayerNormOutput(keras.layers.Layer):
    # 初始化方法，设置层的参数
    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
        super().__init__(**kwargs)

        # Dense层，用于进行线性变换
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # Dropout层，用于随机失活
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    # 前向传播方法，定义了层的计算过程
    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 线性变换
        hidden_states = self.dense(inputs=hidden_states)
        # Dropout操作
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 残差连接
        hidden_states = hidden_states + input_tensor

        return hidden_states

    # 构建方法，用于创建层的参数
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 构建Dense层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])


# 以下是从transformers库中复制并修改的代码，用于Roberta模型的预处理层归一化
class TFRobertaPreLayerNormLayer(keras.layers.Layer):
    # 初始化函数，用于创建一个 RoBERTa 预层归一化模型实例
    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 创建注意力层对象，用给定的配置信息初始化，命名为"attention"
        self.attention = TFRobertaPreLayerNormAttention(config, name="attention")
        
        # 检查是否为解码器模型
        self.is_decoder = config.is_decoder
        
        # 检查是否添加了跨注意力机制
        self.add_cross_attention = config.add_cross_attention
        
        # 如果添加了跨注意力机制，但当前模型不是解码器，则抛出值错误异常
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            
            # 创建跨注意力层对象，用给定的配置信息初始化，命名为"crossattention"
            self.crossattention = TFRobertaPreLayerNormAttention(config, name="crossattention")
        
        # 创建 RoBERTa 预层归一化模型的中间层对象，用给定的配置信息初始化，命名为"intermediate"
        self.intermediate = TFRobertaPreLayerNormIntermediate(config, name="intermediate")
        
        # 创建 RoBERTa 预层归一化模型的输出层对象，用给定的配置信息初始化，命名为"output"
        self.bert_output = TFRobertaPreLayerNormOutput(config, name="output")
        ) -> Tuple[tf.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # Perform self-attention for the decoder layer
        self_attention_outputs = self.attention(
            input_tensor=hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            past_key_value=self_attn_past_key_value,
            output_attentions=output_attentions,
            training=training,
        )
        attention_output = self_attention_outputs[0]

        # if decoder, the last output is tuple of self-attn cache
        if self.is_decoder:
            outputs = self_attention_outputs[1:-1]
            present_key_value = self_attention_outputs[-1]
        else:
            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # Perform cross-attention between decoder and encoder layers
            cross_attention_outputs = self.crossattention(
                input_tensor=attention_output,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=cross_attn_past_key_value,
                output_attentions=output_attentions,
                training=training,
            )
            attention_output = cross_attention_outputs[0]
            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights

            # add cross-attn cache to positions 3,4 of present_key_value tuple
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        intermediate_output = self.intermediate(hidden_states=attention_output)
        layer_output = self.bert_output(
            hidden_states=intermediate_output, input_tensor=attention_output, training=training
        )
        outputs = (layer_output,) + outputs  # add attentions if we output them

        # if decoder, return the attn key/values as the last output
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs
    # 构建神经网络模型，如果已经构建过了则直接返回，否则执行构建过程
    def build(self, input_shape=None):
        # 如果已经构建过了，则直接返回，不做任何操作
        if self.built:
            return
        # 将标识位设置为已构建
        self.built = True
        # 如果存在注意力模型，则根据其名称创建命名空间，并执行注意力模型的构建过程
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        # 如果存在中间层模型，则根据其名称创建命名空间，并执行中间层模型的构建过程
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        # 如果存在BERT输出模型，则根据其名称创建命名空间，并执行BERT输出模型的构建过程
        if getattr(self, "bert_output", None) is not None:
            with tf.name_scope(self.bert_output.name):
                self.bert_output.build(None)
        # 如果存在交叉注意力模型，则根据其名称创建命名空间，并执行交叉注意力模型的构建过程
        if getattr(self, "crossattention", None) is not None:
            with tf.name_scope(self.crossattention.name):
                self.crossattention.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->RobertaPreLayerNorm
# 定义一个自定义层 TFRobertaPreLayerNormEncoder，继承自 keras.layers.Layer 类
class TFRobertaPreLayerNormEncoder(keras.layers.Layer):
    # 初始化方法，接受一个 config 对象和额外的关键字参数
    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config  # 保存传入的 config 对象
        # 创建一个由 TFRobertaPreLayerNormLayer 实例组成的列表，共 config.num_hidden_layers 个
        self.layer = [TFRobertaPreLayerNormLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]

    # 定义 call 方法，用于执行层的前向传播
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        encoder_hidden_states: tf.Tensor | None,
        encoder_attention_mask: tf.Tensor | None,
        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
        use_cache: Optional[bool],
        output_attentions: bool,
        output_hidden_states: bool,
        return_dict: bool,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
        # 如果需要输出隐藏状态，则初始化 all_hidden_states 为空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力，初始化 all_attentions 为空元组
        all_attentions = () if output_attentions else None
        # 如果需要输出交叉注意力且配置允许，则初始化 all_cross_attentions 为空元组
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果需要使用缓存，则初始化 next_decoder_cache 为空元组
        next_decoder_cache = () if use_cache else None
        
        # 遍历 self.layer 列表中的每个层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前 hidden_states 添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果 past_key_values 不为 None，则获取当前层的 past_key_value
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 调用当前层的前向传播方法，得到 layer_outputs
            layer_outputs = layer_module(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask[i],
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                training=training,
            )
            # 更新 hidden_states 为当前层输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果需要使用缓存，则将当前层的输出的最后一个元素加入 next_decoder_cache
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)

            # 如果需要输出注意力，则将当前层的第二个元素加入 all_attentions
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)
                # 如果配置允许且 encoder_hidden_states 不为 None，则将当前层的第三个元素加入 all_cross_attentions
                if self.config.add_cross_attention and encoder_hidden_states is not None:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 添加最后一层的隐藏状态，如果需要输出隐藏状态
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果 return_dict 为 False，则返回非空的元组中的元素
        if not return_dict:
            return tuple(
                v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
            )

        # 如果 return_dict 为 True，则返回 TFBaseModelOutputWithPastAndCrossAttentions 对象
        return TFBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
            cross_attentions=all_cross_attentions,
        )
    # 定义 build 方法，用于构建神经网络模型的层结构
    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 如果模型包含层属性
        if getattr(self, "layer", None) is not None:
            # 遍历每个层并设置 TensorFlow 的命名作用域
            for layer in self.layer:
                with tf.name_scope(layer.name):
                    # 调用每个层的 build 方法构建层结构，传入 None 作为输入形状
                    layer.build(None)
# 声明一个自定义层，用于TFRoberta预处理层的主要部分，通过装饰器指定其可序列化
@keras_serializable
class TFRobertaPreLayerNormMainLayer(keras.layers.Layer):
    # 将配置类设为RobertaPreLayerNormConfig
    config_class = RobertaPreLayerNormConfig

    # 初始化方法，接受配置参数config和是否添加池化层的标志add_pooling_layer
    def __init__(self, config, add_pooling_layer=True, **kwargs):
        super().__init__(**kwargs)

        # 将config保存到实例属性中
        self.config = config
        # 检查配置中是否为解码器模式
        self.is_decoder = config.is_decoder

        # 初始化其他实例属性，从配置中获取对应的值
        self.num_hidden_layers = config.num_hidden_layers
        self.initializer_range = config.initializer_range
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.return_dict = config.use_return_dict

        # 创建编码器层实例，并命名为"encoder"
        self.encoder = TFRobertaPreLayerNormEncoder(config, name="encoder")
        # 创建LayerNormalization层实例，使用配置中的epsilon值，并命名为"LayerNorm"
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        
        # 如果add_pooling_layer为True，则创建池化层实例，命名为"pooler"，否则设置为None
        self.pooler = TFRobertaPreLayerNormPooler(config, name="pooler") if add_pooling_layer else None
        
        # 最后声明嵌入层实例，命名为"embeddings"，必须放在最后声明以保持权重顺序
        self.embeddings = TFRobertaPreLayerNormEmbeddings(config, name="embeddings")

    # 获取输入嵌入层的方法，返回嵌入层实例
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.embeddings

    # 设置输入嵌入层的方法，接受一个tf.Variable作为参数，设置权重和词汇大小
    def set_input_embeddings(self, value: tf.Variable):
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    # 未实现的方法，用于剪枝模型的头部
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError

    # 装饰器函数，用于解包输入参数，并接受多种类型的输入数据，处理预处理层的调用
    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    # 如果已经构建过网络，则直接返回，不进行重复构建
    if self.built:
        return
    # 标记网络为已构建状态
    self.built = True
    
    # 如果存在编码器(encoder)属性，则构建编码器
    if getattr(self, "encoder", None) is not None:
        # 在命名空间下构建编码器
        with tf.name_scope(self.encoder.name):
            self.encoder.build(None)
    
    # 如果存在 LayerNorm 属性，则构建 LayerNorm 层
    if getattr(self, "LayerNorm", None) is not None:
        # 在命名空间下构建 LayerNorm 层，指定输入形状为 [None, None, self.config.hidden_size]
        with tf.name_scope(self.LayerNorm.name):
            self.LayerNorm.build([None, None, self.config.hidden_size])
    
    # 如果存在池化器(pooler)属性，则构建池化器
    if getattr(self, "pooler", None) is not None:
        # 在命名空间下构建池化器
        with tf.name_scope(self.pooler.name):
            self.pooler.build(None)
    
    # 如果存在嵌入(embeddings)属性，则构建嵌入层
    if getattr(self, "embeddings", None) is not None:
        # 在命名空间下构建嵌入层
        with tf.name_scope(self.embeddings.name):
            self.embeddings.build(None)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel中复制代码，并将Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm进行了替换
class TFRobertaPreLayerNormPreTrainedModel(TFPreTrainedModel):
    """
    处理权重初始化、预训练模型下载和加载的抽象类。
    """

    # 指定配置类为RobertaPreLayerNormConfig
    config_class = RobertaPreLayerNormConfig
    # 指定基础模型前缀为"roberta_prelayernorm"
    base_model_prefix = "roberta_prelayernorm"


ROBERTA_PRELAYERNORM_START_DOCSTRING = r"""

    该模型继承自[`TFPreTrainedModel`]。查阅超类文档以获取库实现的通用方法，如下载或保存模型、调整输入嵌入、修剪头等。

    该模型也是一个[keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)子类。可以像常规TF 2.0 Keras模型一样使用，并且请参考TF 2.0文档以获取所有与一般使用和行为相关的事项。

    <Tip>

    `transformers`中的TensorFlow模型和层接受两种输入格式：

    - 将所有输入作为关键字参数传递（类似于PyTorch模型），或者
    - 将所有输入作为列表、元组或字典的第一个位置参数传递。

    支持第二种格式的原因是，Keras方法在将输入传递给模型和层时更喜欢这种格式。因此，在使用`model.fit()`等方法时，只需将输入和标签以任何`model.fit()`支持的格式传递即可！但是，如果要在Keras方法之外（如创建自己的层或使用Keras`Functional`API创建模型时）使用第二种格式，可以使用三种方法来收集第一个位置参数中的所有输入张量：

    - 仅包含`input_ids`的单个张量，没有其他内容：`model(input_ids)`
    - 长度可变的列表，按照文档字符串中给定的顺序包含一个或多个输入张量：`model([input_ids, attention_mask])`或`model([input_ids, attention_mask, token_type_ids])`
    - 一个字典，包含一个或多个与文档字符串中给定输入名称相关联的输入张量：`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    注意，当使用[子类化](https://keras.io/guides/making_new_layers_and_models_via_subclassing/)创建模型和层时，您不需要担心这些问题，因为可以像将输入传递给任何其他Python函数一样传递输入！

    </Tip>

    参数:
        config ([`RobertaPreLayerNormConfig`]): 包含模型所有参数的配置类。使用配置文件初始化不会加载与模型相关的权重，只会加载配置。请查看[`~PreTrainedModel.from_pretrained`]方法以加载模型权重。
"""

ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING = r"""
"""
# 使用装饰器添加文档字符串，描述这是 RoBERTa-PreLayerNorm 模型，输出原始隐藏状态而不带特定的顶层头部。
# ROBERTA_PRELAYERNORM_START_DOCSTRING 是一个预定义的文档字符串变量。
@add_start_docstrings(
    "The bare RoBERTa-PreLayerNorm Model transformer outputting raw hidden-states without any specific head on top.",
    ROBERTA_PRELAYERNORM_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaModel 复制而来，将类名中的 ROBERTA 替换为 ROBERTA_PRELAYERNORM，Roberta 替换为 RobertaPreLayerNorm，roberta 替换为 roberta_prelayernorm。
class TFRobertaPreLayerNormModel(TFRobertaPreLayerNormPreTrainedModel):
    
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 创建 RoBERTa-PreLayerNorm 的主要层，使用给定的配置，并命名为 "roberta_prelayernorm"。
        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(config, name="roberta_prelayernorm")

    @unpack_inputs
    # 使用装饰器添加模型前向传播的文档字符串，描述输入参数的形状和用途。
    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 添加代码示例的文档字符串，包括模型的检查点、输出类型、配置类。
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        # 这里省略了函数体的部分，需要在实际代码中完整添加。
    ) -> Union[Tuple, TFBaseModelOutputWithPoolingAndCrossAttentions]:
        r"""
        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*, defaults to `True`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`). Set to `False` during training, `True` during generation
        """
        outputs = self.roberta_prelayernorm(
            input_ids=input_ids,  # 输入的token ID序列
            attention_mask=attention_mask,  # 注意力掩码，避免对填充的token进行注意力计算
            token_type_ids=token_type_ids,  # token类型ID，用于BERT模型中的segment embedding
            position_ids=position_ids,  # token位置ID，用于BERT模型中的position embedding
            head_mask=head_mask,  # 头部掩码，用于屏蔽某些注意力头
            inputs_embeds=inputs_embeds,  # 嵌入的输入向量，代替输入的token ID
            encoder_hidden_states=encoder_hidden_states,  # 编码器的隐藏状态，用于解码器的交叉注意力
            encoder_attention_mask=encoder_attention_mask,  # 编码器的注意力掩码，用于解码器的交叉注意力
            past_key_values=past_key_values,  # 预先计算的注意力块的键值隐藏状态，用于解码时加速
            use_cache=use_cache,  # 是否使用缓存来加速解码
            output_attentions=output_attentions,  # 是否返回注意力权重
            output_hidden_states=output_hidden_states,  # 是否返回隐藏状态
            return_dict=return_dict,  # 是否以字典形式返回输出
            training=training,  # 是否在训练模式下
        )

        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "roberta_prelayernorm", None) is not None:
            with tf.name_scope(self.roberta_prelayernorm.name):
                self.roberta_prelayernorm.build(None)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead复制而来，将Roberta改为RobertaPreLayerNorm
class TFRobertaPreLayerNormLMHead(keras.layers.Layer):
    """用于预层归一化的Roberta LM头部，用于掩码语言建模。"""

    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)

        self.config = config
        self.hidden_size = config.hidden_size
        # 创建一个全连接层，大小为config.hidden_size，使用给定范围的初始化器进行初始化
        self.dense = keras.layers.Dense(
            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 创建一个层归一化层，epsilon为config.layer_norm_eps
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        # 获取GELU激活函数
        self.act = get_tf_activation("gelu")

        # 输出权重与输入嵌入相同，但每个标记有一个仅输出的偏置
        self.decoder = input_embeddings

    def build(self, input_shape=None):
        # 添加一个偏置，形状为(config.vocab_size,)，使用零初始化，可训练，命名为"bias"
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

        # 如果已经构建了，直接返回
        if self.built:
            return
        self.built = True
        # 构建全连接层和层归一化层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.hidden_size])

    def get_output_embeddings(self):
        return self.decoder

    def set_output_embeddings(self, value):
        # 设置输出嵌入
        self.decoder.weight = value
        self.decoder.vocab_size = shape_list(value)[0]

    def get_bias(self):
        return {"bias": self.bias}

    def set_bias(self, value):
        # 设置偏置
        self.bias = value["bias"]
        self.config.vocab_size = shape_list(value["bias"])[0]

    def call(self, hidden_states):
        # 经过全连接层
        hidden_states = self.dense(hidden_states)
        # 应用激活函数
        hidden_states = self.act(hidden_states)
        # 应用层归一化
        hidden_states = self.layer_norm(hidden_states)

        # 通过偏置将其投影回词汇表大小
        seq_length = shape_list(tensor=hidden_states)[1]
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
        hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)

        return hidden_states


@add_start_docstrings(
    """带有`语言建模`头部的RoBERTa-PreLayerNorm模型。""", ROBERTA_PRELAYERNORM_START_DOCSTRING
)
class TFRobertaPreLayerNormForMaskedLM(TFRobertaPreLayerNormPreTrainedModel, TFMaskedLanguageModelingLoss):
    # 当从PT模型加载TF模型时，带有'.'的名称表示授权的意外/丢失的层
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]

    # 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM.__init__ 复制而来，替换 ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化 RobBERTa 预层归一化主层，不添加池化层，命名为 "roberta_prelayernorm"
        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(
            config, add_pooling_layer=False, name="roberta_prelayernorm"
        )
        # 初始化 RobBERTa 预层归一化语言模型头部，使用 roberta_prelayernorm 的嵌入层，命名为 "lm_head"
        self.lm_head = TFRobertaPreLayerNormLMHead(config, self.roberta_prelayernorm.embeddings, name="lm_head")

    # 返回 lm_head 属性，即 RobBERTa 预层归一化语言模型头部
    def get_lm_head(self):
        return self.lm_head

    # 获取前缀偏置名称的方法，已弃用，发出未来警告，建议使用 `get_bias` 替代
    def get_prefix_bias_name(self):
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        # 返回对象的名称加上 lm_head 的名称，作为前缀偏置名称
        return self.name + "/" + self.lm_head.name

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="<mask>",
        expected_output="' Paris'",
        expected_loss=0.69,
    )
    # 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM.call 复制而来，替换 ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 定义方法签名，指定输入参数和返回类型注解
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取模型输出的序列表示
        sequence_output = outputs[0]
        # 使用语言模型头部预测得分
        prediction_scores = self.lm_head(sequence_output)

        # 如果存在标签，则计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)

        # 如果不返回字典形式的结果，则按元组方式构造输出
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFMaskedLMOutput 对象，包含损失、预测 logits、隐藏状态和注意力权重
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已构建，则直接返回
        if self.built:
            return
        # 标记模型已构建
        self.built = True
        # 如果存在 Roberta 预层规范化，构建其结构
        if getattr(self, "roberta_prelayernorm", None) is not None:
            with tf.name_scope(self.roberta_prelayernorm.name):
                self.roberta_prelayernorm.build(None)
        # 如果存在语言模型头部，构建其结构
        if getattr(self, "lm_head", None) is not None:
            with tf.name_scope(self.lm_head.name):
                self.lm_head.build(None)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM 复制并修改为 TFRobertaPreLayerNormForCausalLM，将 ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
class TFRobertaPreLayerNormForCausalLM(TFRobertaPreLayerNormPreTrainedModel, TFCausalLanguageModelingLoss):
    # 在从 PT 模型加载 TF 模型时，忽略以下名称的层
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]

    def __init__(self, config: RobertaPreLayerNormConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 如果不是解码器，则发出警告
        if not config.is_decoder:
            logger.warning(
                "If you want to use `TFRobertaPreLayerNormLMHeadModel` as a standalone, add `is_decoder=True.`"
            )

        # 初始化 RoBERTa 预层归一化主层
        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(
            config, add_pooling_layer=False, name="roberta_prelayernorm"
        )
        # 初始化 RoBERTa 预层归一化语言模型头部
        self.lm_head = TFRobertaPreLayerNormLMHead(
            config, input_embeddings=self.roberta_prelayernorm.embeddings, name="lm_head"
        )

    def get_lm_head(self):
        # 返回语言模型头部
        return self.lm_head

    def get_prefix_bias_name(self):
        # 发出警告，此方法已弃用
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        # 返回名称前缀和偏置名
        return self.name + "/" + self.lm_head.name

    # 从 transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation 复制而来
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        # 如果未提供注意力遮罩，则创建全为1的遮罩
        if attention_mask is None:
            attention_mask = tf.ones(input_shape)

        # 如果使用过去的键值对，则截取 decoder_input_ids
        if past_key_values is not None:
            input_ids = input_ids[:, -1:]

        # 返回生成模型输入所需的字典
        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}

    # 解包输入参数
    @unpack_inputs
    # 添加 ROBERTA_PRELAYERNORM 输入文档字符串
    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 添加代码示例文档字符串
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFCausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义类方法 `build`，用于构建模型
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True
        
        # 如果存在 `roberta_prelayernorm` 属性，则构建其相关层
        if getattr(self, "roberta_prelayernorm", None) is not None:
            # 使用 `roberta_prelayernorm` 的名称作为命名空间
            with tf.name_scope(self.roberta_prelayernorm.name):
                # 调用 `roberta_prelayernorm` 的 build 方法
                self.roberta_prelayernorm.build(None)
        
        # 如果存在 `lm_head` 属性，则构建其相关层
        if getattr(self, "lm_head", None) is not None:
            # 使用 `lm_head` 的名称作为命名空间
            with tf.name_scope(self.lm_head.name):
                # 调用 `lm_head` 的 build 方法
                self.lm_head.build(None)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead复制并修改为TFRobertaPreLayerNormClassificationHead
class TFRobertaPreLayerNormClassificationHead(keras.layers.Layer):
    """用于句子级分类任务的头部。"""

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 创建一个全连接层，输出大小为config.hidden_size，使用指定的初始化器初始化权重，激活函数为tanh
        self.dense = keras.layers.Dense(
            config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        # 根据配置设定分类器的dropout率，若未设定则使用隐藏层dropout率
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 创建一个dropout层，用于在训练时随机断开输入单元，以防止过拟合
        self.dropout = keras.layers.Dropout(classifier_dropout)
        # 创建一个全连接层，输出大小为config.num_labels，使用指定的初始化器初始化权重
        self.out_proj = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
        )
        self.config = config

    def call(self, features, training=False):
        # 取features中的第一个位置的特征向量，相当于取< s >标记（等效于[CLS]）
        x = features[:, 0, :]
        # 对x应用dropout，根据training参数确定是否在训练时使用
        x = self.dropout(x, training=training)
        # 通过全连接层dense处理x
        x = self.dense(x)
        # 再次应用dropout
        x = self.dropout(x, training=training)
        # 通过全连接层out_proj处理x，得到最终的输出
        x = self.out_proj(x)
        return x

    def build(self, input_shape=None):
        # 如果已经构建过则直接返回
        if self.built:
            return
        self.built = True
        # 如果存在dense层，则构建dense层，指定输入形状为[None, None, self.config.hidden_size]
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果存在out_proj层，则构建out_proj层，指定输入形状为[None, None, self.config.hidden_size]
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.config.hidden_size])


@add_start_docstrings(
    """
    RoBERTa-PreLayerNorm 模型转换器，顶部带有序列分类/回归头部（在汇聚输出之上的线性层），例如用于GLUE任务。
    """,
    ROBERTA_PRELAYERNORM_START_DOCSTRING,
)
class TFRobertaPreLayerNormForSequenceClassification(
    TFRobertaPreLayerNormPreTrainedModel, TFSequenceClassificationLoss
):
    # 在从PT模型加载TF模型时，带'.'的名称表示授权的意外/缺少的层
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 设置类别数目
        self.num_labels = config.num_labels

        # 创建RoBERTa-PreLayerNorm的主层，不添加池化层
        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(
            config, add_pooling_layer=False, name="roberta_prelayernorm"
        )
        # 创建分类器头部
        self.classifier = TFRobertaPreLayerNormClassificationHead(config, name="classifier")

    # 将@unpack_inputs应用于下面的函数
    # 将ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING的内容添加到模型前向传播的文档字符串中
    # 添加代码示例的文档字符串
    # 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification.call 复制而来，将 roberta 替换为 roberta_prelayernorm
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 调用 self.roberta_prelayernorm 方法进行前向传播，生成模型输出
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从模型输出中获取序列输出
        sequence_output = outputs[0]
        # 使用 self.classifier 对序列输出进行分类预测
        logits = self.classifier(sequence_output, training=training)

        # 如果提供了 labels，则计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果 return_dict 不为 True，则返回元组形式的输出
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 TFSequenceClassifierOutput 类型的对象
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果存在 self.roberta_prelayernorm 属性，则构建它
        if getattr(self, "roberta_prelayernorm", None) is not None:
            with tf.name_scope(self.roberta_prelayernorm.name):
                self.roberta_prelayernorm.build(None)
        # 如果存在 self.classifier 属性，则构建它
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build(None)
@add_start_docstrings(
    """
    RobertaPreLayerNorm Model with a multiple choice classification head on top (a linear layer on top of the pooled
    output and a softmax) e.g. for RocStories/SWAG tasks.
    """,
    ROBERTA_PRELAYERNORM_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaForMultipleChoice复制而来，将ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
class TFRobertaPreLayerNormForMultipleChoice(TFRobertaPreLayerNormPreTrainedModel, TFMultipleChoiceLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # 在加载PT模型时，'.'表示授权的不符合预期/缺失的层
    _keys_to_ignore_on_load_unexpected = [r"lm_head"]
    _keys_to_ignore_on_load_missing = [r"dropout"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化模型的主要层，命名为'roberta_prelayernorm'
        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(config, name="roberta_prelayernorm")
        # 添加dropout层，使用给定的隐藏层dropout概率
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        # 分类器层，使用给定的初始化范围初始化权重，输出维度为1
        self.classifier = keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        # 保存配置信息
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(
        ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播方法，接受多种输入参数，具体见ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        # ...
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """

        # 如果存在 input_ids，则确定 num_choices 和 seq_length
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]  # 获取选择数量
            seq_length = shape_list(input_ids)[2]   # 获取序列长度
        else:
            num_choices = shape_list(inputs_embeds)[1]  # 否则，从 inputs_embeds 确定选择数量
            seq_length = shape_list(inputs_embeds)[2]   # 从 inputs_embeds 确定序列长度

        # 根据是否为 None，将 input_ids, attention_mask, token_type_ids, position_ids 进行扁平化处理
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None

        # 调用 self.roberta_prelayernorm 方法，传入参数扁平化后的输入数据以及其他可选参数
        outputs = self.roberta_prelayernorm(
            flat_input_ids,
            flat_attention_mask,
            flat_token_type_ids,
            flat_position_ids,
            head_mask,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取池化后的输出（通常是第二个元素）
        pooled_output = outputs[1]

        # 在训练时应用 dropout
        pooled_output = self.dropout(pooled_output, training=training)

        # 使用分类器对池化输出进行分类预测
        logits = self.classifier(pooled_output)

        # 将 logits 重新调整为原始形状，以便与输入匹配
        reshaped_logits = tf.reshape(logits, (-1, num_choices))

        # 如果没有提供 labels，则不计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)

        # 如果不要求返回字典，则按照 tuple 的形式返回输出
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 否则，返回一个 TFMultipleChoiceModelOutput 对象，包含损失、预测的 logits、隐藏状态和注意力权重
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return

        # 标记模型为已构建
        self.built = True

        # 如果存在 self.roberta_prelayernorm 属性，则构建其内部结构
        if getattr(self, "roberta_prelayernorm", None) is not None:
            with tf.name_scope(self.roberta_prelayernorm.name):
                self.roberta_prelayernorm.build(None)

        # 如果存在 self.classifier 属性，则构建其内部结构
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
    """
    RoBERTa-PreLayerNorm Model with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    """
    # 定义了一个 RoBERTa-PreLayerNorm 模型，用于标记分类任务，例如命名实体识别（NER）
    @add_start_docstrings(
        ROBERTA_PRELAYERNORM_START_DOCSTRING,
    )
    class TFRobertaPreLayerNormForTokenClassification(TFRobertaPreLayerNormPreTrainedModel, TFTokenClassificationLoss):
        # 当从 PyTorch 模型加载到 TF 模型时，以下名称表示可以忽略的意外/缺失层
        _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
        # 当从 PyTorch 模型加载到 TF 模型时，以下名称表示可以忽略的缺失层
        _keys_to_ignore_on_load_missing = [r"dropout"]

        def __init__(self, config, *inputs, **kwargs):
            # 调用父类构造函数初始化模型
            super().__init__(config, *inputs, **kwargs)
            # 设置标签的数量
            self.num_labels = config.num_labels

            # 初始化 RoBERTa-PreLayerNorm 主层，不包括池化层，命名为 "roberta_prelayernorm"
            self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(
                config, add_pooling_layer=False, name="roberta_prelayernorm"
            )
            # 根据配置设置分类器的 dropout，如果未设置，则使用隐藏层的 dropout
            classifier_dropout = (
                config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
            )
            # 添加一个 dropout 层
            self.dropout = keras.layers.Dropout(classifier_dropout)
            # 添加一个全连接层，用于分类，输出维度为标签的数量，初始化方法使用配置中的范围设置
            self.classifier = keras.layers.Dense(
                config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
            )
            # 保存配置对象
            self.config = config

        # 对模型的前向传播函数进行装饰，添加输入参数的说明文档
        @unpack_inputs
        @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
        @add_code_sample_docstrings(
            checkpoint=_CHECKPOINT_FOR_DOC,
            output_type=TFTokenClassifierOutput,
            config_class=_CONFIG_FOR_DOC,
        )
        # 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification.call 复制，并将 robera->roberta_prelayernorm
        def call(
            self,
            input_ids: TFModelInputType | None = None,
            attention_mask: np.ndarray | tf.Tensor | None = None,
            token_type_ids: np.ndarray | tf.Tensor | None = None,
            position_ids: np.ndarray | tf.Tensor | None = None,
            head_mask: np.ndarray | tf.Tensor | None = None,
            inputs_embeds: np.ndarray | tf.Tensor | None = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            labels: np.ndarray | tf.Tensor | None = None,
            training: Optional[bool] = False,
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 使用 Roberta 的预层归一化层处理输入数据，并返回输出结果
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取模型的序列输出
        sequence_output = outputs[0]

        # 在训练过程中对序列输出进行 dropout 处理
        sequence_output = self.dropout(sequence_output, training=training)
        # 将经过 dropout 处理后的序列输出送入分类器进行分类得到 logits
        logits = self.classifier(sequence_output)

        # 如果提供了标签，计算分类损失
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不需要返回字典格式的输出，则按照非字典格式返回结果
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典格式的输出，则构建 TFTokenClassifierOutput 对象
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果定义了 roberta_prelayernorm 层，则构建该层
        if getattr(self, "roberta_prelayernorm", None) is not None:
            with tf.name_scope(self.roberta_prelayernorm.name):
                self.roberta_prelayernorm.build(None)
        # 如果定义了 classifier 层，则构建该层
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
"""
RoBERTa-PreLayerNorm Model with a span classification head on top for extractive question-answering tasks like
SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
# 导入所需的模块和函数
@add_start_docstrings(
    """
    RoBERTa-PreLayerNorm Model with a span classification head on top for extractive question-answering tasks like
    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    ROBERTA_PRELAYERNORM_START_DOCSTRING,
)
# 定义 RoBERTa-PreLayerNorm 用于问答任务的模型类，继承自 TFRobertaPreLayerNormPreTrainedModel 和 TFQuestionAnsweringLoss
class TFRobertaPreLayerNormForQuestionAnswering(TFRobertaPreLayerNormPreTrainedModel, TFQuestionAnsweringLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # 在从 PyTorch 模型加载到 TF 模型时，指定可以忽略的不匹配的层的名称列表
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]

    # 初始化方法，接收配置和其他输入参数
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类初始化方法
        super().__init__(config, *inputs, **kwargs)
        # 设置标签数目
        self.num_labels = config.num_labels

        # 初始化 RoBERTa-PreLayerNorm 主层，不包含池化层
        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(
            config, add_pooling_layer=False, name="roberta_prelayernorm"
        )
        # 初始化问答输出层，包含一个 Dense 层用于输出问题答案的 logits
        self.qa_outputs = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
        # 保存配置对象
        self.config = config

    # 使用装饰器添加模型前向传播的注释和示例
    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 复制自 transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering.call，将 robera 改为 roberta_prelayernorm
    # 定义模型的前向传播方法
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 使用 TFQuestionAnsweringModelOutput 或 (start_logits, end_logits) 的元组作为返回类型注解

        # 将输入传递给 self.roberta_prelayernorm 模型的前向传播，并获取输出
        outputs = self.roberta_prelayernorm(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从输出中提取序列输出
        sequence_output = outputs[0]

        # 将序列输出传递给 self.qa_outputs 模型，获取问题回答的 logits
        logits = self.qa_outputs(sequence_output)
        # 将 logits 沿着最后一个维度分割成 start_logits 和 end_logits
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        # 去除 start_logits 和 end_logits 的单维度
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)

        # 初始化损失为 None
        loss = None
        # 如果提供了 start_positions 和 end_positions，则计算损失
        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            # 调用 hf_compute_loss 方法计算损失
            loss = self.hf_compute_loss(labels, (start_logits, end_logits))

        # 如果 return_dict 为 False，则按非字典方式返回输出
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则按 TFQuestionAnsweringModelOutput 格式返回输出
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        # 设置标志位表示模型已经构建
        self.built = True
        # 如果 self.roberta_prelayernorm 存在，则构建其模型
        if getattr(self, "roberta_prelayernorm", None) is not None:
            with tf.name_scope(self.roberta_prelayernorm.name):
                self.roberta_prelayernorm.build(None)
        # 如果 self.qa_outputs 存在，则构建其模型
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])

Transformers-源码解析-九十六-

Transformers 源码解析（九十六）

.\models\roberta\tokenization_roberta_fast.py

.\models\roberta\__init__.py

.\models\roberta_prelayernorm\configuration_roberta_prelayernorm.py

.\models\roberta_prelayernorm\convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py

.\models\roberta_prelayernorm\modeling_flax_roberta_prelayernorm.py

.\models\roberta_prelayernorm\modeling_roberta_prelayernorm.py

.\models\roberta_prelayernorm\modeling_tf_roberta_prelayernorm.py

`.\models\roberta\tokenization_roberta_fast.py`

`.\models\roberta\init.py`

`.\models\roberta_prelayernorm\configuration_roberta_prelayernorm.py`

`.\models\roberta_prelayernorm\convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py`

`.\models\roberta_prelayernorm\modeling_flax_roberta_prelayernorm.py`

`.\models\roberta_prelayernorm\modeling_roberta_prelayernorm.py`

`.\models\roberta_prelayernorm\modeling_tf_roberta_prelayernorm.py`