Transformers 源码解析（二十三）

`.\models\bros\processing_bros.py`

# 设置编码格式为 UTF-8，确保支持中文等多种字符
# 版权声明，指明此代码版权归 HuggingFace Inc. 团队所有，遵循 Apache License 2.0
# 详细许可条款可以在 http://www.apache.org/licenses/LICENSE-2.0 查看
# 如果符合许可协议，可以自由使用、修改和分发本代码
"""
Processor class for Bros.
"""

# 从相关模块导入所需的类和函数
from typing import List, Optional, Union

# 导入自定义的处理工具类
from ...processing_utils import ProcessorMixin
# 导入与标记化相关的基础工具类
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
# 导入张量类型定义
from ...utils import TensorType


# BrosProcessor 类，继承自 ProcessorMixin 类
class BrosProcessor(ProcessorMixin):
    r"""
    Constructs a Bros processor which wraps a BERT tokenizer.

    [`BrosProcessor`] offers all the functionalities of [`BertTokenizerFast`]. See the docstring of
    [`~BrosProcessor.__call__`] and [`~BrosProcessor.decode`] for more information.

    Args:
        tokenizer (`BertTokenizerFast`, *optional*):
            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
    """

    # 类属性，指定可访问的属性名列表
    attributes = ["tokenizer"]
    # 类属性，指定支持的 tokenizer 类名
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")

    # 初始化方法
    def __init__(self, tokenizer=None, **kwargs):
        # 如果未提供 tokenizer 参数，则抛出 ValueError 异常
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        # 调用父类的初始化方法，传入 tokenizer 参数
        super().__init__(tokenizer)

    # 实例调用方法，处理文本和标记化的主要功能
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ):
        """
        Processes input text or pre-tokenized input into a format suitable for BERT models.

        Args:
            text (Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], optional):
                The input text to process, can be single or batched inputs.
            add_special_tokens (bool, optional):
                Whether to add special tokens like [CLS], [SEP].
            padding (Union[bool, str, PaddingStrategy], optional):
                Strategy for padding sequences to the same length.
            truncation (Union[bool, str, TruncationStrategy], optional):
                Strategy for truncating sequences to a maximum length.
            max_length (int, optional):
                Maximum length of the returned sequences after truncation and padding.
            stride (int, optional):
                Stride for splitting text into chunks when truncation is applied.
            pad_to_multiple_of (int, optional):
                Pad all sequences to a multiple of this value.
            return_token_type_ids (bool, optional):
                Whether to return token type IDs.
            return_attention_mask (bool, optional):
                Whether to return attention masks.
            return_overflowing_tokens (bool, optional):
                Whether to return overflowing tokens that were truncated.
            return_special_tokens_mask (bool, optional):
                Whether to return a mask indicating special tokens.
            return_offsets_mapping (bool, optional):
                Whether to return offsets mapping tokenized input to original text.
            return_length (bool, optional):
                Whether to return the length of the output sequence.
            verbose (bool, optional):
                Whether to print informative messages during processing.
            return_tensors (Optional[Union[str, TensorType]], optional):
                Type of tensor to return (e.g., 'pt' for PyTorch tensors).

            **kwargs:
                Additional keyword arguments passed to the tokenizer.

        Returns:
            BatchEncoding:
                Processed batch encoding containing tokenized inputs and relevant masks/tensors.
        """
        pass  # 方法体为空，实际功能由子类实现
    ) -> BatchEncoding:
        """
        This method uses `BertTokenizerFast.__call__` to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.
        """
        # 使用BertTokenizerFast的__call__方法对文本进行处理，生成模型的输入编码
        encoding = self.tokenizer(
            text=text,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            return_tensors=return_tensors,
            **kwargs,
        )

        return encoding

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to BertTokenizerFast's `~PreTrainedTokenizer.batch_decode`. Please
        refer to the docstring of this method for more information.
        """
        # 将所有参数转发到BertTokenizerFast的~PreTrainedTokenizer.batch_decode方法
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to BertTokenizerFast's `~PreTrainedTokenizer.decode`. Please refer to
        the docstring of this method for more information.
        """
        # 将所有参数转发到BertTokenizerFast的~PreTrainedTokenizer.decode方法
        return self.tokenizer.decode(*args, **kwargs)

    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        # 获取tokenizer的模型输入名称列表，并移除重复项后返回
        return list(dict.fromkeys(tokenizer_input_names))

`.\models\bros\init.py`

# 版权声明及许可证声明，指明代码作者及授权条款
# 版权所有 2023-present NAVER Corp，Microsoft Research Asia LayoutLM Team 作者和 HuggingFace Inc. 团队。
# 根据 Apache 许可证 2.0 版本发布，除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 除非法律要求或书面同意，本软件按"原样"分发，不附带任何明示或暗示的保证或条件。
# 有关详细的权利和限制，请参阅许可证。

# 引入类型检查模块
from typing import TYPE_CHECKING

# 从内部模块引入相关依赖
# utils 模块来自上层目录的 "..."，这里是一个占位符
# OptionalDependencyNotAvailable 是一个自定义的异常类
# _LazyModule 是一个懒加载模块类
# is_tokenizers_available 和 is_torch_available 是检查依赖是否可用的函数

# 定义导入结构，字典包含不同模块和它们的导入项
_import_structure = {
    "configuration_bros": ["BROS_PRETRAINED_CONFIG_ARCHIVE_MAP", "BrosConfig"],
}

# 检查 tokenizers 是否可用，如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则添加 processing_bros 模块到导入结构中
    _import_structure["processing_bros"] = ["BrosProcessor"]

# 检查 torch 是否可用，如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则添加 modeling_bros 模块到导入结构中，并列出具体导入项
    _import_structure["modeling_bros"] = [
        "BROS_PRETRAINED_MODEL_ARCHIVE_LIST",
        "BrosPreTrainedModel",
        "BrosModel",
        "BrosForTokenClassification",
        "BrosSpadeEEForTokenClassification",
        "BrosSpadeELForTokenClassification",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 从 configuration_bros 模块导入相关内容
    from .configuration_bros import BROS_PRETRAINED_CONFIG_ARCHIVE_MAP, BrosConfig

    # 检查 tokenizers 是否可用，如果可用则从 processing_bros 模块导入 BrosProcessor
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .processing_bros import BrosProcessor

    # 检查 torch 是否可用，如果可用则从 modeling_bros 模块导入多个类和常量
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_bros import (
            BROS_PRETRAINED_MODEL_ARCHIVE_LIST,
            BrosForTokenClassification,
            BrosModel,
            BrosPreTrainedModel,
            BrosSpadeEEForTokenClassification,
            BrosSpadeELForTokenClassification,
        )

# 如果不在类型检查模式下
else:
    import sys

    # 将当前模块注册为懒加载模块
    # _LazyModule 是一个辅助类，用于在需要时加载模块
    # __name__ 是当前模块的名称
    # globals()["__file__"] 返回当前文件的路径
    # _import_structure 是定义好的导入结构
    # module_spec=__spec__ 指定模块的规范

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\byt5\convert_byt5_original_tf_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2018 The T5 authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert T5 checkpoint."""


import argparse  # 导入 argparse 模块，用于解析命令行参数

from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5  # 导入转换所需的类和函数
from transformers.utils import logging  # 导入日志工具


logging.set_verbosity_info()  # 设置日志输出级别为 INFO


def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = T5Config.from_json_file(config_file)  # 从配置文件加载 T5 模型配置
    print(f"Building PyTorch model from configuration: {config}")  # 打印正在根据配置构建 PyTorch 模型
    model = T5ForConditionalGeneration(config)  # 使用配置初始化 T5 条件生成模型

    # Load weights from tf checkpoint
    load_tf_weights_in_t5(model, config, tf_checkpoint_path)  # 从 TensorFlow checkpoint 中加载权重到 PyTorch 模型

    # Save pytorch-model
    print(f"Save PyTorch model to {pytorch_dump_path}")  # 打印正在将 PyTorch 模型保存到指定路径
    model.save_pretrained(pytorch_dump_path)  # 保存预训练好的 PyTorch 模型到指定路径


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建命令行参数解析器

    # Required parameters
    parser.add_argument(
        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
    )  # 添加参数：TensorFlow checkpoint 的路径
    parser.add_argument(
        "--config_file",
        default=None,
        type=str,
        required=True,
        help=(
            "The config json file corresponding to the pre-trained T5 model. \nThis specifies the model architecture."
        ),
    )  # 添加参数：预训练 T5 模型对应的配置 JSON 文件路径
    parser.add_argument(
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )  # 添加参数：输出 PyTorch 模型的路径
    args = parser.parse_args()  # 解析命令行参数
    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)  # 转换 TensorFlow checkpoint 到 PyTorch 模型

`.\models\byt5\tokenization_byt5.py`

# 设置编码声明为 UTF-8，确保源文件以 UTF-8 格式解析
# 版权声明和许可信息
# 引入警告模块，用于处理警告信息
# 引入类型提示模块中的 List、Optional 和 Tuple
# 引入 tokenization_utils 模块中的 AddedToken 和 PreTrainedTokenizer 类
# 引入 logging 模块，获取当前模块的日志记录器对象
# 获取当前模块的日志记录器对象并赋值给 logger 变量

# 定义一个名为 ByT5Tokenizer 的类，继承自 PreTrainedTokenizer 类
class ByT5Tokenizer(PreTrainedTokenizer):

    # 定义模型输入的名称列表，包含 "input_ids" 和 "attention_mask"
    model_input_names = ["input_ids", "attention_mask"]

    # 初始化方法，用于创建一个 ByT5 分词器对象
    def __init__(
        self,
        eos_token="</s>",  # 结束序列的特殊标记，默认为 "</s>"
        unk_token="<unk>",  # 未知标记，表示词汇表中不存在的标记，默认为 "<unk>"
        pad_token="<pad>",  # 用于填充序列的特殊标记，默认为 "<pad>"
        extra_ids=125,  # 额外添加到词汇表末尾的特殊标记数目，默认为 125
        additional_special_tokens=None,  # 额外的特殊标记列表，默认为 None
        **kwargs,  # 接收其他未指定参数的关键字参数
    ) -> None:
        # 将额外的特殊标记添加到特殊标记列表中
        if extra_ids > 0 and additional_special_tokens is None:
            # 如果额外的特殊标记数大于0且未提供额外特殊标记，则创建默认的额外特殊标记列表
            additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
        elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0:
            # 如果额外的特殊标记数大于0且提供了额外特殊标记，并且列表中有条目，则验证特殊标记数是否正确
            extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
            if extra_tokens != extra_ids:
                # 如果额外的特殊标记数与提供的额外特殊标记列表不匹配，则引发值错误
                raise ValueError(
                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
                    " provided to ByT5Tokenizer. In this case the additional_special_tokens must include the"
                    " extra_ids tokens"
                )

        pad_token = AddedToken(pad_token, lstrip=True, rstrip=True) if isinstance(pad_token, str) else pad_token
        # 对于向后兼容性，强制左右修剪。byt5tests 依赖于此。
        eos_token = AddedToken(eos_token, lstrip=True, rstrip=True) if isinstance(eos_token, str) else eos_token
        unk_token = AddedToken(unk_token, lstrip=True, rstrip=True) if isinstance(unk_token, str) else unk_token
        # unk 标记需要在词汇表中以正确的索引存在
        self._added_tokens_decoder = {0: pad_token, 1: eos_token, 2: unk_token}
        self.offset = len(self._added_tokens_decoder)
        self._utf_vocab_size = 2**8  # utf 是 8 位
        super().__init__(
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            extra_ids=0,
            additional_special_tokens=additional_special_tokens,  # TODO extra ids are not used :sweatywmile:
            **kwargs,
        )

    @property
    def vocab_size(self):
        # 返回 UTF 编码的词汇表大小
        return self._utf_vocab_size

    def get_vocab(self):
        # 返回词汇表，将词汇 ID 映射到对应的标记
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
        # 返回一个布尔掩码，指示哪些标记是特殊标记
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # 如果已经包含特殊token，则调用父类的方法获取特殊token的掩码
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # 普通情况：一些特殊token
        if token_ids_1 is None:
            # 对于单个序列，返回一个由0组成的列表，并在末尾添加一个1表示特殊token
            return ([0] * len(token_ids_0)) + [1]
        else:
            # 对于序列对，返回一个由0组成的列表，每个序列末尾添加一个1表示特殊token
            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]

    def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
        """Do not add eos again if user already added it."""
        # 如果最后一个token已经是eos_token_id，则不再添加eos_token_id
        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
            warnings.warn(
                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
                " eos tokens being added."
            )
            return token_ids
        else:
            # 否则，在token_ids末尾添加eos_token_id并返回
            return token_ids + [self.eos_token_id]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        # 创建一个用于序列对分类任务的掩码，ByT5模型不使用token type ids，因此返回一个全为0的列表
        eos = [self.eos_token_id]

        if token_ids_1 is None:
            # 对于单个序列，返回一个由0组成的列表，长度为token_ids_0和eos的总和
            return len(token_ids_0 + eos) * [0]
        else:
            # 对于序列对，返回一个由0组成的列表，长度为token_ids_0、eos、token_ids_1和eos的总和
            return len(token_ids_0 + eos + token_ids_1 + eos) * [0]

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequences for sequence classification tasks.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of input IDs with special tokens added in the appropriate positions.
        """
    def build_inputs_with_special_tokens(
        self,
        token_ids_0: List[int],
        token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of input IDs with the appropriate special tokens.
        """
        # Ensure token_ids_0 ends with an end-of-sequence token if not already present
        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
        
        if token_ids_1 is None:
            # If only token_ids_0 is provided, return it as the final input IDs
            return token_ids_0
        else:
            # Ensure token_ids_1 ends with an end-of-sequence token if not already present
            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
            # Concatenate token_ids_0 and token_ids_1 to form the complete input IDs for pair sequences
            return token_ids_0 + token_ids_1

    def _tokenize(self, text: str) -> List[str]:
        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
        # Convert each character in the input text into tokens
        tokens = [chr(i) for i in text.encode("utf-8")]
        return tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) into an ID using the vocabulary."""
        if len(token) != 1:
            token_id = None  # If the token length is not 1, set token_id to None
        else:
            token_id = ord(token) + self.offset  # Calculate the token ID based on ASCII value and offset
        return token_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) into a token (str) using the vocabulary."""
        # Convert index back to token using ASCII value and offset
        token = chr(index - self.offset)
        return token

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings) into a single string."""
        bstring = b""  # Initialize a byte string
        for token in tokens:
            if token in self.added_tokens_decoder:
                # If token exists in added_tokens_decoder, encode it and append to byte string
                tok_string = self.added_tokens_decoder[token].encode("utf-8")
            elif token in self.added_tokens_encoder:
                # If token exists in added_tokens_encoder, encode it and append to byte string
                tok_string = token.encode("utf-8")
            else:
                # Otherwise, convert token to byte and append to byte string
                tok_string = bytes([ord(token)])
            bstring += tok_string
        # Decode byte string into a string and return
        string = bstring.decode("utf-8", errors="ignore")
        return string

    # ByT5Tokenizer has no vocab file
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # This method is intended to save vocabulary, but currently returns an empty tuple
        return ()

`.\models\byt5\init.py`

# 导入必要的模块和类型检查工具
from typing import TYPE_CHECKING
# 导入懒加载模块
from ...utils import _LazyModule

# 定义模块导入结构，指定要导入的模块和类
_import_structure = {"tokenization_byt5": ["ByT5Tokenizer"]}

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 导入 ByT5Tokenizer 类型
    from .tokenization_byt5 import ByT5Tokenizer
# 如果不在类型检查模式下
else:
    # 导入 sys 模块
    import sys
    # 将当前模块替换为懒加载模块，使用 LazyModule 将当前模块名、文件名、导入结构、模块规范传入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\camembert\configuration_camembert.py`

# 指定文件编码为 UTF-8
# 版权声明，包括谷歌 AI 语言团队和 HuggingFace 公司
# 版权声明，包括 NVIDIA 公司
#
# 根据 Apache 许可证 2.0 版本授权，除非符合许可证规定，否则不得使用此文件
# 可在以下链接获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发本软件，无任何明示或暗示的保证或条件
# 详见许可证，获取更多信息
""" CamemBERT configuration"""

# 导入 OrderedDict 类型和 Mapping 接口
from collections import OrderedDict
from typing import Mapping

# 导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 导入 OnnxConfig 类
from ...onnx import OnnxConfig
# 导入日志记录工具
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义预训练模型配置文件的映射字典
CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "almanach/camembert-base": "https://huggingface.co/almanach/camembert-base/resolve/main/config.json",
    "umberto-commoncrawl-cased-v1": (
        "https://huggingface.co/Musixmatch/umberto-commoncrawl-cased-v1/resolve/main/config.json"
    ),
    "umberto-wikipedia-uncased-v1": (
        "https://huggingface.co/Musixmatch/umberto-wikipedia-uncased-v1/resolve/main/config.json"
    ),
}

# CamembertConfig 类继承自 PretrainedConfig 类
class CamembertConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`CamembertModel`] or a [`TFCamembertModel`]. It is
    used to instantiate a Camembert model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the Camembert
    [almanach/camembert-base](https://huggingface.co/almanach/camembert-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import CamembertConfig, CamembertModel

    >>> # Initializing a Camembert almanach/camembert-base style configuration
    >>> configuration = CamembertConfig()

    >>> # Initializing a model (with random weights) from the almanach/camembert-base style configuration
    >>> model = CamembertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型标识为 "camembert"
    model_type = "camembert"
    # 初始化函数，用于初始化一个 Transformer 模型的配置
    def __init__(
        self,
        vocab_size=30522,                            # 词汇表大小，默认为30522
        hidden_size=768,                             # 隐藏层大小，默认为768
        num_hidden_layers=12,                        # Transformer 模型中的隐藏层层数，默认为12
        num_attention_heads=12,                      # 注意力头的数量，默认为12
        intermediate_size=3072,                      # Feedforward 层的中间大小，默认为3072
        hidden_act="gelu",                           # 隐藏层激活函数，默认为 GELU
        hidden_dropout_prob=0.1,                     # 隐藏层的 dropout 概率，默认为0.1
        attention_probs_dropout_prob=0.1,             # 注意力概率的 dropout 概率，默认为0.1
        max_position_embeddings=512,                 # 最大位置编码的长度，默认为512
        type_vocab_size=2,                           # 类型词汇表的大小，默认为2
        initializer_range=0.02,                      # 参数初始化范围，默认为0.02
        layer_norm_eps=1e-12,                        # 层归一化的 epsilon 值，默认为1e-12
        pad_token_id=1,                              # 填充 token 的 id，默认为1
        bos_token_id=0,                              # 开始 token 的 id，默认为0
        eos_token_id=2,                              # 结束 token 的 id，默认为2
        position_embedding_type="absolute",          # 位置编码类型，默认为绝对位置编码
        use_cache=True,                              # 是否使用缓存，默认为True
        classifier_dropout=None,                     # 分类器的 dropout 概率，默认为None
        **kwargs,                                    # 其他未命名参数
    ):
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        self.vocab_size = vocab_size                 # 设置模型的词汇表大小
        self.hidden_size = hidden_size               # 设置模型的隐藏层大小
        self.num_hidden_layers = num_hidden_layers   # 设置模型的隐藏层层数
        self.num_attention_heads = num_attention_heads  # 设置模型的注意力头数
        self.hidden_act = hidden_act                 # 设置模型的隐藏层激活函数
        self.intermediate_size = intermediate_size   # 设置模型的中间层大小
        self.hidden_dropout_prob = hidden_dropout_prob  # 设置模型的隐藏层 dropout 概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob  # 设置模型的注意力 dropout 概率
        self.max_position_embeddings = max_position_embeddings  # 设置模型的最大位置编码长度
        self.type_vocab_size = type_vocab_size       # 设置模型的类型词汇表大小
        self.initializer_range = initializer_range   # 设置模型参数初始化范围
        self.layer_norm_eps = layer_norm_eps         # 设置模型的层归一化 epsilon 值
        self.position_embedding_type = position_embedding_type  # 设置模型的位置编码类型
        self.use_cache = use_cache                   # 设置模型是否使用缓存
        self.classifier_dropout = classifier_dropout  # 设置模型的分类器 dropout 概率
# 定义 CamembertOnnxConfig 类，继承自 OnnxConfig 类
class CamembertOnnxConfig(OnnxConfig):
    
    # 定义 inputs 属性，返回一个映射，表示输入数据的结构
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务是多项选择（multiple-choice）
        if self.task == "multiple-choice":
            # 定义动态轴的顺序，分别为 batch、choice、sequence
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则，定义动态轴的顺序，分别为 batch、sequence
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 返回有序字典，表示输入的名称和对应的动态轴
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),         # 输入数据的标识符
                ("attention_mask", dynamic_axis),    # 输入数据的注意力掩码
            ]
        )

`.\models\camembert\modeling_camembert.py`

# 设置文件编码为 UTF-8
# 版权声明：2019 年由 Inria、Facebook AI Research 和 HuggingFace Inc. 团队创建
# 版权声明：2018 年，NVIDIA CORPORATION 版权所有
#
# 根据 Apache 许可证 2.0 版本，除非符合许可证，否则不得使用此文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于“按现状”提供的，
# 没有任何形式的明示或暗示保证，包括但不限于对适销性或特定用途适用性的暗示保证。
# 有关详细信息，请参阅许可证。
"""PyTorch CamemBERT 模型。"""

import math
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 从外部导入一些自定义模块和类
from ...activations import ACT2FN, gelu
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 从模型配置文件中导入 CamembertConfig 类
from .configuration_camembert import CamembertConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置
_CHECKPOINT_FOR_DOC = "almanach/camembert-base"
_CONFIG_FOR_DOC = "CamembertConfig"

# 预训练模型存档列表
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "almanach/camembert-base",
    "Musixmatch/umberto-commoncrawl-cased-v1",
    "Musixmatch/umberto-wikipedia-uncased-v1",
    # 查看所有 CamemBERT 模型：https://huggingface.co/models?filter=camembert
]

# CamemBERT 模型起始文档字符串
CAMEMBERT_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`CamembertConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Camembert
class CamembertEmbeddings(nn.Module):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    """

    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
    def __init__(self, config):
        super().__init__()
        # 创建词嵌入层，用于将输入的词索引转换为词向量表示
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 创建位置嵌入层，用于存储位置信息的嵌入表示
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 创建类型嵌入层，用于存储token的类型信息的嵌入表示
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        # 创建 LayerNorm 层，用于归一化隐藏状态向量
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建 Dropout 层，用于在训练过程中随机置零一部分输入
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        # 初始化位置嵌入类型，指定为绝对位置嵌入
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册 position_ids 缓冲区，用于存储位置嵌入的位置索引
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册 token_type_ids 缓冲区，用于存储类型嵌入的 token 类型索引
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

        # End copy
        # 设置 padding_idx 为 config.pad_token_id，用于指定 padding 位置的索引
        self.padding_idx = config.pad_token_id
        # 重新创建位置嵌入层，用于存储位置信息的嵌入表示，指定 padding 索引
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

    def forward(
        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
        ):
            # 如果位置标识符为空，则根据输入的标记标识符创建位置标识符。任何填充的标记仍然保持填充状态。
            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
        else:
            # 否则，根据输入的嵌入张量创建位置标识符
            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        if input_ids is not None:
            # 如果输入标记标识符不为空，则获取其形状
            input_shape = input_ids.size()
        else:
            # 否则，获取输入嵌入张量的形状，但不包括最后一维
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        # 将 token_type_ids 设置为构造函数中注册的缓冲区，通常情况下为全零，这有助于用户在不传递 token_type_ids 的情况下跟踪模型，解决问题 #5664
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                # 如果模型有 token_type_ids 属性，则使用其注册的缓冲区，并扩展以匹配输入的形状
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                # 否则，创建全零的 token_type_ids 张量，其形状与输入形状相同，并使用与 position_ids 相同的设备
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            # 如果输入嵌入张量为空，则通过输入标记标识符获取单词嵌入
            inputs_embeds = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 计算最终的嵌入向量：输入嵌入加上 token_type_embeddings
        embeddings = inputs_embeds + token_type_embeddings

        if self.position_embedding_type == "absolute":
            # 如果位置嵌入类型是 "absolute"，则添加位置嵌入到最终的嵌入向量中
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # 应用 LayerNorm 层对嵌入向量进行归一化
        embeddings = self.LayerNorm(embeddings)

        # 对归一化后的嵌入向量进行 dropout 处理
        embeddings = self.dropout(embeddings)

        # 返回最终的嵌入向量作为输出
        return embeddings
# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Camembert
class CamembertSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查隐藏层大小是否能够被注意力头数整除，如果不行且配置中没有嵌入大小，则引发错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 设置注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 初始化查询、键、值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 初始化 dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        # 设置位置嵌入类型，默认为绝对位置嵌入
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果使用相对位置嵌入，初始化距离嵌入的 Embedding
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 标识是否为解码器
        self.is_decoder = config.is_decoder

    # 调整张量形状以便进行注意力计算
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 定义前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput with Roberta->Camembert
class CamembertSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化密集连接层、LayerNorm 层和 dropout 层
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 定义前向传播函数
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 全连接层
        hidden_states = self.dense(hidden_states)
        # dropout
        hidden_states = self.dropout(hidden_states)
        # LayerNorm
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->Camembert
class CamembertAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 初始化自注意力层，使用CamembertSelfAttention类
        self.self = CamembertSelfAttention(config, position_embedding_type=position_embedding_type)
        # 初始化输出层，使用CamembertSelfOutput类
        self.output = CamembertSelfOutput(config)
        # 存储被修剪的注意力头的集合
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 找到可修剪的注意力头和其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 修剪线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪的注意力头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用自注意力层的前向传播
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 调用输出层的前向传播，得到注意力输出
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力信息，则将其加入到输出中
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs


# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Roberta->Camembert
class CamembertIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化线性层，将隐藏状态维度转换为中间状态维度
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置初始化中间激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 线性变换
        hidden_states = self.dense(hidden_states)
        # 应用中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Roberta->Camembert
class CamembertOutput(nn.Module):
    # 初始化函数，用于初始化一个神经网络层
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个全连接层，将输入特征的大小设为 config.intermediate_size，输出特征的大小设为 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个 LayerNorm 层，对输入进行归一化处理，归一化的特征维度为 config.hidden_size，设置归一化的 epsilon 值为 config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层，以 config.hidden_dropout_prob 的概率随机将输入置为 0，用于防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，接受两个输入张量，返回一个张量作为输出
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入 hidden_states 经过全连接层 self.dense，得到新的 hidden_states
        hidden_states = self.dense(hidden_states)
        # 对新的 hidden_states 应用 Dropout，以防止过拟合
        hidden_states = self.dropout(hidden_states)
        # 将经过 Dropout 的 hidden_states 与输入张量 input_tensor 相加，然后经过 LayerNorm 层进行归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回最终处理后的 hidden_states 作为输出结果
        return hidden_states
# 从transformers.models.roberta.modeling_roberta.RobertaLayer复制的代码，将Roberta替换为Camembert
class CamembertLayer(nn.Module):
    # 初始化函数，接受一个配置对象config作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 设置前向传播中用于分块的大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度的索引，通常为1
        self.seq_len_dim = 1
        # 使用配置对象创建CamembertAttention层
        self.attention = CamembertAttention(config)
        # 是否作为解码器使用的标志
        self.is_decoder = config.is_decoder
        # 是否添加交叉注意力的标志
        self.add_cross_attention = config.add_cross_attention
        # 如果设置了添加交叉注意力，且不是解码器模型，则抛出错误
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 使用绝对位置编码类型创建交叉注意力层
            self.crossattention = CamembertAttention(config, position_embedding_type="absolute")
        # CamembertIntermediate中间层
        self.intermediate = CamembertIntermediate(config)
        # CamembertOutput输出层
        self.output = CamembertOutput(config)

    # 前向传播函数，接受多个参数作为输入
    def forward(
        self,
        hidden_states: torch.Tensor,  # 隐藏状态张量
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力掩码张量，可选
        head_mask: Optional[torch.FloatTensor] = None,  # 头部掩码张量，可选
        encoder_hidden_states: Optional[torch.FloatTensor] = None,  # 编码器隐藏状态张量，可选
        encoder_attention_mask: Optional[torch.FloatTensor] = None,  # 编码器注意力掩码张量，可选
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,  # 过去的键值元组，可选
        output_attentions: Optional[bool] = False,  # 是否输出注意力权重，缺省为False
    ) -> Tuple[torch.Tensor]:
        # 声明函数的返回类型为一个包含单个 torch.Tensor 的元组
        # 如果有过去的注意力缓存，获取解码器单向自注意力的缓存键/值元组，位置在1,2处
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 使用当前模块中的注意力层进行自注意力计算
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # 获取自注意力计算的输出
        attention_output = self_attention_outputs[0]

        # 如果当前模块是解码器模块，最后一个输出为自注意力缓存的元组
        if self.is_decoder:
            outputs = self_attention_outputs[1:-1]
            present_key_value = self_attention_outputs[-1]
        else:
            # 否则将自注意力计算的输出作为结果之一，并添加自注意力权重输出
            outputs = self_attention_outputs[1:]

        cross_attn_present_key_value = None
        # 如果当前模块是解码器且有编码器的隐藏状态
        if self.is_decoder and encoder_hidden_states is not None:
            # 如果当前模块没有交叉注意力层，则引发值错误
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # 获取解码器交叉注意力缓存的键/值元组，位置在3,4处
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 使用交叉注意力层计算交叉注意力输出
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            # 获取交叉注意力计算的输出
            attention_output = cross_attention_outputs[0]
            # 将交叉注意力计算的输出添加到结果之一，并添加交叉注意力权重输出
            outputs = outputs + cross_attention_outputs[1:-1]

            # 将交叉注意力的当前键/值元组添加到当前键/值元组中
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # 应用前向传播的分块策略到注意力输出上
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # 将分块后的结果作为输出之一
        outputs = (layer_output,) + outputs

        # 如果当前模块是解码器，将注意力键/值作为最后一个输出返回
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        # 返回所有的输出
        return outputs

    # 定义一个处理注意力输出的分块函数
    def feed_forward_chunk(self, attention_output):
        # 使用中间层处理注意力输出
        intermediate_output = self.intermediate(attention_output)
        # 使用输出层处理中间层的输出，得到最终的层输出
        layer_output = self.output(intermediate_output, attention_output)
        # 返回处理后的层输出
        return layer_output
# 从transformers.models.roberta.modeling_roberta.RobertaEncoder复制并修改为CamembertEncoder
class CamembertEncoder(nn.Module):
    # 初始化函数，接收一个配置对象config作为参数
    def __init__(self, config):
        super().__init__()
        # 将传入的配置对象保存到成员变量self.config中
        self.config = config
        # 使用列表推导式创建一个由CamembertLayer对象组成的ModuleList，长度为config.num_hidden_layers
        self.layer = nn.ModuleList([CamembertLayer(config) for _ in range(config.num_hidden_layers)])
        # 默认关闭梯度检查点功能
        self.gradient_checkpointing = False

    # 前向传播函数，接收多个输入参数，具体功能在后续方法体中实现
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 初始化存储所有隐藏状态的元组，如果不需要输出隐藏状态则为None
        all_hidden_states = () if output_hidden_states else None
        # 初始化存储所有自注意力机制结果的元组，如果不需要输出注意力则为None
        all_self_attentions = () if output_attentions else None
        # 初始化存储所有交叉注意力机制结果的元组，如果不需要输出交叉注意力则为None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果开启了梯度检查点且处于训练阶段
        if self.gradient_checkpointing and self.training:
            # 如果使用了缓存，则给出警告并设置use_cache为False
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果需要使用缓存，则初始化下一个解码器缓存的元组，否则设为None
        next_decoder_cache = () if use_cache else None
        # 遍历所有解码器层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前隐藏状态加入all_hidden_states元组
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部遮罩，如果没有则设为None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取当前层的过去键值对，如果没有则设为None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果开启了梯度检查点且处于训练阶段
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数计算当前层的输出
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层模块计算当前层的输出
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新当前隐藏状态为当前层输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果使用缓存，则将当前层输出的最后一个元素加入下一个解码器缓存元组
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力，则将当前层输出的第二个元素加入all_self_attentions元组
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果模型配置要求添加交叉注意力，则将当前层输出的第三个元素加入all_cross_attentions元组
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态，则将最终的隐藏状态加入all_hidden_states元组
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不使用返回字典结构，则按照顺序返回相关的输出元组
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则，返回带有过去键值和交叉注意力的基本模型输出对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# Copied from transformers.models.bert.modeling_bert.BertPooler
class CamembertPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        # 获取第一个 token 对应的隐藏状态
        first_token_tensor = hidden_states[:, 0]
        # 将第一个 token 的隐藏状态通过线性层
        pooled_output = self.dense(first_token_tensor)
        # 应用 Tanh 激活函数
        pooled_output = self.activation(pooled_output)
        return pooled_output


class CamembertPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = CamembertConfig
    base_model_prefix = "roberta"
    supports_gradient_checkpointing = True

    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            # 使用正态分布初始化线性层的权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果存在偏置项，则将其初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化嵌入层的权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果定义了 padding_idx，则将对应位置的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 将 LayerNorm 层的偏置项初始化为零
            module.bias.data.zero_()
            # 将 LayerNorm 层的权重初始化为全1
            module.weight.data.fill_(1.0)


CAMEMBERT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列标记在词汇表中的索引。
            # 可以使用 [`AutoTokenizer`] 获取这些索引。
            # 参见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`] 获取详细信息。
            # [什么是输入 ID？](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 遮罩，用于避免在填充的标记索引上执行注意力操作。
            # 遮罩值选取在 `[0, 1]` 之间：
            # - 1 表示**未遮罩**的标记，
            # - 0 表示**遮罩**的标记。
            # [什么是注意力遮罩？](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 段标记索引，用于指示输入的第一和第二部分。
            # 索引选取在 `[0, 1]` 之间：
            # - 0 对应*句子 A* 的标记，
            # - 1 对应*句子 B* 的标记。
            # [什么是标记类型 ID？](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 输入序列中每个标记在位置嵌入中的位置索引。
            # 索引选取在 `[0, config.max_position_embeddings - 1]` 范围内。
            # [什么是位置 ID？](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 用于将自注意力模块中选择的头部置空的遮罩。
            # 遮罩值选取在 `[0, 1]` 之间：
            # - 1 表示**未遮罩**的头部，
            # - 0 表示**遮罩**的头部。
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选，代替传递 `input_ids`，您可以直接传递嵌入表示。
            # 如果您希望更加控制将 `input_ids` 索引转换为关联向量的方式，这将会很有用，而不是使用模型的内部嵌入查找矩阵。
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。
            # 有关更多详细信息，请参见返回的张量中的 `attentions`。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。
            # 有关更多详细信息，请参见返回的张量中的 `hidden_states`。
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是简单的元组。
    """

    # 从 transformers.models.roberta.modeling_roberta.RobertaClassificationHead 复制并修改为支持 Camembert
    class CamembertClassificationHead(nn.Module):
        """用于句子级分类任务的头部模块。"""

        def __init__(self, config):
            super().__init__()
            # 密集连接层，将输入特征映射到隐藏层大小
            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
            # 分类器的 dropout 操作
            classifier_dropout = (
                config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
            )
            self.dropout = nn.Dropout(classifier_dropout)
            # 输出投影层，将隐藏层映射到标签数量大小
            self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

        def forward(self, features, **kwargs):
            # 取特征的第一个位置处的向量，对应于 <s> 标记（等同于 [CLS]）
            x = features[:, 0, :]
            x = self.dropout(x)  # 应用 dropout
            x = self.dense(x)  # 密集连接层
            x = torch.tanh(x)  # 使用双曲正切激活函数
            x = self.dropout(x)  # 再次应用 dropout
            x = self.out_proj(x)  # 输出投影层映射到标签数量大小
            return x

    # 从 transformers.models.roberta.modeling_roberta.RobertaLMHead 复制并修改为支持 Camembert
    class CamembertLMHead(nn.Module):
        """用于掩码语言建模的 Camembert 头部模块。"""

        def __init__(self, config):
            super().__init__()
            # 密集连接层，将输入特征映射到隐藏层大小
            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
            # LayerNorm 层，用于归一化隐藏层特征
            self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

            # 解码层，将隐藏层映射回词汇表大小
            self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
            self.bias = nn.Parameter(torch.zeros(config.vocab_size))
            self.decoder.bias = self.bias

        def forward(self, features, **kwargs):
            x = self.dense(features)  # 密集连接层
            x = gelu(x)  # 使用 GELU 激活函数
            x = self.layer_norm(x)  # LayerNorm 归一化

            # 使用偏置将特征映射回词汇表大小
            x = self.decoder(x)

            return x

        def _tie_weights(self):
            # 如果权重断开连接（在 TPU 上或当偏置被调整大小时），则重新绑定这两个权重
            # 为了加速兼容性和不破坏向后兼容性
            if self.decoder.bias.device.type == "meta":
                self.decoder.bias = self.bias
            else:
                self.bias = self.decoder.bias

    @add_start_docstrings(
        "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
        CAMEMBERT_START_DOCSTRING,
    )
    # 从 CamembertPreTrainedModel 继承的 CamembertModel 类
    class CamembertModel(CamembertPreTrainedModel):
        """
        模型可以作为编码器（仅自注意力）或解码器使用，此时在自注意力层之间添加了一层交叉注意力层，遵循 *Attention is
        all you need*_ 中描述的架构，作者是 Ashish Vaswani、Noam Shazeer、Niki Parmar、Jakob Uszkoreit、Llion
        Jones、Aidan N. Gomez、Lukasz Kaiser 和 Illia Polosukhin。

        要作为解码器使用，模型需要使用配置设置中的 `is_decoder` 参数初始化为 `True`。要用于 Seq2Seq 模型，
        模型需要同时使用 `is_decoder` 参数和
    ```
    """
    add_cross_attention 设置为 True；预期在前向传播中作为输入传入 encoder_hidden_states。

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762

    """

    _no_split_modules = []

    # 从 transformers.models.bert.modeling_bert.BertModel.__init__ 复制并修改为 Camembert
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        # 初始化嵌入层和编码器
        self.embeddings = CamembertEmbeddings(config)
        self.encoder = CamembertEncoder(config)

        # 如果需要添加池化层，则初始化池化器
        self.pooler = CamembertPooler(config) if add_pooling_layer else None

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        对模型的注意力头进行修剪。heads_to_prune: {layer_num: 要在该层中修剪的头列表} 参见基类 PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 从 transformers.models.bert.modeling_bert.BertModel.forward 复制
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 使用装饰器将文档字符串添加到模型类的定义中，描述了此类是一个带有语言建模头部的CamemBERT模型。
# 这些文档字符串是从CAMEMBERT_START_DOCSTRING导入的基础信息后面增加的。
@add_start_docstrings(
    """CamemBERT Model with a `language modeling` head on top.""",
    CAMEMBERT_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_roberta.RobertaForMaskedLM复制过来，将Roberta改为Camembert，ROBERTA改为CAMEMBERT。
class CamembertForMaskedLM(CamembertPreTrainedModel):
    # 定义了一个列表，包含了lm_head.decoder.weight和lm_head.decoder.bias，这些权重是被绑定的。
    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

    # 初始化方法，接受一个config参数，并调用其父类的初始化方法。
    def __init__(self, config):
        super().__init__(config)

        # 如果config.is_decoder为True，给出警告，建议在使用CamembertForMaskedLM时将其设为False，以使用双向自注意力。
        if config.is_decoder:
            logger.warning(
                "If you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # 初始化一个CamembertModel对象，并禁用添加池化层。
        self.roberta = CamembertModel(config, add_pooling_layer=False)
        # 初始化一个CamembertLMHead对象。
        self.lm_head = CamembertLMHead(config)

        # 初始化权重并应用最终处理。
        self.post_init()

    # 返回语言建模头部的输出嵌入。
    def get_output_embeddings(self):
        return self.lm_head.decoder

    # 设置语言建模头部的输出嵌入为新的嵌入。
    def set_output_embeddings(self, new_embeddings):
        self.lm_head.decoder = new_embeddings

    # 前向传播方法，接受多个输入参数，并且被装饰器修饰，添加了一些模型前向传播的文档字符串。
    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="<mask>",
        expected_output="' Paris'",
        expected_loss=0.1,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 以下是方法参数的描述，注释解释了每个参数的作用和类型。
    ):
        ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        """
        # Determine whether to use a return dictionary based on the provided argument or the default configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass the input data through the Roberta model to obtain outputs
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # Retrieve the sequence output from the Roberta model's outputs
        sequence_output = outputs[0]
        # Generate prediction scores using the language modeling head
        prediction_scores = self.lm_head(sequence_output)

        # Initialize the masked language modeling loss variable
        masked_lm_loss = None
        # Calculate the masked language modeling loss if labels are provided
        if labels is not None:
            # Move labels to the device where prediction_scores tensor resides for model parallelism
            labels = labels.to(prediction_scores.device)
            # Define the loss function as Cross Entropy Loss
            loss_fct = CrossEntropyLoss()
            # Compute the masked LM loss based on prediction scores and labels
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # If return_dict is False, prepare the output tuple with prediction scores and additional outputs
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # If return_dict is True, construct a MaskedLMOutput object with specific attributes
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    CAMEMBERT_START_DOCSTRING,
)
# 基于 transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification 复制修改，将所有 Roberta 替换为 Camembert，所有 ROBERTA 替换为 CAMEMBERT
class CamembertForSequenceClassification(CamembertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels  # 从配置中获取标签数目
        self.config = config  # 存储配置信息

        self.roberta = CamembertModel(config, add_pooling_layer=False)  # 初始化 Camembert 模型，不添加汇聚层
        self.classifier = CamembertClassificationHead(config)  # 初始化 Camembert 分类头部

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="cardiffnlp/twitter-roberta-base-emotion",
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="'optimism'",
        expected_loss=0.08,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用指定的值；否则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 RoBERTa 模型进行前向传播
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取 RoBERTa 输出的序列输出
        sequence_output = outputs[0]
        # 经过分类器得到 logits
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            # 将标签移动到正确的设备以启用模型并行处理
            labels = labels.to(logits.device)
            # 确定问题类型，根据 num_labels 和 labels 的数据类型进行分类
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不需要返回字典，则返回模型的输出和损失
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典，则构造 SequenceClassifierOutput 对象并返回
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 使用装饰器为类添加文档字符串，描述了该类是基于CamemBERT模型的多选分类器，适用于例如RocStories/SWAG任务。
@add_start_docstrings(
    """
    CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    CAMEMBERT_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice中复制的代码，将Roberta替换为Camembert，ROBERTA替换为CAMEMBERT
class CamembertForMultipleChoice(CamembertPreTrainedModel):
    def __init__(self, config):
        # 调用父类构造函数初始化对象
        super().__init__(config)

        # 初始化Camembert模型
        self.roberta = CamembertModel(config)
        # 使用config中定义的hidden_dropout_prob初始化一个Dropout层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 创建一个线性层用于多选分类，输入维度为config中定义的hidden_size，输出维度为1
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(
        CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义前向传播方法，接收多个输入参数并返回一个包含输出的字典或者一个元组
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 描述输入参数的文档字符串，指定了输入的形状和含义
    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据 `return_dict` 参数确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 获取输入 `input_ids` 的第二维大小作为选项数
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 如果 `input_ids` 不为空，则展平为二维张量
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        # 如果 `position_ids` 不为空，则展平为二维张量
        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        # 如果 `token_type_ids` 不为空，则展平为二维张量
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        # 如果 `attention_mask` 不为空，则展平为二维张量
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        # 如果 `inputs_embeds` 不为空，则展平为三维张量
        flat_inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用 RoBERTa 模型进行前向传播
        outputs = self.roberta(
            flat_input_ids,
            position_ids=flat_position_ids,
            token_type_ids=flat_token_type_ids,
            attention_mask=flat_attention_mask,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取池化后的输出
        pooled_output = outputs[1]

        # 对池化输出应用 dropout
        pooled_output = self.dropout(pooled_output)
        # 对池化后的输出应用分类器得到 logits
        logits = self.classifier(pooled_output)
        # 重塑 logits 的形状为 (batch_size, num_choices)
        reshaped_logits = logits.view(-1, num_choices)

        # 初始化损失为 None
        loss = None
        # 如果提供了标签 `labels`
        if labels is not None:
            # 将标签移动到正确的设备以支持模型并行计算
            labels = labels.to(reshaped_logits.device)
            # 定义交叉熵损失函数
            loss_fct = CrossEntropyLoss()
            # 计算交叉熵损失
            loss = loss_fct(reshaped_logits, labels)

        # 如果不使用返回字典，则返回扁平化后的输出
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 使用返回字典形式输出结果
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
"""

# 从transformers.models.roberta.modeling_roberta.RobertaForTokenClassification复制，将Roberta替换为Camembert，ROBERTA替换为CAMEMBERT
@add_start_docstrings(
    """
    CamemBERT模型，顶部带有一个标记分类头（在隐藏状态输出的顶部增加了一个线性层），例如用于命名实体识别（NER）任务。
    """,
    CAMEMBERT_START_DOCSTRING,
)
class CamembertForTokenClassification(CamembertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # 初始化Camembert模型，不包括池化层
        self.roberta = CamembertModel(config, add_pooling_layer=False)
        
        # 分类器的dropout率，如果未指定，则使用config.hidden_dropout_prob
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        
        # 线性分类器，将隐藏状态的输出映射到标签数量
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="Jean-Baptiste/roberta-large-ner-english",
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
        expected_loss=0.01,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        CamemBERT模型的前向传播方法。

        Args:
            input_ids (Optional[torch.LongTensor], optional): 输入的token索引序列. Defaults to None.
            attention_mask (Optional[torch.FloatTensor], optional): 注意力遮罩，指示哪些元素是填充值而不是实际数据. Defaults to None.
            token_type_ids (Optional[torch.LongTensor], optional): token类型ids，用于区分不同的句子. Defaults to None.
            position_ids (Optional[torch.LongTensor], optional): 位置ids，指示每个token在输入中的位置. Defaults to None.
            head_mask (Optional[torch.FloatTensor], optional): 头部遮罩，用于指定哪些注意力头部被屏蔽. Defaults to None.
            inputs_embeds (Optional[torch.FloatTensor], optional): 嵌入的输入，而不是使用input_ids. Defaults to None.
            labels (Optional[torch.LongTensor], optional): 标签，用于训练时的监督. Defaults to None.
            output_attentions (Optional[bool], optional): 是否输出所有注意力权重. Defaults to None.
            output_hidden_states (Optional[bool], optional): 是否输出所有隐藏状态. Defaults to None.
            return_dict (Optional[bool], optional): 是否返回字典格式的输出. Defaults to None.

        Returns:
            TokenClassifierOutput or Tuple[torch.FloatTensor]: 模型的输出结果或元组，根据return_dict参数决定输出形式.
        """
        # 实现CamemBERT模型的前向传播逻辑，详细解释见上文
        pass  # forward方法的具体实现在实际代码中，这里暂时不作展示
        ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 如果 return_dict 不为 None，则使用传入的 return_dict，否则使用配置中的 use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 Roberta 模型处理输入数据
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取序列输出
        sequence_output = outputs[0]

        # 对序列输出进行 dropout 处理
        sequence_output = self.dropout(sequence_output)
        
        # 使用分类器对处理后的序列输出进行分类得到 logits
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        # 如果存在标签，则计算交叉熵损失
        if labels is not None:
            # 将标签移到与 logits 相同的设备上，以支持模型并行计算
            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            # 计算交叉熵损失
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不使用 return_dict，按顺序返回 logits 和额外的模型输出
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 使用 TokenClassifierOutput 类构建返回结果，包括损失、logits、隐藏状态和注意力权重
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`
    """,
    CAMEMBERT_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering 复制而来，将所有 Roberta 替换为 Camembert，所有 ROBERTA 替换为 CAMEMBERT
class CamembertForQuestionAnswering(CamembertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # 初始化 Camembert 模型，禁用 pooling 层
        self.roberta = CamembertModel(config, add_pooling_layer=False)
        # 线性层，用于输出分类 logits
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="deepset/roberta-base-squad2",
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="' puppet'",
        expected_loss=0.86,
    )
    # 定义前向传播方法，接受多种输入参数并返回结果
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 Roberta 模型进行前向传播
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出的序列输出
        sequence_output = outputs[0]

        # 对序列输出进行问答任务的输出
        logits = self.qa_outputs(sequence_output)
        # 将输出分割为开始位置和结束位置的 logits
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()  # 去除维度为 1 的维度，并保证连续性
        end_logits = end_logits.squeeze(-1).contiguous()  # 去除维度为 1 的维度，并保证连续性

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果输入的 start_positions 或 end_positions 是多维的，在 GPU 上处理时需要进行调整
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 将超出模型输入长度的位置索引设置为忽略索引
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，忽略忽略索引
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            # 计算开始位置和结束位置的损失
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            # 计算总体损失
            total_loss = (start_loss + end_loss) / 2

        # 如果不需要返回字典，则返回一个元组
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 返回 QuestionAnsweringModelOutput 对象，包含损失、开始位置 logits、结束位置 logits、隐藏状态和注意力权重
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 使用自定义的装饰器添加模型文档字符串，说明这是一个带有语言建模头部的CamemBERT模型，用于条件语言建模（CLM）微调
@add_start_docstrings(
    """CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING
)
# 从transformers.models.roberta.modeling_roberta.RobertaForCausalLM复制并修改为CamembertForCausalLM，替换了相关引用和模型名称
# 将FacebookAI/roberta-base替换为almanach/camembert-base
class CamembertForCausalLM(CamembertPreTrainedModel):
    # 指定权重共享的键列表，这些键将与lm_head.decoder的权重和偏置相关联
    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

    def __init__(self, config):
        super().__init__(config)

        # 如果配置不是解码器，则发出警告，建议添加"is_decoder=True"以独立使用CamembertLMHeadModel
        if not config.is_decoder:
            logger.warning("If you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`")

        # 初始化Camembert模型部分，不包括池化层
        self.roberta = CamembertModel(config, add_pooling_layer=False)
        # 初始化Camembert语言建模头部
        self.lm_head = CamembertLMHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 获取输出嵌入层的方法，返回lm_head.decoder，即语言建模头部的解码器
    def get_output_embeddings(self):
        return self.lm_head.decoder

    # 设置输出嵌入层的方法，更新lm_head.decoder的值为新的嵌入层
    def set_output_embeddings(self, new_embeddings):
        self.lm_head.decoder = new_embeddings

    # 重写forward方法，根据参数文档说明进行详细的输入和输出注释
    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 为生成准备输入数据，根据给定参数设置输入形状
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
        # 获取输入数据的形状
        input_shape = input_ids.shape

        # 如果未提供注意力掩码，则创建一个全为1的掩码，长度与输入相同
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)

        # 如果已提供过去的键值（用于缓存），则根据过去键值裁剪输入的ID序列
        if past_key_values is not None:
            # 获取过去键值的长度（通常是序列长度）
            past_length = past_key_values[0][0].shape[2]

            # 如果输入ID序列长度大于过去键值长度，裁剪序列，保留后面部分
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 否则，默认只保留最后一个ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 裁剪输入ID序列
            input_ids = input_ids[:, remove_prefix_length:]

        # 返回准备好的输入参数字典
        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}

    # 重新排序缓存中的过去键值，根据给定的beam索引
    def _reorder_cache(self, past_key_values, beam_idx):
        # 初始化重新排序后的过去键值元组
        reordered_past = ()

        # 遍历每一层的过去键值
        for layer_past in past_key_values:
            # 对每个过去状态，根据beam索引重新排序，并加入元组中
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )

        # 返回重新排序后的过去键值
        return reordered_past
# 从输入的input_ids中创建位置标识符，用于Transformer模型的位置编码
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        input_ids: 输入的整数张量，包含了模型的输入内容
        padding_idx: 表示填充的索引，用于识别填充符号
        past_key_values_length: 过去键值的长度，用于增量索引计算

    Returns:
        torch.Tensor: 包含了每个位置的标识符的长整型张量
    """
    # 创建一个掩码张量，将非填充符号的位置标记为1，填充符号标记为0
    mask = input_ids.ne(padding_idx).int()
    # 计算每个位置的增量索引，忽略填充符号
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    # 将增量索引转换为长整型，并加上填充索引，以获得最终的位置标识符
    return incremental_indices.long() + padding_idx

`.\models\camembert\modeling_tf_camembert.py`

# 导入必要的库和模块
import math
import warnings
from typing import Optional, Tuple, Union

import numpy as np
import tensorflow as tf

# 导入transformers库中的相关模块和类
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutputWithPastAndCrossAttentions,
    TFBaseModelOutputWithPoolingAndCrossAttentions,
    TFCausalLMOutputWithCrossAttentions,
    TFMaskedLMOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
    TFCausalLanguageModelingLoss,
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_camembert import CamembertConfig

# 获取全局的日志记录器
logger = logging.get_logger(__name__)

# Transformer模型的checkpoint路径
_CHECKPOINT_FOR_DOC = "almanach/camembert-base"
# Camembert配置文件的名称
_CONFIG_FOR_DOC = "CamembertConfig"

# Camembert预训练模型的存档列表
TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    # 请查看所有Camembert模型的列表：https://huggingface.co/models?filter=camembert
]

# Camembert模型的起始文档字符串，包含模型的基本信息和用法说明
CAMEMBERT_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.
    """
    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
      `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
      `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!
    """

    """
    Parameters:
        config ([`CamembertConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    """
"""

CAMEMBERT_INPUTS_DOCSTRING = r"""
"""


# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings 复制过来的
class TFCamembertEmbeddings(keras.layers.Layer):
    """
    与 BertEmbeddings 相同，但在位置嵌入索引方面有微小调整。
    """

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        self.padding_idx = 1  # 定义填充索引，用于表示填充的位置
        self.config = config  # 保存配置对象
        self.hidden_size = config.hidden_size  # 获取隐藏层大小
        self.max_position_embeddings = config.max_position_embeddings  # 获取最大位置嵌入数
        self.initializer_range = config.initializer_range  # 获取初始化范围
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")  # 创建层归一化层对象
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)  # 创建 dropout 层对象

    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):
            # 创建词嵌入权重矩阵
            self.weight = self.add_weight(
                name="weight",
                shape=[self.config.vocab_size, self.hidden_size],
                initializer=get_initializer(self.initializer_range),
            )

        with tf.name_scope("token_type_embeddings"):
            # 创建 token 类型嵌入权重矩阵
            self.token_type_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.config.type_vocab_size, self.hidden_size],
                initializer=get_initializer(self.initializer_range),
            )

        with tf.name_scope("position_embeddings"):
            # 创建位置嵌入权重矩阵
            self.position_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.max_position_embeddings, self.hidden_size],
                initializer=get_initializer(self.initializer_range),
            )

        if self.built:
            return
        self.built = True
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                # 构建层归一化层
                self.LayerNorm.build([None, None, self.config.hidden_size])

    def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
        """
        用输入的 id 创建位置 id，非填充符号被替换为它们的位置数字。位置数字从 padding_idx+1 开始。
        填充符号被忽略。这是从 fairseq 的 `utils.make_positions` 修改而来。

        Args:
            input_ids: tf.Tensor 输入的 id 张量
        Returns: tf.Tensor 输出的位置 id 张量
        """
        mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
        incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask

        return incremental_indices + self.padding_idx

    def call(
        self,
        input_ids=None,
        position_ids=None,
        token_type_ids=None,
        inputs_embeds=None,
        past_key_values_length=0,
        training=False,
    ):
        """
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        """
        assert not (input_ids is None and inputs_embeds is None)

        # 如果没有提供 input_ids 或 inputs_embeds，抛出异常
        if input_ids is not None:
            # 检查 input_ids 是否在词汇表大小内
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            # 根据 input_ids 从权重矩阵中获取对应的 embeddings
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        # 获取输入 embeds 的形状，去掉最后一个维度
        input_shape = shape_list(inputs_embeds)[:-1]

        # 如果没有提供 token_type_ids，则用0填充形状
        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)

        # 如果没有提供 position_ids
        if position_ids is None:
            if input_ids is not None:
                # 根据输入的 token ids 创建 position ids。任何填充的 token 仍然保持填充状态。
                position_ids = self.create_position_ids_from_input_ids(
                    input_ids=input_ids, past_key_values_length=past_key_values_length
                )
            else:
                # 创建默认的 position ids，范围从 padding_idx + 1 到 input_shape[-1] + padding_idx + 1
                position_ids = tf.expand_dims(
                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
                )

        # 根据 position_ids 获取 position embeddings
        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        # 根据 token_type_ids 获取 token type embeddings
        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
        # 计算最终的 embeddings，组合 inputs_embeds、position_embeds 和 token_type_embeds
        final_embeddings = inputs_embeds + position_embeds + token_type_embeds
        # 对最终的 embeddings 进行 LayerNorm 处理
        final_embeddings = self.LayerNorm(inputs=final_embeddings)
        # 对最终的 embeddings 进行 dropout 处理
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)

        # 返回最终的 embeddings 结果
        return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Camembert
class TFCamembertPooler(keras.layers.Layer):
    def __init__(self, config: CamembertConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义一个全连接层，用于池化隐藏状态的第一个令牌
        self.dense = keras.layers.Dense(
            units=config.hidden_size,  # 全连接层的输出大小为配置文件中定义的隐藏大小
            kernel_initializer=get_initializer(config.initializer_range),  # 使用配置中的初始化器范围进行权重初始化
            activation="tanh",  # 激活函数为双曲正切函数
            name="dense",  # 层的名称为dense
        )
        self.config = config  # 保存配置参数

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 池化模型的方法是简单地选择与第一个令牌对应的隐藏状态
        first_token_tensor = hidden_states[:, 0]  # 获取每个样本的第一个令牌的隐藏状态
        pooled_output = self.dense(inputs=first_token_tensor)  # 使用全连接层池化第一个令牌的隐藏状态

        return pooled_output  # 返回池化输出

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])



# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert
class TFCamembertSelfAttention(keras.layers.Layer):
    def __init__(self, config: CamembertConfig, **kwargs):
        super().__init__(**kwargs)

        # 检查隐藏大小是否能够被注意力头数整除
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
                f"of attention heads ({config.num_attention_heads})"
            )

        # 初始化注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)

        # 定义查询、键、值的全连接层，并使用配置中的初始化器范围初始化权重
        self.query = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
        )
        self.key = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
        )
        self.value = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )
        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)  # 定义注意力概率的dropout层

        self.is_decoder = config.is_decoder  # 记录是否为解码器
        self.config = config  # 保存配置参数
    # 将输入张量重新调整形状从 [batch_size, seq_length, all_head_size] 到 [batch_size, seq_length, num_attention_heads, attention_head_size]
    tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))

    # 将张量转置从 [batch_size, seq_length, num_attention_heads, attention_head_size] 到 [batch_size, num_attention_heads, seq_length, attention_head_size]
    return tf.transpose(tensor, perm=[0, 2, 1, 3])

# 神经网络模型的调用方法，接受多个输入张量和参数，执行注意力机制相关的计算
def call(
    self,
    hidden_states: tf.Tensor,
    attention_mask: tf.Tensor,
    head_mask: tf.Tensor,
    encoder_hidden_states: tf.Tensor,
    encoder_attention_mask: tf.Tensor,
    past_key_value: Tuple[tf.Tensor],
    output_attentions: bool,
    training: bool = False,
):
    # 在构建模型时调用，用于设置层的结构
    def build(self, input_shape=None):
        # 如果已经构建过一次，直接返回
        if self.built:
            return
        # 标记该层已构建
        self.built = True
        # 如果存在查询张量，构建查询张量的结构
        if getattr(self, "query", None) is not None:
            with tf.name_scope(self.query.name):
                self.query.build([None, None, self.config.hidden_size])
        # 如果存在键张量，构建键张量的结构
        if getattr(self, "key", None) is not None:
            with tf.name_scope(self.key.name):
                self.key.build([None, None, self.config.hidden_size])
        # 如果存在值张量，构建值张量的结构
        if getattr(self, "value", None) is not None:
            with tf.name_scope(self.value.name):
                self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert
class TFCamembertSelfOutput(keras.layers.Layer):
    def __init__(self, config: CamembertConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化一个全连接层，用于转换隐藏状态的维度
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 初始化 LayerNormalization 层，用于归一化隐藏状态
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 初始化 Dropout 层，用于在训练时随机置零输入张量的一部分
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将隐藏状态通过全连接层 dense 进行线性转换
        hidden_states = self.dense(inputs=hidden_states)
        # 在训练时应用 Dropout，随机置零一部分输入张量
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 对转换后的隐藏状态应用 LayerNormalization，加上输入张量 input_tensor
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 构建 dense 层，设置其输入维度为 config.hidden_size
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 构建 LayerNorm 层，设置其输入维度为 config.hidden_size
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert
class TFCamembertAttention(keras.layers.Layer):
    def __init__(self, config: CamembertConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化自注意力层 TFCamembertSelfAttention
        self.self_attention = TFCamembertSelfAttention(config, name="self")
        # 初始化输出层 TFCamembertSelfOutput
        self.dense_output = TFCamembertSelfOutput(config, name="output")

    def prune_heads(self, heads):
        raise NotImplementedError

    def call(
        self,
        input_tensor: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        encoder_hidden_states: tf.Tensor,
        encoder_attention_mask: tf.Tensor,
        past_key_value: Tuple[tf.Tensor],
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 调用自注意力层进行注意力计算，返回自注意力层的输出
        self_outputs = self.self_attention(
            hidden_states=input_tensor,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            training=training,
        )
        # 将自注意力层的输出作为输入，通过输出层进行转换
        attention_output = self.dense_output(
            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
        )
        # 如果需要输出注意力值，则将其添加到输出元组中
        outputs = (attention_output,) + self_outputs[1:]

        return outputs
    # 定义神经网络层的构建方法，用于在给定输入形状时构建层
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 设置标志位，表示该层已经构建完成
        self.built = True
        
        # 检查是否存在自注意力层，并构建其名称作用域下的层
        if getattr(self, "self_attention", None) is not None:
            with tf.name_scope(self.self_attention.name):
                self.self_attention.build(None)
        
        # 检查是否存在密集输出层，并构建其名称作用域下的层
        if getattr(self, "dense_output", None) is not None:
            with tf.name_scope(self.dense_output.name):
                self.dense_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert
class TFCamembertIntermediate(keras.layers.Layer):
    def __init__(self, config: CamembertConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，用于中间状态转换，输出单元数由配置文件决定
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 根据配置文件中指定的激活函数类型，获取对应的 TensorFlow 激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 将输入的隐藏状态通过全连接层处理
        hidden_states = self.dense(inputs=hidden_states)
        # 使用配置中指定的中间激活函数处理转换后的隐藏状态
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 构建层次结构，若已存在 dense 层则使用其名字的命名空间，构建时指定输入形状
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert
class TFCamembertOutput(keras.layers.Layer):
    def __init__(self, config: CamembertConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，用于输出层，输出单元数由配置文件决定
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 创建一个 LayerNormalization 层，用于规范化层次，epsilon 值由配置文件决定
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建一个 Dropout 层，用于在训练时进行随机失活，失活率由配置文件决定
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将输入的隐藏状态通过全连接层处理
        hidden_states = self.dense(inputs=hidden_states)
        # 若在训练状态下，对输出的隐藏状态进行随机失活处理
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 将失活后的隐藏状态与输入张量进行加和，并通过 LayerNormalization 处理
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 构建层次结构，若已存在 dense 层则使用其名字的命名空间，构建时指定输入形状
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])
        # 构建层次结构，若已存在 LayerNorm 层则使用其名字的命名空间，构建时指定输入形状
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert
class TFCamembertLayer(keras.layers.Layer):
    # 使用指定的配置初始化 Camembert 模型
    def __init__(self, config: CamembertConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 创建注意力层对象，使用给定的配置，并命名为"attention"
        self.attention = TFCamembertAttention(config, name="attention")
        
        # 设置是否为解码器的标志
        self.is_decoder = config.is_decoder
        
        # 设置是否添加交叉注意力的标志
        self.add_cross_attention = config.add_cross_attention
        
        # 如果要添加交叉注意力，需检查当前模型是否为解码器模型
        if self.add_cross_attention:
            if not self.is_decoder:
                # 如果不是解码器模型且添加了交叉注意力，则引发错误
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            
            # 创建交叉注意力层对象，使用给定的配置，并命名为"crossattention"
            self.crossattention = TFCamembertAttention(config, name="crossattention")
        
        # 创建中间层对象，使用给定的配置，并命名为"intermediate"
        self.intermediate = TFCamembertIntermediate(config, name="intermediate")
        
        # 创建输出层对象，使用给定的配置，并命名为"output"
        self.bert_output = TFCamembertOutput(config, name="output")

    # 定义模型的调用方法
    def call(
        self,
        hidden_states: tf.Tensor,                    # 输入的隐藏状态张量
        attention_mask: tf.Tensor,                   # 注意力掩码张量
        head_mask: tf.Tensor,                        # 头部掩码张量
        encoder_hidden_states: tf.Tensor | None,     # 编码器的隐藏状态张量或空值
        encoder_attention_mask: tf.Tensor | None,    # 编码器的注意力掩码张量或空值
        past_key_value: Tuple[tf.Tensor] | None,     # 过去的键-值张量元组或空值
        output_attentions: bool,                     # 是否输出注意力权重
        training: bool = False,                      # 是否处于训练模式，默认为False
    ````
    # 定义方法签名，指定返回类型为包含单个元素的元组，该元素类型为 tf.Tensor
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: Optional[tf.Tensor] = None,
        head_mask: Optional[tf.Tensor] = None,
        encoder_hidden_states: Optional[tf.Tensor] = None,
        encoder_attention_mask: Optional[tf.Tensor] = None,
        past_key_value: Optional[Tuple[tf.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        training: Optional[bool] = False,
    ) -> Tuple[tf.Tensor]:
    
        # 如果 past_key_value 不为 None，则提取出 self-attention 的过去键/值缓存
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
    
        # 调用 self.attention 方法进行自注意力计算
        self_attention_outputs = self.attention(
            input_tensor=hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            past_key_value=self_attn_past_key_value,
            output_attentions=output_attentions,
            training=training,
        )
    
        # 获取 self-attention 的输出
        attention_output = self_attention_outputs[0]
    
        # 如果模型是解码器模型
        if self.is_decoder:
            # 输出中除了 self_attention_outputs 中的第一个元素之外的所有元素
            outputs = self_attention_outputs[1:-1]
            # 提取 self_attention_outputs 中的最后一个元素作为 present_key_value
            present_key_value = self_attention_outputs[-1]
        else:
            # 输出中包含 self_attention_outputs 中除第一个元素外的所有元素（如果输出注意力权重的话）
            outputs = self_attention_outputs[1:]
    
        # 初始化 cross_attn_present_key_value 为 None
        cross_attn_present_key_value = None
    
        # 如果模型是解码器并且存在编码器的隐藏状态
        if self.is_decoder and encoder_hidden_states is not None:
            # 如果模型没有交叉注意力层，则引发 ValueError 异常
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )
    
            # 如果 past_key_value 不为 None，则提取出交叉注意力的过去键/值缓存
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
    
            # 调用 self.crossattention 方法进行交叉注意力计算
            cross_attention_outputs = self.crossattention(
                input_tensor=attention_output,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=cross_attn_past_key_value,
                output_attentions=output_attentions,
                training=training,
            )
    
            # 获取交叉注意力的输出
            attention_output = cross_attention_outputs[0]
            # 将交叉注意力的输出中除了第一个和最后一个元素之外的所有元素添加到 outputs 中
            outputs = outputs + cross_attention_outputs[1:-1]
    
            # 将交叉注意力的输出中的最后一个元素添加到 present_key_value 中
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value
    
        # 对注意力输出进行中间层处理
        intermediate_output = self.intermediate(hidden_states=attention_output)
    
        # 对中间层输出进行最终的 Bert 输出层处理
        layer_output = self.bert_output(
            hidden_states=intermediate_output, input_tensor=attention_output, training=training
        )
    
        # 将最终的层输出添加到 outputs 中
        outputs = (layer_output,) + outputs
    
        # 如果模型是解码器，将注意力键/值作为最后的输出添加到 outputs 中
        if self.is_decoder:
            outputs = outputs + (present_key_value,)
    
        # 返回最终的输出元组
        return outputs
    # 构建方法，用于构建模型的层次结构。如果已经构建过，则直接返回。
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，不再重复构建
        if self.built:
            return
        # 将标志位设置为已构建
        self.built = True
        
        # 如果存在 self.attention 属性，则构建 self.attention 层次结构
        if getattr(self, "attention", None) is not None:
            # 使用 tf.name_scope 为 self.attention 层创建命名空间
            with tf.name_scope(self.attention.name):
                # 调用 self.attention 的 build 方法来构建该层
                self.attention.build(None)
        
        # 如果存在 self.intermediate 属性，则构建 self.intermediate 层次结构
        if getattr(self, "intermediate", None) is not None:
            # 使用 tf.name_scope 为 self.intermediate 层创建命名空间
            with tf.name_scope(self.intermediate.name):
                # 调用 self.intermediate 的 build 方法来构建该层
                self.intermediate.build(None)
        
        # 如果存在 self.bert_output 属性，则构建 self.bert_output 层次结构
        if getattr(self, "bert_output", None) is not None:
            # 使用 tf.name_scope 为 self.bert_output 层创建命名空间
            with tf.name_scope(self.bert_output.name):
                # 调用 self.bert_output 的 build 方法来构建该层
                self.bert_output.build(None)
        
        # 如果存在 self.crossattention 属性，则构建 self.crossattention 层次结构
        if getattr(self, "crossattention", None) is not None:
            # 使用 tf.name_scope 为 self.crossattention 层创建命名空间
            with tf.name_scope(self.crossattention.name):
                # 调用 self.crossattention 的 build 方法来构建该层
                self.crossattention.build(None)
# 从 transformers.models.bert.modeling_tf_bert.TFBertEncoder 复制代码，将其中的 Bert 替换为 Camembert
class TFCamembertEncoder(keras.layers.Layer):
    # 初始化函数，接收 CamembertConfig 对象作为参数
    def __init__(self, config: CamembertConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        # 创建 CamembertLayer 的列表，根据层数进行命名
        self.layer = [TFCamembertLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]

    # 前向传播函数，接收多个参数和返回类型的注解
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        encoder_hidden_states: tf.Tensor | None,
        encoder_attention_mask: tf.Tensor | None,
        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
        use_cache: Optional[bool],
        output_attentions: bool,
        output_hidden_states: bool,
        return_dict: bool,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
        # 初始化空元组或 None，用于存储中间结果
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 若 use_cache 为 True，则初始化空元组用于存储下一层的缓存
        next_decoder_cache = () if use_cache else None

        # 遍历每一层的 CamembertLayer
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的过去键值对，如果 past_key_values 不为 None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 调用当前层的前向传播函数，计算当前层的输出
            layer_outputs = layer_module(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask[i],
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                training=training,
            )
            # 更新 hidden_states 为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果 use_cache 为 True，则更新下一层的缓存
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)

            # 如果 output_attentions 为 True，则将当前层的注意力加入 all_attentions
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)
                # 如果配置中包含交叉注意力，并且 encoder_hidden_states 不为 None，则将交叉注意力加入 all_cross_attentions
                if self.config.add_cross_attention and encoder_hidden_states is not None:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 添加最后一层的隐藏状态到 all_hidden_states
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果 return_dict 为 False，则返回非空的元组
        if not return_dict:
            return tuple(
                v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
            )

        # 返回 TFBaseModelOutputWithPastAndCrossAttentions 对象，包含各类输出结果
        return TFBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
            cross_attentions=all_cross_attentions,
        )
    # 定义一个方法 `build`，用于构建神经网络模型的层次结构
    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 检查是否存在 `layer` 属性，并逐层构建每个子层
        if getattr(self, "layer", None) is not None:
            # 遍历每个子层
            for layer in self.layer:
                # 在 TensorFlow 中为每个层次设置命名空间，以层次的名字作为命名空间
                with tf.name_scope(layer.name):
                    # 构建每个子层，此处传入 `None` 作为输入形状参数
                    layer.build(None)
@keras_serializable
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer 复制并修改为 Camembert
class TFCamembertMainLayer(keras.layers.Layer):
    config_class = CamembertConfig

    def __init__(self, config, add_pooling_layer=True, **kwargs):
        super().__init__(**kwargs)

        self.config = config  # 设置配置对象
        self.is_decoder = config.is_decoder  # 是否为解码器

        self.num_hidden_layers = config.num_hidden_layers  # 隐藏层的数量
        self.initializer_range = config.initializer_range  # 初始化范围
        self.output_attentions = config.output_attentions  # 是否输出注意力权重
        self.output_hidden_states = config.output_hidden_states  # 是否输出隐藏状态
        self.return_dict = config.use_return_dict  # 是否返回字典格式的输出
        self.encoder = TFCamembertEncoder(config, name="encoder")  # Camembert 编码器
        self.pooler = TFCamembertPooler(config, name="pooler") if add_pooling_layer else None  # 可选的池化层
        # embeddings 必须是最后声明的，以保持权重的顺序
        self.embeddings = TFCamembertEmbeddings(config, name="embeddings")  # Camembert embeddings

    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings 复制
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.embeddings  # 获取输入 embeddings

    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings 复制
    def set_input_embeddings(self, value: tf.Variable):
        self.embeddings.weight = value  # 设置 embeddings 的权重
        self.embeddings.vocab_size = shape_list(value)[0]  # 设置 embeddings 的词汇表大小

    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads 复制
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError  # 未实现的方法，用于剪枝模型的注意力头部

    @unpack_inputs
    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call 复制
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    # 如果模型已经构建完成，则直接返回，不做任何操作
    if self.built:
        return
    # 标记模型已经构建
    self.built = True
    
    # 如果模型中存在编码器（encoder），则构建编码器
    if getattr(self, "encoder", None) is not None:
        # 在TensorFlow的命名空间中，使用编码器的名称
        with tf.name_scope(self.encoder.name):
            # 构建编码器，input_shape设为None
            self.encoder.build(None)
    
    # 如果模型中存在池化器（pooler），则构建池化器
    if getattr(self, "pooler", None) is not None:
        # 在TensorFlow的命名空间中，使用池化器的名称
        with tf.name_scope(self.pooler.name):
            # 构建池化器，input_shape设为None
            self.pooler.build(None)
    
    # 如果模型中存在嵌入层（embeddings），则构建嵌入层
    if getattr(self, "embeddings", None) is not None:
        # 在TensorFlow的命名空间中，使用嵌入层的名称
        with tf.name_scope(self.embeddings.name):
            # 构建嵌入层，input_shape设为None
            self.embeddings.build(None)
# 定义一个名为 TFCamembertPreTrainedModel 的类，继承自 TFPreTrainedModel
class TFCamembertPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    一个抽象类，用于处理权重初始化以及下载和加载预训练模型的简单接口。
    """

    # 指定配置类为 CamembertConfig
    config_class = CamembertConfig
    # 基础模型前缀为 "roberta"
    base_model_prefix = "roberta"


# 引入函数装饰器 add_start_docstrings，并传入文档字符串和 CAMEMBERT_START_DOCSTRING 常量
@add_start_docstrings(
    "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
    CAMEMBERT_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaModel 复制代码，并将 Roberta->Camembert, ROBERTA->CAMEMBERT
class TFCamembertModel(TFCamembertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类构造函数，传入配置对象和其他参数
        super().__init__(config, *inputs, **kwargs)
        # 初始化 self.roberta 属性为 TFCamembertMainLayer 类的实例，传入配置对象和名称 "roberta"
        self.roberta = TFCamembertMainLayer(config, name="roberta")

    # 引入函数装饰器 unpack_inputs，用于展开输入参数
    # 引入函数装饰器 add_start_docstrings_to_model_forward，传入格式化字符串 CAMEMBERT_INPUTS_DOCSTRING 和输入参数说明
    # 引入函数装饰器 add_code_sample_docstrings，传入检查点、输出类型和配置类的相关文档信息
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        # 函数参数说明完毕
    ) -> Union[Tuple, TFBaseModelOutputWithPoolingAndCrossAttentions]:
        r"""
        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*, defaults to `True`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`). Set to `False` during training, `True` during generation
        """
        outputs = self.roberta(
            input_ids=input_ids,  # 输入的 token IDs
            attention_mask=attention_mask,  # 注意力遮罩，掩盖无效位置的 token
            token_type_ids=token_type_ids,  # token 类型 IDs，用于区分句子 A 和句子 B
            position_ids=position_ids,  # token 的位置编码
            head_mask=head_mask,  # 头部掩码，用于指定哪些注意力头部被屏蔽
            inputs_embeds=inputs_embeds,  # 输入的嵌入表示
            encoder_hidden_states=encoder_hidden_states,  # 编码器的隐藏状态序列
            encoder_attention_mask=encoder_attention_mask,  # 编码器的注意力遮罩
            past_key_values=past_key_values,  # 预计算的键值状态，用于加速解码
            use_cache=use_cache,  # 是否使用缓存以加速解码
            output_attentions=output_attentions,  # 是否输出注意力权重
            output_hidden_states=output_hidden_states,  # 是否输出隐藏状态
            return_dict=return_dict,  # 返回结果类型，字典还是元组
            training=training,  # 是否处于训练模式
        )

        return outputs

    def build(self, input_shape=None):
        if self.built:
            return  # 如果已经构建过，则直接返回

        self.built = True  # 标记模型已构建

        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):  # 在 TensorFlow 中设置命名空间
                self.roberta.build(None)  # 构建 RoBERTa 模型
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead复制而来，将Roberta->Camembert
class TFCamembertLMHead(keras.layers.Layer):
    """Camembert模型的masked语言建模头部。"""

    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)

        self.config = config
        self.hidden_size = config.hidden_size
        self.dense = keras.layers.Dense(
            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        self.act = get_tf_activation("gelu")

        # 输出权重与输入嵌入相同，但每个token有一个仅输出的偏置项。
        self.decoder = input_embeddings

    def build(self, input_shape=None):
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.hidden_size])

    def get_output_embeddings(self):
        return self.decoder

    def set_output_embeddings(self, value):
        self.decoder.weight = value
        self.decoder.vocab_size = shape_list(value)[0]

    def get_bias(self):
        return {"bias": self.bias}

    def set_bias(self, value):
        self.bias = value["bias"]
        self.config.vocab_size = shape_list(value["bias"])[0]

    def call(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.layer_norm(hidden_states)

        # 投影回词汇表大小，带有偏置项
        seq_length = shape_list(tensor=hidden_states)[1]
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
        hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)

        return hidden_states


@add_start_docstrings(
    """在顶部有一个`language modeling`头的CamemBERT模型。""",
    CAMEMBERT_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM复制而来，将Roberta->Camembert, ROBERTA->CAMEMBERT
class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelingLoss):
    # 带有'.'的名称表示在从PT模型加载TF模型时授权的意外/缺失层
    # 初始化一个列表，包含在加载时要忽略的特定键
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]

    # 初始化方法，接受配置对象和其他输入参数
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法，传入配置和其他输入参数
        super().__init__(config, *inputs, **kwargs)

        # 初始化一个 RoBERTa 主层对象，禁用添加池化层，命名为 "roberta"
        self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
        # 初始化一个语言模型头部对象，传入配置和 RoBERTa 主层的嵌入层，命名为 "lm_head"
        self.lm_head = TFCamembertLMHead(config, self.roberta.embeddings, name="lm_head")

    # 返回语言模型头部对象的方法
    def get_lm_head(self):
        return self.lm_head

    # 返回前缀偏置名称的方法，已弃用，发出未来警告
    def get_prefix_bias_name(self):
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        # 返回头部名称组成的字符串，使用 "/" 分隔
        return self.name + "/" + self.lm_head.name

    # 调用方法的装饰器，将输入参数解包，并添加模型前向传递的文档字符串和代码示例的文档字符串
    @unpack_inputs
    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="<mask>",
        expected_output="' Paris'",
        expected_loss=0.1,
    )
    # 模型的前向传递方法，接受多个输入参数，并返回预测输出
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 调用 RoBERTa 模型进行前向传播，获取模型的输出结果
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从 RoBERTa 模型的输出中提取序列输出
        sequence_output = outputs[0]
        
        # 将序列输出送入语言模型头部，得到预测分数（logits）
        prediction_scores = self.lm_head(sequence_output)

        # 如果提供了标签，则计算损失；否则损失设为 None
        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)

        # 如果不要求返回字典形式的输出，则按照元组形式返回结果
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFMaskedLMOutput 类型的对象，包括损失、预测分数、隐藏状态和注意力权重
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        
        # 标记模型已经构建
        self.built = True
        
        # 如果定义了 RoBERTa 模型，则构建 RoBERTa
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
        
        # 如果定义了语言模型头部，则构建语言模型头部
        if getattr(self, "lm_head", None) is not None:
            with tf.name_scope(self.lm_head.name):
                self.lm_head.build(None)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead复制而来，定义了一个用于句子级别分类任务的头部。
class TFCamembertClassificationHead(keras.layers.Layer):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 创建一个全连接层，输出维度为config.hidden_size，激活函数为tanh
        self.dense = keras.layers.Dense(
            config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        # 根据config中的设置，选择分类器的dropout率，如果未指定则使用hidden_dropout_prob
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 定义一个Dropout层，应用于全连接层的输出
        self.dropout = keras.layers.Dropout(classifier_dropout)
        # 创建一个全连接层，输出维度为config.num_labels，用于输出分类任务的结果
        self.out_proj = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
        )
        self.config = config

    def call(self, features, training=False):
        # 取出features的第一个token的向量表示，通常代表<CLS> token
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        # 应用dropout层到x上，用于训练时进行随机失活
        x = self.dropout(x, training=training)
        # 将x传入全连接层dense中进行线性变换并激活
        x = self.dense(x)
        # 再次应用dropout层到x上，用于训练时进行随机失活
        x = self.dropout(x, training=training)
        # 将x传入全连接层out_proj中，生成最终的分类结果
        x = self.out_proj(x)
        return x

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经建立过网络，则直接返回，否则开始构建
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                # 构建全连接层dense，输入维度为config.hidden_size
                self.dense.build([None, None, self.config.hidden_size])
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                # 构建全连接层out_proj，输入维度为config.hidden_size
                self.out_proj.build([None, None, self.config.hidden_size])


@add_start_docstrings(
    """
    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    CAMEMBERT_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification中复制，仅将Roberta替换为Camembert，ROBERTA替换为CAMEMBERT
class TFCamembertForSequenceClassification(TFCamembertPreTrainedModel, TFSequenceClassificationLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # _keys_to_ignore_on_load_unexpected列出了在从PT模型加载TF模型时，可以忽略的意外/丢失的层的名称模式
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 设置分类任务的标签数量
        self.num_labels = config.num_labels

        # 创建Camembert主体层，用于处理输入序列，不包含池化层
        self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
        # 创建Camembert分类头部，用于生成分类任务的输出
        self.classifier = TFCamembertClassificationHead(config, name="classifier")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 使用装饰器为方法添加文档字符串，指定模型和输出类型，以及配置类和预期输出和损失
    @add_code_sample_docstrings(
        checkpoint="cardiffnlp/twitter-roberta-base-emotion",
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="'optimism'",
        expected_loss=0.08,
    )
    # 定义模型的调用方法，接受多个输入参数和可选的标签，返回分类器输出或者元组包含 logits
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入文本的 token IDs
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 表示输入文本中实际词汇的掩码
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # 区分不同句子的标识符
        position_ids: np.ndarray | tf.Tensor | None = None,  # 表示输入中 token 的位置
        head_mask: np.ndarray | tf.Tensor | None = None,  # 多头注意力机制的掩码
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 输入 token 的嵌入表示
        output_attentions: Optional[bool] = None,  # 是否返回注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否返回隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回 TFSequenceClassifierOutput 对象
        labels: np.ndarray | tf.Tensor | None = None,  # 计算序列分类/回归损失的标签
        training: Optional[bool] = False,  # 是否处于训练模式
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        """
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 使用 RoBERTa 模型处理输入数据，返回模型输出
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取序列输出
        sequence_output = outputs[0]
        # 使用分类器模型处理序列输出，得到 logits
        logits = self.classifier(sequence_output, training=training)

        # 如果标签为空，则损失也为空；否则计算标签和 logits 之间的损失
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不返回字典，则按顺序返回 logits 和其他输出（如隐藏状态）
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFSequenceClassifierOutput 对象，包括损失、logits、隐藏状态和注意力权重
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    # 构建模型，初始化 RoBERTa 和分类器层
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果 RoBERTa 模型存在，则构建 RoBERTa 层
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
        # 如果分类器存在，则构建分类器层
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build(None)
# 使用装饰器将以下字符串添加到模型文档字符串的开头，描述了 CamemBERT 模型及其在命名实体识别 (NER) 任务中的用途
@add_start_docstrings(
    """
    CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    """,
    CAMEMBERT_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification 复制，将 Roberta 替换为 Camembert，ROBERTA 替换为 CAMEMBERT
class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClassificationLoss):
    # 在从 PyTorch 模型加载到 TensorFlow 模型时，这些键表示不希望或缺少的层
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
    _keys_to_ignore_on_load_missing = [r"dropout"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        # 初始化 Camembert 主层，排除添加池化层，命名为 "roberta"
        self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
        
        # 设置分类器的 dropout 比例为 config.classifier_dropout，若未指定则使用 config.hidden_dropout_prob
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = keras.layers.Dropout(classifier_dropout)
        
        # 定义分类器层，输出维度为 config.num_labels，使用给定范围内的初始化器进行初始化，命名为 "classifier"
        self.classifier = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        
        # 保存配置信息
        self.config = config

    # 使用装饰器解包输入参数，并添加模型前向传播的文档字符串，描述输入格式
    @unpack_inputs
    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="ydshieh/roberta-large-ner-english",
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
        expected_loss=0.01,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        **kwargs
    ):
        """
        CamemBERT 模型的前向传播方法，支持各种输入参数，返回 TFTokenClassifierOutput 类型的输出结果。
        """
        # 实现前向传播的具体逻辑，包括输入的各种处理和模型输出的计算
        pass
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 使用 Type Hinting 指定函数返回类型，可以是 TFTokenClassifierOutput 或包含 tf.Tensor 的元组
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取 RoBERTa 模型的输出序列
        sequence_output = outputs[0]

        # 应用 dropout 操作，用于防止过拟合
        sequence_output = self.dropout(sequence_output, training=training)
        # 对输出序列进行分类器分类
        logits = self.classifier(sequence_output)

        # 如果提供了标签，计算损失函数
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果 return_dict 为 False，则返回不同的输出格式
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则构建 TFTokenClassifierOutput 对象并返回
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建，直接返回
        if self.built:
            return
        # 设置模型已构建标志
        self.built = True
        # 如果存在 RoBERTa 模型，则构建 RoBERTa 模型
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
        # 如果存在分类器模型，则构建分类器模型
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
    """
    CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    CAMEMBERT_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaForMultipleChoice复制过来，将Roberta改为Camembert，ROBERTA改为CAMEMBERT
class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceLoss):
    # 当从PyTorch模型加载到TensorFlow模型时，以下带'.'的名称表示授权的意外/缺失层
    _keys_to_ignore_on_load_unexpected = [r"lm_head"]
    # 当从PyTorch模型加载到TensorFlow模型时，以下名称表示授权的缺失层
    _keys_to_ignore_on_load_missing = [r"dropout"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 使用TFCamembertMainLayer初始化Camembert主层，并命名为"roberta"
        self.roberta = TFCamembertMainLayer(config, name="roberta")
        # 使用config.hidden_dropout_prob初始化Dropout层
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        # 使用config.initializer_range初始化Dense层，用于分类
        self.classifier = keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(
        CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 模型的前向传播函数，接受多个输入参数并返回相应输出
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        # 参数training用于指定当前是否处于训练模式，默认为False
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """

        # 如果输入包含 input_ids，则确定 num_choices 和 seq_length
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]  # 获取选择项的数量
            seq_length = shape_list(input_ids)[2]   # 获取序列长度
        else:
            num_choices = shape_list(inputs_embeds)[1]  # 获取选择项的数量（从 embeddings 中）
            seq_length = shape_list(inputs_embeds)[2]   # 获取序列长度（从 embeddings 中）

        # 根据 input_ids 是否为 None，对输入的张量进行扁平化处理
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
        
        # 调用 self.roberta 进行模型的前向传播
        outputs = self.roberta(
            flat_input_ids,
            flat_attention_mask,
            flat_token_type_ids,
            flat_position_ids,
            head_mask,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        
        # 获取池化后的输出（通常是第二个输出）
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)  # 对池化输出应用 dropout
        logits = self.classifier(pooled_output)  # 使用分类器对池化输出进行分类

        # 将 logits 重新整形为 (batch_size, num_choices)
        reshaped_logits = tf.reshape(logits, (-1, num_choices))

        # 计算损失，如果 labels 不为 None，则计算损失值
        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)

        # 如果不要求返回字典，则返回结果的元组
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典，则返回 TFMultipleChoiceModelOutput 对象
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True

        # 如果 self.roberta 存在，则构建 self.roberta 模型
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)

        # 如果 self.classifier 存在，则构建 self.classifier 模型
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
    """
    CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    CAMEMBERT_START_DOCSTRING,
)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering中复制过来，将Roberta替换为Camembert，将ROBERTA替换为CAMEMBERT
class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsweringLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # 在从PyTorch模型加载TF模型时，'pooler'和'lm_head'是允许的未预期/缺失的层
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        # 初始化Camembert主层，不添加池化层，命名为"roberta"
        self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
        # QA输出层，全连接层，输出维度为config.num_labels，初始化方法为config中定义的initializer_range
        self.qa_outputs = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="ydshieh/roberta-base-squad2",
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="' puppet'",
        expected_loss=0.86,
    )
    # 定义模型的前向传播方法
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        **kwargs
    ):
        """
        Perform the forward pass of the model.
        """
        # 调用Camembert主层进行前向传播
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
            **kwargs,
        )

        # 获取Camembert主层的输出
        sequence_output = outputs[0]

        # 计算问题回答的起始位置和结束位置的logits
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)

        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)

        outputs = (start_logits, end_logits) + outputs[2:]

        if not return_dict:
            return outputs + (outputs[0],)
        return TFQuestionAnsweringModelOutput(
            start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 调用 RoBERTa 模型进行预测
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取模型输出的序列表示
        sequence_output = outputs[0]

        # 对序列表示进行线性变换，得到起始和结束位置的 logits
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)

        # 如果提供了起始和结束位置的标签，则计算损失
        loss = None
        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            loss = self.hf_compute_loss(labels, (start_logits, end_logits))

        # 如果不需要返回字典形式的输出，则返回 logits 和可能的其他输出
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFQuestionAnsweringModelOutput 类型的输出，包括损失、logits、隐藏状态和注意力权重
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果定义了 RoBERTa 模型，则构建其结构
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
        # 如果定义了 QA 输出层，则构建其结构
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])
@add_start_docstrings(
    """CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING
)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM 复制并修改为 Camembert，ROBERTA->CAMEMBERT
class TFCamembertForCausalLM(TFCamembertPreTrainedModel, TFCausalLanguageModelingLoss):
    # 在从 PT 模型加载 TF 模型时，以下带有 '.' 的名称表示授权的意外/丢失的层
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]

    def __init__(self, config: CamembertConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        if not config.is_decoder:
            logger.warning("如果要将 `TFCamembertLMHeadModel` 作为独立模型使用，请添加 `is_decoder=True.`")

        # 初始化 Camembert 主层，不添加池化层，命名为 "roberta"
        self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
        # 初始化 Camembert LM 头部，使用 self.roberta.embeddings 作为输入嵌入，命名为 "lm_head"
        self.lm_head = TFCamembertLMHead(config, input_embeddings=self.roberta.embeddings, name="lm_head")

    def get_lm_head(self):
        return self.lm_head

    def get_prefix_bias_name(self):
        warnings.warn("方法 get_prefix_bias_name 已弃用，请改用 `get_bias`.", FutureWarning)
        # 返回头部名称，以及 LM 头部名称的组合
        return self.name + "/" + self.lm_head.name

    # 从 transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation 复制
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        # 如果没有提供注意力遮罩，则创建全为1的遮罩
        if attention_mask is None:
            attention_mask = tf.ones(input_shape)

        # 如果存在过去的键值对，则截取最后一个输入 ID
        if past_key_values is not None:
            input_ids = input_ids[:, -1:]

        # 返回包含输入 ID、注意力遮罩和过去键值对的字典
        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}

    @unpack_inputs
    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFCausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义神经网络模型中的方法，用于执行模型的前向推断或训练
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入的 token IDs，类型可以是 TFModelInputType 或 None
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力掩码，类型可以是 numpy 数组、Tensor 或 None
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # token 类型 IDs，类型可以是 numpy 数组、Tensor 或 None
        position_ids: np.ndarray | tf.Tensor | None = None,  # 位置 IDs，类型可以是 numpy 数组、Tensor 或 None
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部掩码，类型可以是 numpy 数组、Tensor 或 None
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 输入的嵌入表示，类型可以是 numpy 数组、Tensor 或 None
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,  # 编码器隐藏状态，类型可以是 numpy 数组、Tensor 或 None
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,  # 编码器的注意力掩码，类型可以是 numpy 数组、Tensor 或 None
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,  # 过去的键值对，类型为可选的嵌套元组
        use_cache: Optional[bool] = None,  # 是否使用缓存，类型为可选的布尔值
        output_attentions: Optional[bool] = None,  # 是否输出注意力信息，类型为可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，类型为可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典格式的结果，类型为可选的布尔值
        labels: np.ndarray | tf.Tensor | None = None,  # 标签，类型可以是 numpy 数组、Tensor 或 None
        training: Optional[bool] = False,  # 是否处于训练模式，类型为可选的布尔值，默认为 False
    # 构建神经网络模型的结构，设置层的连接关系和参数
    def build(self, input_shape=None):
        if self.built:
            return  # 如果已经构建过，则直接返回

        self.built = True  # 标记模型已经构建

        if getattr(self, "roberta", None) is not None:
            # 如果存在名为 "roberta" 的属性，则在命名空间下构建它
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)

        if getattr(self, "lm_head", None) is not None:
            # 如果存在名为 "lm_head" 的属性，则在命名空间下构建它
            with tf.name_scope(self.lm_head.name):
                self.lm_head.build(None)

`.\models\camembert\tokenization_camembert.py`

# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
""" Tokenization classes for Camembert model."""


import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple

import sentencepiece as spm  # 导入句子分词工具

from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging  # 导入日志工具


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}  # 词汇文件名映射

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "almanach/camembert-base": "https://huggingface.co/almanach/camembert-base/resolve/main/sentencepiece.bpe.model",
    }
}  # 预训练模型词汇文件映射

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "almanach/camembert-base": 512,
}  # 预训练模型的位置嵌入大小映射

SPIECE_UNDERLINE = "▁"  # SentencePiece 分词中的特殊标记

class CamembertTokenizer(PreTrainedTokenizer):
    """
    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Construct a CamemBERT tokenizer. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    """

    vocab_files_names = VOCAB_FILES_NAMES  # 设置词汇文件名属性
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 设置预训练模型词汇文件映射属性
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 设置预训练模型输入大小属性
    model_input_names = ["input_ids", "attention_mask"]  # 模型输入名称列表

    def __init__(
        self,
        vocab_file,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"],
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> None:
        # 定义方法签名，无返回值
        # Mask token 表现得像普通单词，即包括其前面的空格
        mask_token = (
            # 如果 mask_token 是字符串，创建一个带有特殊属性的 AddedToken 对象
            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False, special=True)
            if isinstance(mask_token, str)
            else mask_token
        )

        # 如果 sp_model_kwargs 为 None，则设为空字典，否则使用传入的 sp_model_kwargs
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 创建 SentencePieceProcessor 对象并加载词汇文件
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(str(vocab_file))
        self.vocab_file = vocab_file

        # HACK: 作者添加这些 token 有些隐晦的原因，因为它们已经在 sentencepiece 词汇中了
        # 对于 <s>、</s> 和 <unk>，建议手动设置这些 token
        self._added_tokens_decoder = {
            0: AddedToken("<s>NOTUSED", special=True),
            1: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token,
            2: AddedToken("</s>NOTUSED", special=True),
            3: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token,
            4: AddedToken("<unk>NOTUSED", special=True),
        }

        # fairseq 偏移量为 4，因为新增了 3 个 token，但偏移从 4 开始
        self.fairseq_offset = 4

        # legacy: camemebert 是一个特殊情况，需要确保 `"<unk>NOTUSED"` 在这里
        if "added_tokens_decoder" in kwargs:
            # 这是唯一一个需要这样做的类......
            # 原因是快速版本有一个完整的...
            kwargs["added_tokens_decoder"].update(self._added_tokens_decoder)

        # 调用父类的初始化方法，传递各种 token 和参数
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            additional_special_tokens=additional_special_tokens,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

    @property
    def vocab_size(self):
        # 词汇表大小是 self.sp_model 的长度，但添加的 token 在开头，所以加上 fairseq 偏移量
        return len(self.sp_model)

    def get_vocab(self):
        # 创建词汇表字典，包括已添加的 token 编码
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.fairseq_offset)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    def _tokenize(self, text: str) -> List[str]:
        # 使用 SentencePiece 对文本进行编码成字符串列表
        return self.sp_model.encode(text, out_type=str)

    def _convert_token_to_id(self, token):
        """将 token (str) 转换为对应的 id，使用词汇表."""
        # 对于 camembert 特定的情况，3 和 4 都指向 unk token
        if self.sp_model.PieceToId(token) == 0:
            # 将 sentence piece unk token 转换为 fairseq unk token 的索引
            return self.unk_token_id
        return self.fairseq_offset + self.sp_model.PieceToId(token)
    # 使用索引值转换为对应的 token 字符串，通过 vocab 进行映射
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.sp_model.IdToPiece(index - self.fairseq_offset)

    # 将一系列 token 字符串转换为单个字符串
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # TODO decode outputs do not match between fast and slow
        current_sub_tokens = []  # 用于存储当前正在构建的子 token 序列
        out_string = ""  # 最终合并的字符串结果
        prev_is_special = False  # 前一个 token 是否是特殊 token
        for token in tokens:
            # 检查当前 token 是否为特殊 token，如果是，则需要处理拼接
            if token in self.all_special_tokens:
                if not prev_is_special:
                    out_string += " "  # 如果前一个不是特殊 token，则添加空格分隔
                out_string += self.sp_model.decode(current_sub_tokens) + token  # 解码当前子 token 序列并拼接特殊 token
                prev_is_special = True  # 更新前一个 token 是特殊 token
                current_sub_tokens = []  # 重置子 token 序列
            else:
                current_sub_tokens.append(token)  # 添加当前 token 到子 token 序列
                prev_is_special = False  # 更新前一个 token 不是特殊 token
        out_string += self.sp_model.decode(current_sub_tokens)  # 处理剩余的子 token 序列并添加到最终结果中
        return out_string.strip()  # 返回去除首尾空格的字符串

    # 序列化对象状态以便进行存储
    def __getstate__(self):
        state = self.__dict__.copy()  # 创建对象状态的深拷贝副本
        state["sp_model"] = None  # 设置 sp_model 为 None，因为它无法被序列化
        return state

    # 反序列化对象状态并重新构建对象
    def __setstate__(self, d):
        self.__dict__ = d  # 恢复对象状态

        # 为了向后兼容性
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}  # 如果对象中不存在 sp_model_kwargs 属性，则创建空字典

        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)  # 使用保存的参数重新创建 sp_model
        self.sp_model.Load(self.vocab_file)  # 加载之前保存的 vocab_file 文件

    # 将词汇表保存到指定目录
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return  # 如果保存目录不存在，则记录错误并返回

        # 构建输出的词汇表文件路径
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前 vocab_file 与输出路径不同且 vocab_file 是一个文件，则复制文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        elif not os.path.isfile(self.vocab_file):
            # 如果 vocab_file 不存在，则将序列化后的 sp_model 写入到输出路径
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        return (out_vocab_file,)  # 返回保存的词汇表文件路径的元组

    # 构建带有特殊 token 的输入序列
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        从一个序列或者一个序列对构建模型输入，用于序列分类任务，通过连接和添加特殊标记。一个 CamemBERT 序列的格式如下：

        - 单个序列: `<s> X </s>`
        - 序列对: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                将添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                第二个可选的序列对 ID 列表。

        Returns:
            `List[int]`: 包含适当特殊标记的输入 ID 列表。
        """

        if token_ids_1 is None:
            # 返回带有特殊标记的单个序列输入 ID 列表
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        # 返回带有特殊标记的序列对输入 ID 列表
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        从没有添加特殊标记的令牌列表中检索序列 ID。在使用分词器的 `prepare_for_model` 方法添加特殊标记时调用此方法。

        Args:
            token_ids_0 (`List[int]`):
                ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                第二个可选的序列对 ID 列表。
            already_has_special_tokens (`bool`, *optional*, 默认为 `False`):
                标记列表是否已经格式化为模型的特殊标记。

        Returns:
            `List[int]`: 一个整数列表，范围在 [0, 1]：1 表示特殊标记，0 表示序列标记。
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is None:
            # 返回单个序列的特殊标记掩码
            return [1] + ([0] * len(token_ids_0)) + [1]
        # 返回序列对的特殊标记掩码
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        从序列对创建令牌类型 ID。这个方法用于创建用于区分不同序列的令牌类型 ID。

        Args:
            token_ids_0 (`List[int]`):
                ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                第二个可选的序列对 ID 列表。

        Returns:
            `List[int]`: 令牌类型 ID 列表。
        """
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like
        RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        # Define the separator token
        sep = [self.sep_token_id]
        # Define the classification token
        cls = [self.cls_token_id]

        # If token_ids_1 is not provided (single sequence case)
        if token_ids_1 is None:
            # Return a list of zeros of length equal to the sum of cls, token_ids_0, sep
            return len(cls + token_ids_0 + sep) * [0]
        
        # For sequence pairs case (token_ids_1 is provided)
        # Return a list of zeros of length equal to the sum of cls, token_ids_0, sep, sep, token_ids_1, sep
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

`.\models\camembert\tokenization_camembert_fast.py`

# coding=utf-8
# 上面的注释指定了文件的编码格式为 UTF-8

# 版权声明和许可证信息，说明了代码的使用权限和责任限制
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

""" Fast tokenization classes for Camembert model."""

# 导入必要的模块和库
import os
from shutil import copyfile
from typing import List, Optional, Tuple

# 导入 HuggingFace 库中的一些实用函数和类
from ...tokenization_utils import AddedToken
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging

# 检查是否安装了 sentencepiece 库
if is_sentencepiece_available():
    # 如果安装了 sentencepiece，则导入 CamembertTokenizer 类
    from .tokenization_camembert import CamembertTokenizer
else:
    # 否则，将 CamembertTokenizer 设为 None
    CamembertTokenizer = None

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 定义 CamembertTokenizerFast 类的相关常量和映射
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}

# 预训练模型使用的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "almanach/camembert-base": "https://huggingface.co/almanach/camembert-base/resolve/main/sentencepiece.bpe.model",
    },
    "tokenizer_file": {
        "almanach/camembert-base": "https://huggingface.co/almanach/camembert-base/resolve/main/tokenizer.json",
    },
}

# 预训练模型的位置嵌入大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "almanach/camembert-base": 512,
}

# SentencePiece 模型中表示子词的前缀
SPIECE_UNDERLINE = "▁"

# CamembertTokenizerFast 类的定义
class CamembertTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    """

    # 类的主要功能和继承关系的说明
    # 构造一个基于 HuggingFace 的 tokenizers 库的 "快速" CamemBERT 分词器

    # 这里没有其他代码，因此没有需要添加注释的额外行
    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
            SentencePiece文件的路径，用于实例化分词器的词汇表文件。
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
            序列的起始标记，用于预训练。可用作序列分类器的标记。

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            在使用特殊标记构建序列时，并非使用此标记作为序列的起始标记。实际使用的是 `cls_token`。

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
            序列的结束标记。

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            在使用特殊标记构建序列时，并非使用此标记作为序列的结束标记。实际使用的是 `sep_token`。

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
            分隔符标记，在构建来自多个序列的序列时使用，例如用于序列分类的两个序列，或用于问答中的文本和问题序列。也用作使用特殊标记构建的序列的最后一个标记。

        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
            在进行序列分类（整个序列而不是每个标记的分类）时使用的分类器标记。使用特殊标记构建序列时，它是序列的第一个标记。

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
            未知标记。不在词汇表中的标记无法转换为ID，因此将被设置为此标记。

        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
            用于填充的标记，例如在批处理不同长度的序列时使用。

        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
            用于掩码值的标记。在进行掩码语言建模训练时使用的标记。这是模型将尝试预测的标记。

        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
            分词器使用的额外特殊标记列表。

    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]
    slow_tokenizer_class = CamembertTokenizer
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"],
        **kwargs,
    ):
        # Mask token behavior is modified to strip left spaces and is marked as special
        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
        # 调用父类的构造方法，初始化基类的属性
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )

        # 设置实例的属性，保存词汇表文件路径
        self.vocab_file = vocab_file

    @property
    def can_save_slow_tokenizer(self) -> bool:
        # 判断词汇表文件是否存在，用于判断是否可以保存慢速分词器
        return os.path.isfile(self.vocab_file) if self.vocab_file else False

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An CamemBERT sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of input IDs with the appropriate special tokens.
        """

        if token_ids_1 is None:
            # 返回只包含单个序列的输入 ID，包含特殊的开始和结束标记
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        # 返回包含两个序列的输入 ID，包含特殊的开始和结束标记
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
        # 根据输入的序列创建 token type IDs，用于区分输入序列的类型（单个序列或序列对）
        # 在 CamemBERT 中，token type IDs 用于指示每个 token 属于哪个输入序列
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like
        RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        # Separator token ID for separating sequences
        sep = [self.sep_token_id]
        # CLS token ID for start of sequence classification
        cls = [self.cls_token_id]

        # If only one sequence is provided
        if token_ids_1 is None:
            # Return a list of zeros with the length of cls + token_ids_0 + sep
            return len(cls + token_ids_0 + sep) * [0]
        
        # If two sequences are provided
        # Return a list of zeros with the length of cls + token_ids_0 + 2 * sep + token_ids_1 + sep
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Check if saving slow tokenizer vocabulary is possible
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # Check if save_directory exists and is a directory; log error if not
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # Define the output vocabulary file path
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # Copy the current vocabulary file to the specified directory if different
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # Return the path to the saved vocabulary file
        return (out_vocab_file,)

`.\models\camembert\init.py`

# 引入类型检查模块，用于检查类型相关的导入
from typing import TYPE_CHECKING

# 引入依赖项检查函数和相关模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块导入结构字典，包含Camembert相关配置和模型
_import_structure = {
    "configuration_camembert": ["CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CamembertConfig", "CamembertOnnxConfig"],
}

# 检查是否支持sentencepiece，若不支持则引发OptionalDependencyNotAvailable异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 支持时将CamembertTokenizer模块添加到导入结构中
    _import_structure["tokenization_camembert"] = ["CamembertTokenizer"]

# 检查是否支持tokenizers，若不支持则引发OptionalDependencyNotAvailable异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 支持时将CamembertTokenizerFast模块添加到导入结构中
    _import_structure["tokenization_camembert_fast"] = ["CamembertTokenizerFast"]

# 检查是否支持torch，若不支持则引发OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 支持时将Camembert相关模型添加到导入结构中
    _import_structure["modeling_camembert"] = [
        "CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "CamembertForCausalLM",
        "CamembertForMaskedLM",
        "CamembertForMultipleChoice",
        "CamembertForQuestionAnswering",
        "CamembertForSequenceClassification",
        "CamembertForTokenClassification",
        "CamembertModel",
        "CamembertPreTrainedModel",
    ]

# 检查是否支持tensorflow，若不支持则引发OptionalDependencyNotAvailable异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 支持时将TensorFlow版Camembert模型添加到导入结构中
    _import_structure["modeling_tf_camembert"] = [
        "TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFCamembertForCausalLM",
        "TFCamembertForMaskedLM",
        "TFCamembertForMultipleChoice",
        "TFCamembertForQuestionAnswering",
        "TFCamembertForSequenceClassification",
        "TFCamembertForTokenClassification",
        "TFCamembertModel",
        "TFCamembertPreTrainedModel",
    ]


# 如果在类型检查模式下
if TYPE_CHECKING:
    # 从configuration_camembert模块导入特定配置和类定义
    from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig, CamembertOnnxConfig

    # 检查是否支持sentencepiece，若不支持则不导入CamembertTokenizer
    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 支持时从tokenization_camembert模块导入CamembertTokenizer
        from .tokenization_camembert import CamembertTokenizer

    # 检查是否支持tokenizers，若不支持则不导入任何内容
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()

        pass
    else:
        # 支持时继续导入相关内容，这里不包含在示例中的代码部分
        pass
    # 尝试导入 CamembertTokenizerFast，如果 OptionalDependencyNotAvailable 异常发生则跳过
    try:
        from .tokenization_camembert_fast import CamembertTokenizerFast
    # 如果 OptionalDependencyNotAvailable 异常发生，则什么也不做，直接跳过
    except OptionalDependencyNotAvailable:
        pass
    # 如果没有异常发生，则成功导入了 CamembertTokenizerFast
    
    # 尝试检查是否 Torch 库可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    # 如果 OptionalDependencyNotAvailable 异常发生，则什么也不做，直接跳过
    except OptionalDependencyNotAvailable:
        pass
    # 如果没有异常发生，则 Torch 库可用，导入相关 Camembert 模型和工具类
    
    else:
        from .modeling_camembert import (
            CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            CamembertForCausalLM,
            CamembertForMaskedLM,
            CamembertForMultipleChoice,
            CamembertForQuestionAnswering,
            CamembertForSequenceClassification,
            CamembertForTokenClassification,
            CamembertModel,
            CamembertPreTrainedModel,
        )
    
    # 尝试检查是否 TensorFlow 库可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 如果 OptionalDependencyNotAvailable 异常发生，则什么也不做，直接跳过
    except OptionalDependencyNotAvailable:
        pass
    # 如果没有异常发生，则 TensorFlow 库可用，导入相关 TensorFlow 版本的 Camembert 模型和工具类
    else:
        from .modeling_tf_camembert import (
            TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFCamembertForCausalLM,
            TFCamembertForMaskedLM,
            TFCamembertForMultipleChoice,
            TFCamembertForQuestionAnswering,
            TFCamembertForSequenceClassification,
            TFCamembertForTokenClassification,
            TFCamembertModel,
            TFCamembertPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于动态设置当前模块的属性
    import sys

    # 使用 sys.modules 和 __name__ 将当前模块名指定为 _LazyModule 的实例
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\canine\configuration_canine.py`

# 设置文件编码为 UTF-8
# 版权声明，指出代码的所有权和授权信息
# 根据 Apache 2.0 许可证使用本代码
# 可以在符合许可证的前提下使用本代码
# 可以通过许可证链接获取许可证副本
# 如果适用法律要求或书面同意，本软件以"原样"分发，不提供任何明示或暗示的保证或条件
# 详见许可证获取更多信息
""" CANINE 模型配置"""

# 从配置工具中导入预训练配置类
from ...configuration_utils import PretrainedConfig
# 从工具集中导入日志记录器
from ...utils import logging

# 获取 logger 对象用于记录日志
logger = logging.get_logger(__name__)

# CANINE 预训练模型配置文件映射字典，将模型名称映射到配置文件的 URL 地址
CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "google/canine-s": "https://huggingface.co/google/canine-s/resolve/main/config.json",
    # 查看所有 CANINE 模型的列表 https://huggingface.co/models?filter=canine
}

# CanineConfig 类继承自 PretrainedConfig，用于存储 CANINE 模型的配置信息
class CanineConfig(PretrainedConfig):
    r"""
    这是配置类，用于存储 [`CanineModel`] 的配置信息。根据指定的参数实例化 CANINE 模型，定义模型架构。
    使用默认配置实例化将会产生类似于 CANINE [google/canine-s](https://huggingface.co/google/canine-s) 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。

    Example:

    ```
    >>> from transformers import CanineConfig, CanineModel

    >>> # 初始化一个 CANINE google/canine-s 风格的配置
    >>> configuration = CanineConfig()

    >>> # 使用该配置初始化一个（随机权重）模型，使用 google/canine-s 风格的配置
    >>> model = CanineModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    # 模型类型为 "canine"
    model_type = "canine"

    # 构造函数，用于初始化 CANINE 模型的配置参数
    def __init__(
        self,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=16384,
        type_vocab_size=16,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        bos_token_id=0xE000,
        eos_token_id=0xE001,
        downsampling_rate=4,
        upsampling_kernel_size=4,
        num_hash_functions=8,
        num_hash_buckets=16384,
        local_transformer_stride=128,  # 适合 TPU/XLA 内存对齐的良好值
        **kwargs,
        ):
        # 调用父类的构造方法，初始化模型的参数，包括填充、开头和结尾的特殊token id等
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 设置模型的最大位置嵌入数
        self.max_position_embeddings = max_position_embeddings
        # 设置模型的隐藏层大小
        self.hidden_size = hidden_size
        # 设置模型的隐藏层数量
        self.num_hidden_layers = num_hidden_layers
        # 设置模型的注意力头数量
        self.num_attention_heads = num_attention_heads
        # 设置模型的中间层大小
        self.intermediate_size = intermediate_size
        # 设置模型的隐藏层激活函数类型
        self.hidden_act = hidden_act
        # 设置模型的隐藏层dropout概率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 设置模型的注意力层dropout概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 设置模型的初始化范围
        self.initializer_range = initializer_range
        # 设置模型的类型词汇大小
        self.type_vocab_size = type_vocab_size
        # 设置模型的层归一化epsilon值
        self.layer_norm_eps = layer_norm_eps

        # 字符特征配置:
        # 设置字符特征的下采样率
        self.downsampling_rate = downsampling_rate
        # 设置字符特征的上采样卷积核大小
        self.upsampling_kernel_size = upsampling_kernel_size
        # 设置哈希函数的数量
        self.num_hash_functions = num_hash_functions
        # 设置哈希桶的数量
        self.num_hash_buckets = num_hash_buckets
        # 设置本地transformer的步长
        self.local_transformer_stride = local_transformer_stride

`.\models\canine\convert_canine_original_tf_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert CANINE checkpoint."""


import argparse  # 导入用于解析命令行参数的模块

from transformers import CanineConfig, CanineModel, CanineTokenizer, load_tf_weights_in_canine  # 导入转换和处理 CANINE 模型的相关模块
from transformers.utils import logging  # 导入日志记录模块


logging.set_verbosity_info()  # 设置日志输出级别为 info


def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, pytorch_dump_path):
    # 初始化 PyTorch 模型配置
    config = CanineConfig()
    # 基于配置初始化 CANINE 模型
    model = CanineModel(config)
    # 设置模型为评估模式（不进行训练）
    model.eval()

    print(f"Building PyTorch model from configuration: {config}")

    # 从 TensorFlow checkpoint 加载权重到 CANINE 模型
    load_tf_weights_in_canine(model, config, tf_checkpoint_path)

    # 保存 PyTorch 模型（包括权重和配置）
    print(f"Save PyTorch model to {pytorch_dump_path}")
    model.save_pretrained(pytorch_dump_path)

    # 保存 tokenizer 文件
    tokenizer = CanineTokenizer()
    print(f"Save tokenizer files to {pytorch_dump_path}")
    tokenizer.save_pretrained(pytorch_dump_path)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 必选参数
    parser.add_argument(
        "--tf_checkpoint_path",
        default=None,
        type=str,
        required=True,
        help="Path to the TensorFlow checkpoint. Should end with model.ckpt",
    )
    parser.add_argument(
        "--pytorch_dump_path",
        default=None,
        type=str,
        required=True,
        help="Path to a folder where the PyTorch model will be placed.",
    )
    # 解析命令行参数
    args = parser.parse_args()
    # 转换 TensorFlow checkpoint 到 PyTorch 模型
    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.pytorch_dump_path)

`.\models\canine\modeling_canine.py`

# 定义一个数据类，用于存储 CANINE 模型的输出，包含了额外的池化信息
@dataclass
class CanineModelOutputWithPooling(ModelOutput):
    """
    Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
    different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
    Transformer encoders.
    """
    # 继承自 `ModelOutput` 类，包含了基本的模型输出信息
    # 定义函数参数和返回类型注释
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列，是深度Transformer编码器的输出。
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            序列中第一个标记（分类标记）在深度Transformer编码器最后一层的隐藏状态，经过线性层和Tanh激活函数进一步处理。
            线性层的权重在预训练期间从下一个句子预测（分类）目标中训练得到。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            元组类型，包含`torch.FloatTensor`类型的张量，每个编码器的输入和每个编码器每一层的输出。
            第一个张量的形状为 `(batch_size, sequence_length, hidden_size)`，第二个张量的形状为
            `(batch_size, sequence_length // config.downsampling_rate, hidden_size)`。
            浅层编码器的隐藏状态长度为 `sequence_length`，深层编码器的隐藏状态长度为 `sequence_length // config.downsampling_rate`。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            元组类型，包含`torch.FloatTensor`类型的张量，每个编码器的注意力权重。
            第一个张量的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`，
            第二个张量的形状为 `(batch_size, num_heads, sequence_length // config.downsampling_rate, sequence_length // config.downsampling_rate)`。
            在注意力softmax之后的注意力权重，用于计算自注意力头中的加权平均值。
    """
    
    # 初始化函数参数的默认值
    last_hidden_state: torch.FloatTensor = None
    pooler_output: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
def load_tf_weights_in_canine(model, config, tf_checkpoint_path):
    """Load tf checkpoints in a pytorch model."""
    try:
        import re  # 导入正则表达式模块，用于处理字符串匹配
        import numpy as np  # 导入NumPy库，用于数值计算
        import tensorflow as tf  # 导入TensorFlow库，用于加载TensorFlow模型权重
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise

    tf_path = os.path.abspath(tf_checkpoint_path)  # 获取TensorFlow模型检查点文件的绝对路径
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")  # 记录日志信息，显示正在转换的TensorFlow检查点路径

    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)  # 获取TensorFlow模型中所有变量的名称和形状信息
    names = []
    arrays = []
    for name, shape in init_vars:
        logger.info(f"Loading TF weight {name} with shape {shape}")  # 记录日志信息，显示正在加载的TensorFlow权重名称和形状
        array = tf.train.load_variable(tf_path, name)  # 加载TensorFlow模型中指定变量的权重数据
        names.append(name)
        arrays.append(array)

    return model


class CanineEmbeddings(nn.Module):
    """Construct the character, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()

        self.config = config

        # character embeddings
        shard_embedding_size = config.hidden_size // config.num_hash_functions
        for i in range(config.num_hash_functions):
            name = f"HashBucketCodepointEmbedder_{i}"
            setattr(self, name, nn.Embedding(config.num_hash_buckets, shard_embedding_size))
            # 设置每个哈希桶代码点嵌入层，使用nn.Embedding创建嵌入矩阵

        self.char_position_embeddings = nn.Embedding(config.num_hash_buckets, config.hidden_size)
        # 设置字符位置嵌入层，使用nn.Embedding创建嵌入矩阵，嵌入维度为config.hidden_size
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        # 设置令牌类型嵌入层，使用nn.Embedding创建嵌入矩阵，嵌入维度为config.hidden_size

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 设置LayerNorm层，使用nn.LayerNorm进行层归一化，归一化维度为config.hidden_size，设置epsilon为config.layer_norm_eps
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 设置Dropout层，使用nn.Dropout进行Dropout操作，设置丢弃概率为config.hidden_dropout_prob

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册position_ids作为缓冲区，存储长度为config.max_position_embeddings的位置ID张量，不持久化保存

        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 设置位置嵌入类型，默认为"absolute"，如果config中有指定position_embedding_type则使用指定值
    def _hash_bucket_tensors(self, input_ids, num_hashes: int, num_buckets: int):
        """
        Converts ids to hash bucket ids via multiple hashing.

        Args:
            input_ids: The codepoints or other IDs to be hashed.
            num_hashes: The number of hash functions to use.
            num_buckets: The number of hash buckets (i.e. embeddings in each table).

        Returns:
            A list of tensors, each of which is the hash bucket IDs from one hash function.
        """
        # 检查 `num_hashes` 是否超过了预定义的素数列表长度，抛出异常
        if num_hashes > len(_PRIMES):
            raise ValueError(f"`num_hashes` must be <= {len(_PRIMES)}")

        # 选择前 `num_hashes` 个素数作为哈希函数的参数
        primes = _PRIMES[:num_hashes]

        result_tensors = []
        # 对每一个素数进行哈希计算
        for prime in primes:
            # 根据哈希函数计算输入 ID 的哈希桶 ID
            hashed = ((input_ids + 1) * prime) % num_buckets
            result_tensors.append(hashed)
        return result_tensors

    def _embed_hash_buckets(self, input_ids, embedding_size: int, num_hashes: int, num_buckets: int):
        """Converts IDs (e.g. codepoints) into embeddings via multiple hashing."""
        # 检查 `embedding_size` 是否可以被 `num_hashes` 整除，否则抛出异常
        if embedding_size % num_hashes != 0:
            raise ValueError(f"Expected `embedding_size` ({embedding_size}) % `num_hashes` ({num_hashes}) == 0")

        # 使用 `_hash_bucket_tensors` 方法将输入 ID 转换为哈希桶 ID 的张量列表
        hash_bucket_tensors = self._hash_bucket_tensors(input_ids, num_hashes=num_hashes, num_buckets=num_buckets)
        embedding_shards = []
        # 对每一个哈希桶 ID 张量进行嵌入映射
        for i, hash_bucket_ids in enumerate(hash_bucket_tensors):
            name = f"HashBucketCodepointEmbedder_{i}"
            # 调用模型的子模块进行哈希桶 ID 的嵌入映射
            shard_embeddings = getattr(self, name)(hash_bucket_ids)
            embedding_shards.append(shard_embeddings)

        # 将所有嵌入映射拼接成一个张量
        return torch.cat(embedding_shards, dim=-1)

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.FloatTensor:
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        # 获取输入序列的长度
        seq_length = input_shape[1]

        # 如果未提供位置 ID，则使用预定义的位置 ID
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果未提供 token 类型 ID，则默认为全零张量
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果未提供输入嵌入张量，则通过 `_embed_hash_buckets` 方法生成
        if inputs_embeds is None:
            inputs_embeds = self._embed_hash_buckets(
                input_ids, self.config.hidden_size, self.config.num_hash_functions, self.config.num_hash_buckets
            )

        # 获取 token 类型的嵌入映射
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将输入嵌入张量与 token 类型嵌入映射相加
        embeddings = inputs_embeds + token_type_embeddings

        # 如果位置嵌入类型为 "absolute"，则加上字符位置嵌入映射
        if self.position_embedding_type == "absolute":
            position_embeddings = self.char_position_embeddings(position_ids)
            embeddings += position_embeddings

        # 执行 LayerNorm 操作
        embeddings = self.LayerNorm(embeddings)
        # 执行 dropout 操作
        embeddings = self.dropout(embeddings)
        return embeddings
class CharactersToMolecules(nn.Module):
    """Convert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions."""

    def __init__(self, config):
        super().__init__()

        # Define 1D convolutional layer for downsampling
        self.conv = nn.Conv1d(
            in_channels=config.hidden_size,
            out_channels=config.hidden_size,
            kernel_size=config.downsampling_rate,
            stride=config.downsampling_rate,
        )
        
        # Activation function based on the configuration
        self.activation = ACT2FN[config.hidden_act]

        # Layer normalization to normalize outputs across the hidden_size dimension
        # `self.LayerNorm` is kept as is to maintain compatibility with TensorFlow checkpoints
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, char_encoding: torch.Tensor) -> torch.Tensor:
        # Extract the [CLS] token encoding: [batch, 1, hidden_size]
        cls_encoding = char_encoding[:, 0:1, :]

        # Transpose `char_encoding` to [batch, hidden_size, char_seq]
        char_encoding = torch.transpose(char_encoding, 1, 2)

        # Apply convolution for downsampling, then transpose back
        downsampled = self.conv(char_encoding)
        downsampled = torch.transpose(downsampled, 1, 2)

        # Apply activation function to the downsampled sequence
        downsampled = self.activation(downsampled)

        # Remove the last molecule to reserve space for [CLS], maintaining alignment on TPUs
        downsampled_truncated = downsampled[:, 0:-1, :]

        # Concatenate [CLS] encoding with downsampled sequence
        result = torch.cat([cls_encoding, downsampled_truncated], dim=1)

        # Apply LayerNorm to the concatenated sequence
        result = self.LayerNorm(result)

        return result


class ConvProjection(nn.Module):
    """
    Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size
    characters.
    """

    def __init__(self, config):
        super().__init__()
        self.config = config

        # Define 1D convolutional layer for upsampling
        self.conv = nn.Conv1d(
            in_channels=config.hidden_size * 2,
            out_channels=config.hidden_size,
            kernel_size=config.upsampling_kernel_size,
            stride=1,
        )

        # Activation function based on the configuration
        self.activation = ACT2FN[config.hidden_act]

        # Layer normalization to normalize outputs across the hidden_size dimension
        # `self.LayerNorm` is kept as is to maintain compatibility with TensorFlow checkpoints
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # Dropout layer for regularization
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(
        self,
        inputs: torch.Tensor,
        final_seq_char_positions: Optional[torch.Tensor] = None,
        # inputs has shape [batch, mol_seq, molecule_hidden_size+char_hidden_final]
        # we transpose it to be [batch, molecule_hidden_size+char_hidden_final, mol_seq]
        inputs = torch.transpose(inputs, 1, 2)

        # PyTorch < 1.9 does not support padding="same" (which is used in the original implementation),
        # so we pad the tensor manually before passing it to the conv layer
        # based on https://github.com/google-research/big_transfer/blob/49afe42338b62af9fbe18f0258197a33ee578a6b/bit_tf2/models.py#L36-L38
        # Calculate total padding needed to achieve 'same' padding
        pad_total = self.config.upsampling_kernel_size - 1
        pad_beg = pad_total // 2  # Calculate padding to be added at the beginning
        pad_end = pad_total - pad_beg  # Calculate padding to be added at the end

        # Create a 1-dimensional constant padding layer for convolution
        pad = nn.ConstantPad1d((pad_beg, pad_end), 0)
        # Apply padding to inputs tensor before passing it through convolutional layer
        padded_inputs = pad(inputs)

        # Perform convolution operation on the padded inputs
        # `result`: shape (batch_size, char_seq_len, hidden_size)
        result = self.conv(padded_inputs)

        # Transpose result tensor to revert to original shape [batch, mol_seq, hidden_size]
        result = torch.transpose(result, 1, 2)

        # Apply activation function (e.g., ReLU) to the convolved result
        result = self.activation(result)

        # Apply layer normalization to stabilize training
        result = self.LayerNorm(result)

        # Apply dropout for regularization
        result = self.dropout(result)

        # Store the processed character sequence as the final output
        final_char_seq = result

        if final_seq_char_positions is not None:
            # Limit transformer query seq and attention mask to these character
            # positions to greatly reduce the compute cost. Typically, this is just
            # done for the MLM training task.
            # TODO add support for MLM
            raise NotImplementedError("CanineForMaskedLM is currently not supported")
        else:
            # If no specific character positions are provided, use the entire processed sequence
            query_seq = final_char_seq

        # Return the final processed query sequence
        return query_seq
# 定义一个名为 CanineSelfOutput 的神经网络模块，用于处理自注意力机制的输出
class CanineSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性变换层，将隐藏状态的维度转换为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # LayerNorm 层，用于规范化隐藏状态，以减少内部协变量偏移
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层，用于随机置零隐藏状态的部分单元，以减少过拟合风险
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(
        self, hidden_states: Tuple[torch.FloatTensor], input_tensor: torch.FloatTensor
    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
        # 线性变换操作，将隐藏状态转换为 config.hidden_size 维度
        hidden_states = self.dense(hidden_states)
        # 对转换后的隐藏状态进行随机置零处理，以减少过拟合
        hidden_states = self.dropout(hidden_states)
        # 对处理后的隐藏状态进行 LayerNorm 规范化，加上输入的 tensor，形成残差连接
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回规范化后的隐藏状态
        return hidden_states
    def __init__(
        self,
        config,
        local=False,
        always_attend_to_first_position: bool = False,
        first_position_attends_to_all: bool = False,
        attend_from_chunk_width: int = 128,
        attend_from_chunk_stride: int = 128,
        attend_to_chunk_width: int = 128,
        attend_to_chunk_stride: int = 128,
    ):
        super().__init__()
        # 初始化自注意力机制和自注意力输出层
        self.self = CanineSelfAttention(config)
        self.output = CanineSelfOutput(config)
        # 初始化一个空的剪枝头集合
        self.pruned_heads = set()

        # 检查是否开启局部注意力
        self.local = local
        # 检查块大小和跨步是否合理，防止序列位置被跳过
        if attend_from_chunk_width < attend_from_chunk_stride:
            raise ValueError(
                "`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped."
            )
        if attend_to_chunk_width < attend_to_chunk_stride:
            raise ValueError(
                "`attend_to_chunk_width` < `attend_to_chunk_stride` would cause sequence positions to get skipped."
            )
        # 设置额外的局部注意力参数
        self.always_attend_to_first_position = always_attend_to_first_position
        self.first_position_attends_to_all = first_position_attends_to_all
        self.attend_from_chunk_width = attend_from_chunk_width
        self.attend_from_chunk_stride = attend_from_chunk_stride
        self.attend_to_chunk_width = attend_to_chunk_width
        self.attend_to_chunk_stride = attend_to_chunk_stride
    # 对 self 对象中的注意力头进行修剪操作
    def prune_heads(self, heads):
        # 如果 heads 列表为空，则直接返回，不执行修剪操作
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数查找可修剪的注意力头及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 对线性层进行修剪
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪后的头信息
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 定义前向传播函数
    def forward(
        self,
        hidden_states: Tuple[torch.FloatTensor],
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
# 定义一个名为 CanineIntermediate 的神经网络模块类
class CanineIntermediate(nn.Module):
    # 初始化方法，接受一个 config 参数
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，输入大小为 config.hidden_size，输出大小为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据 config 中的 hidden_act 字段选择激活函数，存储在 self.intermediate_act_fn 中
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播方法，接受 hidden_states 参数作为输入张量，返回处理后的张量
    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
        # 将输入张量通过 self.dense 线性层处理
        hidden_states = self.dense(hidden_states)
        # 将处理后的张量通过选定的激活函数 self.intermediate_act_fn 进行激活
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回处理后的张量作为输出
        return hidden_states


# 定义一个名为 CanineOutput 的神经网络模块类
class CanineOutput(nn.Module):
    # 初始化方法，接受一个 config 参数
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，输入大小为 config.intermediate_size，输出大小为 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个 LayerNorm 层，对输入大小为 config.hidden_size 的张量进行归一化
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层，使用 config.hidden_dropout_prob 作为丢弃概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法，接受 hidden_states 和 input_tensor 两个参数作为输入，返回处理后的张量
    def forward(self, hidden_states: Tuple[torch.FloatTensor], input_tensor: torch.FloatTensor) -> torch.FloatTensor:
        # 将输入张量通过 self.dense 线性层处理
        hidden_states = self.dense(hidden_states)
        # 对处理后的张量应用 Dropout 操作
        hidden_states = self.dropout(hidden_states)
        # 将 Dropout 后的张量与输入张量 input_tensor 相加，并通过 LayerNorm 层处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的张量作为输出
        return hidden_states


# 定义一个名为 CanineLayer 的神经网络模块类
class CanineLayer(nn.Module):
    # 初始化方法，接受多个参数，包括 config 和各种注意力机制的相关参数
    def __init__(
        self,
        config,
        local,
        always_attend_to_first_position,
        first_position_attends_to_all,
        attend_from_chunk_width,
        attend_from_chunk_stride,
        attend_to_chunk_width,
        attend_to_chunk_stride,
    ):
        super().__init__()
        # 设定块大小 feed forward 的大小为 config.chunk_size_feed_forward
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度为 1
        self.seq_len_dim = 1
        # 创建 CanineAttention 层，使用给定的参数进行初始化
        self.attention = CanineAttention(
            config,
            local,
            always_attend_to_first_position,
            first_position_attends_to_all,
            attend_from_chunk_width,
            attend_from_chunk_stride,
            attend_to_chunk_width,
            attend_to_chunk_stride,
        )
        # 创建 CanineIntermediate 层，使用 config 进行初始化
        self.intermediate = CanineIntermediate(config)
        # 创建 CanineOutput 层，使用 config 进行初始化
        self.output = CanineOutput(config)

    # 前向传播方法，接受 hidden_states、attention_mask、head_mask、output_attentions 四个参数，
    # 返回处理后的张量和可能的注意力权重张量
    def forward(
        self,
        hidden_states: Tuple[torch.FloatTensor],
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
        # 使用 self.attention 对 hidden_states 进行自注意力机制处理
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
        )
        # 获取注意力机制处理后的输出
        attention_output = self_attention_outputs[0]

        # 如果输出注意力权重，则添加自注意力权重到输出中
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        # 将 attention_output 通过 apply_chunking_to_forward 方法进行分块处理
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # 将分块处理后的输出添加到 outputs 中
        outputs = (layer_output,) + outputs

        # 返回处理后的输出
        return outputs
    # 定义神经网络的前向传播方法，处理注意力输出作为输入
    def feed_forward_chunk(self, attention_output):
        # 将注意力输出作为输入，调用中间层的方法处理
        intermediate_output = self.intermediate(attention_output)
        # 使用中间层的输出和注意力输出调用输出层的方法，计算最终层的输出
        layer_output = self.output(intermediate_output, attention_output)
        # 返回最终的层输出作为这一块的前向传播结果
        return layer_output
class CanineEncoder(nn.Module):
    # CanineEncoder 类，用于实现特定的编码器模型
    def __init__(
        self,
        config,
        local=False,
        always_attend_to_first_position=False,
        first_position_attends_to_all=False,
        attend_from_chunk_width=128,
        attend_from_chunk_stride=128,
        attend_to_chunk_width=128,
        attend_to_chunk_stride=128,
    ):
        super().__init__()
        self.config = config
        # 创建一个由 CanineLayer 组成的层列表，根据 config 中的隐藏层数量进行初始化
        self.layer = nn.ModuleList(
            [
                CanineLayer(
                    config,
                    local,
                    always_attend_to_first_position,
                    first_position_attends_to_all,
                    attend_from_chunk_width,
                    attend_from_chunk_stride,
                    attend_to_chunk_width,
                    attend_to_chunk_stride,
                )
                for _ in range(config.num_hidden_layers)
            ]
        )
        self.gradient_checkpointing = False  # 梯度检查点标志，默认为 False

    def forward(
        self,
        hidden_states: Tuple[torch.FloatTensor],
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple, BaseModelOutput]:
        # 初始化空元组，用于存储所有隐藏状态和自注意力分数（根据需要）
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        # 遍历所有的层，并执行前向传播
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            if self.gradient_checkpointing and self.training:
                # 如果启用梯度检查点且处于训练模式，使用梯度检查点函数来计算当前层的输出
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    output_attentions,
                )
            else:
                # 否则，直接调用当前层的__call__方法来计算当前层的输出
                layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)

            # 更新隐藏状态为当前层的输出的第一个元素（通常是隐藏状态）
            hidden_states = layer_outputs[0]
            if output_attentions:
                # 如果需要输出自注意力分数，将当前层的自注意力分数添加到 all_self_attentions 中
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        if output_hidden_states:
            # 如果需要输出所有隐藏状态，将最终的隐藏状态添加到 all_hidden_states 中
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            # 如果不需要返回字典形式的输出，返回一个元组，其中包含非空的结果项
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则，返回一个 BaseModelOutput 对象，包含最终的隐藏状态、所有隐藏状态和自注意力分数
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )


class CaninePooler(nn.Module):
    # CaninePooler 类，用于实现特定的池化器模型
    def __init__(self, config):
        super().__init__()
        # 全连接层，将输入的大小转换为隐藏大小
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # Tanh 激活函数
        self.activation = nn.Tanh()
    # 定义类方法 `forward`，接受 `hidden_states` 参数作为输入，并返回 `torch.FloatTensor` 类型的张量
    def forward(self, hidden_states: Tuple[torch.FloatTensor]) -> torch.FloatTensor:
        # 通过取第一个令牌的隐藏状态来"汇聚"模型的输出
        first_token_tensor = hidden_states[:, 0]
        # 将第一个令牌的隐藏状态输入全连接层 `self.dense` 进行线性变换
        pooled_output = self.dense(first_token_tensor)
        # 对线性变换的结果应用激活函数 `self.activation`
        pooled_output = self.activation(pooled_output)
        # 返回汇聚后的输出张量 `pooled_output`
        return pooled_output
class CaninePredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        
        # 根据配置选择激活函数，如果配置中指定了激活函数名称，则使用对应的函数；否则直接使用配置中的激活函数对象
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        
        # 初始化 LayerNorm 层，归一化大小为 config.hidden_size，epsilon 值为 config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states: Tuple[torch.FloatTensor]) -> torch.FloatTensor:
        # 将输入 hidden_states 通过全连接层 dense
        hidden_states = self.dense(hidden_states)
        
        # 使用预先选择的激活函数进行变换
        hidden_states = self.transform_act_fn(hidden_states)
        
        # 对变换后的 hidden_states 进行 LayerNorm 归一化处理
        hidden_states = self.LayerNorm(hidden_states)
        
        # 返回处理后的 hidden_states
        return hidden_states


class CanineLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        # 初始化预测头变换层，使用 CaninePredictionHeadTransform 类处理输入的 config
        self.transform = CaninePredictionHeadTransform(config)

        # 初始化解码层，使用全连接层实现，输入维度为 config.hidden_size，输出维度为 config.vocab_size，无偏置
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 初始化偏置参数，维度为 config.vocab_size，作为解码层的偏置
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # 将解码层的偏置参数设置为初始化的偏置参数
        self.decoder.bias = self.bias

    def forward(self, hidden_states: Tuple[torch.FloatTensor]) -> torch.FloatTensor:
        # 使用预测头变换层处理输入 hidden_states
        hidden_states = self.transform(hidden_states)
        
        # 使用解码层对处理后的 hidden_states 进行预测
        hidden_states = self.decoder(hidden_states)
        
        # 返回预测得分
        return hidden_states


class CanineOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        # 初始化 MLM 头部，使用 CanineLMPredictionHead 类处理输入的 config
        self.predictions = CanineLMPredictionHead(config)

    def forward(
        self,
        sequence_output: Tuple[torch.Tensor],
    ) -> Tuple[torch.Tensor]:
        # 将序列输出作为输入，通过预测头进行预测
        prediction_scores = self.predictions(sequence_output)
        
        # 返回预测分数
        return prediction_scores


class CaninePreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定模型对应的配置类为 CanineConfig
    config_class = CanineConfig
    
    # 指定加载 TensorFlow 权重的函数为 load_tf_weights_in_canine
    load_tf_weights = load_tf_weights_in_canine
    
    # 设置基础模型的名称前缀为 "canine"
    base_model_prefix = "canine"
    
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果 module 是 nn.Linear 或 nn.Conv1d 类型
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            # 使用正态分布随机初始化权重，均值为 0.0，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果 module 存在偏置项，则将偏置项初始化为 0
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果 module 是 nn.Embedding 类型
        elif isinstance(module, nn.Embedding):
            # 使用正态分布随机初始化权重，均值为 0.0，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果 module 设置了 padding_idx，将对应位置的权重初始化为 0
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果 module 是 nn.LayerNorm 类型
        elif isinstance(module, nn.LayerNorm):
            # 将 LayerNorm 层的偏置项初始化为 0
            module.bias.data.zero_()
            # 将 LayerNorm 层的权重初始化为 1
            module.weight.data.fill_(1.0)
CANINE_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.
    
    Parameters:
        config ([`CanineConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

CANINE_INPUTS_DOCSTRING = r"""
    This string is intended to provide documentation about the expected inputs for the CANINE model. However, this section
    currently lacks specific content and requires further completion to describe the inputs comprehensively.
"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列中的标记索引，在词汇表中的位置
            # 可以使用 AutoTokenizer 获取这些索引。参见 PreTrainedTokenizer.encode 和 PreTrainedTokenizer.__call__ 进行详细说明。
            # 什么是输入 ID？请参见 ../glossary#input-ids

        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 遮罩，用于在填充标记索引上避免执行注意力操作
            # 遮罩的值选择在 [0, 1] 范围内：
            # - 1 表示 **未被遮罩** 的标记
            # - 0 表示 **被遮罩** 的标记
            # 什么是注意力遮罩？请参见 ../glossary#attention-mask

        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 段标记索引，用于指示输入的第一部分和第二部分
            # 索引在 [0, 1] 范围内选择：
            # - 0 对应 *句子 A* 的标记
            # - 1 对应 *句子 B* 的标记
            # 什么是标记类型 ID？请参见 ../glossary#token-type-ids

        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 每个输入序列标记在位置嵌入中的位置索引
            # 选择范围在 [0, config.max_position_embeddings - 1] 内
            # 什么是位置 ID？请参见 ../glossary#position-ids

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 用于置空自注意力模块的选定头部的遮罩
            # 遮罩的值选择在 [0, 1] 范围内：
            # - 1 表示 **未被遮罩** 的头部
            # - 0 表示 **被遮罩** 的头部

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选项，可以直接传递嵌入表示而不是传递 input_ids
            # 如果您想要更多控制如何将 input_ids 索引转换为相关联的向量，而不是使用模型内部的嵌入查找矩阵，则这很有用。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。更多细节请参见返回张量中的 `attentions`。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。更多细节请参见返回张量中的 `hidden_states`。

        return_dict (`bool`, *optional*):
            # 是否返回 `~utils.ModelOutput` 而不是普通元组。
"""
@add_start_docstrings(
    "The bare CANINE Model transformer outputting raw hidden-states without any specific head on top.",
    CANINE_START_DOCSTRING,
)
class CanineModel(CaninePreTrainedModel):
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config
        shallow_config = copy.deepcopy(config)
        shallow_config.num_hidden_layers = 1

        self.char_embeddings = CanineEmbeddings(config)
        # 初始化字符嵌入层
        self.initial_char_encoder = CanineEncoder(
            shallow_config,
            local=True,
            always_attend_to_first_position=False,
            first_position_attends_to_all=False,
            attend_from_chunk_width=config.local_transformer_stride,
            attend_from_chunk_stride=config.local_transformer_stride,
            attend_to_chunk_width=config.local_transformer_stride,
            attend_to_chunk_stride=config.local_transformer_stride,
        )
        # 初始化字符到分子的转换层
        self.chars_to_molecules = CharactersToMolecules(config)
        # 初始化深层 transformer 编码器
        self.encoder = CanineEncoder(config)
        # 初始化投影层
        self.projection = ConvProjection(config)
        # 初始化最终字符编码的浅层 transformer 编码器
        self.final_char_encoder = CanineEncoder(shallow_config)

        self.pooler = CaninePooler(config) if add_pooling_layer else None

        # 初始化权重并应用最终处理
        self.post_init()

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    def _create_3d_attention_mask_from_input_mask(self, from_tensor, to_mask):
        """
        Create 3D attention mask from a 2D tensor mask.

        Args:
            from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
            to_mask: int32 Tensor of shape [batch_size, to_seq_length].

        Returns:
            float Tensor of shape [batch_size, from_seq_length, to_seq_length].
        """
        # 获取输入张量的批量大小和序列长度
        batch_size, from_seq_length = from_tensor.shape[0], from_tensor.shape[1]

        # 获取目标掩码的序列长度
        to_seq_length = to_mask.shape[1]

        # 将目标掩码重塑为正确形状，并转换为浮点型张量
        to_mask = torch.reshape(to_mask, (batch_size, 1, to_seq_length)).float()

        # 创建一个全为1的张量，用于掩盖
        broadcast_ones = torch.ones(size=(batch_size, from_seq_length, 1), dtype=torch.float32, device=to_mask.device)

        # 使用广播操作创建掩码
        mask = broadcast_ones * to_mask

        return mask
    def _downsample_attention_mask(self, char_attention_mask: torch.Tensor, downsampling_rate: int):
        """Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer."""

        # 将二维字符注意力掩码转换为三维，添加一个通道维度
        batch_size, char_seq_len = char_attention_mask.shape
        poolable_char_mask = torch.reshape(char_attention_mask, (batch_size, 1, char_seq_len))

        # 使用 MaxPool1d 进行下采样，得到形状为 (batch_size, 1, mol_seq_len) 的池化分子注意力掩码
        pooled_molecule_mask = torch.nn.MaxPool1d(kernel_size=downsampling_rate, stride=downsampling_rate)(
            poolable_char_mask.float()
        )

        # 最后，压缩维度，得到形状为 (batch_size, mol_seq_len) 的张量
        molecule_attention_mask = torch.squeeze(pooled_molecule_mask, dim=-1)

        return molecule_attention_mask

    def _repeat_molecules(self, molecules: torch.Tensor, char_seq_length: torch.Tensor) -> torch.Tensor:
        """Repeats molecules to make them the same length as the char sequence."""

        rate = self.config.downsampling_rate

        # 从 `molecules` 中去除额外的 `<cls>` 标记，形状为 [batch_size, almost_char_seq_len, molecule_hidden_size]
        molecules_without_extra_cls = molecules[:, 1:, :]
        # 使用 repeat_interleave 函数按指定的倍数 `rate` 在指定维度 `-2` 重复张量
        repeated = torch.repeat_interleave(molecules_without_extra_cls, repeats=rate, dim=-2)

        # 现在，我们已经为任何 `char_seq_length` 是 `downsampling_rate` 的倍数的情况重复了足够的元素。
        # 现在我们处理最后的 n 个元素（其中 n < `downsampling_rate`），即 floor 除法的余数部分。
        # 我们通过额外多次重复最后一个分子来处理这部分。
        last_molecule = molecules[:, -1:, :]
        remainder_length = torch.fmod(torch.tensor(char_seq_length), torch.tensor(rate)).item()
        remainder_repeated = torch.repeat_interleave(
            last_molecule,
            repeats=remainder_length + rate,  # 加1个分子以弥补截断。
            dim=-2,
        )

        # 将重复后的结果拼接起来，形状为 [batch_size, char_seq_len, molecule_hidden_size]
        return torch.cat([repeated, remainder_repeated], dim=-2)

    @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CanineModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    """
    CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    CANINE_START_DOCSTRING,
)
class CanineForSequenceClassification(CaninePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels  # 初始化模型时设置标签数量

        self.canine = CanineModel(config)  # 使用配置初始化CANINE模型
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # 根据配置设置dropout层
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)  # 线性层，用于分类任务

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传播函数，接收多个输入参数，执行模型推断。

        Args:
            input_ids (Optional[torch.LongTensor]): 输入的token IDs序列.
            attention_mask (Optional[torch.FloatTensor]): 注意力掩码，指示哪些位置是填充的.
            token_type_ids (Optional[torch.LongTensor]): token类型IDs，如用于BERT模型的segment IDs.
            position_ids (Optional[torch.LongTensor]): 位置IDs，用于指定每个token的绝对位置.
            head_mask (Optional[torch.FloatTensor]): 多头注意力层的掩码.
            inputs_embeds (Optional[torch.FloatTensor]): 直接的嵌入表示输入.
            labels (Optional[torch.LongTensor]): 模型的标签.
            output_attentions (Optional[bool]): 是否输出注意力权重.
            output_hidden_states (Optional[bool]): 是否输出所有隐藏状态.
            return_dict (Optional[bool]): 是否返回输出字典.

        Returns:
            SequenceClassifierOutput: 序列分类器的输出，包括预测和额外的元数据.
        """
        # 通过CANINE模型获取输出的隐藏状态和池化的输出
        outputs = self.canine(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 对CANINE的输出进行dropout处理
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)

        # 使用线性分类器进行分类预测
        logits = self.classifier(pooled_output)

        # 构建返回的序列分类器输出对象
        return SequenceClassifierOutput(
            loss=None if labels is None else F.cross_entropy(logits, labels),
            logits=logits,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=outputs.attentions if output_attentions else None,
        )
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用其值；否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 Canine 模型进行推理，获取模型的输出
        outputs = self.canine(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取 pooled_output（通常是 BERT 模型中的 [CLS] token 的输出）
        pooled_output = outputs[1]

        # 对 pooled_output 应用 dropout
        pooled_output = self.dropout(pooled_output)

        # 将经过 dropout 后的 pooled_output 输入到分类器（通常是一个线性层）
        logits = self.classifier(pooled_output)

        # 初始化 loss 为 None
        loss = None

        # 如果 labels 不为 None，则计算损失
        if labels is not None:
            # 确定问题类型（回归、单标签分类、多标签分类）
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数并计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 如果只有一个标签，使用 squeeze() 去除维度为 1 的维度
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                # 调整 logits 和 labels 的形状以匹配损失函数的要求
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则组合输出成 tuple
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 SequenceClassifierOutput 类型的对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    CANINE Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    CANINE_START_DOCSTRING,
)
class CanineForMultipleChoice(CaninePreTrainedModel):
    """
    CANINE模型，顶部带有多选分类头部（在汇总输出之上的线性层和softmax），例如用于RocStories/SWAG任务。
    继承自CaninePreTrainedModel类。
    """

    def __init__(self, config):
        """
        初始化方法，设置模型结构。

        Args:
            config (CanineConfig): 模型配置对象，包含模型的各种参数设置。
        """
        super().__init__(config)

        # 加载预训练的CANINE模型
        self.canine = CanineModel(config)
        # Dropout层，用于防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 多选分类线性层，将CANINE模型输出映射到分类标签
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传播方法，定义模型的数据流。

        Args:
            input_ids (Optional[torch.LongTensor]): 输入的token IDs张量。
            attention_mask (Optional[torch.FloatTensor]): 注意力掩码张量，用于指定哪些位置是填充的。
            token_type_ids (Optional[torch.LongTensor]): 分段类型IDs张量，用于区分不同句子的位置。
            position_ids (Optional[torch.LongTensor]): 位置IDs张量，用于指定输入token的绝对位置。
            head_mask (Optional[torch.FloatTensor]): 多头注意力机制的掩码张量，用于指定哪些头部是无效的。
            inputs_embeds (Optional[torch.FloatTensor]): 嵌入向量的输入张量。
            labels (Optional[torch.LongTensor]): 标签张量，用于多选分类任务的真实标签。
            output_attentions (Optional[bool]): 是否输出注意力权重。
            output_hidden_states (Optional[bool]): 是否输出隐藏状态。
            return_dict (Optional[bool]): 是否返回字典格式的输出。

        Returns:
            MultipleChoiceModelOutput: 包含模型输出的对象，包括分类预测和其他可选的输出（如注意力权重、隐藏状态）。
        """
        # 省略了具体的前向传播逻辑，由于这里没有具体代码实现，无法添加进一步的注释。
        pass
        ) -> Union[Tuple, MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 如果 return_dict 为 None，则使用模型配置中的 use_return_dict 参数来决定返回类型
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 计算输入的 num_choices，如果 input_ids 不为 None，则为 input_ids 的第二维大小，否则为 inputs_embeds 的第二维大小
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 将 input_ids 展平为二维张量，每行表示一个选择项的输入 ids；如果 input_ids 为 None，则 input_ids 也为 None
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        # 将 attention_mask 展平为二维张量，每行表示一个选择项的 attention mask；如果 attention_mask 为 None，则为 None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        # 将 token_type_ids 展平为二维张量，每行表示一个选择项的 token type ids；如果 token_type_ids 为 None，则为 None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        # 将 position_ids 展平为二维张量，每行表示一个选择项的 position ids；如果 position_ids 为 None，则为 None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        # 将 inputs_embeds 展平为三维张量，每行表示一个选择项的嵌入；如果 inputs_embeds 为 None，则为 None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 使用模型的前向传播方法 canine 进行推断
        outputs = self.canine(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取汇聚的输出（通常是 pooler 输出）
        pooled_output = outputs[1]

        # 对汇聚的输出应用 dropout
        pooled_output = self.dropout(pooled_output)
        # 将 dropout 后的输出送入分类器得到 logits
        logits = self.classifier(pooled_output)
        # 将 logits 重新调整为二维张量，每行表示一个选择项的分类结果
        reshaped_logits = logits.view(-1, num_choices)

        # 初始化损失值为 None
        loss = None
        # 如果 labels 不为 None，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果不需要返回一个字典形式的结果
        if not return_dict:
            # 组装输出元组，包含重新调整的 logits 和可能的隐藏状态
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回多选模型的输出，包括损失、调整后的 logits、隐藏状态和注意力分布
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 使用 @add_start_docstrings 装饰器为类添加文档字符串，描述了该模型在 token 分类任务上的应用
@add_start_docstrings(
    """
    CANINE Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    CANINE_START_DOCSTRING,  # 引用预定义的 CANINE_START_DOCSTRING 常量
)
# 定义 CanineForTokenClassification 类，继承自 CaninePreTrainedModel
class CanineForTokenClassification(CaninePreTrainedModel):
    
    # 初始化方法，接收一个配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 从配置对象中获取标签数目并保存
        self.num_labels = config.num_labels
        
        # 创建 CANINE 模型实例
        self.canine = CanineModel(config)
        # 添加 dropout 层，使用配置中的隐藏层 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 创建线性分类器，输入大小为隐藏层大小，输出大小为标签数目
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        
        # 初始化权重并进行最终处理
        self.post_init()

    # 使用 @add_start_docstrings_to_model_forward 装饰器为 forward 方法添加文档字符串，描述输入参数
    # 使用 @replace_return_docstrings 装饰器替换返回值的文档字符串，输出类型为 TokenClassifierOutput，配置类为 _CONFIG_FOR_DOC
    @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
    # 定义 forward 方法，处理输入并返回模型输出
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

        # forward 方法接收多个输入参数，均为可选类型的 torch 张量
        
        # input_ids: 输入 token 的 IDs，长整型张量，可选
        # attention_mask: 注意力遮罩，浮点型张量，可选
        # token_type_ids: token 类型 IDs，长整型张量，可选
        # position_ids: 位置 IDs，长整型张量，可选
        # head_mask: 头部遮罩，浮点型张量，可选
        # inputs_embeds: 嵌入式输入，浮点型张量，可选
        # labels: 标签，长整型张量，可选
        # output_attentions: 是否输出注意力，布尔类型，可选
        # output_hidden_states: 是否输出隐藏状态，布尔类型，可选
        # return_dict: 是否返回字典形式的输出，布尔类型，可选
        
        # 方法内部进行模型的前向传播计算，返回模型输出
        ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Returns:
            Depending on `return_dict`:
            - If `return_dict=True`, returns a `TokenClassifierOutput` containing `loss`, `logits`, `hidden_states`, and `attentions`.
            - If `return_dict=False`, returns a tuple with `logits` followed by additional outputs.

        Example:

        ```
        >>> from transformers import AutoTokenizer, CanineForTokenClassification
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
        >>> model = CanineForTokenClassification.from_pretrained("google/canine-s")

        >>> inputs = tokenizer(
        ...     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
        ... )

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_token_class_ids = logits.argmax(-1)

        >>> # Note that tokens are classified rather then input words which means that
        >>> # there might be more predicted token classes than words.
        >>> # Multiple token classes might account for the same word
        >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
        >>> predicted_tokens_classes  # doctest: +SKIP
        ```

        ```
        >>> labels = predicted_token_class_ids
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)  # doctest: +SKIP
        ```"""
        # Determine if the output should be in dictionary format based on the `return_dict` argument
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Perform token classification using the Canine model
        outputs = self.canine(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the sequence output from the model's outputs
        sequence_output = outputs[0]

        # Apply dropout to the sequence output
        sequence_output = self.dropout(sequence_output)

        # Generate logits using the classifier layer
        logits = self.classifier(sequence_output)

        # Initialize loss as None
        loss = None

        # Compute the loss if labels are provided
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # Prepare the output based on the `return_dict` setting
        if not return_dict:
            # If `return_dict=False`, return a tuple with logits and additional outputs
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # If `return_dict=True`, return a `TokenClassifierOutput` object
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    CANINE Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    CANINE_START_DOCSTRING,
)
class CanineForQuestionAnswering(CaninePreTrainedModel):
    """
    CANINE模型，顶部带有用于提取式问答任务（如SQuAD）的跨度分类头部（在隐藏状态输出之上的线性层，用于计算`span start logits`和`span end logits`）。
    继承自CaninePreTrainedModel。
    """

    def __init__(self, config):
        """
        初始化方法，设置模型参数和各层。
        
        Args:
            config (CanineConfig): 模型的配置对象，包含模型的各种参数。
        """
        super().__init__(config)
        self.num_labels = config.num_labels

        # 使用给定的配置初始化CANINE模型
        self.canine = CanineModel(config)
        # 初始化一个线性层，用于输出问题-答案对的标签数
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 执行额外的初始化步骤，包括权重初始化和最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="Splend1dchan/canine-c-squad",
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="'nice puppet'",
        expected_loss=8.81,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传播方法，执行模型的前向计算。

        Args:
            input_ids (Optional[torch.LongTensor]): 输入token的ids。
            attention_mask (Optional[torch.FloatTensor]): 注意力掩码，指示哪些tokens需要注意，哪些不需要。
            token_type_ids (Optional[torch.LongTensor]): token类型ids，如segment ids。
            position_ids (Optional[torch.LongTensor]): token位置ids。
            head_mask (Optional[torch.FloatTensor]): 头部掩码，用于指定哪些层的注意力是有效的。
            inputs_embeds (Optional[torch.FloatTensor]): 嵌入的输入。
            start_positions (Optional[torch.LongTensor]): 答案起始位置的ids。
            end_positions (Optional[torch.LongTensor]): 答案结束位置的ids。
            output_attentions (Optional[bool]): 是否返回注意力权重。
            output_hidden_states (Optional[bool]): 是否返回隐藏状态。
            return_dict (Optional[bool]): 是否返回字典格式的输出。

        Returns:
            QuestionAnsweringModelOutput: 包含模型预测结果的输出对象。
        """
        # 实现前向传播逻辑的具体计算，包括如何处理输入和输出
        pass
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 初始化返回字典，如果 return_dict 为 None，则使用模型配置中的设定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的前向传播方法 `canine`，传入各种输入参数
        outputs = self.canine(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取序列输出（通常是 BERT 的最后一层隐藏状态）
        sequence_output = outputs[0]

        # 将序列输出传入问答模型输出层，得到起始位置和结束位置的 logits
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果 start_positions 和 end_positions 都不为 None，则计算损失
            # 如果在多GPU情况下，添加一个维度以匹配 logits 的维度
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 忽略超出模型输入长度的位置
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)

            # 使用交叉熵损失函数来计算起始位置和结束位置的损失
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            # 如果 return_dict 为 False，则返回一个包含损失和 logits 的 tuple
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果 return_dict 为 True，则返回一个 QuestionAnsweringModelOutput 对象
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Transformers-源码解析-二十三-

Transformers 源码解析（二十三）

.\models\bros\processing_bros.py

.\models\bros\__init__.py

.\models\byt5\convert_byt5_original_tf_checkpoint_to_pytorch.py

.\models\byt5\tokenization_byt5.py

.\models\byt5\__init__.py

.\models\camembert\configuration_camembert.py

.\models\camembert\modeling_camembert.py

.\models\camembert\modeling_tf_camembert.py

.\models\camembert\tokenization_camembert.py

.\models\camembert\tokenization_camembert_fast.py

.\models\camembert\__init__.py

.\models\canine\configuration_canine.py

.\models\canine\convert_canine_original_tf_checkpoint_to_pytorch.py

.\models\canine\modeling_canine.py

`.\models\bros\processing_bros.py`

`.\models\bros\init.py`

`.\models\byt5\convert_byt5_original_tf_checkpoint_to_pytorch.py`

`.\models\byt5\tokenization_byt5.py`

`.\models\byt5\init.py`

`.\models\camembert\configuration_camembert.py`

`.\models\camembert\modeling_camembert.py`

`.\models\camembert\modeling_tf_camembert.py`

`.\models\camembert\tokenization_camembert.py`

`.\models\camembert\tokenization_camembert_fast.py`

`.\models\camembert\init.py`

`.\models\canine\configuration_canine.py`

`.\models\canine\convert_canine_original_tf_checkpoint_to_pytorch.py`

`.\models\canine\modeling_canine.py`