Transformers 源码解析（八十一）

`.\models\mvp\tokenization_mvp.py`

# coding=utf-8
# Copyright 2022 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json  # 导入处理 JSON 格式数据的模块
import os  # 导入操作系统相关功能的模块
from functools import lru_cache  # 导入用于缓存函数结果的装饰器
from typing import List, Optional, Tuple  # 导入用于类型注解的模块

import regex as re  # 导入正则表达式模块

from ...tokenization_utils import AddedToken, PreTrainedTokenizer  # 导入特定的 tokenization_utils 模块
from ...utils import logging  # 导入日志记录工具

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}  # 词汇表文件名的映射字典

# See all MVP models at https://huggingface.co/models?filter=mvp
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/vocab.json",
    },
    "added_tokens.json": {
        "RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/added_tokens.json",
    },
    "merges_file": {
        "RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/merges.txt",
    },
}  # 预训练模型的词汇文件映射字典

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "RUCAIBox/mvp": 1024,
}  # 预训练模型的位置编码大小映射字典

@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )  # 定义包含 UTF-8 字节范围的列表 bs
    cs = bs[:]  # 复制 bs 到 cs
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]  # 将 cs 中的数值转换为对应的 Unicode 字符
    return dict(zip(bs, cs))  # 返回 UTF-8 字节到 Unicode 字符的映射字典

def get_pairs(word):
    """
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()  # 创建一个空集合 pairs，用于存放单词中的符号对
    prev_char = word[0]  # 获取单词的第一个字符
    for char in word[1:]:  # 遍历单词的其余字符
        pairs.add((prev_char, char))  # 将前一个字符和当前字符作为符号对添加到集合中
        prev_char = char  # 更新前一个字符为当前字符
    return pairs  # 返回符号对的集合

class MvpTokenizer(PreTrainedTokenizer):
    """
    Constructs a MVP tokenizer, which is smilar to the RoBERTa tokenizer, using byte-level Byte-Pair-Encoding.
    """
    # 构造函数，初始化 MVP tokenizer 对象
    # 从transformers库导入MvpTokenizer类
    >>> from transformers import MvpTokenizer
    
    # 使用预训练模型"RUCAIBox/mvp"实例化一个tokenizer对象
    >>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
    
    # 对文本"Hello world"进行tokenization，并获取其input_ids（输入ID）
    >>> tokenizer("Hello world")["input_ids"]
    [0, 31414, 232, 2]
    
    # 对文本" Hello world"进行tokenization（在单词前加上空格），并获取其input_ids（输入ID）
    >>> tokenizer(" Hello world")["input_ids"]
    [0, 20920, 232, 2]
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (MVP tokenizer detect beginning of words by the preceding space).
    """

    # 定义一些常量和映射，用于处理预训练模型的输入和输出
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]
    def __init__(
        self,
        vocab_file,
        merges_file,
        errors="replace",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        add_prefix_space=False,
        **kwargs,
    ):
        # 初始化方法，用于创建一个新的对象实例，并初始化其属性
        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token

        # 如果 mask_token 是字符串类型，则创建一个特殊的 AddedToken 对象，左去空格，表示它在标记化时不带空格
        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
        
        # 使用 UTF-8 编码打开词汇文件，并加载其中的 JSON 数据到 self.encoder 字典中
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        
        # 创建 self.decoder 字典，将 self.encoder 的键值对反转，用于从索引解码到词汇
        self.decoder = {v: k for k, v in self.encoder.items()}
        
        # 设置处理解码错误的方法（默认是替换错误）
        self.errors = errors
        
        # 初始化字节到 Unicode 编码的映射
        self.byte_encoder = bytes_to_unicode()
        
        # 创建字节解码器，将字节到 Unicode 编码的映射反转，用于从 Unicode 解码到字节
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        
        # 使用 UTF-8 编码打开 merges_file 文件，并读取其中的 BPE 合并规则
        with open(merges_file, encoding="utf-8") as merges_handle:
            bpe_merges = merges_handle.read().split("\n")[1:-1]
        
        # 将每行 BPE 合并规则转换为元组列表
        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
        
        # 创建 self.bpe_ranks 字典，将 BPE 合并规则和其索引值构成键值对
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        
        # 创建一个空的缓存字典
        self.cache = {}
        
        # 设置是否在词汇前添加空格的选项
        self.add_prefix_space = add_prefix_space
        
        # 使用正则表达式创建 self.pat 模式，用于词汇分割和合并
        # 包括缩略词、字母、数字、非空格非字母非数字字符、空格等的匹配
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
        
        # 调用父类的初始化方法，传递相应参数和关键字参数
        super().__init__(
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )

    @property
    def vocab_size(self):
        # 返回词汇表的大小，即 self.encoder 字典的键值对数量
        return len(self.encoder)

    def get_vocab(self):
        # 获取词汇表，包括已添加的特殊 token
        vocab = self.encoder.copy()
        vocab.update(self.added_tokens_encoder)
        return vocab
    def _tokenize(self, text):
        """Tokenize a string."""
        # 定义一个空列表，用于存储经过 BPE 处理后的 token
        bpe_tokens = []
        # 使用正则表达式找到文本中所有符合模式的 token，并进行处理
        for token in re.findall(self.pat, text):
            # 将每个 token 编码成字节，并通过字节编码器映射为 unicode 字符串，避免 BPE 中的控制标记（在本例中是空格）
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
            # 将经过 BPE 处理后的 token 拆分成多个子 token，并加入到 bpe_tokens 列表中
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        # 返回经过 BPE 处理后的所有 token
        return bpe_tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 根据 token 在词汇表中查找对应的 id，如果找不到则返回未知 token 对应的 id
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 根据 id 查找词汇表中对应的 token
        return self.decoder.get(index)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 将 tokens 列表中的所有 token 拼接成一个字符串
        text = "".join(tokens)
        # 将字符串转换为字节，并根据字节解码器将其解码为 utf-8 编码的字符串，处理可能的错误情况
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        # 返回转换后的字符串
        return text
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 构建词汇表文件路径
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        # 构建合并文件路径
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 将编码器内容以 JSON 格式写入词汇表文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        # 将 BPE 标记和对应的索引写入合并文件
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                # 检查 BPE 合并索引是否连续，如果不连续则记录警告信息
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        # 返回词汇表文件路径和合并文件路径
        return vocab_file, merge_file

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A MVP sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # 如果只有一个序列，则添加开头和结尾的特殊标记
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        # 如果有两个序列，则按照格式添加特殊标记
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
        """
        Return a mask indicating the positions of special tokens in the input sequences.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs corresponding to the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs corresponding to the second sequence.
            already_has_special_tokens (`bool`):
                Whether the inputs already include special tokens or not.

        Returns:
            `List[int]`: Mask indicating the positions of special tokens (1 for special token, 0 for regular token).
        """
        # 如果输入已经包含特殊标记，则直接返回全零的掩码
        if already_has_special_tokens:
            return [0] * len(token_ids_0)

        # 初始化一个全零掩码
        special_tokens_mask = [0] * len(token_ids_0)

        # 设置第一个序列的开头和结尾标记位置
        special_tokens_mask[0] = 1  # CLS token
        special_tokens_mask[-1] = 1  # SEP token

        # 如果有第二个序列，设置第二个序列的开头和结尾标记位置
        if token_ids_1 is not None:
            special_tokens_mask.extend([1] * len(token_ids_1) + [1])  # two SEP tokens

        return special_tokens_mask
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # 如果已经包含特殊标记，则调用父类方法获取特殊标记的掩码
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # 如果没有第二个 token_ids_1，则返回一个列表，表示有特殊标记的序列
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        # 否则返回一个列表，表示有特殊标记的序列对
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. MVP does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        # 分隔符和类别标记的 ID
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # 如果没有第二个 token_ids_1，则返回一个全零列表
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        # 否则返回一个全零列表，用于序列对的掩码
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        # 如果文本已经分割成单词或者需要在文本前加空格，并且文本的第一个字符不是空格，则在文本前加一个空格
        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
            text = " " + text
        return (text, kwargs)

`.\models\mvp\tokenization_mvp_fast.py`

import json  # 导入处理 JSON 数据的模块
from typing import List, Optional, Tuple  # 引入类型提示相关的模块

from tokenizers import pre_tokenizers, processors  # 导入 tokenizers 库中的预处理器和处理器

from ...tokenization_utils_base import AddedToken, BatchEncoding  # 导入基础的 tokenization_utils_base 模块中的类
from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入 tokenization_utils_fast 模块中的 PreTrainedTokenizerFast 类
from ...utils import logging  # 导入 logging 模块中的 logging 函数
from .tokenization_mvp import MvpTokenizer  # 从当前目录下的 tokenization_mvp 模块导入 MvpTokenizer 类


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
# 定义包含各种文件名的字典，用于 MVP tokenizer 的词汇文件和相关文件

# 预训练模型的文件映射，指定每个预训练模型的相关文件的 URL
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/vocab.json",
    },
    "added_tokens.json": {
        "RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/added_tokens.json",
    },
    "merges_file": {
        "RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/merges.txt",
    },
    "tokenizer_file": {
        "RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/tokenizer.json",
    },
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "RUCAIBox/mvp": 1024,  # 预训练模型 RUCAIBox/mvp 的位置嵌入大小为 1024
}


class MvpTokenizerFast(PreTrainedTokenizerFast):
    r"""
    Construct a "fast" MVP tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 tokenizer,
    using byte-level Byte-Pair-Encoding.

    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ```
    >>> from transformers import MvpTokenizerFast

    >>> tokenizer = MvpTokenizerFast.from_pretrained("RUCAIBox/mvp")
    >>> tokenizer("Hello world")["input_ids"]
    [0, 31414, 232, 2]

    >>> tokenizer(" Hello world")["input_ids"]
    [0, 20920, 232, 2]
    ```

    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.

    <Tip>

    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.

    </Tip>

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    ```
    # MvpTokenizerFast 类，是基于 HuggingFace 的 tokenizers 库实现的 MVP tokenizer，使用字节级别的 BPE 进行编码
    # 继承自 PreTrainedTokenizerFast 类，包含大多数主要方法，支持从预训练模型加载和使用
    # 通过示例展示了该 tokenizer 如何处理空格以及在不同位置编码单词的不同方式
    # 提示用户在特定情况下实例化时需要传递额外参数以获得最佳效果
    # 定义一个类，用于实现基于特定语言模型的Tokenizer，继承自一个提供了相关方法信息的超类。
    class PreTrainedTokenizer(PreTrainedTokenizerBase):
        # 初始化方法，接受词汇表文件路径、合并文件路径和可选的解码错误处理方式等参数。
        def __init__(
            self,
            vocab_file: str,
            merges_file: str,
            errors: str = "replace",
            bos_token: str = "<s>",
            eos_token: str = "</s>",
            sep_token: str = "</s>",
            cls_token: str = "<s>",
            unk_token: str = "<unk>",
            pad_token: str = "<pad>",
            mask_token: str = "<mask>",
            add_prefix_space: bool = False,
            trim_offsets: bool = True,
        ):
            # 调用超类的初始化方法，传递必要的参数
            super().__init__(
                bos_token=bos_token,
                eos_token=eos_token,
                unk_token=unk_token,
                sep_token=sep_token,
                cls_token=cls_token,
                pad_token=pad_token,
                mask_token=mask_token,
                add_prefix_space=add_prefix_space,
                trim_offsets=trim_offsets,
            )
            # 记录词汇表文件名和预训练词汇表文件映射
            self.vocab_files_names = vocab_files_names
            self.pretrained_vocab_files_map = pretrained_vocab_files_map
    # 将预训练模型的位置编码大小赋给 max_model_input_sizes 变量
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 定义模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask"]
    # 指定慢速分词器的类别为 MvpTokenizer
    slow_tokenizer_class = MvpTokenizer

    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        tokenizer_file=None,
        errors="replace",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        add_prefix_space=False,
        trim_offsets=True,
        **kwargs,
    ):
        # 初始化方法，设置各种初始化参数

    @property
    def mask_token(self) -> str:
        """
        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
        having been set.

        MVP tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
        comprise the space before the *<mask>*.
        """
        # 获取 mask_token 属性的 getter 方法，返回当前的 mask token 字符串
        if self._mask_token is None:
            if self.verbose:
                logger.error("Using mask_token, but it is not set yet.")
            return None
        return str(self._mask_token)

    @mask_token.setter
    def mask_token(self, value):
        """
        Overriding the default behavior of the mask token to have it eat the space before it.

        This is needed to preserve backward compatibility with all the previously used models based on Mvp.
        """
        # 设置 mask_token 属性的 setter 方法，使其吞噬前面的空格
        # 如果 value 是字符串，则创建一个 AddedToken 对象，设置 lstrip 为 True，rstrip 为 False
        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
        self._mask_token = value

    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
        # 批量编码方法，返回 BatchEncoding 对象
        is_split_into_words = kwargs.get("is_split_into_words", False)

        if is_split_into_words and not self.add_prefix_space:
            # 如果输入已经被分割成单词但没有加前缀空格，则抛出错误
            raise ValueError(
                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
                "to use it with pretokenized inputs."
            )

        return super()._batch_encode_plus(*args, **kwargs)

    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
        # 编码方法，返回 BatchEncoding 对象
        is_split_into_words = kwargs.get("is_split_into_words", False)

        if is_split_into_words and not self.add_prefix_space:
            # 如果输入已经被分割成单词但没有加前缀空格，则抛出错误
            raise ValueError(
                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
                "to use it with pretokenized inputs."
            )

        return super()._encode_plus(*args, **kwargs)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 保存词汇表到指定目录
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)
    # 构建包含特殊标记的输入序列，用于模型输入
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        # 初始化输出列表，以起始标记开始，接着是token_ids_0，最后是结束标记
        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        # 如果没有第二个序列token_ids_1，则直接返回构建好的output
        if token_ids_1 is None:
            return output
        
        # 如果有第二个序列token_ids_1，则在output后面添加结束标记，接着是token_ids_1，最后再加一个结束标记
        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]

    # 根据输入的两个序列token_ids_0和token_ids_1创建token type ids序列
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. MVP does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        # 分隔符标记列表
        sep = [self.sep_token_id]
        # 分类起始标记列表
        cls = [self.cls_token_id]

        # 如果没有第二个序列token_ids_1，则返回一个全为0的列表，长度为cls + token_ids_0 + sep的长度
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # 如果有第二个序列token_ids_1，则返回一个全为0的列表，长度为cls + token_ids_0 + sep + sep + token_ids_1 + sep的长度
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

`.\models\mvp\init.py`

# 版权声明和许可证信息，说明此代码的版权和使用许可
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 引入类型检查模块中的 TYPE_CHECKING 类型
from typing import TYPE_CHECKING

# 从 utils 模块中导入必要的工具和函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义模块的导入结构字典，包含不同模块和对应的导入内容列表
_import_structure = {
    "configuration_mvp": ["MVP_PRETRAINED_CONFIG_ARCHIVE_MAP", "MvpConfig", "MvpOnnxConfig"],
    "tokenization_mvp": ["MvpTokenizer"],
}

# 尝试导入 tokenizers_mvp_fast 模块，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["tokenization_mvp_fast"] = ["MvpTokenizerFast"]

# 尝试导入 torch 模块，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，则将 modeling_mvp 模块添加到导入结构中
    _import_structure["modeling_mvp"] = [
        "MVP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MvpForCausalLM",
        "MvpForConditionalGeneration",
        "MvpForQuestionAnswering",
        "MvpForSequenceClassification",
        "MvpModel",
        "MvpPreTrainedModel",
    ]

# 如果当前处于类型检查状态
if TYPE_CHECKING:
    # 导入配置相关的类和常量
    from .configuration_mvp import MVP_PRETRAINED_CONFIG_ARCHIVE_MAP, MvpConfig, MvpOnnxConfig
    # 导入 tokenizers 模块的相关类，如果不可用则忽略
    from .tokenization_mvp import MvpTokenizer

    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入 tokenizers_mvp_fast 模块的快速 tokenizer 类，如果可用的话
        from .tokenization_mvp_fast import MvpTokenizerFast

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入 modeling_mvp 模块中的各种模型类，如果 torch 可用的话
        from .modeling_mvp import (
            MVP_PRETRAINED_MODEL_ARCHIVE_LIST,
            MvpForCausalLM,
            MvpForConditionalGeneration,
            MvpForQuestionAnswering,
            MvpForSequenceClassification,
            MvpModel,
            MvpPreTrainedModel,
        )

# 如果不处于类型检查状态，则将当前模块设置为 LazyModule 的代理模块
else:
    import sys

    # 将当前模块替换为 LazyModule，实现延迟导入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\nat\configuration_nat.py`

# 设置文件编码为UTF-8
# 版权声明和许可证信息
#
# 根据Apache许可证版本2.0授权，除非符合许可证要求，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件，
# 没有任何明示或暗示的保证或条件。
# 有关详细信息，请参阅许可证。
""" Neighborhood Attention Transformer model configuration"""

# 从transformers库导入预训练配置类
from ...configuration_utils import PretrainedConfig
# 导入日志记录工具
from ...utils import logging
# 导入BackboneConfigMixin类和get_aligned_output_features_output_indices函数
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices

# 获取logger对象
logger = logging.get_logger(__name__)

# Nat预训练模型配置文件映射表
NAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "shi-labs/nat-mini-in1k-224": "https://huggingface.co/shi-labs/nat-mini-in1k-224/resolve/main/config.json",
    # 查看所有Nat模型：https://huggingface.co/models?filter=nat
}

# NatConfig类，继承自BackboneConfigMixin和PretrainedConfig
class NatConfig(BackboneConfigMixin, PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`NatModel`]. It is used to instantiate a Nat model
    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Nat
    [shi-labs/nat-mini-in1k-224](https://huggingface.co/shi-labs/nat-mini-in1k-224) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import NatConfig, NatModel

    >>> # Initializing a Nat shi-labs/nat-mini-in1k-224 style configuration
    >>> configuration = NatConfig()

    >>> # Initializing a model (with random weights) from the shi-labs/nat-mini-in1k-224 style configuration
    >>> model = NatModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型为"nat"
    model_type = "nat"

    # 属性映射字典，用于将外部使用的属性名映射到内部配置属性名
    attribute_map = {
        "num_attention_heads": "num_heads",
        "num_hidden_layers": "num_layers",
    }

    # 初始化函数，定义了NatConfig的各种参数和默认值
    def __init__(
        self,
        patch_size=4,
        num_channels=3,
        embed_dim=64,
        depths=[3, 4, 6, 5],
        num_heads=[2, 4, 8, 16],
        kernel_size=7,
        mlp_ratio=3.0,
        qkv_bias=True,
        hidden_dropout_prob=0.0,
        attention_probs_dropout_prob=0.0,
        drop_path_rate=0.1,
        hidden_act="gelu",
        initializer_range=0.02,
        layer_norm_eps=1e-5,
        layer_scale_init_value=0.0,
        out_features=None,
        out_indices=None,
        **kwargs,
        ):
        # 调用父类的初始化方法，传入所有关键字参数
        super().__init__(**kwargs)

        # 设置模型的补丁大小
        self.patch_size = patch_size
        # 设置输入图像的通道数
        self.num_channels = num_channels
        # 设置嵌入维度
        self.embed_dim = embed_dim
        # 设置每个层级的深度列表
        self.depths = depths
        # 计算层级的数量
        self.num_layers = len(depths)
        # 设置注意力头的数量
        self.num_heads = num_heads
        # 设置注意力机制的核心尺寸
        self.kernel_size = kernel_size
        # 设置MLP扩展比率
        self.mlp_ratio = mlp_ratio
        # 设置查询、键、值是否包含偏差
        self.qkv_bias = qkv_bias
        # 设置隐藏层的dropout概率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 设置注意力概率的dropout概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 设置层级的丢弃路径率
        self.drop_path_rate = drop_path_rate
        # 设置隐藏层的激活函数
        self.hidden_act = hidden_act
        # 设置层标准化的epsilon值
        self.layer_norm_eps = layer_norm_eps
        # 设置初始化范围
        self.initializer_range = initializer_range
        # 设置隐藏尺寸，以便Nat与VisionEncoderDecoderModel一起使用
        # 这指示模型最后阶段后的通道维度
        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
        # 设置层尺度初始化值
        self.layer_scale_init_value = layer_scale_init_value
        # 设置阶段名称列表，包括"stem"和从"stage1"到"stageN"
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
        # 获取与输出特征和输出索引对齐的特征和索引
        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
        )

`.\models\nat\modeling_nat.py`

# coding=utf-8
# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Neighborhood Attention Transformer model."""

# Importing necessary libraries and modules
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# Importing activation function mappings
from ...activations import ACT2FN
# Importing output classes
from ...modeling_outputs import BackboneOutput
# Importing utility functions for model handling
from ...modeling_utils import PreTrainedModel
# Importing pruning utilities for linear layers
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
# Importing various utility functions and classes
from ...utils import (
    ModelOutput,
    OptionalDependencyNotAvailable,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_natten_available,
    logging,
    replace_return_docstrings,
    requires_backends,
)
# Importing backbone utility functions
from ...utils.backbone_utils import BackboneMixin
# Importing configuration class for Nat model
from .configuration_nat import NatConfig

# Checking availability of external module 'natten'
if is_natten_available():
    # Importing specific functions from 'natten.functional'
    from natten.functional import natten2dav, natten2dqkrpb
else:
    # Define placeholder functions if 'natten' is not available
    def natten2dqkrpb(*args, **kwargs):
        raise OptionalDependencyNotAvailable()

    def natten2dav(*args, **kwargs):
        raise OptionalDependencyNotAvailable()

# Setting up logging for the module
logger = logging.get_logger(__name__)

# General documentation strings
_CONFIG_FOR_DOC = "NatConfig"

# Base documentation strings
_CHECKPOINT_FOR_DOC = "shi-labs/nat-mini-in1k-224"
_EXPECTED_OUTPUT_SHAPE = [1, 7, 7, 512]

# Image classification documentation strings
_IMAGE_CLASS_CHECKPOINT = "shi-labs/nat-mini-in1k-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"

# List of pretrained model archives for Nat model
NAT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "shi-labs/nat-mini-in1k-224",
    # See all Nat models at https://huggingface.co/models?filter=nat
]

# Definition of dataclass for NatEncoderOutput
@dataclass
class NatEncoderOutput(ModelOutput):
    """
    Nat encoder's outputs, with potential hidden states and attentions.
"""
    # 定义函数的参数及其类型注解，以下为函数的输入参数
    
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列。
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型在每一层输出的隐藏状态的元组。
            Tuple of `torch.FloatTensor` representing hidden-states of the model at the output of each layer.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            模型在每一层输出的注意力权重的元组，用于计算自注意力头中的加权平均值。
            Tuple of `torch.FloatTensor` representing attention weights after attention softmax.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型在每一层输出的隐藏状态的元组，包含了空间维度重塑后的输出。
            Tuple of `torch.FloatTensor` representing hidden-states of the model at the output of each layer, reshaped to include spatial dimensions.
    """
    
    # 初始化函数的参数，默认值为None
    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 创建一个数据类 NatModelOutput，继承自 ModelOutput 类，用于表示 NAT 模型的输出，包括最后隐藏状态的汇总信息。
@dataclass
class NatModelOutput(ModelOutput):
    """
    Nat model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    """

    # 定义类的属性，用于存储 NAT 模型的输出信息
    last_hidden_state: torch.FloatTensor = None
    pooler_output: Optional[torch.FloatTensor] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None


# 创建一个数据类 NatImageClassifierOutput，继承自 ModelOutput 类，用于表示 NAT 图像分类的输出。
@dataclass
class NatImageClassifierOutput(ModelOutput):
    """
    Nat outputs for image classification.
    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            分类（如果 `config.num_labels==1` 则为回归）损失。
            分类损失或回归损失的张量。
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            分类（如果 `config.num_labels==1` 则为回归）得分（SoftMax 之前）。
            模型输出的分类或回归得分（经过 SoftMax 之前的）。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            `torch.FloatTensor` 元组（当 `output_hidden_states=True` 传入或 `config.output_hidden_states=True` 时返回），
            包含形状为 `(batch_size, sequence_length, hidden_size)` 的张量。

            每个层的模型隐藏状态以及初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            `torch.FloatTensor` 元组（当 `output_attentions=True` 传入或 `config.output_attentions=True` 时返回），
            包含形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的张量。

            经过注意力 softmax 后的注意力权重，用于计算自注意力头部的加权平均值。
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            `torch.FloatTensor` 元组（当 `output_hidden_states=True` 传入或 `config.output_hidden_states=True` 时返回），
            包含形状为 `(batch_size, hidden_size, height, width)` 的张量。

            每个层的模型隐藏状态以及初始嵌入输出重塑为包括空间维度。
    """

    # 可选的损失张量，当提供 `labels` 时返回
    loss: Optional[torch.FloatTensor] = None
    # 分类或回归得分张量，形状为 `(batch_size, config.num_labels)`
    logits: torch.FloatTensor = None
    # 可选的隐藏状态张量元组，当 `output_hidden_states=True` 时返回
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 可选的注意力权重张量元组，当 `output_attentions=True` 时返回
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 可选的重塑后的隐藏状态张量元组，当 `output_hidden_states=True` 时返回
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
class NatEmbeddings(nn.Module):
    """
    Construct the patch and position embeddings.
    """

    def __init__(self, config):
        super().__init__()

        self.patch_embeddings = NatPatchEmbeddings(config)  # 创建补丁嵌入对象

        self.norm = nn.LayerNorm(config.embed_dim)  # 初始化 LayerNorm 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # 初始化 Dropout 层

    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor]:
        embeddings = self.patch_embeddings(pixel_values)  # 获取补丁嵌入向量
        embeddings = self.norm(embeddings)  # 应用 LayerNorm
        embeddings = self.dropout(embeddings)  # 应用 Dropout

        return embeddings


class NatPatchEmbeddings(nn.Module):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config):
        super().__init__()
        patch_size = config.patch_size
        num_channels, hidden_size = config.num_channels, config.embed_dim
        self.num_channels = num_channels

        if patch_size == 4:
            pass  # 当 patch_size 为 4 时，无需额外操作
        else:
            # TODO: Support arbitrary patch sizes.
            raise ValueError("Dinat only supports patch size of 4 at the moment.")  # 报错：当前仅支持 patch 大小为 4

        self.projection = nn.Sequential(
            nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
            nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
        )  # 初始化卷积投影层

    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor:
        _, num_channels, height, width = pixel_values.shape
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )  # 检查通道维度是否匹配配置中设置的通道维度
        embeddings = self.projection(pixel_values)  # 对输入进行投影
        embeddings = embeddings.permute(0, 2, 3, 1)  # 调整维度顺序，使得最后一维为 hidden_size

        return embeddings


class NatDownsampler(nn.Module):
    """
    Convolutional Downsampling Layer.

    Args:
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    """

    def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
        super().__init__()
        self.dim = dim
        self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        self.norm = norm_layer(2 * dim)  # 初始化规范化层

    def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
        input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)  # 执行卷积降采样
        input_feature = self.norm(input_feature)  # 应用规范化层
        return input_feature
# 定义一个函数，用于在神经网络中按概率丢弃路径（随机深度）以减少模型复杂度
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    
    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    # 如果丢弃概率为0或者不处于训练模式，则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    # 计算保留的概率
    keep_prob = 1 - drop_prob
    # 为了支持不同维度的张量，创建与输入形状相同的随机张量
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 将随机张量二值化
    # 应用随机深度丢弃操作并返回输出
    output = input.div(keep_prob) * random_tensor
    return output


# 从transformers.models.beit.modeling_beit.BeitDropPath中复制的类，修改为NatDropPath
class NatDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用前面定义的drop_path函数，传递当前对象的dropout概率和训练模式
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        # 返回描述对象的额外字符串表示，显示dropout概率
        return "p={}".format(self.drop_prob)


class NeighborhoodAttention(nn.Module):
    def __init__(self, config, dim, num_heads, kernel_size):
        super().__init__()
        # 检查隐藏层大小是否能被注意力头数量整除
        if dim % num_heads != 0:
            raise ValueError(
                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
            )

        self.num_attention_heads = num_heads
        self.attention_head_size = int(dim / num_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.kernel_size = kernel_size

        # rpb是可学习的相对位置偏置，与Swin中的概念相同
        self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1)))

        # 创建用于查询、键、值的线性变换层
        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)

        # 使用配置中的注意力概率丢弃层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        # 调整张量形状以便计算注意力分数
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 3, 1, 2, 4)
    # 定义前向传播方法，接受隐藏状态和是否输出注意力矩阵作为参数，返回元组类型的输出
    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 通过查询（query）权重网络处理隐藏状态，以备计算注意力得分
        query_layer = self.transpose_for_scores(self.query(hidden_states))
        # 通过键（key）权重网络处理隐藏状态，以备计算注意力得分
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        # 通过值（value）权重网络处理隐藏状态，以备计算注意力得分
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        # 在计算注意力权重之前应用缩放因子，通常更高效，因为注意力权重通常比查询向量更大
        # 由于矩阵乘法中标量是可交换的，因此结果相同
        query_layer = query_layer / math.sqrt(self.attention_head_size)

        # 计算“查询”和“键”的归一化注意力得分，并添加相对位置偏置
        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, 1)

        # 将注意力得分归一化为概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 通过丢弃操作随机地“丢弃”一些注意力概率，这在原始Transformer论文中有所提及
        attention_probs = self.dropout(attention_probs)

        # 计算加权后的值（value）以生成上下文向量
        context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, 1)
        # 调整上下文向量的维度顺序，以便进一步处理
        context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous()
        # 将调整后的上下文向量形状改变为全头尺寸
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        # 根据是否需要输出注意力矩阵，选择性地返回上下文向量和注意力概率
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        # 返回输出元组
        return outputs
# 定义一个邻域注意力输出模块的神经网络模型
class NeighborhoodAttentionOutput(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 初始化一个线性层，用于将输入维度为dim的张量线性变换为维度仍为dim的输出张量
        self.dense = nn.Linear(dim, dim)
        # 初始化一个dropout层，使用config中指定的概率进行dropout操作
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入张量hidden_states通过线性层self.dense进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的张量进行dropout操作
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# 定义一个邻域注意力模块的神经网络模型
class NeighborhoodAttentionModule(nn.Module):
    def __init__(self, config, dim, num_heads, kernel_size):
        super().__init__()
        # 初始化一个邻域自注意力模块，使用config、dim、num_heads和kernel_size参数进行初始化
        self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size)
        # 初始化一个邻域注意力输出模块，使用config和dim参数进行初始化
        self.output = NeighborhoodAttentionOutput(config, dim)
        # 初始化一个集合，用于存储需要剪枝的注意力头
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 调用find_pruneable_heads_and_indices函数，找到可剪枝的注意力头和对应的索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储剪枝的注意力头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用self.self的forward方法，对隐藏状态进行自注意力操作
        self_outputs = self.self(hidden_states, output_attentions)
        # 调用self.output的forward方法，将自注意力操作的输出和输入隐藏状态作为输入计算注意力输出
        attention_output = self.output(self_outputs[0], hidden_states)
        # 构造输出元组，如果需要输出注意力，将它们加入到输出元组中
        outputs = (attention_output,) + self_outputs[1:]
        return outputs


# 定义一个邻域中间层的神经网络模型
class NatIntermediate(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 初始化一个线性层，用于将输入维度为dim的张量线性变换为维度为config.mlp_ratio * dim的输出张量
        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
        # 根据config中指定的隐藏层激活函数类型，选择对应的激活函数ACT2FN进行初始化
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入张量hidden_states通过线性层self.dense进行线性变换
        hidden_states = self.dense(hidden_states)
        # 将线性变换后的张量通过选择的激活函数self.intermediate_act_fn进行非线性变换
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 定义一个邻域输出层的神经网络模型
class NatOutput(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 初始化一个线性层，用于将输入维度为config.mlp_ratio * dim的张量线性变换为维度为dim的输出张量
        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
        # 初始化一个dropout层，使用config中指定的概率进行dropout操作
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    # 定义一个方法 `forward`，用于模型的前向传播过程，接受隐藏状态作为输入张量，并返回处理后的张量作为输出
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入张量通过全连接层 `self.dense` 进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的张量进行 dropout 操作，以防止过拟合
        hidden_states = self.dropout(hidden_states)
        # 返回经过线性变换和 dropout 处理后的张量作为输出
        return hidden_states
class NatLayer(nn.Module):
    def __init__(self, config, dim, num_heads, drop_path_rate=0.0):
        super().__init__()
        # 设置前馈分块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 设置卷积核大小
        self.kernel_size = config.kernel_size
        # 应用层归一化在注意力模块之前
        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
        # 创建注意力模块
        self.attention = NeighborhoodAttentionModule(config, dim, num_heads, kernel_size=self.kernel_size)
        # 根据丢弃路径率设置丢弃路径层
        self.drop_path = NatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
        # 应用层归一化在注意力模块之后
        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
        # 创建自然语言中间层
        self.intermediate = NatIntermediate(config, dim)
        # 创建自然语言输出层
        self.output = NatOutput(config, dim)
        # 如果配置允许，创建层缩放参数
        self.layer_scale_parameters = (
            nn.Parameter(config.layer_scale_init_value * torch.ones((2, dim)), requires_grad=True)
            if config.layer_scale_init_value > 0
            else None
        )

    def maybe_pad(self, hidden_states, height, width):
        # 设置窗口大小为卷积核大小
        window_size = self.kernel_size
        pad_values = (0, 0, 0, 0, 0, 0)
        # 如果输入的高度或宽度小于窗口大小，则进行填充
        if height < window_size or width < window_size:
            pad_l = pad_t = 0
            pad_r = max(0, window_size - width)
            pad_b = max(0, window_size - height)
            pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b)
            hidden_states = nn.functional.pad(hidden_states, pad_values)
        return hidden_states, pad_values

    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # 获取批处理大小、高度、宽度和通道数
        batch_size, height, width, channels = hidden_states.size()
        # 保存输入的快捷连接
        shortcut = hidden_states

        # 应用层归一化在注意力模块之前
        hidden_states = self.layernorm_before(hidden_states)
        # 如果输入的大小小于卷积核大小，则进行填充
        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)

        # 获取填充后的高度和宽度
        _, height_pad, width_pad, _ = hidden_states.shape

        # 应用注意力模块
        attention_outputs = self.attention(hidden_states, output_attentions=output_attentions)

        # 获取注意力输出
        attention_output = attention_outputs[0]

        # 检查是否进行了填充
        was_padded = pad_values[3] > 0 or pad_values[5] > 0
        if was_padded:
            # 如果进行了填充，则裁剪注意力输出以匹配原始输入的尺寸
            attention_output = attention_output[:, :height, :width, :].contiguous()

        # 如果存在层缩放参数，则应用第一个参数到注意力输出
        if self.layer_scale_parameters is not None:
            attention_output = self.layer_scale_parameters[0] * attention_output

        # 计算最终的隐藏状态，结合快捷连接和丢弃路径
        hidden_states = shortcut + self.drop_path(attention_output)

        # 应用层归一化在注意力模块之后
        layer_output = self.layernorm_after(hidden_states)
        # 经过中间层和输出层处理的最终输出
        layer_output = self.output(self.intermediate(layer_output))

        # 如果存在层缩放参数，则应用第二个参数到最终输出
        if self.layer_scale_parameters is not None:
            layer_output = self.layer_scale_parameters[1] * layer_output

        # 结合快捷连接和丢弃路径到最终输出
        layer_output = hidden_states + self.drop_path(layer_output)

        # 返回层输出，如果需要，还返回注意力输出
        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
        return layer_outputs
    # 初始化函数，用于初始化一个神经网络模型
    def __init__(self, config, dim, depth, num_heads, drop_path_rate, downsample):
        # 调用父类的初始化函数
        super().__init__()
        # 将传入的参数保存到对象的属性中
        self.config = config  # 保存配置信息
        self.dim = dim  # 保存输入的维度信息
        # 创建神经网络层的列表，每一层是一个 NatLayer 对象
        self.layers = nn.ModuleList(
            [
                NatLayer(
                    config=config,
                    dim=dim,
                    num_heads=num_heads,
                    drop_path_rate=drop_path_rate[i],
                )
                for i in range(depth)  # 根据指定的深度创建对应数量的层
            ]
        )

        # 如果存在下采样层，则创建该下采样层对象
        if downsample is not None:
            self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm)
        else:
            self.downsample = None  # 否则置为 None

        # 初始化一个指示变量，用于标记指向状态
        self.pointing = False

    # 前向传播函数，用于定义数据在模型中的正向流动过程
    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        _, height, width, _ = hidden_states.size()  # 获取输入张量的高度和宽度信息
        # 遍历所有层，并将输入张量按顺序传递给每一层进行处理
        for i, layer_module in enumerate(self.layers):
            layer_outputs = layer_module(hidden_states, output_attentions)
            hidden_states = layer_outputs[0]  # 更新隐藏状态

        hidden_states_before_downsampling = hidden_states  # 保存下采样之前的隐藏状态
        # 如果存在下采样层，则对隐藏状态进行下采样处理
        if self.downsample is not None:
            hidden_states = self.downsample(hidden_states_before_downsampling)

        stage_outputs = (hidden_states, hidden_states_before_downsampling)  # 构造阶段输出元组

        # 如果需要输出注意力权重，则将每一层的注意力权重也加入到输出中
        if output_attentions:
            stage_outputs += layer_outputs[1:]

        return stage_outputs  # 返回阶段输出元组
class NatEncoder(nn.Module):
    # 自然编码器的类定义，继承自nn.Module
    def __init__(self, config):
        super().__init__()
        # 初始化方法，接受一个配置对象作为参数

        # 获取层级深度列表的长度
        self.num_levels = len(config.depths)
        # 将配置对象保存在self.config中
        self.config = config
        # 根据drop_path_rate参数创建一个线性间隔的列表
        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
        
        # 创建一个模块列表，每个元素是一个NatStage对象
        self.levels = nn.ModuleList(
            [
                NatStage(
                    config=config,
                    # 设置每一层的嵌入维度
                    dim=int(config.embed_dim * 2**i_layer),
                    # 设置每一层的深度
                    depth=config.depths[i_layer],
                    # 设置每一层的头数
                    num_heads=config.num_heads[i_layer],
                    # 设置每一层的drop_path_rate值
                    drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                    # 设置是否下采样，最后一层不进行下采样
                    downsample=NatDownsampler if (i_layer < self.num_levels - 1) else None,
                )
                # 对每一个层级进行循环
                for i_layer in range(self.num_levels)
            ]
        )

    # 前向传播方法定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        output_hidden_states_before_downsampling: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple, NatEncoderOutput]:
        all_hidden_states = () if output_hidden_states else None
        all_reshaped_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        if output_hidden_states:
            # 将隐藏状态重排列为 b h w c -> b c h w
            reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
            # 将当前隐藏状态添加到所有隐藏状态元组中
            all_hidden_states += (hidden_states,)
            # 将重排列后的隐藏状态添加到所有重排列隐藏状态元组中
            all_reshaped_hidden_states += (reshaped_hidden_state,)

        for i, layer_module in enumerate(self.levels):
            # 调用每个层模块处理隐藏状态和注意力输出
            layer_outputs = layer_module(hidden_states, output_attentions)

            # 更新当前隐藏状态为当前层的隐藏状态输出
            hidden_states = layer_outputs[0]
            # 更新在下采样之前的隐藏状态为当前层的下采样前隐藏状态输出
            hidden_states_before_downsampling = layer_outputs[1]

            if output_hidden_states and output_hidden_states_before_downsampling:
                # 将下采样前的隐藏状态重排列为 b h w c -> b c h w
                reshaped_hidden_state = hidden_states_before_downsampling.permute(0, 3, 1, 2)
                # 将下采样前的隐藏状态添加到所有隐藏状态元组中
                all_hidden_states += (hidden_states_before_downsampling,)
                # 将重排列后的下采样前的隐藏状态添加到所有重排列隐藏状态元组中
                all_reshaped_hidden_states += (reshaped_hidden_state,)
            elif output_hidden_states and not output_hidden_states_before_downsampling:
                # 将当前隐藏状态重排列为 b h w c -> b c h w
                reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
                # 将当前隐藏状态添加到所有隐藏状态元组中
                all_hidden_states += (hidden_states,)
                # 将重排列后的当前隐藏状态添加到所有重排列隐藏状态元组中
                all_reshaped_hidden_states += (reshaped_hidden_state,)

            if output_attentions:
                # 将当前层的注意力输出添加到所有自注意力元组中
                all_self_attentions += layer_outputs[2:]

        if not return_dict:
            # 如果不返回字典形式的输出，则返回非空元组的组成部分
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)

        # 返回按照 NatEncoderOutput 结构组织的输出字典
        return NatEncoderOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            reshaped_hidden_states=all_reshaped_hidden_states,
        )
class NatPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 NatConfig
    config_class = NatConfig
    # 基础模型前缀设定为 "nat"
    base_model_prefix = "nat"
    # 主输入名称设定为 "pixel_values"
    main_input_name = "pixel_values"

    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果是线性层或卷积层，使用正态分布初始化权重
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 与 TF 版本稍有不同，TF 使用截断正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置，初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是 LayerNorm 层，初始化偏置为零，权重为全1
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


NAT_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`NatConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# NatModel 类的文档字符串
NAT_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 使用 add_start_docstrings 装饰 NatModel 类，添加其描述信息和参数文档字符串
@add_start_docstrings(
    "The bare Nat Model transformer outputting raw hidden-states without any specific head on top.",
    NAT_START_DOCSTRING,
)
class NatModel(NatPreTrainedModel):
    pass
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)

        # 确保模型需要的后端库已经加载
        requires_backends(self, ["natten"])

        # 设置配置信息
        self.config = config
        # 确定模型深度的层数
        self.num_levels = len(config.depths)
        # 计算特征向量的维度
        self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1))

        # 初始化自然语言嵌入层
        self.embeddings = NatEmbeddings(config)
        # 初始化自然语言编码器
        self.encoder = NatEncoder(config)

        # 使用指定的层归一化函数初始化
        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
        # 如果需要添加池化层，则初始化自适应平均池化层
        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None

        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回嵌入层中的补丁嵌入
        return self.embeddings.patch_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历需要修剪的层和对应的头部信息
        for layer, heads in heads_to_prune.items():
            # 在编码器中修剪指定层的注意力头
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=NatModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 输出注意力张量，如果未指定则使用配置中的输出注意力设置
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    # 输出隐藏状态张量，如果未指定则使用配置中的输出隐藏状态设置
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    # 返回字典标志，如果未指定则使用配置中的返回字典设置
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # 如果未提供像素值，则抛出数值错误
    if pixel_values is None:
        raise ValueError("You have to specify pixel_values")

    # 将像素值嵌入到嵌入层中
    embedding_output = self.embeddings(pixel_values)

    # 编码器处理嵌入输出
    encoder_outputs = self.encoder(
        embedding_output,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    # 获取编码器的序列输出
    sequence_output = encoder_outputs[0]
    # 应用层归一化到序列输出
    sequence_output = self.layernorm(sequence_output)

    # 初始化池化输出为 None
    pooled_output = None
    # 如果存在池化器，则对序列输出进行池化操作
    if self.pooler is not None:
        pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2))
        pooled_output = torch.flatten(pooled_output, 1)

    # 如果不使用返回字典形式
    if not return_dict:
        # 返回序列输出，池化输出以及可能的其他编码器输出
        output = (sequence_output, pooled_output) + encoder_outputs[1:]
        return output

    # 如果使用返回字典形式，则返回自定义的模型输出对象
    return NatModelOutput(
        last_hidden_state=sequence_output,
        pooler_output=pooled_output,
        hidden_states=encoder_outputs.hidden_states,
        attentions=encoder_outputs.attentions,
        reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
    )
# 使用装饰器为类添加文档字符串，描述其作为 Nat 模型变换器和图像分类头的功能，该头部是在 [CLS] 标记的最终隐藏状态之上的线性层
@add_start_docstrings(
    """
    Nat Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    """,
    NAT_START_DOCSTRING,
)
class NatForImageClassification(NatPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 检查所需的后端是否已加载
        requires_backends(self, ["natten"])

        # 初始化分类器的标签数量
        self.num_labels = config.num_labels
        # 创建 NatModel 实例
        self.nat = NatModel(config)

        # 分类器头部
        self.classifier = (
            nn.Linear(self.nat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # 初始化权重并应用最终处理
        self.post_init()

    # 使用装饰器添加模型前向方法的文档字符串，描述其输入参数
    @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=NatImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用 return_dict；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用自然语言处理模型进行处理，根据参数设置输出注意力权重和隐藏状态
        outputs = self.nat(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取池化后的输出，通常用于分类任务
        pooled_output = outputs[1]

        # 使用分类器对池化后的输出进行分类，得到预测的 logits
        logits = self.classifier(pooled_output)

        # 初始化损失为 None
        loss = None

        # 如果提供了标签 labels，则计算损失
        if labels is not None:
            # 如果问题类型未指定，则根据标签类型确定问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择对应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单标签回归任务，计算均方误差损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签回归任务，计算均方误差损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类任务，计算交叉熵损失
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类任务，计算二元交叉熵损失
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不需要返回字典形式的输出，则按照元组的形式返回结果
        if not return_dict:
            output = (logits,) + outputs[2:]  # outputs[2:] 包含额外的隐藏状态信息
            return ((loss,) + output) if loss is not None else output

        # 返回自定义的输出类 NatImageClassifierOutput，包含损失、logits、隐藏状态和注意力权重等信息
        return NatImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            reshaped_hidden_states=outputs.reshaped_hidden_states,
        )
# 使用装饰器添加文档字符串，描述这个类的作用是提供 NAT 的主干结构，可用于 DETR 和 MaskFormer 等框架。
# NAT_START_DOCSTRING 是预定义的一部分文档字符串内容。
@add_start_docstrings(
    "NAT backbone, to be used with frameworks like DETR and MaskFormer.",
    NAT_START_DOCSTRING,
)
class NatBackbone(NatPreTrainedModel, BackboneMixin):
    def __init__(self, config):
        # 调用父类的构造函数，初始化 NAT 模型的配置
        super().__init__(config)
        # 调用父类的方法，初始化主干结构
        super()._init_backbone(config)

        # 检查并确保需要的后端支持模块 "natten" 已经加载
        requires_backends(self, ["natten"])

        # 初始化嵌入层
        self.embeddings = NatEmbeddings(config)
        # 初始化编码器
        self.encoder = NatEncoder(config)
        
        # 计算每个特征层的通道数，并将其保存在列表中
        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]

        # 为输出特征层的隐藏状态添加层归一化层
        hidden_states_norms = {}
        for stage, num_channels in zip(self.out_features, self.channels):
            hidden_states_norms[stage] = nn.LayerNorm(num_channels)
        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)

        # 执行初始化权重并应用最终处理
        self.post_init()

    # 获取输入嵌入层的方法
    def get_input_embeddings(self):
        return self.embeddings.patch_embeddings

    # 重写 forward 方法，添加输入文档字符串和返回值文档字符串，指定了输入参数和返回输出类型
    @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.Tensor,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ):
        """
        根据输入的参数返回BackboneOutput对象。

        Parameters:
            return_dict (bool, optional): 控制是否返回字典形式的输出，默认从self.config.use_return_dict获取。
            output_hidden_states (bool, optional): 控制是否输出隐藏状态，默认从self.config.output_hidden_states获取。
            output_attentions (bool, optional): 控制是否输出注意力，默认从self.config.output_attentions获取。

        Returns:
            BackboneOutput: 包含特征图、隐藏状态和注意力的输出对象。

        Examples:

        ```
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 512, 7, 7]
        ```
        """
        # 如果未提供return_dict参数，则使用self.config.use_return_dict的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果未提供output_hidden_states参数，则使用self.config.output_hidden_states的设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未提供output_attentions参数，则使用self.config.output_attentions的设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

        # 对输入的像素值进行嵌入处理，获取嵌入输出
        embedding_output = self.embeddings(pixel_values)

        # 将嵌入输出传入编码器，获取编码器的输出
        outputs = self.encoder(
            embedding_output,
            output_attentions=output_attentions,
            output_hidden_states=True,
            output_hidden_states_before_downsampling=True,
            return_dict=True,
        )

        # 获取重塑后的隐藏状态
        hidden_states = outputs.reshaped_hidden_states

        # 初始化特征图为空元组
        feature_maps = ()
        # 遍历阶段名称和隐藏状态，将符合输出特征要求的阶段的处理后的隐藏状态添加到特征图中
        for stage, hidden_state in zip(self.stage_names, hidden_states):
            if stage in self.out_features:
                # TODO can we simplify this? 可以简化这部分代码吗？
                batch_size, num_channels, height, width = hidden_state.shape
                hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
                hidden_state = hidden_state.view(batch_size, height * width, num_channels)
                hidden_state = self.hidden_states_norms[stage](hidden_state)
                hidden_state = hidden_state.view(batch_size, height, width, num_channels)
                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
                feature_maps += (hidden_state,)

        # 如果不需要返回字典形式的输出，则构造输出元组
        if not return_dict:
            output = (feature_maps,)
            if output_hidden_states:
                output += (outputs.hidden_states,)
            return output

        # 返回BackboneOutput对象，包含特征图、隐藏状态和注意力
        return BackboneOutput(
            feature_maps=feature_maps,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=outputs.attentions,
        )

`.\models\nat\init.py`

# 版权声明及许可信息
# 2022 年版权归 HuggingFace 团队所有。保留所有权利。
# 根据 Apache 许可证 2.0 版本（“许可证”）进行许可；
# 除非符合许可证的要求，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件，
# 不附带任何明示或暗示的担保或条件。
# 有关许可证的详细信息，请参阅许可证。

# 引入 TYPE_CHECKING 类型检查工具
from typing import TYPE_CHECKING

# 引入 OptionalDependencyNotAvailable 异常和 _LazyModule、is_torch_available 工具函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义预期的导入结构
_import_structure = {"configuration_nat": ["NAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "NatConfig"]}

# 检查是否可用 Torch
try:
    if not is_torch_available():
        # 如果 Torch 不可用，引发 OptionalDependencyNotAvailable 异常
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    # 如果捕获到 OptionalDependencyNotAvailable 异常，则继续执行后续代码
    pass
else:
    # 如果 Torch 可用，则添加相关模块到导入结构中
    _import_structure["modeling_nat"] = [
        "NAT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "NatForImageClassification",
        "NatModel",
        "NatPreTrainedModel",
        "NatBackbone",
    ]

# 如果 TYPE_CHECKING 为 True，则导入所需的配置和模型
if TYPE_CHECKING:
    from .configuration_nat import NAT_PRETRAINED_CONFIG_ARCHIVE_MAP, NatConfig

    try:
        if not is_torch_available():
            # 如果 Torch 不可用，引发 OptionalDependencyNotAvailable 异常
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果捕获到 OptionalDependencyNotAvailable 异常，则继续执行后续代码
        pass
    else:
        # 如果 Torch 可用，则从 modeling_nat 模块中导入所需的类
        from .modeling_nat import (
            NAT_PRETRAINED_MODEL_ARCHIVE_LIST,
            NatBackbone,
            NatForImageClassification,
            NatModel,
            NatPreTrainedModel,
        )

# 如果 TYPE_CHECKING 为 False，则将当前模块设为 LazyModule，并指定导入结构
else:
    import sys

    # 使用 _LazyModule 将当前模块设为惰性加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\nezha\configuration_nezha.py`

# 导入PretrainedConfig类
from ... import PretrainedConfig

# NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP定义了一个字典，映射了预训练模型名称到其配置文件的URL
NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "sijunhe/nezha-cn-base": "https://huggingface.co/sijunhe/nezha-cn-base/resolve/main/config.json",
}

# NezhaConfig类继承自PretrainedConfig类，用于存储Nezha模型的配置信息
class NezhaConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of an [`NezhaModel`]. It is used to instantiate an Nezha
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Nezha
    [sijunhe/nezha-cn-base](https://huggingface.co/sijunhe/nezha-cn-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # NEZHA 模型的配置类，用于定义模型的各种参数和超参数
    Args:
        vocab_size (`int`, optional, defaults to 21128):
            NEZHA 模型的词汇表大小，定义了可以被输入到 `NezhaModel` 的 `input_ids` 中的不同标记。
        hidden_size (`int`, optional, defaults to 768):
            编码器层和池化层的维度。
        num_hidden_layers (`int`, optional, defaults to 12):
            Transformer 编码器中的隐藏层数量。
        num_attention_heads (`int`, optional, defaults to 12):
            Transformer 编码器中每个注意力层的注意头数量。
        intermediate_size (`int`, optional, defaults to 3072):
            Transformer 编码器中“中间”（即前馈）层的维度。
        hidden_act (`str` or `function`, optional, defaults to "gelu"):
            编码器和池化层中的非线性激活函数。
        hidden_dropout_prob (`float`, optional, defaults to 0.1):
            嵌入层、编码器和池化层中所有全连接层的 dropout 概率。
        attention_probs_dropout_prob (`float`, optional, defaults to 0.1):
            注意力概率的 dropout 比率。
        max_position_embeddings (`int`, optional, defaults to 512):
            模型可能使用的最大序列长度。通常设置为一个较大的值（例如 512、1024 或 2048）。
        type_vocab_size (`int`, optional, defaults to 2):
            传递给 `NezhaModel` 的 `token_type_ids` 的词汇表大小。
        initializer_range (`float`, optional, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态初始化器的标准差。
        layer_norm_eps (`float`, optional, defaults to 1e-12):
            层归一化层使用的 epsilon 值。
        classifier_dropout (`float`, optional, defaults to 0.1):
            附加分类器的 dropout 比率。
        is_decoder (`bool`, *optional*, defaults to `False`):
            模型是否作为解码器使用。如果为 `False`，则模型作为编码器使用。

    Example:

    ```
    >>> from transformers import NezhaConfig, NezhaModel

    >>> # Initializing an Nezha configuration
    >>> configuration = NezhaConfig()

    >>> # Initializing a model (with random weights) from the Nezha-base style configuration model
    >>> model = NezhaModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```

    # NEZHA 预训练模型配置文件的存档映射
    pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP

    # 模型类型为 NEZHA
    model_type = "nezha"
    # 初始化函数，用于创建一个新的实例对象
    def __init__(
        self,
        vocab_size=21128,  # 词汇表大小，默认为21128
        hidden_size=768,   # 隐藏层大小，默认为768
        num_hidden_layers=12,  # 隐藏层的数量，默认为12
        num_attention_heads=12,  # 注意力头的数量，默认为12
        intermediate_size=3072,  # 中间层大小，默认为3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为GELU
        hidden_dropout_prob=0.1,  # 隐藏层的Dropout概率，默认为0.1
        attention_probs_dropout_prob=0.1,  # 注意力概率的Dropout概率，默认为0.1
        max_position_embeddings=512,  # 最大位置编码数，默认为512
        max_relative_position=64,  # 最大相对位置数，默认为64
        type_vocab_size=2,   # 类型词汇表大小，默认为2
        initializer_range=0.02,  # 初始化范围，默认为0.02
        layer_norm_eps=1e-12,   # 层归一化的epsilon，默认为1e-12
        classifier_dropout=0.1,   # 分类器的Dropout概率，默认为0.1
        pad_token_id=0,   # 填充标记ID，默认为0
        bos_token_id=2,   # 起始标记ID，默认为2
        eos_token_id=3,   # 结束标记ID，默认为3
        use_cache=True,   # 是否使用缓存，默认为True
        **kwargs,   # 其他关键字参数
    ):
        # 调用父类的初始化函数，传递填充标记ID、起始标记ID、结束标记ID以及其他关键字参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 设置实例对象的属性
        self.vocab_size = vocab_size   # 设置词汇表大小属性
        self.hidden_size = hidden_size   # 设置隐藏层大小属性
        self.num_hidden_layers = num_hidden_layers   # 设置隐藏层数量属性
        self.num_attention_heads = num_attention_heads   # 设置注意力头数量属性
        self.hidden_act = hidden_act   # 设置隐藏层激活函数属性
        self.intermediate_size = intermediate_size   # 设置中间层大小属性
        self.hidden_dropout_prob = hidden_dropout_prob   # 设置隐藏层的Dropout概率属性
        self.attention_probs_dropout_prob = attention_probs_dropout_prob   # 设置注意力概率的Dropout概率属性
        self.max_position_embeddings = max_position_embeddings   # 设置最大位置编码数属性
        self.max_relative_position = max_relative_position   # 设置最大相对位置数属性
        self.type_vocab_size = type_vocab_size   # 设置类型词汇表大小属性
        self.initializer_range = initializer_range   # 设置初始化范围属性
        self.layer_norm_eps = layer_norm_eps   # 设置层归一化的epsilon属性
        self.classifier_dropout = classifier_dropout   # 设置分类器的Dropout概率属性
        self.use_cache = use_cache   # 设置是否使用缓存属性

`.\models\nezha\modeling_nezha.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Nezha model."""


import math                           # 导入数学库
import os                             # 导入操作系统库
import warnings                      # 导入警告模块
from dataclasses import dataclass     # 导入数据类装饰器
from typing import List, Optional, Tuple, Union   # 导入类型提示工具

import torch                         # 导入PyTorch
import torch.utils.checkpoint         # 导入PyTorch的checkpoint工具
from torch import nn                  # 导入神经网络模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss   # 导入损失函数

from ...activations import ACT2FN    # 从相对路径导入激活函数映射
from ...modeling_outputs import (    # 从相对路径导入模型输出类
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    NextSentencePredictorOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel   # 从相对路径导入预训练模型类
from ...pytorch_utils import (      # 从相对路径导入PyTorch工具函数
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from ...utils import (               # 从相对路径导入工具函数和类
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_nezha import NezhaConfig   # 从相对路径导入配置文件类


logger = logging.get_logger(__name__)   # 获取当前模块的日志记录器

_CHECKPOINT_FOR_DOC = "sijunhe/nezha-cn-base"   # 文档中使用的检查点模型名称
_CONFIG_FOR_DOC = "NezhaConfig"                # 文档中使用的配置文件名称

NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = [         # Nezha预训练模型的模型存档列表
    "sijunhe/nezha-cn-base",
    "sijunhe/nezha-cn-large",
    "sijunhe/nezha-base-wwm",
    "sijunhe/nezha-large-wwm",
    # See all Nezha models at https://huggingface.co/models?filter=nezha
]


def load_tf_weights_in_nezha(model, config, tf_checkpoint_path):
    """Load tf checkpoints in a pytorch model."""
    try:
        import re                      # 导入正则表达式模块
        import numpy as np             # 导入NumPy库
        import tensorflow as tf        # 导入TensorFlow库
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)   # 获取TensorFlow检查点路径的绝对路径
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")   # 记录日志，显示正在转换的TensorFlow检查点路径
    # 从TF模型加载权重
    init_vars = tf.train.list_variables(tf_path)   # 列出TF模型的所有变量
    names = []                                     # 初始化变量名列表
    arrays = []                                    # 初始化数组列表
    for name, shape in init_vars:                  # 遍历所有初始化的变量和形状
        logger.info(f"Loading TF weight {name} with shape {shape}")   # 记录日志，显示加载的TF权重和其形状
        array = tf.train.load_variable(tf_path, name)   # 加载TF变量
        names.append(name)                           # 将变量名添加到列表
        arrays.append(array)                         # 将加载的数组添加到列表
    for name, array in zip(names, arrays):
        # 将每个变量名按 '/' 分割成列表
        name = name.split("/")
        # 检查变量名中是否包含不需要的项，如 'adam_v', 'adam_m', 'AdamWeightDecayOptimizer' 等
        if any(
            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
            for n in name
        ):
            # 如果包含不需要的项，记录日志并跳过当前循环
            logger.info(f"Skipping {'/'.join(name)}")
            continue
        # 设置指针指向模型的根部
        pointer = model
        # 遍历变量名列表中的每个名称
        for m_name in name:
            # 如果名称匹配类似 'xxx_0' 的格式，按 '_' 分割成列表
            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
                scope_names = re.split(r"_(\d+)", m_name)
            else:
                scope_names = [m_name]
            # 根据名称的第一个部分决定指针移动的方式
            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                pointer = getattr(pointer, "bias")
            elif scope_names[0] == "output_weights":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "squad":
                pointer = getattr(pointer, "classifier")
            else:
                try:
                    # 尝试获取指针指向对象的属性
                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    # 如果属性不存在，记录日志并跳过当前循环
                    logger.info(f"Skipping {'/'.join(name)}")
                    continue
            # 如果名称列表长度大于等于2，将第二部分转换为整数，用于索引指针对象
            if len(scope_names) >= 2:
                num = int(scope_names[1])
                pointer = pointer[num]
        # 如果变量名以 '_embeddings' 结尾，设置指针指向权重
        if m_name[-11:] == "_embeddings":
            pointer = getattr(pointer, "weight")
        elif m_name == "kernel":
            # 如果变量名为 'kernel'，对数组进行转置操作
            array = np.transpose(array)
        # 检查指针对象的形状是否与数组的形状匹配
        try:
            if pointer.shape != array.shape:
                # 如果形状不匹配，抛出异常
                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
        except AssertionError as e:
            # 捕获断言错误，添加指针对象和数组的形状信息，并重新抛出异常
            e.args += (pointer.shape, array.shape)
            raise
        # 记录日志，表示正在初始化 PyTorch 权重
        logger.info(f"Initialize PyTorch weight {name}")
        # 将数组转换为 Torch 张量，并赋值给指针对象的数据属性
        pointer.data = torch.from_numpy(array)
    # 返回更新后的模型对象
    return model
    """Implement the Functional Relative Position Encoding"""
    # 实现函数式相对位置编码

    def __init__(self, length, depth, max_relative_position=127):
        super().__init__()
        # 获得词汇表大小
        vocab_size = max_relative_position * 2 + 1
        # 创建长度范围向量
        range_vec = torch.arange(length)
        # 创建长度范围矩阵
        range_mat = range_vec.repeat(length).view(length, length)
        # 创建距离矩阵
        distance_mat = range_mat - torch.t(range_mat)
        # 将距离矩阵裁剪在[-max_relative_position, max_relative_position]范围内
        distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position)
        # 最终的相对位置矩阵
        final_mat = distance_mat_clipped + max_relative_position

        # 创建空的嵌入表
        embeddings_table = torch.zeros(vocab_size, depth)
        # 创建位置矩阵
        position = torch.arange(0, vocab_size, dtype=torch.int64).float().unsqueeze(1)
        # 创建除数项
        div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth))
        # 使用正弦函数填充偶数列
        embeddings_table[:, 0::2] = torch.sin(position * div_term)
        # 使用余弦函数填充奇数列
        embeddings_table[:, 1::2] = torch.cos(position * div_term)

        # 将最终的相对位置矩阵展平
        flat_relative_positions_matrix = final_mat.view(-1)
        # 创建独热编码的相对位置矩阵
        one_hot_relative_positions_matrix = torch.nn.functional.one_hot(
            flat_relative_positions_matrix, num_classes=vocab_size
        ).float()
        # 计算位置编码矩阵
        positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table)
        # 调整位置编码矩阵形状
        my_shape = list(final_mat.size())
        my_shape.append(depth)
        positions_encoding = positions_encoding.view(my_shape)
        # 将位置编码矩阵注册为模型的缓冲区
        self.register_buffer("positions_encoding", positions_encoding, persistent=False)

    def forward(self, length):
        # 返回指定长度的位置编码矩阵
        return self.positions_encoding[:length, :length, :]


class NezhaEmbeddings(nn.Module):
    """Construct the embeddings from word and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
        # 创建词嵌入层
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 创建标记类型嵌入层
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 与 TensorFlow 模型变量名保持一致以便加载任何 TensorFlow 检查点文件
        # 这里的 self.LayerNorm 不使用蛇形命名法
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建丢弃层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 注册一个零张量作为标记类型的缓冲区
        self.register_buffer(
            "token_type_ids", torch.zeros((1, config.max_position_embeddings), dtype=torch.long), persistent=False
        )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.Tensor:
        # 如果给定了 input_ids，则获取其形状作为 input_shape
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则，从 inputs_embeds 中获取形状，去掉最后一个维度
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度，这里假定 input_shape 是 (batch_size, seq_length)
        seq_length = input_shape[1]

        # 如果 inputs_embeds 为 None，则使用 word_embeddings 对 input_ids 进行嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        # 设置 token_type_ids，如果未提供，则使用在构造函数中注册的缓冲区，通常为全零。这种设置通常在模型跟踪时帮助用户，避免了手动传入 token_type_ids 的问题，解决了 issue #5664
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                # 从模型中注册的缓冲区获取 token_type_ids，并截取到当前序列长度
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                # 将 buffered_token_type_ids 扩展为 input_shape[0] 行，seq_length 列的张量
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                # 如果模型中未定义 token_type_ids，创建一个全零张量作为 token_type_ids
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=inputs_embeds.device)

        # 根据 token_type_ids 获取 token_type_embeddings
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将 inputs_embeds 和 token_type_embeddings 相加得到最终的 embeddings
        embeddings = inputs_embeds + token_type_embeddings
        # 使用 LayerNorm 对 embeddings 进行归一化
        embeddings = self.LayerNorm(embeddings)
        # 对 embeddings 进行 dropout 处理
        embeddings = self.dropout(embeddings)
        # 返回最终的 embeddings 张量
        return embeddings
# 定义 NezhaSelfAttention 类，继承自 nn.Module
class NezhaSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 检查隐藏层大小是否是注意力头数的整数倍，如果不是则抛出数值错误
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 初始化注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 初始化查询、键、值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 初始化 dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # 初始化相对位置编码
        self.relative_positions_encoding = NezhaRelativePositionsEncoding(
            length=config.max_position_embeddings,
            depth=self.attention_head_size,
            max_relative_position=config.max_relative_position,
        )
        
        # 是否作为解码器的标志
        self.is_decoder = config.is_decoder

    # 将输入张量变换为注意力分数张量的形状
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ):
        # 这里的前向传播功能将根据输入计算注意力分数并返回相应输出
        # 具体实现需要查看具体的代码逻辑和数学计算过程



# 从 transformers.models.bert.modeling_bert.BertSelfOutput 复制过来，修改为 NezhaSelfOutput 类
class NezhaSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 全连接层，将隐藏状态映射回原始大小
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # LayerNorm 层，归一化隐藏状态
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，计算隐藏状态的输出
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 全连接层映射
        hidden_states = self.dense(hidden_states)
        # dropout
        hidden_states = self.dropout(hidden_states)
        # LayerNorm 归一化并添加输入张量
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states



# 定义 NezhaAttention 类，继承自 nn.Module
class NezhaAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化自注意力层和自注意力输出层
        self.self = NezhaSelfAttention(config)
        self.output = NezhaSelfOutput(config)
        # 初始化剪枝的注意力头集合
        self.pruned_heads = set()
    # 剪枝注意力头部，排除头部列表为空的情况
    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        
        # 调用辅助函数找到可剪枝的注意力头部及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层的查询、键、值和输出层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并记录剪枝的注意力头部
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播函数，接收多个参数并返回一个元组
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self 层的前向传播方法，得到 self_outputs
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        
        # 将 self_outputs 的第一个元素与原始隐藏状态传递给输出层，得到注意力输出
        attention_output = self.output(self_outputs[0], hidden_states)
        
        # 构造输出元组，包含注意力输出和可能的其他输出
        outputs = (attention_output,) + self_outputs[1:]  # 如果有输出注意力，将其加入输出元组
        
        # 返回所有输出
        return outputs
# 从transformers.models.bert.modeling_bert.BertIntermediate复制而来，将Bert替换为Nezha
class NezhaIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，将输入的隐藏大小转换为中间大小
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果配置中的隐藏激活函数是字符串，则使用ACT2FN字典中对应的函数，否则直接使用配置中的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 通过线性层转换隐藏状态的维度
        hidden_states = self.dense(hidden_states)
        # 应用中间激活函数到转换后的隐藏状态
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 从transformers.models.bert.modeling_bert.BertOutput复制而来，将Bert替换为Nezha
class NezhaOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，将中间大小的特征映射回隐藏大小
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个LayerNorm层，用于标准化隐藏状态
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个Dropout层，用于随机置零隐藏状态的部分单元
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 通过线性层将中间大小的特征映射回隐藏大小
        hidden_states = self.dense(hidden_states)
        # 对映射后的隐藏状态应用Dropout
        hidden_states = self.dropout(hidden_states)
        # 对映射后的隐藏状态应用LayerNorm，并将输入张量与其相加
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# NezhaLayer类定义，用于构建Nezha模型的一个层
class NezhaLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 设置前向传播中的分块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 设置序列长度维度
        self.seq_len_dim = 1
        # 创建NezhaAttention层对象
        self.attention = NezhaAttention(config)
        # 检查是否为解码器模型，如果是，则添加跨注意力
        self.is_decoder = config.is_decoder
        self.add_cross_attention = config.add_cross_attention
        if self.add_cross_attention:
            if not self.is_decoder:
                # 如果非解码器模型且添加了跨注意力，抛出错误
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 创建另一个NezhaAttention层对象，用于跨注意力
            self.crossattention = NezhaAttention(config)
        # 创建NezhaIntermediate层对象，用于转换隐藏状态到中间状态
        self.intermediate = NezhaIntermediate(config)
        # 创建NezhaOutput层对象，用于将中间状态转换回隐藏状态
        self.output = NezhaOutput(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
        ):
        # NezhaLayer的前向传播方法，将输入隐藏状态传递给Nezha模型的每个组件，并返回隐藏状态的转换结果
    ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # Perform self-attention operation using the attention module
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # Extract the self-attention output tensor
        attention_output = self_attention_outputs[0]

        # if decoder, the last output is tuple of self-attn cache
        if self.is_decoder:
            # Extract all outputs except the last one which is the self-attention cache
            outputs = self_attention_outputs[1:-1]
            # Extract the present key/value tuple
            present_key_value = self_attention_outputs[-1]
        else:
            # Include self attentions in outputs if attention weights are to be output
            outputs = self_attention_outputs[1:]

        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                # Raise error if cross-attention layers are not instantiated when needed
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # Perform cross-attention operation using the crossattention module
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            # Extract the cross-attention output tensor
            attention_output = cross_attention_outputs[0]
            # Append cross-attention outputs to the existing outputs list
            outputs = outputs + cross_attention_outputs[1:-1]

            # add cross-attn cache to positions 3,4 of present_key_value tuple
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # Apply chunking to the feed forward step and compute layer output
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # Append layer_output to outputs tuple
        outputs = (layer_output,) + outputs

        # if decoder, return the attn key/values as the last output
        if self.is_decoder:
            # Append present key/values to outputs if decoder
            outputs = outputs + (present_key_value,)

        # Return the final outputs tuple
        return outputs

    def feed_forward_chunk(self, attention_output):
        # Compute intermediate output using the intermediate module
        intermediate_output = self.intermediate(attention_output)
        # Compute final layer output using the output module
        layer_output = self.output(intermediate_output, attention_output)
        # Return the final layer output
        return layer_output
# 从transformers.models.bert.modeling_bert.BertEncoder复制过来，并将Bert->Nezha
class NezhaEncoder(nn.Module):
    # 初始化方法，接受一个config对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 将传入的config对象保存到实例变量self.config中
        self.config = config
        # 创建一个由NezhaLayer对象组成的ModuleList，列表长度为config.num_hidden_layers
        self.layer = nn.ModuleList([NezhaLayer(config) for _ in range(config.num_hidden_layers)])
        # 设置梯度检查点为False
        self.gradient_checkpointing = False

    # 前向传播方法定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        all_hidden_states = () if output_hidden_states else None
        # 如果输出隐藏状态为真，则初始化空元组以存储所有隐藏状态，否则设为None
        all_self_attentions = () if output_attentions else None
        # 如果输出自注意力权重为真，则初始化空元组以存储所有自注意力权重，否则设为None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
        # 如果输出自注意力和交叉注意力为真且模型配置中包含交叉注意力，则初始化空元组以存储所有交叉注意力，否则设为None

        if self.gradient_checkpointing and self.training:
            # 如果启用了梯度检查点并且处于训练状态
            if use_cache:
                # 如果使用缓存，则发出警告并设置use_cache为False，因为与梯度检查点不兼容
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        next_decoder_cache = () if use_cache else None
        # 如果使用缓存，则初始化空元组以存储下一个解码器缓存，否则设为None
        for i, layer_module in enumerate(self.layer):
            # 遍历每个层模块

            if output_hidden_states:
                # 如果需要输出隐藏状态，则将当前隐藏状态添加到all_hidden_states元组中
                all_hidden_states = all_hidden_states + (hidden_states,)

            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取当前层的头部掩码，如果头部掩码存在的话

            past_key_value = past_key_values[i] if past_key_values is not None else None
            # 获取过去的键值对，如果过去的键值对存在的话

            if self.gradient_checkpointing and self.training:
                # 如果启用了梯度检查点并且处于训练状态
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则正常调用当前层模块
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            hidden_states = layer_outputs[0]
            # 更新当前隐藏状态为当前层的输出的第一个元素（即隐藏状态）

            if use_cache:
                # 如果使用缓存，则将当前层的输出的最后一个元素（即下一个解码器缓存）添加到next_decoder_cache中
                next_decoder_cache += (layer_outputs[-1],)

            if output_attentions:
                # 如果需要输出注意力权重
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 将当前层的输出的第二个元素（自注意力权重）添加到all_self_attentions中
                if self.config.add_cross_attention:
                    # 如果模型配置中包含交叉注意力
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
                    # 将当前层的输出的第三个元素（交叉注意力权重）添加到all_cross_attentions中

        if output_hidden_states:
            # 如果需要输出隐藏状态，则将最终的当前隐藏状态添加到all_hidden_states中
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            # 如果不返回字典形式的结果
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
            # 返回包含非空值的元组作为输出

        return BaseModelOutputWithPastAndCrossAttentions(
            # 否则返回基础模型输出与过去和交叉注意力
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Nezha
class NezhaPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Initialize a fully connected layer for pooling hidden states
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # Activation function used after pooling
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # Pooling operation: take the hidden state corresponding to the first token
        first_token_tensor = hidden_states[:, 0]
        # Pass through the pooling dense layer
        pooled_output = self.dense(first_token_tensor)
        # Apply activation function
        pooled_output = self.activation(pooled_output)
        return pooled_output


# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->Nezha
class NezhaPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Initialize a fully connected layer for transformation
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # Activation function determined by the config
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        # Layer normalization
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # Transform the hidden states using the dense layer
        hidden_states = self.dense(hidden_states)
        # Apply activation function
        hidden_states = self.transform_act_fn(hidden_states)
        # Apply layer normalization
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->Nezha
class NezhaLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Initialize prediction head transformation module
        self.transform = NezhaPredictionHeadTransform(config)

        # Decoder layer: projects hidden states to output vocab size
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Bias parameter for output layer
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # Link bias to the decoder to adjust with token embeddings resizing
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        # Transform hidden states
        hidden_states = self.transform(hidden_states)
        # Decode to get prediction scores
        hidden_states = self.decoder(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->Nezha
class NezhaOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # MLM predictions using NezhaLMPredictionHead
        self.predictions = NezhaLMPredictionHead(config)

    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
        # Get MLM prediction scores from sequence output
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->Nezha
class NezhaOnlyNSPHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Next sentence prediction using linear layer
        self.seq_relationship = nn.Linear(config.hidden_size, 2)
    # 定义一个类方法 `forward`，用于执行模型的前向传播
    def forward(self, pooled_output):
        # 调用 `seq_relationship` 方法，传入 `pooled_output` 参数，计算序列关系分数
        seq_relationship_score = self.seq_relationship(pooled_output)
        # 返回计算得到的序列关系分数作为方法的输出结果
        return seq_relationship_score
# 从 transformers.models.bert.modeling_bert.BertPreTrainingHeads 复制并修改为 Nezha 模型的预训练头部
class NezhaPreTrainingHeads(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建 NezhaLMPredictionHead 对象用于预测下一个词的概率分布
        self.predictions = NezhaLMPredictionHead(config)
        # 创建一个线性层用于序列关系预测，输出维度为 2
        self.seq_relationship = nn.Linear(config.hidden_size, 2)

    def forward(self, sequence_output, pooled_output):
        # 通过预测头部获取预测分数，用于预测下一个词的概率分布
        prediction_scores = self.predictions(sequence_output)
        # 通过线性层获取序列关系分数，用于预测句子关系
        seq_relationship_score = self.seq_relationship(pooled_output)
        # 返回预测分数和序列关系分数作为输出
        return prediction_scores, seq_relationship_score


class NezhaPreTrainedModel(PreTrainedModel):
    """
    一个处理权重初始化、下载和加载预训练模型的抽象类。
    """

    # Nezha 模型的配置类
    config_class = NezhaConfig
    # 加载 TensorFlow 权重的函数
    load_tf_weights = load_tf_weights_in_nezha
    # Nezha 模型的前缀
    base_model_prefix = "nezha"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """初始化权重"""
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化线性层的权重，均值为 0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果有偏置项，初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化嵌入层的权重，均值为 0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果有填充索引，将填充索引位置的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 初始化 LayerNorm 层的偏置为零，权重为 1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


@dataclass
class NezhaForPreTrainingOutput(ModelOutput):
    """
    NezhaForPreTraining 的输出类型。
    """
    """
    loss: Optional[torch.FloatTensor] = None
    prediction_logits: torch.FloatTensor = None
    seq_relationship_logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    """
"""
    This constant defines a docstring for Nezha models, providing an overview and usage guidelines.

    It inherits from `PreTrainedModel`, indicating that it leverages methods defined in the superclass 
    for tasks like downloading, saving, resizing embeddings, and pruning heads.

    Additionally, it specifies that the model is a subclass of PyTorch's `torch.nn.Module`, implying 
    that it can be used as a regular PyTorch module. Users are directed to consult the PyTorch 
    documentation for general usage and behavior details.

    Parameters:
        config (`NezhaConfig`): A configuration object holding all model parameters. When initializing 
            with a config file, only configuration settings are loaded, not model weights. Refer to 
            `PreTrainedModel.from_pretrained` to load both configuration and weights.
"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列标记在词汇表中的索引。

            # 可以使用 [`AutoTokenizer`] 获取这些索引。有关详细信息，请参阅 [`PreTrainedTokenizer.encode`] 和
            # [`PreTrainedTokenizer.__call__`]。

            # [什么是输入 ID？](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 遮罩，用于避免在填充的标记索引上执行注意力操作。遮罩的值为 `[0, 1]`：

            # - 1 表示**不遮罩**的标记，
            # - 0 表示**遮罩**的标记。

            # [什么是注意力遮罩？](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 段标记索引，指示输入的第一部分和第二部分。索引在 `[0, 1]` 中选择：

            # - 0 对应于*句子 A*的标记，
            # - 1 对应于*句子 B*的标记。

            # [什么是标记类型 ID？](../glossary#token-type-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 用于将自注意力模块的选定头部置零的遮罩。遮罩的值在 `[0, 1]` 中选择：

            # - 1 表示头部**不遮罩**，
            # - 0 表示头部**遮罩**。

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选，您可以选择直接传递嵌入表示，而不是传递 `input_ids`。如果您想对如何将 `input_ids` 索引转换为关联向量
            # 有更多控制权，则这非常有用，而不是使用模型的内部嵌入查找矩阵。
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。有关详细信息，请参阅返回张量中的 `attentions`。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。有关详细信息，请参阅返回张量中的 `hidden_states`。
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
# 定义 NezhaModel 类，继承自 NezhaPreTrainedModel
@add_start_docstrings(
    "The bare Nezha Model transformer outputting raw hidden-states without any specific head on top.",
    NEZHA_START_DOCSTRING,
)
class NezhaModel(NezhaPreTrainedModel):
    """
    # 初始化函数
    def __init__(self, config, add_pooling_layer=True):
        # 调用父类初始化函数
        super().__init__(config)
        # 保存配置
        self.config = config
        # 初始化嵌入层
        self.embeddings = NezhaEmbeddings(config)
        # 初始化编码器
        self.encoder = NezhaEncoder(config)
        # 如果需要添加池化层，则初始化池化层
        self.pooler = NezhaPooler(config) if add_pooling_layer else None
        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入嵌入层
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    # 设置输入嵌入层
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    # 剪枝模型中的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 前向传播函数
    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    """
    Nezha Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    """
    这是一个多行字符串（docstring），通常用来描述函数或类的作用、参数、返回值等信息。
    在这里，它描述的是一个叫做 "sentence prediction (classification)" 的部分的头部信息。
    """
    NEZHA_START_DOCSTRING,
# 定义 NezhaForPreTraining 类，继承自 NezhaPreTrainedModel
class NezhaForPreTraining(NezhaPreTrainedModel):
    # 定义 tied_weights_keys 类属性，指定可共享权重的键名
    _tied_weights_keys = ["cls.predictions.decoder"]

    # 初始化方法，接收一个配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 创建 NezhaModel 实例，并保存到 self.nezha 属性中
        self.nezha = NezhaModel(config)
        # 创建 NezhaPreTrainingHeads 实例，并保存到 self.cls 属性中
        self.cls = NezhaPreTrainingHeads(config)

        # 执行后续初始化和处理步骤
        self.post_init()

    # 获取输出嵌入的方法，返回预测层的解码器
    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    # 设置输出嵌入的方法，用新的嵌入替换预测层的解码器
    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    # 前向传播方法，接收多个输入参数，并返回模型输出
    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=NezhaForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        next_sentence_label: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



# 定义 NezhaForMaskedLM 类，继承自 NezhaPreTrainedModel
@add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING)
class NezhaForMaskedLM(NezhaPreTrainedModel):
    # 定义 tied_weights_keys 类属性，指定可共享权重的键名
    _tied_weights_keys = ["cls.predictions.decoder"]

    # 初始化方法，接收一个配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 如果配置中标明是解码器，则发出警告
        if config.is_decoder:
            logger.warning(
                "If you want to use `NezhaForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # 创建 NezhaModel 实例，设置不添加池化层，并保存到 self.nezha 属性中
        self.nezha = NezhaModel(config, add_pooling_layer=False)
        # 创建 NezhaOnlyMLMHead 实例，并保存到 self.cls 属性中
        self.cls = NezhaOnlyMLMHead(config)

        # 执行后续初始化和处理步骤
        self.post_init()

    # 获取输出嵌入的方法，返回预测层的解码器
    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    # 设置输出嵌入的方法，用新的嵌入替换预测层的解码器
    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    # 前向传播方法，接收多个输入参数，并返回模型输出
    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的token IDs序列，可以为空
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，用于指示哪些token是真实值，哪些是填充值
        token_type_ids: Optional[torch.Tensor] = None,  # token类型IDs，例如segment IDs对于BERT模型
        head_mask: Optional[torch.Tensor] = None,  # 多头注意力掩码，用于遮蔽某些注意力头
        inputs_embeds: Optional[torch.Tensor] = None,  # 嵌入输入的张量表示
        encoder_hidden_states: Optional[torch.Tensor] = None,  # 编码器隐藏状态，用于Transformer类模型
        encoder_attention_mask: Optional[torch.Tensor] = None,  # 编码器的注意力掩码
        labels: Optional[torch.Tensor] = None,  # 用于计算MLM损失的标签
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否以字典格式返回输出
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict  # 确定是否使用字典格式返回输出结果

        # 调用NeZha模型的前向传播函数，传入各种参数
        outputs = self.nezha(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]  # 取出模型输出的序列输出
        prediction_scores = self.cls(sequence_output)  # 使用分类头对序列输出进行预测得分计算

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # 定义交叉熵损失函数，用于计算MLM损失
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))  # 计算MLM损失

        if not return_dict:
            output = (prediction_scores,) + outputs[2:]  # 如果不返回字典，则构造输出元组
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output  # 返回带有损失的输出元组或者纯输出元组

        # 返回MaskedLMOutput对象，其中包括损失、logits、隐藏状态和注意力权重
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 准备生成模型输入，处理输入的标识符（token）和注意力掩码
    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
        # 获取输入标识符的形状信息
        input_shape = input_ids.shape
        # 获取有效的批次大小
        effective_batch_size = input_shape[0]

        # 添加一个虚拟标记（dummy token）
        # 如果配置中未定义PAD标记，则抛出数值错误异常
        if self.config.pad_token_id is None:
            raise ValueError("The PAD token should be defined for generation")

        # 将注意力掩码末尾添加一个全零列，扩展其形状
        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
        
        # 创建一个全为PAD标记的虚拟标记，形状为（有效批次大小，1），并放置在与输入标识符相同的设备上
        dummy_token = torch.full(
            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
        )
        
        # 将虚拟标记添加到输入标识符的末尾，扩展其长度
        input_ids = torch.cat([input_ids, dummy_token], dim=1)

        # 返回处理后的输入字典，包括更新后的输入标识符和注意力掩码
        return {"input_ids": input_ids, "attention_mask": attention_mask}
# 使用特定的文档字符串描述 Nezha 模型，该模型在顶部包含一个用于下一句预测（分类）的头部。
# 引用了 NEZHA_START_DOCSTRING 中定义的文档字符串。
@add_start_docstrings(
    """Nezha Model with a `next sentence prediction (classification)` head on top.""",
    NEZHA_START_DOCSTRING,
)
class NezhaForNextSentencePrediction(NezhaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 Nezha 模型，使用给定的配置
        self.nezha = NezhaModel(config)
        # 初始化仅包含 NSP（Next Sentence Prediction）头部的组件
        self.cls = NezhaOnlyNSPHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
):
        ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring). Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Returns:
            Depending on `return_dict`, either a tuple with `NextSentencePredictorOutput` or separate elements.

        Example:

        ```
        >>> from transformers import AutoTokenizer, NezhaForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("sijunhe/nezha-cn-base")
        >>> model = NezhaForNextSentencePrediction.from_pretrained("sijunhe/nezha-cn-base")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```
        """

        if "next_sentence_label" in kwargs:
            # 发出警告，告知用户 `next_sentence_label` 参数即将被弃用，应使用 `labels`
            warnings.warn(
                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
                " `labels` instead.",
                FutureWarning,
            )
            # 将 `next_sentence_label` 赋值给 `labels` 变量，并从 `kwargs` 中删除该参数
            labels = kwargs.pop("next_sentence_label")

        # 确定是否返回字典形式的输出，如果未指定则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用预训练模型的主要处理逻辑 `nezha` 方法，传入各类参数
        outputs = self.nezha(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取池化后的输出
        pooled_output = outputs[1]

        # 将池化输出传递给分类层 `cls`，得到序列关系分数
        seq_relationship_scores = self.cls(pooled_output)

        next_sentence_loss = None
        # 如果 `labels` 不为空，则计算下一个句子预测的损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))

        # 根据 `return_dict` 决定返回的格式，如果不返回字典，则返回元组形式的输出
        if not return_dict:
            output = (seq_relationship_scores,) + outputs[2:]
            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output

        # 返回包含损失、预测分数、隐藏状态和注意力权重的 `NextSentencePredictorOutput` 对象
        return NextSentencePredictorOutput(
            loss=next_sentence_loss,
            logits=seq_relationship_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 使用 Nezha 模型来实现序列分类或回归任务的模型转换器，顶部是一个线性层（放置在池化输出之上），例如用于 GLUE 任务。
@add_start_docstrings(
    """
    Nezha Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    NEZHA_START_DOCSTRING,
)
class NezhaForSequenceClassification(NezhaPreTrainedModel):
    def __init__(self, config):
        # 初始化函数，接受一个配置对象并调用父类的初始化方法
        super().__init__(config)
        # 设置类别数量
        self.num_labels = config.num_labels
        # 保存配置对象
        self.config = config

        # 创建 Nezha 模型实例
        self.nezha = NezhaModel(config)
        # 设置分类器的 dropout 概率，如果未提供则使用隐藏层 dropout 概率
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 定义一个 dropout 层
        self.dropout = nn.Dropout(classifier_dropout)
        # 定义一个线性层，用于分类任务，输入尺寸是隐藏层尺寸，输出尺寸是类别数量
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 根据函数定义，返回值类型可以是包含Tensor的元组或者SequenceClassifierOutput对象
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用NeZha模型进行前向传播
        outputs = self.nezha(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 提取经过池化层后的输出
        pooled_output = outputs[1]

        # 对池化后的输出应用dropout
        pooled_output = self.dropout(pooled_output)

        # 使用分类器模型进行分类预测
        logits = self.classifier(pooled_output)

        # 初始化损失值
        loss = None

        # 如果给定了标签，计算损失函数
        if labels is not None:
            # 根据配置动态确定问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择对应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单标签回归问题，计算均方误差损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签回归问题，同样计算均方误差损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类问题，使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类问题，使用带Logits的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不需要返回字典格式的结果，则返回元组形式的输出
        if not return_dict:
            output = (logits,) + outputs[2:]  # 包含分类预测和其他输出状态
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典格式的结果，则返回SequenceClassifierOutput对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 为 Nezha 模型添加一个用于多选分类任务的头部（在池化输出的基础上添加一个线性层和 softmax 函数），例如用于 RocStories/SWAG 任务
@add_start_docstrings(
    """
    Nezha Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    NEZHA_START_DOCSTRING,
)
# 定义 NezhaForMultipleChoice 类，继承自 NezhaPreTrainedModel
class NezhaForMultipleChoice(NezhaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 Nezha 模型
        self.nezha = NezhaModel(config)
        # 根据配置获取分类器的 dropout 概率，若未设置则使用隐藏层的 dropout 概率
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 使用 dropout 层
        self.dropout = nn.Dropout(classifier_dropout)
        # 分类器线性层，输入维度为隐藏层大小，输出维度为1
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并进行最终处理
        self.post_init()

    # 添加输入说明文档到模型前向传播方法
    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    # 添加代码示例文档到模型前向传播方法
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义前向传播方法
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 如果 return_dict 为 None，则使用配置中的 use_return_dict 设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 计算 num_choices，根据 input_ids 或 inputs_embeds 的第二维大小确定
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
        # 将 input_ids 重新视图为二维形状，如果 input_ids 不为 None
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        # 将 attention_mask 重新视图为二维形状，如果 attention_mask 不为 None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        # 将 token_type_ids 重新视图为二维形状，如果 token_type_ids 不为 None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        # 将 inputs_embeds 重新视图为三维形状，如果 inputs_embeds 不为 None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用 self.nezha 方法，传入各种参数，获取输出
        outputs = self.nezha(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从输出中获取 pooled_output
        pooled_output = outputs[1]
        print(pooled_output.shape)
        # 对 pooled_output 进行 dropout
        pooled_output = self.dropout(pooled_output)
        # 将 dropout 后的 pooled_output 输入分类器，获取 logits
        logits = self.classifier(pooled_output)
        print(logits.shape)
        print(num_choices)
        # 将 logits 重新视图为二维形状，形状为 (batch_size, num_choices)
        reshaped_logits = logits.view(-1, num_choices)

        # 初始化 loss 为 None
        loss = None
        # 如果 labels 不为 None，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果 return_dict 为 False，则输出包含 reshaped_logits 和额外输出的元组
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 MultipleChoiceModelOutput 对象
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Nezha Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    NEZHA_START_DOCSTRING,
)
class NezhaForTokenClassification(NezhaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels  # 从配置中获取标签数量

        self.nezha = NezhaModel(config, add_pooling_layer=False)  # 初始化 Nezha 模型，不添加池化层
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)  # 定义一个 Dropout 层，用于分类器
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)  # 定义一个线性层作为分类器，输入维度是隐藏层大小，输出维度是标签数量

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict  # 如果未指定返回字典，使用配置中的默认设置

        outputs = self.nezha(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )  # 将输入传递给 Nezha 模型并获取输出

        sequence_output = outputs[0]  # 获取模型输出的序列输出

        sequence_output = self.dropout(sequence_output)  # 应用 Dropout 到序列输出上
        logits = self.classifier(sequence_output)  # 通过分类器线性层获取 logits

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))  # 计算标签分类损失

        if not return_dict:
            output = (logits,) + outputs[2:]  # 构建输出元组，包括 logits 和可能的其他输出
            return ((loss,) + output) if loss is not None else output  # 如果有损失，包括损失在内并返回，否则只返回输出

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )  # 返回 TokenClassifierOutput 对象，包含损失、logits、隐藏状态和注意力权重
"""
Nezha Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
# 使用 Nezha 模型，在其顶部添加一个用于抽取式问答任务（如 SQuAD）的跨度分类头部（在隐藏状态输出之上的线性层，用于计算“起始位置对数”和“结束位置对数”）。

@add_start_docstrings(NEZHA_START_DOCSTRING)
# 添加起始文档字符串，继承自 NEZHA_START_DOCSTRING 预定义的文档字符串内容
class NezhaForQuestionAnswering(NezhaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # 初始化 Nezha 模型，不包含池化层
        self.nezha = NezhaModel(config, add_pooling_layer=False)
        # QA 输出层，线性变换的输出大小为配置中的隐藏大小和标签数目
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 添加开始文档字符串到模型前向函数中，格式化 NEZHA_INPUTS_DOCSTRING 包含 "batch_size, sequence_length"
    # 添加代码示例的文档字符串，包括检查点、输出类型和配置类
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 根据需要决定是否返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 NEZHA 模型进行前向传播
        outputs = self.nezha(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取序列输出
        sequence_output = outputs[0]

        # 将序列输出传递给 QA 输出层得到 logits
        logits = self.qa_outputs(sequence_output)
        
        # 将 logits 沿着最后一个维度分割为 start_logits 和 end_logits
        start_logits, end_logits = logits.split(1, dim=-1)
        
        # 去除不必要的维度并确保连续性
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果在多 GPU 环境下，可能需要扩展维度
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            
            # 忽略超出模型输入范围的 start/end 位置
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，忽略指定索引处的预测
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        # 如果不要求返回字典形式的输出，则返回元组
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 返回包含 loss、start_logits、end_logits 等内容的 QuestionAnsweringModelOutput 对象
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\nezha\init.py`

# 导入必要的模块和函数，包括自定义的异常和延迟加载模块
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义模块的导入结构，包含配置和模型类名称
_import_structure = {
    "configuration_nezha": ["NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP", "NezhaConfig"],
}

# 检查是否有 torch 库可用，若不可用则引发自定义的依赖不可用异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若 torch 可用，则添加模型相关的导入结构
    _import_structure["modeling_nezha"] = [
        "NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST",
        "NezhaForNextSentencePrediction",
        "NezhaForMaskedLM",
        "NezhaForPreTraining",
        "NezhaForMultipleChoice",
        "NezhaForQuestionAnswering",
        "NezhaForSequenceClassification",
        "NezhaForTokenClassification",
        "NezhaModel",
        "NezhaPreTrainedModel",
    ]

# 如果是类型检查阶段，则导入配置和模型类名
if TYPE_CHECKING:
    from .configuration_nezha import NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP, NezhaConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_nezha import (
            NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST,
            NezhaForMaskedLM,
            NezhaForMultipleChoice,
            NezhaForNextSentencePrediction,
            NezhaForPreTraining,
            NezhaForQuestionAnswering,
            NezhaForSequenceClassification,
            NezhaForTokenClassification,
            NezhaModel,
            NezhaPreTrainedModel,
        )

# 如果不是类型检查阶段，则进行模块的延迟加载和替换
else:
    import sys

    # 将当前模块替换为 LazyModule 实例，进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\nllb\tokenization_nllb.py`

# 导入必要的模块
import os  # 导入操作系统模块
from shutil import copyfile  # 从 shutil 模块中导入 copyfile 函数
from typing import Any, Dict, List, Optional, Tuple  # 导入类型提示相关的类和函数

import sentencepiece as spm  # 导入 sentencepiece 库

from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer  # 导入特定的类和函数
from ...utils import logging  # 导入日志记录工具

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

SPIECE_UNDERLINE = "▁"  # 定义特殊符号“▁”，用于处理语料中的词片段

VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}  # 指定词汇表文件名的映射字典

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/nllb-200-distilled-600M": (
            "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/sentencepiece.bpe.model"
        ),
    }
}  # 预训练模型与其词汇文件的映射，包含下载链接

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/nllb-200-distilled-600M": 1024,
}  # 预训练模型的位置嵌入尺寸映射表
# 定义了一个包含多种语言和脚本组合的列表，用于表示Fairseq支持的语言代码
FAIRSEQ_LANGUAGE_CODES = ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab', 'fij_Latn', 'fin_Latn', 'fon_Latn', 'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn', 'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr', 'hat_Latn', 'hau_Latn', 'heb_Hebr', 'hin_Deva', 'hne_Deva', 'hrv_Latn', 'hun_Latn', 'hye_Armn', 'ibo_Latn', 'ilo_Latn', 'ind_Latn', 'isl_Latn', 'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn', 'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'kat_Geor', 'knc_Arab', 'knc_Latn', 'kaz_Cyrl', 'kbp_Latn', 'kea_Latn', 'khm_Khmr', 'kik_Latn', 'kin_Latn', 'kir_Cyrl', 'kmb_Latn', 'kon_Latn', 'kor_Hang', 'kmr_Latn', 'lao_Laoo', 'lvs_Latn', 'lij_Latn', 'lim_Latn', 'lin_Latn', 'lit_Latn', 'lmo_Latn', 'ltg_Latn', 'ltz_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'lus_Latn', 'mag_Deva', 'mai_Deva', 'mal_Mlym', 'mar_Deva', 'min_Latn', 'mkd_Cyrl', 'plt_Latn', 'mlt_Latn', 'mni_Beng', 'khk_Cyrl', 'mos_Latn', 'mri_Latn', 'zsm_Latn', 'mya_Mymr', 'nld_Latn', 'nno_Latn', 'nob_Latn', 'npi_Deva', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'oci_Latn', 'gaz_Latn', 'ory_Orya', 'pag_Latn', 'pan_Guru', 'pap_Latn', 'pol_Latn', 'por_Latn', 'prs_Arab', 'pbt_Arab', 'quy_Latn', 'ron_Latn', 'run_Latn', 'rus_Cyrl', 'sag_Latn', 'san_Deva', 'sat_Beng', 'scn_Latn', 'shn_Mymr', 'sin_Sinh', 'slk_Latn', 'slv_Latn', 'smo_Latn', 'sna_Latn', 'snd_Arab', 'som_Latn', 'sot_Latn', 'spa_Latn', 'als_Latn', 'srd_Latn', 'srp_Cyrl', 'ssw_Latn', 'sun_Latn', 'swe_Latn', 'swh_Latn', 'szl_Latn', 'tam_Taml', 'tat_Cyrl', 'tel_Telu', 'tgk_Cyrl', 'tgl_Latn', 'tha_Thai', 'tir_Ethi', 'taq_Latn', 'taq_Tfng', 'tpi_Latn', 'tsn_Latn', 'tso_Latn', 'tuk_Latn', 'tum_Latn', 'tur_Latn', 'twi_Latn', 'tzm_Tfng', 'uig_Arab', 'ukr_Cyrl', 'umb_Latn', 'urd_Arab', 'uzn_Latn', 'vec_Latn', 'vie_Latn', 'war_Latn', 'wol_Latn', 'xho_Latn', 'ydd_Hebr', 'yor_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'zul_Latn']  # fmt: skip

# 定义一个NllbTokenizer类，继承自PreTrainedTokenizer类
class NllbTokenizer(PreTrainedTokenizer):
    """
    构建一个NLLB分词器。

    从RobertaTokenizer和XLNetTokenizer进行了适配。
    基于SentencePiece（https://github.com/google/sentencepiece）。

    分词方法对于源语言文档是'<tokens> <eos> <language code>'，对于目标语言文档是'<language code> <tokens> <eos>'。

    示例：
    
    ```
    >>> from transformers import NllbTokenizer

    >>> tokenizer = NllbTokenizer.from_pretrained(
    ```
    """
    # 定义函数参数和默认值，用于初始化一个特定的tokenizer对象
    vocab_file (`str`):
        # 词汇表文件的路径
        Path to the vocabulary file.
    
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        # 在预训练期间用作序列开头的特殊token。在构建序列时，实际用于序列开头的是`cls_token`。
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
    
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        # 序列结束的特殊token。在构建序列时，实际用于序列结尾的是`sep_token`。
        The end of sequence token.
    
    sep_token (`str`, *optional*, defaults to `"</s>"`):
        # 分隔符token，用于从多个序列构建一个序列，例如用于序列分类或文本与问题回答中。还用作使用特殊token构建序列的最后一个token。
        The separator token, which is used when building a sequence from multiple sequences.
    
    cls_token (`str`, *optional*, defaults to `"<s>"`):
        # 分类器token，在进行序列分类（整个序列的分类而不是每个token的分类）时使用。在使用特殊token构建序列时是序列的第一个token。
        The classifier token which is used when doing sequence classification.
    
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        # 未知token。如果一个token不在词汇表中，无法转换为ID，则会被设置为该token。
        The unknown token.
    
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        # 填充token，在批处理不同长度的序列时使用。
        The token used for padding.
    
    mask_token (`str`, *optional*, defaults to `"<mask>"`):
        # 掩码值token。在进行掩码语言建模训练时使用，模型将尝试预测此token。
        The token used for masking values.
    
    tokenizer_file (`str`, *optional*):
        # 要使用的分词器文件的路径，用于替代词汇表文件。
        The path to a tokenizer file to use instead of the vocab file.
    
    src_lang (`str`, *optional*):
        # 用作翻译的源语言。
        The language to use as source language for translation.
    
    tgt_lang (`str`, *optional*):
        # 用作翻译的目标语言。
        The language to use as target language for translation.
    
    sp_model_kwargs (`Dict[str, str]`):
        # 传递给模型初始化的额外关键字参数。
        Additional keyword arguments to pass to the model initialization.
    # 从预训练的位置编码大小中获取最大模型输入尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 从预训练的词汇文件映射中获取预训练的词汇文件
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 模型的输入名称列表，包括input_ids和attention_mask
    model_input_names = ["input_ids", "attention_mask"]

    # 前缀标记和后缀标记的初始化为空列表
    prefix_tokens: List[int] = []
    suffix_tokens: List[int] = []

    # 初始化函数，接受多个参数，包括词汇文件、特殊标记（如bos_token、eos_token等）、序列化的分词器文件等
    def __init__(
        self,
        vocab_file,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        tokenizer_file=None,
        src_lang=None,
        tgt_lang=None,
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        additional_special_tokens=None,
        legacy_behaviour=False,
        **kwargs,
    ):
    
    # 获取对象状态的函数，返回对象的字典形式状态
    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None  # 状态中的sp_model置为None
        state["sp_model_proto"] = self.sp_model.serialized_model_proto()  # 将sp_model的序列化模型保存到状态中
        return state

    # 设置对象状态的函数，接受状态字典d，并设置对象的状态
    def __setstate__(self, d):
        self.__dict__ = d

        # 为了向后兼容性
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 使用sp_model_kwargs创建spm.SentencePieceProcessor对象，并从序列化的proto中加载模型
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)

    # 返回词汇大小，包括sp_model的长度和fairseq_offset
    @property
    def vocab_size(self):
        return len(self.sp_model) + self.fairseq_offset

    # 返回源语言代码_src_lang
    @property
    def src_lang(self) -> str:
        return self._src_lang

    # 返回语言代码到id的映射，同时发出警告提示属性即将移除
    @property
    def lang_code_to_id(self):
        logger.warning_once(
            "the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
            " this attribute will be removed in `transformers` v4.38"
        )
        return self._lang_code_to_id

    # 返回fairseq中tokens到ids的映射，同时发出警告提示属性即将移除
    @property
    def fairseq_tokens_to_ids(self):
        logger.warning_once(
            "the `fairseq_tokens_to_ids` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
            " this attribute will be removed in `transformers` v4.38"
        )
        return self._fairseq_tokens_to_ids

    # 返回id到语言代码的映射，同时发出警告提示属性即将移除
    @property
    def id_to_lang_code(self):
        logger.warning_once(
            "the `id_to_lang_code` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
            " this attribute will be removed in `transformers` v4.38"
        )
        return self._id_to_lang_code

    # 返回fairseq中ids到tokens的映射，同时发出警告提示属性即将移除
    @property
    def fairseq_ids_to_tokens(self):
        logger.warning_once(
            "the `_fairseq_ids_to_tokens` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
            " this attribute will be removed in `transformers` v4.38"
        )
        return self._fairseq_ids_to_tokens

    # 设置源语言_src_lang，同时更新特殊标记
    @src_lang.setter
    def src_lang(self, new_src_lang: str) -> None:
        self._src_lang = new_src_lang
        self.set_src_lang_special_tokens(self._src_lang)
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        # Check if the token list already has special tokens
        if already_has_special_tokens:
            # If yes, delegate the computation to the superclass method
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # Create lists of 1s for the prefix and suffix tokens
        prefix_ones = [1] * len(self.prefix_tokens)
        suffix_ones = [1] * len(self.suffix_tokens)

        # If there's no token_ids_1 (single sequence case), return with special tokens added
        if token_ids_1 is None:
            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
        
        # For sequence pairs, return with special tokens added for both sequences
        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An NLLB sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """

        # If there's no token_ids_1 (single sequence case), return input_ids with added special tokens
        if token_ids_1 is None:
            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
        
        # If there are token_ids_1 (sequence pairs case), concatenate both sequences with special tokens
        # Note: This case is for API consistency and not expected to be a common use case.
        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
        """
        Create token type IDs tensor from token id lists.

        Args:
            token_ids_0 (`List[int]`):
                List of token IDs representing the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional list of token IDs representing the second sequence (for sequence pairs).

        Returns:
            `List[int]`: A list of token type IDs based on the input sequences.
        """

        # Create a list of zeros representing token type IDs for token_ids_0
        token_type_ids = [0] * len(token_ids_0)
        
        # If token_ids_1 is provided, extend the token_type_ids list with ones for token_ids_1
        if token_ids_1 is not None:
            token_type_ids += [1] * len(token_ids_1)
        
        # Return the token_type_ids list
        return token_type_ids
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        """

        # Initialize separator and classification tokens
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If token_ids_1 is None, return a list of zeros of the appropriate length
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]

        # If token_ids_1 is provided, calculate the length of the resulting sequence with additional separators
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    def _build_translation_inputs(
        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
    ):
        """Used by translation pipeline, to prepare inputs for the generate function"""
        # Check if source language and target language are provided
        if src_lang is None or tgt_lang is None:
            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
        
        # Set source language for the instance
        self.src_lang = src_lang
        
        # Prepare inputs by invoking the model with special tokens and additional keyword arguments
        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
        
        # Convert target language token to its corresponding ID
        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
        
        # Add the target language ID as a forced beginning-of-sequence token ID
        inputs["forced_bos_token_id"] = tgt_lang_id
        
        return inputs

    def get_vocab(self):
        # Create a dictionary mapping token strings to their corresponding IDs across the entire vocabulary
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        
        # Update the vocabulary dictionary with any additional tokens introduced
        vocab.update(self.added_tokens_encoder)
        
        return vocab

    def _tokenize(self, text: str) -> List[str]:
        # Tokenize a given text using the SentencePiece model and return a list of token strings
        return self.sp_model.encode(text, out_type=str)

    def _convert_token_to_id(self, token):
        """Converts a token (str) into an ID using the vocabulary."""
        # Convert a token string into its corresponding ID using the SentencePiece model
        spm_id = self.sp_model.PieceToId(token)
        
        # Return the ID adjusted by fairseq offset for unknown tokens
        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) into a token (str) using the vocabulary."""
        # Convert an index into its corresponding token string using the SentencePiece model and fairseq offset
        return self.sp_model.IdToPiece(index - self.fairseq_offset)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) into a single string."""
        # Concatenate tokens into a single string and replace special token underscore with a space
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 根据提供的前缀（如果有的话）和文件名字典中的键值，构建输出的词汇文件路径
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇文件路径不同于输出路径，并且当前词汇文件存在，则复制当前词汇文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # 如果当前词汇文件不存在，则将序列化后的特殊模型写入输出路径
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # 返回输出路径的元组
        return (out_vocab_file,)

    def prepare_seq2seq_batch(
        self,
        src_texts: List[str],
        src_lang: str = "eng_Latn",
        tgt_texts: Optional[List[str]] = None,
        tgt_lang: str = "fra_Latn",
        **kwargs,
    ) -> BatchEncoding:
        # 设置源语言和目标语言属性
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        # 调用父类方法，准备序列到序列的批次编码并返回结果
        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)

    def _switch_to_input_mode(self):
        # 切换到输入模式，调用设置源语言特殊标记方法并返回结果
        return self.set_src_lang_special_tokens(self.src_lang)

    def _switch_to_target_mode(self):
        # 切换到目标模式，调用设置目标语言特殊标记方法并返回结果
        return self.set_tgt_lang_special_tokens(self.tgt_lang)

    def set_src_lang_special_tokens(self, src_lang) -> None:
        """Reset the special tokens to the source lang setting.
        - In legacy mode: No prefix and suffix=[eos, src_lang_code].
        - In default mode: Prefix=[src_lang_code], suffix = [eos]
        """
        # 将当前语言代码转换为对应的 ID
        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
        # 根据传统模式与默认模式设置前缀和后缀特殊标记
        if self.legacy_behaviour:
            self.prefix_tokens = []
            self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
        else:
            self.prefix_tokens = [self.cur_lang_code]
            self.suffix_tokens = [self.eos_token_id]

    def set_tgt_lang_special_tokens(self, lang: str) -> None:
        """Reset the special tokens to the target lang setting.
        - In legacy mode: No prefix and suffix=[eos, tgt_lang_code].
        - In default mode: Prefix=[tgt_lang_code], suffix = [eos]
        """
        # 将当前语言代码转换为对应的 ID
        self.cur_lang_code = self.convert_tokens_to_ids(lang)
        # 根据传统模式与默认模式设置前缀和后缀特殊标记
        if self.legacy_behaviour:
            self.prefix_tokens = []
            self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
        else:
            self.prefix_tokens = [self.cur_lang_code]
            self.suffix_tokens = [self.eos_token_id]

`.\models\nllb\tokenization_nllb_fast.py`

# 导入标准库和第三方库
import os  # 导入操作系统相关的功能模块
from shutil import copyfile  # 从 shutil 模块中导入 copyfile 函数
from typing import List, Optional, Tuple  # 导入类型提示相关的模块

# 导入自定义模块和函数
from tokenizers import processors  # 从 tokenizers 库导入 processors 对象
from ...tokenization_utils import AddedToken, BatchEncoding  # 导入相对路径中的 tokenization_utils 模块中的类和函数
from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入相对路径中的 tokenization_utils_fast 模块中的 PreTrainedTokenizerFast 类
from ...utils import is_sentencepiece_available, logging  # 从相对路径中的 utils 模块导入 is_sentencepiece_available 和 logging 函数

# 根据 sentencepiece 库的可用性选择性地导入 NllbTokenizer 类
if is_sentencepiece_available():
    from .tokenization_nllb import NllbTokenizer  # 从当前包中的 tokenization_nllb 模块导入 NllbTokenizer 类
else:
    NllbTokenizer = None  # 如果 sentencepiece 不可用，则将 NllbTokenizer 设为 None

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)

# 定义 VOCAB_FILES_NAMES 字典，指定词汇表和分词器文件的名称
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}

# 定义 PRETRAINED_VOCAB_FILES_MAP 字典，指定预训练模型的词汇表和分词器文件的下载链接
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/nllb-200-distilled-600M": (
            "https://huggingface.co/facebook/nllb-200-distilled-600M/resolve/main/sentencepiece.bpe.model"
        ),
    },
    "tokenizer_file": {
        "facebook/nllb-200-distilled-600M": (
            "https://huggingface.co/facebook/nllb-200-distilled-600M/resolve/main/tokenizer.json"
        ),
    },
}

# 定义 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 字典，指定预训练模型的位置嵌入大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/nllb-large-en-ro": 1024,
    "facebook/nllb-200-distilled-600M": 1024,
}
# 支持Fairseq使用的语言代码列表，每个元素表示一个语言代码，格式为语言代码_脚本
FAIRSEQ_LANGUAGE_CODES = ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab', 'fij_Latn', 'fin_Latn', 'fon_Latn', 'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn', 'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr', 'hat_Latn', 'hau_Latn', 'heb_Hebr', 'hin_Deva', 'hne_Deva', 'hrv_Latn', 'hun_Latn', 'hye_Armn', 'ibo_Latn', 'ilo_Latn', 'ind_Latn', 'isl_Latn', 'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn', 'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'kat_Geor', 'knc_Arab', 'knc_Latn', 'kaz_Cyrl', 'kbp_Latn', 'kea_Latn', 'khm_Khmr', 'kik_Latn', 'kin_Latn', 'kir_Cyrl', 'kmb_Latn', 'kon_Latn', 'kor_Hang', 'kmr_Latn', 'lao_Laoo', 'lvs_Latn', 'lij_Latn', 'lim_Latn', 'lin_Latn', 'lit_Latn', 'lmo_Latn', 'ltg_Latn', 'ltz_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'lus_Latn', 'mag_Deva', 'mai_Deva', 'mal_Mlym', 'mar_Deva', 'min_Latn', 'mkd_Cyrl', 'plt_Latn', 'mlt_Latn', 'mni_Beng', 'khk_Cyrl', 'mos_Latn', 'mri_Latn', 'zsm_Latn', 'mya_Mymr', 'nld_Latn', 'nno_Latn', 'nob_Latn', 'npi_Deva', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'oci_Latn', 'gaz_Latn', 'ory_Orya', 'pag_Latn', 'pan_Guru', 'pap_Latn', 'pol_Latn', 'por_Latn', 'prs_Arab', 'pbt_Arab', 'quy_Latn', 'ron_Latn', 'run_Latn', 'rus_Cyrl', 'sag_Latn', 'san_Deva', 'sat_Beng', 'scn_Latn', 'shn_Mymr', 'sin_Sinh', 'slk_Latn', 'slv_Latn', 'smo_Latn', 'sna_Latn', 'snd_Arab', 'som_Latn', 'sot_Latn', 'spa_Latn', 'als_Latn', 'srd_Latn', 'srp_Cyrl', 'ssw_Latn', 'sun_Latn', 'swe_Latn', 'swh_Latn', 'szl_Latn', 'tam_Taml', 'tat_Cyrl', 'tel_Telu', 'tgk_Cyrl', 'tgl_Latn', 'tha_Thai', 'tir_Ethi', 'taq_Latn', 'taq_Tfng', 'tpi_Latn', 'tsn_Latn', 'tso_Latn', 'tuk_Latn', 'tum_Latn', 'tur_Latn', 'twi_Latn', 'tzm_Tfng', 'uig_Arab', 'ukr_Cyrl', 'umb_Latn', 'urd_Arab', 'uzn_Latn', 'vec_Latn', 'vie_Latn', 'war_Latn', 'wol_Latn', 'xho_Latn', 'ydd_Hebr', 'yor_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'zul_Latn']  # fmt: skip

# 定义了一个新的类 NllbTokenizerFast，它继承自 PreTrainedTokenizerFast 类
class NllbTokenizerFast(PreTrainedTokenizerFast):
    """
    构建一个“快速”NLLB分词器，使用HuggingFace的 *tokenizers* 库作为后端。基于[BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models)。
    
    这个分词器继承自 `PreTrainedTokenizerFast`，该类包含大部分主要方法。用户应参考这个超类以获取关于这些方法更多信息。
    """
    # 在源语言文档中，标记化方法为 `<tokens> <eos> <language code>`；在目标语言文档中，标记化方法为 `<language code> <tokens> <eos>`。
    # 
    # 示例:
    # 
    # ```
    # >>> from transformers import NllbTokenizerFast
    # 
    # >>> tokenizer = NllbTokenizerFast.from_pretrained(
    # ...     "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="fra_Latn"
    # ... )
    # >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    # >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
    # >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
    # ```
    # 获取预定义的词汇文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 获取预训练模型的最大输入大小列表
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 获取预训练词汇文件映射字典
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 定义模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask"]
    # 指定一个较慢的分词器类
    slow_tokenizer_class = NllbTokenizer

    # 前缀令牌的初始整数列表
    prefix_tokens: List[int] = []
    # 后缀令牌的初始整数列表
    suffix_tokens: List[int] = []
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        src_lang=None,
        tgt_lang=None,
        additional_special_tokens=None,
        legacy_behaviour=False,
        **kwargs,
    ):
        # 如果未提供额外的特殊标记，使用默认的FAIRSEQ_LANGUAGE_CODES
        if additional_special_tokens is None:
            additional_special_tokens = FAIRSEQ_LANGUAGE_CODES

        self.vocab_file = vocab_file
        # 如果mask_token是字符串，创建一个AddedToken对象，处理其属性
        mask_token = (
            AddedToken(mask_token, normalized=True, lstrip=True, special=True)
            if isinstance(mask_token, str)
            else mask_token
        )
        self.legacy_behaviour = legacy_behaviour
        # 调用父类的初始化方法，设置实例变量
        super().__init__(
            vocab_file=vocab_file,
            tokenizer_file=tokenizer_file,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            unk_token=unk_token,
            pad_token=pad_token,
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            mask_token=mask_token,
            additional_special_tokens=additional_special_tokens,
            legacy_behaviour=legacy_behaviour,
            **kwargs,
        )

        # 创建语言代码到其对应ID的映射字典
        self._lang_code_to_id = {
            lang_code: self.convert_tokens_to_ids(str(lang_code)) for lang_code in additional_special_tokens
        }

        # 设置源语言，默认为"eng_Latn"，如果未提供则使用默认值
        self._src_lang = src_lang if src_lang is not None else "eng_Latn"
        # 将当前语言代码转换为其对应的ID
        self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
        self.tgt_lang = tgt_lang
        # 设置源语言特殊标记
        self.set_src_lang_special_tokens(self._src_lang)

    @property
    def lang_code_to_id(self):
        # 提示警告，该属性即将被废弃
        logger.warning_once(
            "the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
            " this attribute will be removed in `transformers` v4.38"
        )
        return self._lang_code_to_id

    @property
    def can_save_slow_tokenizer(self) -> bool:
        # 检查是否可以保存缓慢的分词器，检查词汇文件是否存在
        return os.path.isfile(self.vocab_file) if self.vocab_file else False

    @property
    def src_lang(self) -> str:
        # 返回源语言
        return self._src_lang

    @src_lang.setter
    def src_lang(self, new_src_lang: str) -> None:
        # 设置新的源语言，并更新特殊标记
        self._src_lang = new_src_lang
        self.set_src_lang_special_tokens(self._src_lang)

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    def set_lang(
        self,
        token_ids_0: List[int],
        token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. The special tokens depend on calling set_lang.

        An NLLB sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            # Return concatenated list of tokens with prefix and suffix tokens
            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
        # Handle the case of sequence pairs by concatenating both sequences with prefix and suffix tokens
        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """

        # Define special tokens for separator and classification respectively
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        if token_ids_1 is None:
            # Return list of zeros with length corresponding to the total tokens including special tokens
            return len(cls + token_ids_0 + sep) * [0]
        # Handle sequence pairs by computing the length of tokens including additional separators
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    def _build_translation_inputs(
        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
    ):
        """Used by translation pipeline, to prepare inputs for the generate function"""
        # Ensure both source and target languages are provided
        if src_lang is None or tgt_lang is None:
            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
        
        # Set the source language for further processing
        self.src_lang = src_lang
        
        # Generate inputs for the model with special tokens added
        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
        
        # Convert target language to token ID and assign as forced beginning-of-sequence token
        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
        inputs["forced_bos_token_id"] = tgt_lang_id
        
        return inputs

    def prepare_seq2seq_batch(
        self,
        src_texts: List[str],
        src_lang: str = "eng_Latn",
        tgt_texts: Optional[List[str]] = None,
        tgt_lang: str = "fra_Latn",
        **kwargs,
    ):
        """
        Prepare a batch of inputs for sequence-to-sequence tasks, including source and target texts and languages.

        Args:
            src_texts (`List[str]`):
                List of source texts.
            src_lang (`str`):
                Source language identifier.
            tgt_texts (`List[str]`, *optional*):
                List of target texts.
            tgt_lang (`str`):
                Target language identifier.
            **kwargs:
                Additional keyword arguments for further customization.

        Returns:
            Dictionary containing prepared inputs for the model.
        """
        # Implementation of this function's details would go here, but the provided snippet does not include its full body.
        pass
    def set_src_lang_special_tokens(self, src_lang) -> None:
        """设置特殊标记以适应源语言设置。
        - 在传统模式下：无前缀，后缀=[eos, src_lang_code]。
        - 在默认模式下：前缀=[src_lang_code]，后缀=[eos]。
        """
        # 将当前语言代码转换为对应的 ID
        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)

        # 根据 legacy_behaviour 设置特殊标记列表
        if self.legacy_behaviour:
            self.prefix_tokens = []
            self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
        else:
            self.prefix_tokens = [self.cur_lang_code]
            self.suffix_tokens = [self.eos_token_id]

        # 将 ID 转换为对应的 token 字符串
        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)

        # 更新 tokenizer 的后处理器以包含特殊标记
        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
        )

    def set_tgt_lang_special_tokens(self, lang: str) -> None:
        """设置特殊标记以适应目标语言设置。
        - 在传统模式下：无前缀，后缀=[eos, tgt_lang_code]。
        - 在默认模式下：前缀=[tgt_lang_code]，后缀=[eos]。
        """
        # 将当前语言代码转换为对应的 ID
        self.cur_lang_code = self.convert_tokens_to_ids(lang)

        # 根据 legacy_behaviour 设置特殊标记列表
        if self.legacy_behaviour:
            self.prefix_tokens = []
            self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
        else:
            self.prefix_tokens = [self.cur_lang_code]
            self.suffix_tokens = [self.eos_token_id]

        # 将 ID 转换为对应的 token 字符串
        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)

        # 更新 tokenizer 的后处理器以包含特殊标记
        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
        )
    # 保存词汇表到指定目录中
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查是否可以保存慢速分词器的词汇表，如果不能则抛出值错误异常
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # 检查保存目录是否存在，如果不存在则记录错误信息并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
            return

        # 构建输出词汇表文件的完整路径，如果有前缀则加在文件名前面
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件的绝对路径不等于输出文件的绝对路径，则复制当前词汇表文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # 返回输出文件路径的元组
        return (out_vocab_file,)

`.\models\nllb\init.py`

# 版权声明和许可信息
#
# 版权所有 2022 年 HuggingFace 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）获得许可；
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发的软件
# 在法律许可的范围内提供，不提供任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。
from typing import TYPE_CHECKING

# 从 utils 模块导入所需的类和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义一个空的导入结构
_import_structure = {}

# 检查是否可用 SentencePiece
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将 NllbTokenizer 添加到导入结构中
    _import_structure["tokenization_nllb"] = ["NllbTokenizer"]

# 检查是否可用 Tokenizers
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将 NllbTokenizerFast 添加到导入结构中
    _import_structure["tokenization_nllb_fast"] = ["NllbTokenizerFast"]

# 如果在类型检查模式下
if TYPE_CHECKING:
    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入 NllbTokenizer 类型
        from .tokenization_nllb import NllbTokenizer

    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入 NllbTokenizerFast 类型
        from .tokenization_nllb_fast import NllbTokenizerFast

# 如果不在类型检查模式下
else:
    import sys

    # 将当前模块设为懒加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\nllb_moe\configuration_nllb_moe.py`

"""
NLLB-MoE model configuration
"""
# 导入所需模块和类
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# 映射预训练配置文件的 URL 到模型名称
NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/nllb-moe-54B": "https://huggingface.co/facebook/nllb-moe-54b/resolve/main/config.json",
}

# 定义 NllbMoeConfig 类，继承自 PretrainedConfig
class NllbMoeConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`NllbMoeModel`]. It is used to instantiate an
    NLLB-MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the NLLB-MoE
    [facebook/nllb-moe-54b](https://huggingface.co/facebook/nllb-moe-54b) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import NllbMoeModel, NllbMoeConfig

    >>> # Initializing a NllbMoe facebook/nllb-moe-54b style configuration
    >>> configuration = NllbMoeConfig()

    >>> # Initializing a model from the facebook/nllb-moe-54b style configuration
    >>> model = NllbMoeModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型定义为 "nllb-moe"
    model_type = "nllb-moe"
    # 推断阶段要忽略的键列表
    keys_to_ignore_at_inference = ["past_key_values"]
    # 属性映射字典，用于配置转换
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
    # 定义一个初始化方法，用于初始化 Transformer 架构的模型参数和设置
    def __init__(
        self,
        vocab_size=128112,  # 词汇表大小，默认为128112
        max_position_embeddings=1024,  # 最大位置编码长度，默认为1024
        encoder_layers=12,  # 编码器层数，默认为12层
        encoder_ffn_dim=4096,  # 编码器中 FeedForward 层的维度，默认为4096
        encoder_attention_heads=16,  # 编码器中注意力头的数量，默认为16个
        decoder_layers=12,  # 解码器层数，默认为12层
        decoder_ffn_dim=4096,  # 解码器中 FeedForward 层的维度，默认为4096
        decoder_attention_heads=16,  # 解码器中注意力头的数量，默认为16个
        encoder_layerdrop=0.05,  # 编码器层级丢弃率，默认为0.05
        decoder_layerdrop=0.05,  # 解码器层级丢弃率，默认为0.05
        use_cache=True,  # 是否使用缓存，默认为True
        is_encoder_decoder=True,  # 是否是编码-解码架构，默认为True
        activation_function="relu",  # 激活函数，默认为ReLU
        d_model=1024,  # 模型维度，默认为1024
        dropout=0.1,  # 普通Dropout率，默认为0.1
        attention_dropout=0.1,  # 注意力Dropout率，默认为0.1
        activation_dropout=0.0,  # 激活函数Dropout率，默认为0.0
        init_std=0.02,  # 初始化的标准差，默认为0.02
        decoder_start_token_id=2,  # 解码器起始标记ID，默认为2
        scale_embedding=True,  # 是否缩放嵌入，默认为True
        router_bias=False,  # 路由器是否包含偏置项，默认为False
        router_dtype="float32",  # 路由器数据类型，默认为float32
        router_ignore_padding_tokens=False,  # 路由器是否忽略填充标记，默认为False
        num_experts=128,  # 专家数量，默认为128
        expert_capacity=64,  # 每个专家的容量，默认为64
        encoder_sparse_step=4,  # 编码器稀疏步长，默认为4
        decoder_sparse_step=4,  # 解码器稀疏步长，默认为4
        router_z_loss_coef=0.001,  # 路由器Z损失系数，默认为0.001
        router_aux_loss_coef=0.001,  # 路由器辅助损失系数，默认为0.001
        second_expert_policy="all",  # 第二专家策略，默认为"all"
        normalize_router_prob_before_dropping=False,  # 是否在丢弃前归一化路由器概率，默认为False
        batch_prioritized_routing=False,  # 批量优先路由，默认为False
        moe_eval_capacity_token_fraction=1.0,  # MOE评估容量标记分数，默认为1.0
        moe_token_dropout=0.2,  # MOE标记丢弃率，默认为0.2
        pad_token_id=1,  # 填充标记ID，默认为1
        bos_token_id=0,  # 起始标记ID，默认为0
        eos_token_id=2,  # 结束标记ID，默认为2
        output_router_logits=False,  # 是否输出路由器logits，默认为False
        **kwargs,  # 其他关键字参数
        ):
        # 初始化 Transformer 架构的参数
        self.vocab_size = vocab_size  # 词汇表大小
        self.max_position_embeddings = max_position_embeddings  # 最大位置编码长度
        self.d_model = d_model  # 模型的维度大小
        self.encoder_ffn_dim = encoder_ffn_dim  # 编码器中间层的维度
        self.encoder_layers = encoder_layers  # 编码器层数
        self.encoder_attention_heads = encoder_attention_heads  # 编码器注意力头数
        self.decoder_ffn_dim = decoder_ffn_dim  # 解码器中间层的维度
        self.decoder_layers = decoder_layers  # 解码器层数
        self.decoder_attention_heads = decoder_attention_heads  # 解码器注意力头数
        self.dropout = dropout  # 普通丢弃率
        self.attention_dropout = attention_dropout  # 注意力机制中的丢弃率
        self.activation_dropout = activation_dropout  # 激活函数中的丢弃率
        self.activation_function = activation_function  # 激活函数类型
        self.init_std = init_std  # 参数初始化的标准差
        self.encoder_layerdrop = encoder_layerdrop  # 编码器层丢弃率
        self.decoder_layerdrop = decoder_layerdrop  # 解码器层丢弃率
        self.use_cache = use_cache  # 是否使用缓存
        self.num_hidden_layers = encoder_layers  # 隐藏层的数量等同于编码器层数
        self.scale_embedding = scale_embedding  # 如果为 True，嵌入的缩放因子为 sqrt(d_model)
        self.router_z_loss_coef = router_z_loss_coef  # 路由器 z 损失的系数
        self.router_aux_loss_coef = router_aux_loss_coef  # 路由器辅助损失的系数
        self.decoder_sparse_step = decoder_sparse_step  # 解码器稀疏步长
        self.encoder_sparse_step = encoder_sparse_step  # 编码器稀疏步长
        self.num_experts = num_experts  # 专家数量
        self.expert_capacity = expert_capacity  # 专家容量
        self.router_bias = router_bias  # 路由器偏置
        if router_dtype not in ["float32", "float16", "bfloat16"]:
            raise ValueError(f"`router_dtype` must be one of 'float32', 'float16' or 'bfloat16', got {router_dtype}")
        self.router_dtype = router_dtype  # 路由器数据类型，必须是 float32、float16 或 bfloat16 中的一种

        self.router_ignore_padding_tokens = router_ignore_padding_tokens  # 是否忽略填充标记的路由
        self.batch_prioritized_routing = batch_prioritized_routing  # 是否进行批次优先路由
        self.second_expert_policy = second_expert_policy  # 第二专家策略
        self.normalize_router_prob_before_dropping = normalize_router_prob_before_dropping  # 在丢弃前是否对路由器概率进行归一化
        self.moe_eval_capacity_token_fraction = moe_eval_capacity_token_fraction  # MOE 评估容量的标记分数
        self.moe_token_dropout = moe_token_dropout  # MOE 标记的丢弃率
        self.output_router_logits = output_router_logits  # 输出路由器的对数概率
        super().__init__(
            pad_token_id=pad_token_id,  # 填充标记的 ID
            bos_token_id=bos_token_id,  # 开始标记的 ID
            eos_token_id=eos_token_id,  # 结束标记的 ID
            is_encoder_decoder=is_encoder_decoder,  # 是否为编码-解码模型
            decoder_start_token_id=decoder_start_token_id,  # 解码器开始标记的 ID
            **kwargs,  # 其他参数
        )

`.\models\nllb_moe\convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py`

# 导入必要的库和模块
import argparse  # 用于命令行参数解析
import json  # 用于处理 JSON 格式数据
import os  # 提供操作系统相关功能的模块

import torch  # 张量计算库 PyTorch
from torch import nn  # PyTorch 的神经网络模块

# 从 transformers 库中导入模型和配置类
from transformers import NllbMoeConfig, NllbMoeModel
# 从 transformers 模块中导入数据类型相关的函数
from transformers.modeling_utils import dtype_byte_size
# 从 transformers 模块中导入权重相关的常量和函数
from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME


def remove_ignore_keys_(state_dict):
    # 定义需要从 state_dict 中移除的键列表
    ignore_keys = [
        "encoder.version",  # 版本信息，不需保留
        "decoder.version",  # 版本信息，不需保留
        "model.encoder.version",  # 版本信息，不需保留
        "model.decoder.version",  # 版本信息，不需保留
        "decoder.output_projection.weight",  # 解码器输出投影权重，不需保留
        "_float_tensor",  # 浮点数张量，不需保留
        "encoder.embed_positions._float_tensor",  # 编码器位置嵌入的浮点数张量，不需保留
        "decoder.embed_positions._float_tensor",  # 解码器位置嵌入的浮点数张量，不需保留
    ]
    # 逐一移除 ignore_keys 中指定的键
    for k in ignore_keys:
        state_dict.pop(k, None)


def make_linear_from_emb(emb):
    # 根据嵌入层 emb 创建一个线性层
    vocab_size, emb_size = emb.weight.shape
    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
    # 将线性层的权重设为与嵌入层相同的数据
    lin_layer.weight.data = emb.weight.data
    return lin_layer


def rename_fairseq_keys(state_dict, expert_idx=None):
    new_dict = {}
    # 遍历 state_dict 的键
    for old_key in state_dict.keys():
        key = old_key
        # 替换 moe_layer.experts. 为 ffn.experts.expert_，用于重命名键
        if "moe_layer.experts." in key:
            if expert_idx is not None:
                key = key.replace("moe_layer.experts.0", f"ffn.experts.expert_{expert_idx}")
            else:
                key = key.replace("moe_layer.experts.", "ffn.experts.expert_")
        # 将 gate 替换为 ffn.router.classifier，用于重命名键
        if "gate" in key:
            key = key.replace(".moe_layer.gate.wg", ".ffn.router.classifier")
        # 将 fc2 替换为 ffn.fc2，用于重命名键
        if "fc2" and "experts" not in key:
            key = key.replace(".fc2.", ".ffn.fc2.")
        # 将 fc1 替换为 ffn.fc1，用于重命名键
        if "fc1" and "experts" not in key:
            key = key.replace(".fc1.", ".ffn.fc1.")
        # 将 encoder_attn 替换为 cross_attention，用于重命名键
        if ".encoder_attn." in key:
            key = key.replace(".encoder_attn.", ".cross_attention.")
        # 将 encoder_attn_layer_norm 替换为 cross_attention_layer_norm，用于重命名键
        if "encoder_attn_layer_norm" in key:
            key = key.replace("encoder_attn_layer_norm", "cross_attention_layer_norm")
        # 将 final_layer_norm 替换为 ff_layer_norm，用于重命名键
        if "final_layer_norm" in key:
            key = key.replace("final_layer_norm", "ff_layer_norm")
        # 将新键值对加入到 new_dict 中
        new_dict[key] = state_dict[old_key]
    return new_dict


def shard_on_the_fly(switch_checkpoint_path, dump_path, num_experts, dtype, weights_name: str = WEIGHTS_NAME):
    sharded_state_dicts = []  # 初始化空的分片状态字典列表
    total_size = 0  # 初始化总大小为 0
    os.makedirs(dump_path, exist_ok=True)  # 创建 dump_path 目录，如果不存在的话
    # 遍历所有专家的范围，从0到num_experts-1
    for expert in range(num_experts):
        # 构造每个专家的检查点路径，形如"switch_checkpoint_path-rank-{expert}.pt"
        expert_path = switch_checkpoint_path + f"-rank-{expert}.pt"
        # 检查该路径是否是文件
        if os.path.isfile(expert_path):
            # 如果是文件，加载专家模型的状态字典
            expert_state = torch.load(expert_path)["model"]
            # 移除模型中要忽略的键
            remove_ignore_keys_(expert_state)
            # 重命名Fairseq模型中的键，使用专家的索引
            expert_state = rename_fairseq_keys(expert_state, expert)
            # 构造保存路径，使用weights_name替换后缀为".bin"的部分，形如"-{len(sharded_state_dicts)+1:05d}-of-???.bin"
            save_path = os.path.join(
                dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin")
            )
            # 保存专家模型的状态字典到指定路径
            torch.save(expert_state, save_path)
            # 将专家模型的键集合添加到sharded_state_dicts中
            sharded_state_dicts.append(expert_state.keys())
            # 更新总大小，计算专家模型中所有张量的总字节数
            total_size += sum([value.numel() for key, value in expert_state.items()]) * dtype_byte_size(
                expert_state[list(expert_state)[0]].dtype
            )

    # 添加共享权重模型的最后一个块
    # 构造保存路径，使用weights_name替换后缀为".bin"的部分，形如"-{len(sharded_state_dicts)+1:05d}-of-???.bin"
    save_path = os.path.join(dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin"))
    # 加载共享权重模型的状态字典
    shared_weights = torch.load(switch_checkpoint_path + "-shared.pt")["model"]
    # 移除模型中要忽略的键
    remove_ignore_keys_(shared_weights)
    # 重命名Fairseq模型中的键，此时专家为None
    shared_weights = rename_fairseq_keys(shared_weights, None)
    # 将共享权重中的"decoder.embed_tokens.weight"键映射到"shared.weight"
    shared_weights["shared.weight"] = shared_weights["decoder.embed_tokens.weight"]
    # 将共享权重模型的键集合添加到sharded_state_dicts中
    sharded_state_dicts.append(shared_weights.keys())

    # 如果只有共享权重（即dummy模型或专家保存在同一个文件中）
    if len(sharded_state_dicts) == 1:
        # 构造保存路径，直接使用weights_name
        save_path = os.path.join(dump_path, weights_name)
        # 保存共享权重模型的状态字典到指定路径
        torch.save(shared_weights, save_path)
        # 返回只包含一个元素的字典，表示文件名和sharded_state_dicts的第一个元素，以及None
        return {weights_name: sharded_state_dicts[0]}, None
    else:
        # 如果存在多个权重块，保存共享权重模型的状态字典到指定路径
        torch.save(shared_weights, save_path)

    # 否则，构建索引
    # 初始化权重映射字典
    weight_map = {}
    # 遍历所有权重块的索引和名称
    for idx, shard in enumerate(sharded_state_dicts):
        # 构造每个权重块的文件名，形如"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin"
        shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
        # 构造临时文件名，形如"-{idx+1:05d}-of-???.bin"
        temp_filename = os.path.join(dump_path, weights_name.replace(".bin", f"-{idx+1:05d}-of-???.bin"))
        # 重命名临时文件为最终的权重块文件名
        os.rename(temp_filename, os.path.join(dump_path, shard_file))
        # 遍历当前权重块中的所有键，将其映射到对应的权重块文件名
        for key in shard:
            weight_map[key] = shard_file

    # 添加元数据
    # 构造包含总大小的元数据字典
    metadata = {"total_size": total_size}
    # 构造包含元数据和权重映射的索引字典
    index = {"metadata": metadata, "weight_map": weight_map}

    # 将索引字典以JSON格式写入文件
    with open(os.path.join(dump_path, WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f:
        # 将索引字典转换为格式化的JSON字符串并写入文件
        content = json.dumps(index, indent=2, sort_keys=True) + "\n"
        f.write(content)

    # 返回元数据和索引字典
    return metadata, index
if __name__ == "__main__":
    # 如果脚本被直接执行，则开始执行以下操作

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    parser.add_argument(
        "--nllb_moe_checkpoint_path",
        default="/home/arthur_huggingface_co/fairseq/weights/checkpoints/model_moe_54b/checkpoint_2_300000",
        type=str,
        required=False,
        help="Path to a directory containing a folder per layer. Follows the original Google format.",
    )
    # 添加必需的参数：nllb_moe_checkpoint_path，表示模型检查点的路径

    parser.add_argument("--dtype", default="float32", type=str, required=False, help="dtype of the saved model")
    # 添加参数：dtype，默认为"float32"，表示保存模型的数据类型

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default="/home/arthur_huggingface_co/fairseq/weights/checkpoints/hf-converted-moe-54b",
        type=str,
        required=False,
        help="Path to the output pytorch model.",
    )
    # 添加参数：pytorch_dump_folder_path，表示输出 PyTorch 模型的路径

    args = parser.parse_args()
    # 解析命令行参数，并将结果存储在 args 对象中

    metadata, index = shard_on_the_fly(
        args.nllb_moe_checkpoint_path,
        args.pytorch_dump_folder_path,
        128,
        args.dtype,
    )
    # 调用 shard_on_the_fly 函数，使用命令行参数中的路径和参数来执行分片操作，并返回元数据和索引信息

    config = NllbMoeConfig.from_pretrained(
        "facebook/nllb-200-3.3B", encoder_sparse_step=4, decoder_sparse_step=4, num_experts=128
    )
    # 从预训练模型加载配置信息，指定了一些特定参数

    config.save_pretrained(args.pytorch_dump_folder_path)
    # 将配置信息保存到指定的 PyTorch 模型输出路径中

    model = NllbMoeModel.from_pretrained(args.pytorch_dump_folder_path)
    # 从指定路径加载预训练模型

    print("Done")
    # 打印提示信息，表明程序执行完成

    model.save_pretrained(args.pytorch_dump_folder_path)
    # 将加载的预训练模型保存到指定的 PyTorch 模型输出路径中

Transformers-源码解析-八十一-

Transformers 源码解析（八十一）

.\models\mvp\tokenization_mvp.py

.\models\mvp\tokenization_mvp_fast.py

.\models\mvp\__init__.py

.\models\nat\configuration_nat.py

.\models\nat\modeling_nat.py

.\models\nat\__init__.py

.\models\nezha\configuration_nezha.py

.\models\nezha\modeling_nezha.py

.\models\nezha\__init__.py

.\models\nllb\tokenization_nllb.py

.\models\nllb\tokenization_nllb_fast.py

.\models\nllb\__init__.py

.\models\nllb_moe\configuration_nllb_moe.py

.\models\nllb_moe\convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py

`.\models\mvp\tokenization_mvp.py`

`.\models\mvp\tokenization_mvp_fast.py`

`.\models\mvp\init.py`

`.\models\nat\configuration_nat.py`

`.\models\nat\modeling_nat.py`

`.\models\nat\init.py`

`.\models\nezha\configuration_nezha.py`

`.\models\nezha\modeling_nezha.py`

`.\models\nezha\init.py`

`.\models\nllb\tokenization_nllb.py`

`.\models\nllb\tokenization_nllb_fast.py`

`.\models\nllb\init.py`

`.\models\nllb_moe\configuration_nllb_moe.py`

`.\models\nllb_moe\convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py`