Transformers 源码解析（五十）

`.\models\fnet\tokenization_fnet.py`

# coding=utf-8
# 上面的行声明了文件编码格式为 UTF-8，确保可以正确处理中文和其他特殊字符
# Copyright 2021 Google Research, Google AI, Google Brain and the HuggingFace Inc. team.
# 版权声明，指出了代码的版权归属及授权许可信息
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 授权许可，允许在符合许可的前提下使用本文件
# you may not use this file except in compliance with the License.
# 除非符合许可，否则禁止使用此文件
# You may obtain a copy of the License at
# 获取许可协议的副本，详见以下网址
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# 除非适用法律要求或书面同意，否则依据 "AS IS" 原则发布软件
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有任何明示或暗示的保证或条件
# See the License for the specific language governing permissions and
# limitations under the License.
# 详细内容请参阅许可协议，包括授权的特定语言和限制
""" Tokenization classes for FNet model."""
# 此行开始了对 FNet 模型的 tokenization 类的定义，是本文件的主题注释

import os
# 导入操作系统相关的模块
import unicodedata
# 导入处理 Unicode 数据的模块
from shutil import copyfile
# 导入复制文件的函数 copyfile
from typing import Any, Dict, List, Optional, Tuple
# 导入类型提示相关的功能，包括 Any, Dict, List, Optional, Tuple

import sentencepiece as spm
# 导入 SentencePiece 库，用于分词

from ...tokenization_utils import AddedToken, PreTrainedTokenizer
# 从 tokenization_utils 模块中导入 AddedToken 和 PreTrainedTokenizer 类
from ...utils import logging
# 从 utils 模块中导入 logging 模块

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器对象
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
# 定义词汇文件名字典，包含一个键值对，指定了词汇文件的名称为 "spiece.model"

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "google/fnet-base": "https://huggingface.co/google/fnet-base/resolve/main/spiece.model",
        "google/fnet-large": "https://huggingface.co/google/fnet-large/resolve/main/spiece.model",
    },
}
# 预训练词汇文件映射字典，指定了不同模型与其对应的预训练词汇文件下载链接

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "google/fnet-base": 512,
    "google/fnet-large": 512,
}
# 预训练位置嵌入大小字典，指定了不同模型的预训练位置嵌入大小为 512

SPIECE_UNDERLINE = "▁"
# 定义了 SentencePiece 使用的起始符号，这里是下划线 "▁"

class FNetTokenizer(PreTrainedTokenizer):
    """
    Construct an FNet tokenizer. Adapted from [`AlbertTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece). This tokenizer inherits from [`PreTrainedTokenizer`]
    which contains most of the main methods. Users should refer to this superclass for more information regarding those
    methods.
    """
    # FNetTokenizer 类的定义，继承自 PreTrainedTokenizer 类，实现 FNet 模型的分词器功能
    # 从 AlbertTokenizer 适配而来，基于 SentencePiece 实现
    # 初始化一个SentencePieceProcessor对象，用于加载和处理SentencePiece模型
    sp_model = SentencePieceProcessor()
    # 加载指定的SentencePiece模型文件，初始化tokenizer
    sp_model.Load(vocab_file)
    # 是否在tokenize时将输入文本转换为小写，默认为False
    self.do_lower_case = do_lower_case
    # 是否在tokenize时移除文本中的空格，默认为True
    self.remove_space = remove_space
    # 是否在tokenize时保留文本中的重音符号，默认为True
    self.keep_accents = keep_accents
    # 未知token，当输入的token不在词汇表中时使用，默认为"<unk>"
    self.unk_token = unk_token
    # 分隔token，用于多个序列合并时分隔不同的序列，默认为"[SEP]"
    self.sep_token = sep_token
    # 填充token，用于填充不同长度的序列，默认为"<pad>"
    self.pad_token = pad_token
    # 分类器token，用于序列分类时的特殊token，默认为"[CLS]"
    self.cls_token = cls_token
    # 掩码token，用于掩码语言模型训练时的特殊token，默认为"[MASK]"
    self.mask_token = mask_token
    # SentencePiece模型的额外参数，将会传递给SentencePieceProcessor.__init__()方法
    self.sp_model_kwargs = sp_model_kwargs if sp_model_kwargs is not None else {}
    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    """
    # 定义类变量，包含模型需要的文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 预训练模型的最大输入尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 模型输入名称列表
    model_input_names = ["input_ids", "token_type_ids"]

    def __init__(
        self,
        vocab_file,
        do_lower_case=False,
        remove_space=True,
        keep_accents=True,
        unk_token="<unk>",
        sep_token="[SEP]",
        pad_token="<pad>",
        cls_token="[CLS]",
        mask_token="[MASK]",
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> None:
        # 如果 mask_token 是字符串，则创建一个特殊的 AddedToken 对象
        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
        # 如果 cls_token 是字符串，则创建一个特殊的 AddedToken 对象
        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
        # 如果 sep_token 是字符串，则创建一个特殊的 AddedToken 对象
        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
        # 如果 mask_token 是字符串，则创建一个特殊的 AddedToken 对象
        mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token
        # 如果未提供 sp_model_kwargs，则设为默认空字典
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 初始化参数赋值
        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file

        # 使用 SentencePieceProcessor 初始化 sp_model，并加载词汇文件
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(vocab_file)

        # 调用父类的初始化方法
        super().__init__(
            do_lower_case=do_lower_case,
            remove_space=remove_space,
            keep_accents=keep_accents,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

    @property
    def vocab_size(self):
        # 返回当前 sp_model 的词汇大小
        return len(self.sp_model)

    def get_vocab(self):
        # 生成词汇表，将 ID 映射到对应的词汇符号
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        # 将已添加的特殊符号编码器合并到词汇表中
        vocab.update(self.added_tokens_encoder)
        return vocab

    def __getstate__(self):
        # 获取对象状态的副本，去除 sp_model 属性以便序列化
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        # 恢复对象状态，包括 sp_model 属性的重新初始化
        self.__dict__ = d

        # 兼容旧版本的处理
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 使用 SentencePieceProcessor 重新初始化 sp_model 并加载词汇文件
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(self.vocab_file)
    # 预处理文本，根据初始化时的设置进行处理
    def preprocess_text(self, inputs):
        if self.remove_space:
            # 如果需要移除空格，则去除首尾空格并用单个空格重新连接单词
            outputs = " ".join(inputs.strip().split())
        else:
            outputs = inputs
        # 替换文本中的特殊引号格式为标准双引号
        outputs = outputs.replace("``", '"').replace("''", '"')

        if not self.keep_accents:
            # 如果不保留重音符号，则使用Unicode标准化处理文本
            outputs = unicodedata.normalize("NFKD", outputs)
            # 过滤掉所有组合字符，保留文本中的基本字符
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
        if self.do_lower_case:
            # 如果需要转换为小写，则将文本全部转换为小写
            outputs = outputs.lower()

        return outputs

    # 使用SentencePiece模型进行分词
    def _tokenize(self, text: str) -> List[str]:
        """Tokenize a string."""
        # 预处理文本
        text = self.preprocess_text(text)
        # 使用SentencePiece模型对文本进行编码，返回编码后的片段列表
        pieces = self.sp_model.encode(text, out_type=str)
        new_pieces = []
        for piece in pieces:
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                # 处理以数字结尾且倒数第二个字符为逗号的片段
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                new_pieces.append(piece)

        return new_pieces

    # 将Token转换为对应的ID
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.sp_model.PieceToId(token)

    # 将ID转换为对应的Token
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.sp_model.IdToPiece(index)

    # 从tokens序列中恢复成单个字符串
    # 参考自transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        current_sub_tokens = []
        out_string = ""
        prev_is_special = False
        for token in tokens:
            # 确保特殊token不会被SentencePiece模型解码
            if token in self.all_special_tokens:
                if not prev_is_special:
                    out_string += " "
                out_string += self.sp_model.decode(current_sub_tokens) + token
                prev_is_special = True
                current_sub_tokens = []
            else:
                current_sub_tokens.append(token)
                prev_is_special = False
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string.strip()

    # 解码token_ids列表
    def _decode(
        self,
        token_ids: List[int],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        spaces_between_special_tokens: bool = False,
        **kwargs,
    ) -> str:
        # 调用父类的 _decode 方法，解码 token_ids 为文本
        text = super()._decode(
            token_ids=token_ids,
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            spaces_between_special_tokens=spaces_between_special_tokens,
            **kwargs,
        )
        # 模仿 Rust 分词器的行为：
        # 在 <unk> 后面不加空格
        if not spaces_between_special_tokens:
            text = text.replace("<unk> ", "<unk>")
        # 返回处理后的文本
        return text

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        通过连接和添加特殊标记，构建用于序列分类任务的模型输入。一个 FNet 序列的格式如下：

        - 单个序列：`[CLS] X [SEP]`
        - 序列对：`[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                将添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个 ID 列表，用于序列对。

        Returns:
            `List[int]`: 包含适当特殊标记的输入 ID 列表。
        """
        sep = [self.sep_token_id]  # 获取 SEP token 的 ID
        cls = [self.cls_token_id]  # 获取 CLS token 的 ID
        if token_ids_1 is None:
            return cls + token_ids_0 + sep  # 返回单个序列的特殊标记输入
        return cls + token_ids_0 + sep + token_ids_1 + sep  # 返回序列对的特殊标记输入

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        从未添加特殊标记的标记列表中检索序列 ID。在使用分词器的 `prepare_for_model` 方法添加特殊标记时调用此方法。

        Args:
            token_ids_0 (`List[int]`):
                ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个 ID 列表，用于序列对。
            already_has_special_tokens (`bool`, *optional*, 默认为 `False`):
                标记列表是否已经格式化为模型的特殊标记。

        Returns:
            `List[int]`: 一个整数列表，范围在 [0, 1] 内：1 表示特殊标记，0 表示序列标记。
        """

        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
        """
        根据输入的序列列表创建 token_type_ids。在使用分词器的 `prepare_for_model` 方法添加特殊标记时调用此方法。

        Args:
            token_ids_0 (`List[int]`):
                第一个序列的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个序列的 ID 列表，用于序列对。

        Returns:
            None
        """
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet sequence
        pair mask has the following format: :

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Define the separator and classification tokens
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If token_ids_1 is None, return a mask with all zeros for the first sequence part
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        # Otherwise, concatenate the lengths of cls, token_ids_0, sep with all zeros,
        # and concatenate the length of token_ids_1 and sep with all ones for the second sequence part
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Check if the save_directory exists; if not, log an error and return None
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # Construct the output vocabulary file path
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # If the current vocab_file path is different from the output path and is a file,
        # copy the current vocab_file to the output vocab_file path
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # If the current vocab_file does not exist, write the serialized sp_model proto to the output vocab_file
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # Return the path of the saved vocabulary file
        return (out_vocab_file,)

`.\models\fnet\tokenization_fnet_fast.py`

# coding=utf-8
# Copyright 2021 Google AI, Google Brain and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for FNet model."""


import os
from shutil import copyfile
from typing import List, Optional, Tuple

from ...tokenization_utils import AddedToken
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging

# Check if sentencepiece library is available
if is_sentencepiece_available():
    # Import the specific tokenizer for FNet from local module
    from .tokenization_fnet import FNetTokenizer
else:
    # Set FNetTokenizer to None if sentencepiece is not available
    FNetTokenizer = None

# Initialize logger for this module
logger = logging.get_logger(__name__)

# Define vocabulary files names expected by the tokenizer
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}

# Define pretrained vocab files mapping for different model configurations
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "google/fnet-base": "https://huggingface.co/google/fnet-base/resolve/main/spiece.model",
        "google/fnet-large": "https://huggingface.co/google/fnet-large/resolve/main/spiece.model",
    },
    "tokenizer_file": {
        "google/fnet-base": "https://huggingface.co/google/fnet-base/resolve/main/tokenizer.json",
        "google/fnet-large": "https://huggingface.co/google/fnet-large/resolve/main/tokenizer.json",
    },
}

# Define sizes of positional embeddings for different model configurations
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "google/fnet-base": 512,
    "google/fnet-large": 512,
}

# Special token used by sentencepiece for word beginning pieces
SPIECE_UNDERLINE = "▁"


class FNetTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" FNetTokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
    [`AlbertTokenizerFast`]. Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods
    """
    # 声明全局变量，包含预定义的词汇文件名
    vocab_files_names = VOCAB_FILES_NAMES
    # 包含预训练模型的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 包含预训练位置嵌入的最大模型输入大小
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 模型输入的名称列表，用于初始化
    model_input_names = ["input_ids", "token_type_ids"]
    # 慢速分词器的类别，使用了 FNetTokenizer
    slow_tokenizer_class = FNetTokenizer

    # 初始化方法，接受多个可选参数来配置分词器的行为
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=False,
        remove_space=True,
        keep_accents=True,
        unk_token="<unk>",
        sep_token="[SEP]",
        pad_token="<pad>",
        cls_token="[CLS]",
        mask_token="[MASK]",
        **kwargs,
        ):
            # 如果 mask_token 是字符串，则创建一个 AddedToken 对象，保留前导空格但不保留后导空格
            mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
            # 如果 cls_token 是字符串，则创建一个 AddedToken 对象，不保留前导和后导空格
            cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
            # 如果 sep_token 是字符串，则创建一个 AddedToken 对象，不保留前导和后导空格
            sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
    
            # 调用父类的初始化方法，设置基本的 tokenizer 参数
            super().__init__(
                vocab_file,
                tokenizer_file=tokenizer_file,
                do_lower_case=do_lower_case,
                remove_space=remove_space,
                keep_accents=keep_accents,
                unk_token=unk_token,
                sep_token=sep_token,
                pad_token=pad_token,
                cls_token=cls_token,
                mask_token=mask_token,
                **kwargs,
            )
    
            # 设置当前对象的属性值
            self.do_lower_case = do_lower_case
            self.remove_space = remove_space
            self.keep_accents = keep_accents
            self.vocab_file = vocab_file
    
        @property
        def can_save_slow_tokenizer(self) -> bool:
            # 检查词汇文件是否存在，从而判断是否可以保存慢速 tokenizer
            return os.path.isfile(self.vocab_file) if self.vocab_file else False
    
        def build_inputs_with_special_tokens(
            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        ) -> List[int]:
            """
            通过连接并添加特殊 token 构建用于序列分类任务的模型输入。FNet 序列有以下格式：

            - 单序列：`[CLS] X [SEP]`
            - 序列对：`[CLS] A [SEP] B [SEP]`

            Args:
                token_ids_0 (`List[int]`):
                    要添加特殊 token 的 ID 列表
                token_ids_1 (`List[int]`, *optional*):
                    第二个序列的可选 ID 列表，用于序列对任务

            Returns:
                `List[int]`: 包含适当特殊 token 的输入 ID 列表
            """
            sep = [self.sep_token_id]
            cls = [self.cls_token_id]
            if token_ids_1 is None:
                return cls + token_ids_0 + sep
            return cls + token_ids_0 + sep + token_ids_1 + sep
    
        def create_token_type_ids_from_sequences(
            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Define the special tokens for separation and classification
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # Check if token_ids_1 is None; if so, return a mask for only the first sequence
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        # Otherwise, concatenate masks for both sequences (first sequence: 0s, second sequence: 1s)
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Ensure the save_directory exists; if not, log an error and return None
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # Define the output vocabulary file path
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # If the current vocab_file path is different from the desired out_vocab_file path, copy the vocab_file
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # Return the path to the saved vocabulary file
        return (out_vocab_file,)

`.\models\fnet\init.py`

# 版权声明和许可证信息
#
# 版权所有 2021 年 HuggingFace 团队。保留所有权利。
# 
# 根据 Apache 许可证 2.0 版本（“许可证”）许可；
# 除非符合许可证，否则您不能使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件分发时基于“原样”提供，
# 没有任何形式的明示或暗示保证或条件。
# 有关特定语言的详细信息，请参阅许可证。
from typing import TYPE_CHECKING

# 从 utils 中导入相关的函数和类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义导入结构的字典，用于组织导入的模块和类
_import_structure = {"configuration_fnet": ["FNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FNetConfig"]}

# 检查是否有 sentencepiece 库可用，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 FNetTokenizer 加入导入结构
    _import_structure["tokenization_fnet"] = ["FNetTokenizer"]

# 检查是否有 tokenizers 库可用，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 FNetTokenizerFast 加入导入结构
    _import_structure["tokenization_fnet_fast"] = ["FNetTokenizerFast"]

# 检查是否有 torch 库可用，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将一系列 FNet 模型和类加入导入结构
    _import_structure["modeling_fnet"] = [
        "FNET_PRETRAINED_MODEL_ARCHIVE_LIST",
        "FNetForMaskedLM",
        "FNetForMultipleChoice",
        "FNetForNextSentencePrediction",
        "FNetForPreTraining",
        "FNetForQuestionAnswering",
        "FNetForSequenceClassification",
        "FNetForTokenClassification",
        "FNetLayer",
        "FNetModel",
        "FNetPreTrainedModel",
    ]

# 如果是类型检查模式，则导入配置类和相关依赖
if TYPE_CHECKING:
    from .configuration_fnet import FNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FNetConfig

    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_fnet import FNetTokenizer

    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_fnet_fast import FNetTokenizerFast

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从当前目录导入指定模块和变量
        from .modeling_fnet import (
            FNET_PRETRAINED_MODEL_ARCHIVE_LIST,  # 导入预训练模型的存档列表
            FNetForMaskedLM,  # 导入用于Masked Language Modeling的FNet模型
            FNetForMultipleChoice,  # 导入用于多项选择任务的FNet模型
            FNetForNextSentencePrediction,  # 导入用于下一句预测任务的FNet模型
            FNetForPreTraining,  # 导入用于预训练的FNet模型
            FNetForQuestionAnswering,  # 导入用于问答任务的FNet模型
            FNetForSequenceClassification,  # 导入用于序列分类任务的FNet模型
            FNetForTokenClassification,  # 导入用于标记分类任务的FNet模型
            FNetLayer,  # 导入FNet的层类
            FNetModel,  # 导入通用的FNet模型类
            FNetPreTrainedModel,  # 导入预训练模型的基类
        )
else:
    # 导入 sys 模块，用于在运行时操作 Python 解释器
    import sys

    # 将当前模块注册到 sys.modules 中，使得模块在运行时可以被动态加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\focalnet\configuration_focalnet.py`

# coding=utf-8
# 以上为代码文件的编码声明和版权信息

from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
# 导入必要的模块和类

logger = logging.get_logger(__name__)
# 获取用于记录日志的logger对象，命名空间为当前模块

FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/focalnet-tiny": "https://huggingface.co/microsoft/focalnet-tiny/resolve/main/config.json",
}
# FocalNet模型预训练配置文件映射字典，指定了预训练模型及其配置文件的URL

class FocalNetConfig(BackboneConfigMixin, PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`FocalNetModel`]. It is used to instantiate a
    FocalNet model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the FocalNet
    [microsoft/focalnet-tiny](https://huggingface.co/microsoft/focalnet-tiny) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import FocalNetConfig, FocalNetModel

    >>> # Initializing a FocalNet microsoft/focalnet-tiny style configuration
    >>> configuration = FocalNetConfig()

    >>> # Initializing a model (with random weights) from the microsoft/focalnet-tiny style configuration
    >>> model = FocalNetModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """
    # FocalNetConfig类，用于存储FocalNet模型的配置信息，继承自BackboneConfigMixin和PretrainedConfig类

    model_type = "focalnet"
    # 模型类型为"focalnet"

    def __init__(
        self,
        image_size=224,
        patch_size=4,
        num_channels=3,
        embed_dim=96,
        use_conv_embed=False,
        hidden_sizes=[192, 384, 768, 768],
        depths=[2, 2, 6, 2],
        focal_levels=[2, 2, 2, 2],
        focal_windows=[3, 3, 3, 3],
        hidden_act="gelu",
        mlp_ratio=4.0,
        hidden_dropout_prob=0.0,
        drop_path_rate=0.1,
        use_layerscale=False,
        layerscale_value=1e-4,
        use_post_layernorm=False,
        use_post_layernorm_in_modulation=False,
        normalize_modulator=False,
        initializer_range=0.02,
        layer_norm_eps=1e-5,
        encoder_stride=32,
        out_features=None,
        out_indices=None,
        **kwargs,
    ):
        """
        初始化方法，用于配置FocalNet模型的各种参数和选项

        Parameters:
        - image_size (int): 输入图像的尺寸，默认为224
        - patch_size (int): 感兴趣区域（patch）的尺寸，默认为4
        - num_channels (int): 输入图像的通道数，默认为3（RGB）
        - embed_dim (int): 嵌入维度，默认为96
        - use_conv_embed (bool): 是否使用卷积进行嵌入，默认为False
        - hidden_sizes (list of int): 隐藏层的大小列表，默认为[192, 384, 768, 768]
        - depths (list of int): 各阶段的深度列表，默认为[2, 2, 6, 2]
        - focal_levels (list of int): 各阶段的聚焦级别列表，默认为[2, 2, 2, 2]
        - focal_windows (list of int): 各阶段的聚焦窗口大小列表，默认为[3, 3, 3, 3]
        - hidden_act (str): 隐藏层激活函数，默认为"gelu"
        - mlp_ratio (float): MLP扩展比例，默认为4.0
        - hidden_dropout_prob (float): 隐藏层的dropout概率，默认为0.0
        - drop_path_rate (float): drop path的概率，默认为0.1
        - use_layerscale (bool): 是否使用层标准化，默认为False
        - layerscale_value (float): 层标准化的值，默认为1e-4
        - use_post_layernorm (bool): 是否使用后层标准化，默认为False
        - use_post_layernorm_in_modulation (bool): 是否在调制中使用后层标准化，默认为False
        - normalize_modulator (bool): 是否正常化调制器，默认为False
        - initializer_range (float): 初始化范围，默认为0.02
        - layer_norm_eps (float): 层标准化的epsilon值，默认为1e-5
        - encoder_stride (int): 编码器步长，默认为32
        - out_features (None or list of int): 输出特征的索引列表，默认为None
        - out_indices (None or list of int): 输出索引的列表，默认为None
        - **kwargs: 其他参数

        Notes:
        - Parameters prefixed with 'use_' control the activation of various features in the model.
        - The defaults are set to mimic the microsoft/focalnet-tiny architecture as closely as possible.
        """
        super().__init__(**kwargs)
        # 调用父类的初始化方法，传递任意额外的关键字参数

        # 将参数存储在对象的属性中，供模型使用
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.embed_dim = embed_dim
        self.use_conv_embed = use_conv_embed
        self.hidden_sizes = hidden_sizes
        self.depths = depths
        self.focal_levels = focal_levels
        self.focal_windows = focal_windows
        self.hidden_act = hidden_act
        self.mlp_ratio = mlp_ratio
        self.hidden_dropout_prob = hidden_dropout_prob
        self.drop_path_rate = drop_path_rate
        self.use_layerscale = use_layerscale
        self.layerscale_value = layerscale_value
        self.use_post_layernorm = use_post_layernorm
        self.use_post_layernorm_in_modulation = use_post_layernorm_in_modulation
        self.normalize_modulator = normalize_modulator
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.encoder_stride = encoder_stride
        self.out_features = out_features
        self.out_indices = out_indices
        # 初始化并设置模型配置参数的默认值和选项
        ):
            # 调用父类的初始化方法，传递所有关键字参数
            super().__init__(**kwargs)

            # 设置图像大小
            self.image_size = image_size
            # 设置补丁大小
            self.patch_size = patch_size
            # 设置通道数
            self.num_channels = num_channels
            # 设置嵌入维度
            self.embed_dim = embed_dim
            # 是否使用卷积进行嵌入
            self.use_conv_embed = use_conv_embed
            # 隐藏层大小列表
            self.hidden_sizes = hidden_sizes
            # 网络深度列表
            self.depths = depths
            # 注意力头数目
            self.focal_levels = focal_levels
            # 注意力窗口大小
            self.focal_windows = focal_windows
            # 隐藏层激活函数
            self.hidden_act = hidden_act
            # MLP比例
            self.mlp_ratio = mlp_ratio
            # 隐藏层dropout概率
            self.hidden_dropout_prob = hidden_dropout_prob
            # 路径丢弃率
            self.drop_path_rate = drop_path_rate
            # 是否使用层标准化
            self.use_layerscale = use_layerscale
            # 层标准化值
            self.layerscale_value = layerscale_value
            # 是否在模块中使用后层标准化
            self.use_post_layernorm = use_post_layernorm
            # 在调制中是否使用后层标准化
            self.use_post_layernorm_in_modulation = use_post_layernorm_in_modulation
            # 标准化调制器
            self.normalize_modulator = normalize_modulator
            # 初始化范围
            self.initializer_range = initializer_range
            # 层归一化epsilon
            self.layer_norm_eps = layer_norm_eps
            # 编码器步长
            self.encoder_stride = encoder_stride
            # 舞台名称列表，包括“stem”和各阶段的名称
            self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
            # 获取对齐的输出特征和输出索引
            self._out_features, self._out_indices = get_aligned_output_features_output_indices(
                out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
            )

`.\models\focalnet\convert_focalnet_to_hf_format.py`

# 导入必要的库和模块
import argparse  # 导入命令行参数解析模块
import json  # 导入处理 JSON 格式数据的模块

import requests  # 导入发送 HTTP 请求的模块
import torch  # 导入 PyTorch 深度学习库
from huggingface_hub import hf_hub_download  # 导入从 Hugging Face Hub 下载资源的函数
from PIL import Image  # 导入 Python 图像处理库 PIL
from torchvision import transforms  # 导入 PyTorch 中用于图像处理的模块

from transformers import BitImageProcessor, FocalNetConfig, FocalNetForImageClassification  # 导入 Transformers 库中相关模型和配置
from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling  # 导入图像处理的一些常量和函数


def get_focalnet_config(model_name):
    # 根据模型名称选择不同的深度配置
    depths = [2, 2, 6, 2] if "tiny" in model_name else [2, 2, 18, 2]
    # 根据模型名称选择是否使用卷积嵌入
    use_conv_embed = True if "large" in model_name or "huge" in model_name else False
    # 根据模型名称选择是否使用后层归一化
    use_post_layernorm = True if "large" in model_name or "huge" in model_name else False
    # 根据模型名称选择是否使用层缩放
    use_layerscale = True if "large" in model_name or "huge" in model_name else False

    if "large" in model_name or "xlarge" in model_name or "huge" in model_name:
        # 根据模型名称和类型设置聚焦层级和窗口大小
        if "fl3" in model_name:
            focal_levels = [3, 3, 3, 3]
            focal_windows = [5, 5, 5, 5]
        elif "fl4" in model_name:
            focal_levels = [4, 4, 4, 4]
            focal_windows = [3, 3, 3, 3]

    if "tiny" in model_name or "small" in model_name or "base" in model_name:
        # 根据模型名称设置默认的聚焦窗口大小和层级
        focal_windows = [3, 3, 3, 3]
        if "lrf" in model_name:
            focal_levels = [3, 3, 3, 3]
        else:
            focal_levels = [2, 2, 2, 2]

    if "tiny" in model_name:
        embed_dim = 96
    elif "small" in model_name:
        embed_dim = 96
    elif "base" in model_name:
        embed_dim = 128
    elif "large" in model_name:
        embed_dim = 192
    elif "xlarge" in model_name:
        embed_dim = 256
    elif "huge" in model_name:
        embed_dim = 352

    # 设置标签信息
    repo_id = "huggingface/label-files"
    if "large" in model_name or "huge" in model_name:
        filename = "imagenet-22k-id2label.json"
    else:
        filename = "imagenet-1k-id2label.json"

    # 从 Hugging Face Hub 下载标签文件，并加载为字典
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    id2label = {int(k): v for k, v in id2label.items()}  # 转换字典键的数据类型为整数
    label2id = {v: k for k, v in id2label.items()}  # 创建反向标签到ID的映射字典
    # 使用提供的参数创建一个 FocalNetConfig 对象
    config = FocalNetConfig(
        embed_dim=embed_dim,  # 设定嵌入维度
        depths=depths,  # 设置深度参数
        focal_levels=focal_levels,  # 指定焦点级别
        focal_windows=focal_windows,  # 指定焦点窗口大小
        use_conv_embed=use_conv_embed,  # 标志是否使用卷积嵌入
        id2label=id2label,  # 用于标识到标签的映射
        label2id=label2id,  # 用于标签到标识的映射
        use_post_layernorm=use_post_layernorm,  # 标志是否使用层标准化后处理
        use_layerscale=use_layerscale,  # 标志是否使用层比例
    )
    
    # 返回创建的 FocalNetConfig 对象作为函数的结果
    return config
    # 检查名字中是否包含 "patch_embed.proj"，如果是则替换成 "embeddings.patch_embeddings.projection"
    if "patch_embed.proj" in name:
        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
    
    # 检查名字中是否包含 "patch_embed.norm"，如果是则替换成 "embeddings.norm"
    if "patch_embed.norm" in name:
        name = name.replace("patch_embed.norm", "embeddings.norm")
    
    # 如果名字中包含 "layers"，则在名字前加上 "encoder."
    if "layers" in name:
        name = "encoder." + name
    
    # 检查名字中是否包含 "encoder.layers"，如果是则替换成 "encoder.stages"
    if "encoder.layers" in name:
        name = name.replace("encoder.layers", "encoder.stages")
    
    # 检查名字中是否包含 "downsample.proj"，如果是则替换成 "downsample.projection"
    if "downsample.proj" in name:
        name = name.replace("downsample.proj", "downsample.projection")
    
    # 如果名字中包含 "blocks"，则替换成 "layers"
    if "blocks" in name:
        name = name.replace("blocks", "layers")
    
    # 如果名字中包含 "modulation.f.weight" 或 "modulation.f.bias"，则替换成 "modulation.projection_in"
    if "modulation.f.weight" in name or "modulation.f.bias" in name:
        name = name.replace("modulation.f", "modulation.projection_in")
    
    # 如果名字中包含 "modulation.h.weight" 或 "modulation.h.bias"，则替换成 "modulation.projection_context"
    if "modulation.h.weight" in name or "modulation.h.bias" in name:
        name = name.replace("modulation.h", "modulation.projection_context")
    
    # 如果名字中包含 "modulation.proj.weight" 或 "modulation.proj.bias"，则替换成 "modulation.projection_out"
    if "modulation.proj.weight" in name or "modulation.proj.bias" in name:
        name = name.replace("modulation.proj", "modulation.projection_out")
    
    # 如果名字是 "norm.weight"，则替换成 "layernorm.weight"
    if name == "norm.weight":
        name = "layernorm.weight"
    
    # 如果名字是 "norm.bias"，则替换成 "layernorm.bias"
    if name == "norm.bias":
        name = "layernorm.bias"
    
    # 如果名字中包含 "head"，则替换成 "classifier"，否则加上 "focalnet."
    if "head" in name:
        name = name.replace("head", "classifier")
    else:
        name = "focalnet." + name
    
    # 返回修改后的名字
    return name
    # 使用循环遍历 state_dict 的键的副本，并逐一处理
    for key in state_dict.copy().keys():
        # 弹出当前键对应的值
        val = state_dict.pop(key)
        # 使用重命名函数处理键，并将键值对添加回 state_dict
        state_dict[rename_key(key)] = val

    # 根据模型名称获取配置信息
    config = get_focalnet_config(model_name)
    # 根据配置信息创建图像分类模型
    model = FocalNetForImageClassification(config)
    # 设置模型为评估模式
    model.eval()

    # 加载 state_dict 到模型中
    model.load_state_dict(state_dict)

    # 验证图像转换
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"

    # 创建图像处理器实例，进行预处理操作
    processor = BitImageProcessor(
        do_resize=True,
        size={"shortest_edge": 256},
        resample=PILImageResampling.BILINEAR,
        do_center_crop=True,
        crop_size=224,
        do_normalize=True,
        image_mean=IMAGENET_DEFAULT_MEAN,
        image_std=IMAGENET_DEFAULT_STD,
    )
    # 使用 requests 获取图像并用 PIL 打开
    image = Image.open(requests.get(url, stream=True).raw)
    # 对图像应用图像处理器，返回处理后的张量形式输入
    inputs = processor(images=image, return_tensors="pt")

    # 定义图像转换流水线
    image_transforms = transforms.Compose(
        [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]
    )

    # 获取原始像素值的张量表示
    original_pixel_values = image_transforms(image).unsqueeze(0)

    # 验证像素值是否接近
    assert torch.allclose(inputs.pixel_values, original_pixel_values, atol=1e-4)

    # 将输入传递给模型，获取输出
    outputs = model(**inputs)

    # 获取预测的类别索引
    predicted_class_idx = outputs.logits.argmax(-1).item()
    print("Predicted class:", model.config.id2label[predicted_class_idx])

    # 打印 logits 的前三个值
    print("First values of logits:", outputs.logits[0, :3])

    # 根据模型名称选择预期的 logits 切片值，并进行验证
    if model_name == "focalnet-tiny":
        expected_slice = torch.tensor([0.2166, -0.4368, 0.2191])
    elif model_name == "focalnet-tiny-lrf":
        expected_slice = torch.tensor([1.1669, 0.0125, -0.1695])
    elif model_name == "focalnet-small":
        expected_slice = torch.tensor([0.4917, -0.0430, 0.1341])
    elif model_name == "focalnet-small-lrf":
        expected_slice = torch.tensor([-0.2588, -0.5342, -0.2331])
    elif model_name == "focalnet-base":
        expected_slice = torch.tensor([-0.1655, -0.4090, -0.1730])
    elif model_name == "focalnet-base-lrf":
        expected_slice = torch.tensor([0.5306, -0.0483, -0.3928])
    # 验证预期的 logits 切片值是否与实际输出接近
    assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
    print("Looks ok!")

    # 如果指定了 PyTorch 模型保存路径，保存模型和处理器
    if pytorch_dump_folder_path is not None:
        print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}")
        model.save_pretrained(pytorch_dump_folder_path)
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果设置了推送到 Hub 的标志，将模型和处理器推送到 Hub
    if push_to_hub:
        print(f"Pushing model and processor of {model_name} to the hub...")
        model.push_to_hub(f"{model_name}")
        processor.push_to_hub(f"{model_name}")
if __name__ == "__main__":
    # 如果作为主程序运行，执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    parser.add_argument(
        "--model_name",
        default="focalnet-tiny",
        type=str,
        help="Name of the FocalNet model you'd like to convert.",
    )
    # 添加一个必需的参数 `--model_name`，默认为 "focalnet-tiny"，类型为字符串，用于指定要转换的 FocalNet 模型的名称

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加一个参数 `--pytorch_dump_folder_path`，默认为 None，类型为字符串，用于指定输出的 PyTorch 模型目录的路径

    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether to push the model and processor to the hub.",
    )
    # 添加一个参数 `--push_to_hub`，如果存在则表示要推送模型和处理器到 hub

    args = parser.parse_args()
    # 解析命令行参数并将其存储在 `args` 变量中

    convert_focalnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
    # 调用函数 `convert_focalnet_checkpoint`，传入从命令行解析得到的参数 `model_name`, `pytorch_dump_folder_path`, `push_to_hub`

`.\models\focalnet\modeling_focalnet.py`

# coding=utf-8
# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch FocalNet model."""

# Import necessary modules for the FocalNet model
import collections.abc
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# Import utilities and functions from Hugging Face libraries
from ...activations import ACT2FN
from ...modeling_outputs import BackboneOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_focalnet import FocalNetConfig

# Initialize logger for the current module
logger = logging.get_logger(__name__)

# General docstring for documentation purposes
_CONFIG_FOR_DOC = "FocalNetConfig"

# Base docstring for checkpoint information
_CHECKPOINT_FOR_DOC = "microsoft/focalnet-tiny"
_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]

# Image classification docstring for model usage
_IMAGE_CLASS_CHECKPOINT = "microsoft/focalnet-tiny"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

# List of pretrained model archive paths
FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/focalnet-tiny",
    # See all FocalNet models at https://huggingface.co/models?filter=focalnet
]

# Define a dataclass for FocalNetEncoderOutput extending ModelOutput
@dataclass
class FocalNetEncoderOutput(ModelOutput):
    """
    FocalNet encoder's outputs, with potential hidden states.
    This dataclass inherits from ModelOutput provided by Hugging Face.
    """
    # 定义函数参数和返回值的类型注释
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态输出序列，形状为(batch_size, sequence_length, hidden_size)。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型每一层的隐藏状态组成的元组，包括初始嵌入层输出。
            形状为(batch_size, sequence_length, hidden_size)。
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型每一层的隐藏状态组成的元组，包括初始嵌入层输出，并且重新整形以包括空间维度。
            形状为(batch_size, hidden_size, height, width)。
    
    # 初始化函数的返回值，分别为None类型，表示初始情况下未赋值
    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 使用 `dataclass` 装饰器定义一个数据类，表示 FocalNet 模型的输出，继承自 `ModelOutput` 类。
@dataclass
class FocalNetModelOutput(ModelOutput):
    """
    FocalNet model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    """

    # 定义类成员 `last_hidden_state`，表示模型最后一层的隐藏状态
    last_hidden_state: torch.FloatTensor = None
    # 定义类成员 `pooler_output`，表示最后一层隐藏状态的平均池化结果，可选，当 `add_pooling_layer=True` 时返回
    pooler_output: Optional[torch.FloatTensor] = None
    # 定义类成员 `hidden_states`，表示每一层模型的隐藏状态的元组，可选，当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 定义类成员 `reshaped_hidden_states`，表示每一层模型隐藏状态的元组，且包括空间维度的重塑，可选，当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class FocalNetMaskedImageModelingOutput(ModelOutput):
    """
    FocalNet masked image model outputs.
    """

    # 这是一个空的数据类，用于表示 FocalNet 模型处理掩膜图像后的输出
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
            Masked image modeling (MLM) loss.
            图像模型的掩码损失，如果提供了 `bool_masked_pos`，则返回。
        reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Reconstructed pixel values.
            重建后的像素数值，形状为 `(batch_size, num_channels, height, width)`。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` representing hidden-states of the model at the output of each layer plus the initial embedding outputs.
            模型在每层输出和初始嵌入输出时的隐藏状态的元组。
            形状为 `(batch_size, sequence_length, hidden_size)`。
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` representing hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to include the spatial dimensions.
            模型在每层输出和初始嵌入输出时的隐藏状态的元组，已重塑以包括空间维度。
            形状为 `(batch_size, hidden_size, height, width)`。

    """

    # 可选的损失值，如果没有提供将为 None
    loss: Optional[torch.FloatTensor] = None
    # 可选的重建像素值
    reconstruction: torch.FloatTensor = None
    # 可选的隐藏状态，表示模型在每层输出和初始嵌入输出时的隐藏状态
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 可选的重塑后的隐藏状态，表示模型在每层输出和初始嵌入输出时的隐藏状态，并已重塑以包括空间维度
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class FocalNetImageClassifierOutput(ModelOutput):
    """
    FocalNet outputs for image classification.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    """

    loss: Optional[torch.FloatTensor] = None  # 损失值，用于分类或回归任务的损失
    logits: torch.FloatTensor = None  # 分类（或回归）得分，经过 SoftMax 之前的输出
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 每个层输出的隐藏状态，包括初始嵌入输出
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 每个层输出的隐藏状态，包括空间维度的重塑

class FocalNetEmbeddings(nn.Module):
    """
    Construct the patch embeddings and layernorm. Optionally, also the mask token.
    """

    def __init__(self, config, use_mask_token=False):
        super().__init__()

        self.patch_embeddings = FocalNetPatchEmbeddings(
            config=config,
            image_size=config.image_size,
            patch_size=config.patch_size,
            num_channels=config.num_channels,
            embed_dim=config.embed_dim,
            use_conv_embed=config.use_conv_embed,
            is_stem=True,
        )  # 创建图像的补丁嵌入
        self.patch_grid = self.patch_embeddings.grid_size  # 获取补丁嵌入的网格大小
        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None  # 可选地创建掩码令牌

        self.norm = nn.LayerNorm(config.embed_dim, eps=config.layer_norm_eps)  # LayerNorm 归一化层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # 随机失活层

    def forward(
        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
        # 前向传播方法，接收像素值和可选的掩码位置张量

        )
        # 前向传播方法，接收像素值和可选的掩码位置张量
    ) -> Tuple[torch.Tensor]:
        # 获取图像的补丁嵌入和输出维度信息
        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
        # 对嵌入向量进行归一化处理
        embeddings = self.norm(embeddings)
        # 获取当前批次的大小和序列长度
        batch_size, seq_len, _ = embeddings.size()

        # 如果存在布尔类型的遮罩位置信息
        if bool_masked_pos is not None:
            # 将遮罩标记扩展到与嵌入向量相同的维度
            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
            # 将布尔类型的遮罩位置转换成与mask_tokens相同类型的张量
            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
            # 使用遮罩标记替换被遮罩的视觉标记
            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask

        # 对嵌入向量应用dropout操作
        embeddings = self.dropout(embeddings)
        # 返回处理后的嵌入向量和输出维度信息
        return embeddings, output_dimensions
class FocalNetPatchEmbeddings(nn.Module):
    def __init__(
        self,
        config,
        image_size,
        patch_size,
        num_channels,
        embed_dim,
        add_norm=False,
        use_conv_embed=False,
        is_stem=False,
    ):
        super().__init__()
        # 将图像大小和补丁大小转换为元组，如果它们不是可迭代对象，则分别使用默认值
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # 计算图像中的补丁数量
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches
        # 计算网格大小，即图像尺寸与补丁尺寸的整除结果
        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])

        if use_conv_embed:
            # 如果选择使用卷积嵌入，则根据是否是 stem 层选择不同的卷积参数
            if is_stem:
                kernel_size = 7
                padding = 2
                stride = 4
            else:
                kernel_size = 3
                padding = 1
                stride = 2
            # 设置卷积投影层，根据参数创建卷积层对象
            self.projection = nn.Conv2d(
                num_channels, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
            )
        else:
            # 否则，使用常规的卷积设置补丁大小作为卷积核大小和步幅
            self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

        if add_norm:
            # 如果指定要添加 LayerNorm，则创建 LayerNorm 层
            self.norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        else:
            # 否则，不添加标准化层
            self.norm = None

    def maybe_pad(self, pixel_values, height, width):
        # 如果图像宽度不能被补丁宽度整除，则对像素值进行填充
        if width % self.patch_size[1] != 0:
            pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
            pixel_values = nn.functional.pad(pixel_values, pad_values)
        # 如果图像高度不能被补丁高度整除，则对像素值进行填充
        if height % self.patch_size[0] != 0:
            pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
            pixel_values = nn.functional.pad(pixel_values, pad_values)
        return pixel_values

    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
        # 获取输入张量的形状信息
        _, num_channels, height, width = pixel_values.shape
        # 检查通道数是否与配置中指定的数值相符合
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # 对输入像素进行可能的填充，使其能够被补丁大小整除
        pixel_values = self.maybe_pad(pixel_values, height, width)
        # 使用投影层进行特征提取，得到嵌入特征张量
        embeddings = self.projection(pixel_values)
        # 获取嵌入特征张量的新形状信息
        _, _, height, width = embeddings.shape
        output_dimensions = (height, width)
        # 对嵌入特征进行展平和转置操作，以便后续处理
        embeddings = embeddings.flatten(2).transpose(1, 2)

        if self.norm is not None:
            # 如果存在 LayerNorm 层，则对嵌入特征进行标准化处理
            embeddings = self.norm(embeddings)

        return embeddings, output_dimensions


# Copied from transformers.models.beit.modeling_beit.drop_path
# 定义一个函数用于在神经网络中应用路径丢弃（Stochastic Depth），每个样本都可能执行该操作（当应用于残差块的主路径时）。
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    # 如果丢失概率为0或者当前非训练状态，则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    # 计算保留概率
    keep_prob = 1 - drop_prob
    # 创建一个与输入形状兼容的随机张量，用于随机选择保留的路径
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # 适用于不同维度的张量，而不仅仅是二维卷积网络
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 对随机张量进行二值化处理
    # 对输入进行路径丢弃操作，并返回处理后的输出
    output = input.div(keep_prob) * random_tensor
    return output


# 从transformers.models.beit.modeling_beit.BeitDropPath复制并更改为FocalNet
class FocalNetDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用上面定义的drop_path函数来实现路径丢弃操作
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        # 返回当前模块的额外描述信息，这里是丢弃概率(drop_prob)
        return "p={}".format(self.drop_prob)


class FocalNetModulation(nn.Module):
    # 这里可以添加FocalNetModulation的具体实现，以进行FocalNet的特定调制
    def __init__(self, config, index, dim, focal_factor=2, bias=True, projection_dropout=0.0):
        super().__init__()

        self.dim = dim  # 设置对象的维度属性
        self.focal_window = config.focal_windows[index]  # 获取配置中的焦点窗口大小
        self.focal_level = config.focal_levels[index]  # 获取配置中的焦点级别
        self.focal_factor = focal_factor  # 设置焦点因子
        self.use_post_layernorm_in_modulation = config.use_post_layernorm_in_modulation  # 是否使用后层标准化调制
        self.normalize_modulator = config.normalize_modulator  # 是否标准化调制器

        self.projection_in = nn.Linear(dim, 2 * dim + (self.focal_level + 1), bias=bias)  # 输入投影层
        self.projection_context = nn.Conv2d(dim, dim, kernel_size=1, stride=1, bias=bias)  # 上下文投影卷积层

        self.activation = nn.GELU()  # 激活函数
        self.projection_out = nn.Linear(dim, dim)  # 输出投影层
        self.projection_dropout = nn.Dropout(projection_dropout)  # 投影层的dropout
        self.focal_layers = nn.ModuleList()  # 焦点层列表

        self.kernel_sizes = []  # 焦点层的卷积核尺寸列表
        for k in range(self.focal_level):
            kernel_size = self.focal_factor * k + self.focal_window  # 计算每个焦点层的卷积核尺寸
            self.focal_layers.append(
                nn.Sequential(
                    nn.Conv2d(
                        dim, dim, kernel_size=kernel_size, stride=1, groups=dim, padding=kernel_size // 2, bias=False
                    ),  # 焦点层的卷积操作
                    nn.GELU(),  # 焦点层后的激活函数
                )
            )
            self.kernel_sizes.append(kernel_size)  # 将卷积核尺寸添加到列表中
        if self.use_post_layernorm_in_modulation:
            self.layernorm = nn.LayerNorm(dim, eps=config.layer_norm_eps)  # 后层标准化层

    def forward(self, hidden_state):
        """
        Args:
            hidden_state:
                Input features with shape of (batch_size, height, width, num_channels)
        """
        num_channels = hidden_state.shape[-1]  # 获取输入张量中的通道数

        # pre linear projection
        x = self.projection_in(hidden_state).permute(0, 3, 1, 2).contiguous()  # 线性投影操作，并对张量维度进行转置和连续化处理
        q, ctx, self.gates = torch.split(x, (num_channels, num_channels, self.focal_level + 1), 1)  # 按通道数切分张量为q, ctx和门控信号

        # context aggreation
        ctx_all = 0  # 初始化上下文聚合变量
        for level in range(self.focal_level):
            ctx = self.focal_layers[level](ctx)  # 使用每个焦点层处理上下文
            ctx_all = ctx_all + ctx * self.gates[:, level : level + 1]  # 加权累积上下文特征
        ctx_global = self.activation(ctx.mean(2, keepdim=True).mean(3, keepdim=True))  # 全局上下文特征的平均池化和激活处理
        ctx_all = ctx_all + ctx_global * self.gates[:, self.focal_level :]  # 添加全局上下文特征的加权结果

        # normalize context
        if self.normalize_modulator:
            ctx_all = ctx_all / (self.focal_level + 1)  # 如果需要，对上下文进行标准化

        # focal modulation
        self.modulator = self.projection_context(ctx_all)  # 使用上下文调制器对输入进行调制
        x_out = q * self.modulator  # 根据调制结果对q进行调制
        x_out = x_out.permute(0, 2, 3, 1).contiguous()  # 对输出张量进行转置和连续化处理
        if self.use_post_layernorm_in_modulation:
            x_out = self.layernorm(x_out)  # 如果需要，对调制后的输出进行后层标准化处理

        # post linear projection
        x_out = self.projection_out(x_out)  # 输出层的线性投影
        x_out = self.projection_dropout(x_out)  # 输出层的dropout处理
        return x_out  # 返回最终的输出张量
# 定义一个名为 FocalNetLayer 的自定义神经网络层
class FocalNetLayer(nn.Module):
    r"""Focal Modulation Network layer (block).

    Args:
        config (`FocalNetConfig`):
            Model config.
        index (`int`):
            Layer index.
        dim (`int`):
            Number of input channels.
        input_resolution (`Tuple[int]`):
            Input resolution.
        drop_path (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate.
    """

    # 初始化函数，用于设置层的各种属性和参数
    def __init__(self, config, index, dim, input_resolution, drop_path=0.0):
        super().__init__()

        self.config = config

        # 设置层特定的属性
        self.dim = dim  # 输入通道数
        self.input_resolution = input_resolution  # 输入分辨率

        # 设置通用属性
        self.drop = config.hidden_dropout_prob  # 隐藏层的 Dropout 概率
        self.use_post_layernorm = config.use_post_layernorm  # 是否使用层归一化

        # 第一个层归一化模块
        self.norm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)

        # FocalNetModulation 类的实例化，用于模块化调节
        self.modulation = FocalNetModulation(
            config=config,
            index=index,
            dim=dim,
            projection_dropout=self.drop,
        )

        # 根据 drop_path 参数选择是否应用随机深度
        self.drop_path = FocalNetDropPath(drop_path) if drop_path > 0.0 else nn.Identity()

        # 第二个层归一化模块
        self.norm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)

        # 计算 MLP 隐藏层的维度
        mlp_hidden_dim = int(dim * config.mlp_ratio)

        # 实例化 FocalNetMlp 类，定义 MLP 结构
        self.mlp = FocalNetMlp(config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=self.drop)

        # 初始化 layerscale 的 gamma 参数
        self.gamma_1 = 1.0
        self.gamma_2 = 1.0
        if config.use_layerscale:
            self.gamma_1 = nn.Parameter(config.layerscale_value * torch.ones((dim)), requires_grad=True)
            self.gamma_2 = nn.Parameter(config.layerscale_value * torch.ones((dim)), requires_grad=True)
    # 定义前向传播函数，接收隐藏状态和输入尺寸作为参数
    def forward(self, hidden_state, input_dimensions):
        # 解包输入尺寸为高度和宽度
        height, width = input_dimensions
        # 获取隐藏状态的批大小、深度和通道数
        batch_size, _, num_channels = hidden_state.shape
        # 保存原始的隐藏状态作为快捷方式
        shortcut = hidden_state

        # Focal Modulation（集中调制）
        # 如果未使用后层归一化，则对隐藏状态进行归一化处理
        hidden_state = hidden_state if self.use_post_layernorm else self.norm1(hidden_state)
        # 将隐藏状态重新调整形状为(batch_size, height, width, num_channels)
        hidden_state = hidden_state.view(batch_size, height, width, num_channels)
        # 应用调制器（modulation）到隐藏状态，再将其展平为(batch_size, height * width, num_channels)
        hidden_state = self.modulation(hidden_state).view(batch_size, height * width, num_channels)
        # 如果使用后层归一化，则再次对隐藏状态进行归一化处理
        hidden_state = hidden_state if not self.use_post_layernorm else self.norm1(hidden_state)

        # FFN（Feed Forward Network，前馈神经网络）
        # 结合快捷方式和经过DropPath处理的隐藏状态乘以gamma_1
        hidden_state = shortcut + self.drop_path(self.gamma_1 * hidden_state)
        # 将DropPath处理后的MLP输出乘以gamma_2加回到隐藏状态上
        hidden_state = hidden_state + self.drop_path(
            self.gamma_2
            * (self.norm2(self.mlp(hidden_state)) if self.use_post_layernorm else self.mlp(self.norm2(hidden_state)))
        )

        # 返回最终的隐藏状态
        return hidden_state
# 定义 FocalNetStage 类，继承自 nn.Module，用于 FocalNet 的每个阶段处理
class FocalNetStage(nn.Module):
    # 初始化方法，接收配置、阶段索引和输入分辨率作为参数
    def __init__(self, config, index, input_resolution):
        super().__init__()

        # 将配置参数保存到实例变量中
        self.config = config
        # 计算深度列表的长度，即阶段数
        self.num_stages = len(config.depths)

        # 计算当前阶段的嵌入维度和输出维度
        embed_dim = [config.embed_dim * (2**i) for i in range(self.num_stages)]
        dim = embed_dim[index]
        out_dim = embed_dim[index + 1] if (index < self.num_stages - 1) else None
        # 如果不是最后一个阶段，则设置下采样函数
        downsample = FocalNetPatchEmbeddings if (index < self.num_stages - 1) else None

        # 根据随机深度衰减规则生成当前阶段的丢弃路径率
        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
        drop_path = dpr[sum(config.depths[:index]):sum(config.depths[:index + 1])]

        # 创建当前阶段的层列表，每一层使用 FocalNetLayer 类处理
        self.layers = nn.ModuleList(
            [
                FocalNetLayer(
                    config=config,
                    index=index,
                    dim=dim,
                    input_resolution=input_resolution,
                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
                )
                for i in range(config.depths[index])
            ]
        )

        # 如果有下采样函数，则初始化它
        if downsample is not None:
            self.downsample = downsample(
                config=config,
                image_size=input_resolution,
                patch_size=2,
                num_channels=dim,
                embed_dim=out_dim,
                add_norm=True,
                use_conv_embed=config.use_conv_embed,
                is_stem=False,
            )
        else:
            self.downsample = None

        # 初始化指针状态为 False
        self.pointing = False

    # 前向传播方法，接收隐藏状态张量和输入尺寸元组作为参数，返回包含三个张量的元组
    def forward(self, hidden_states: torch.Tensor, input_dimensions: Tuple[int, int]) -> Tuple[torch.Tensor]:
        height, width = input_dimensions
        # 遍历所有层，逐层进行前向传播计算
        for layer_module in self.layers:
            hidden_states = layer_module(hidden_states, input_dimensions)

        # 在进行下采样之前保存当前隐藏状态
        hidden_states_before_downsampling = hidden_states
        # 如果有下采样函数，则对隐藏状态进行形状变换和下采样操作
        if self.downsample is not None:
            height, width = input_dimensions
            hidden_states = hidden_states.transpose(1, 2).reshape(
                hidden_states_before_downsampling.shape[0], -1, height, width
            )
            hidden_states, output_dimensions = self.downsample(hidden_states)
        else:
            # 如果没有下采样函数，则直接使用原始的输入尺寸作为输出尺寸
            output_dimensions = (height, width, height, width)

        # 返回阶段的输出元组，包括下采样后的隐藏状态、未下采样前的隐藏状态和输出尺寸
        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)

        return stage_outputs
    # 初始化方法，用于创建 FocalNet 对象实例
    def __init__(self, config, grid_size):
        # 调用父类的初始化方法
        super().__init__()
        # 获取深度网络层数
        self.num_stages = len(config.depths)
        # 保存配置对象
        self.config = config

        # 创建一个包含多个 FocalNetStage 实例的列表，每个实例对应一个深度网络阶段
        self.stages = nn.ModuleList(
            [
                FocalNetStage(
                    config=config,
                    index=i_layer,
                    # 设置输入分辨率，根据层次 index 和网格大小计算
                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
                )
                for i_layer in range(self.num_stages)
            ]
        )

        # 梯度检查点设为 False
        self.gradient_checkpointing = False

    # 前向传播方法，接收隐藏状态张量、输入维度、可选输出隐藏状态标志、可选输出前采样隐藏状态标志和返回字典标志
    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        output_hidden_states: Optional[bool] = False,
        output_hidden_states_before_downsampling: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        ) -> Union[Tuple, FocalNetEncoderOutput]:
        # 如果需要输出隐藏状态，则初始化空元组来存储所有隐藏状态
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出隐藏状态，则初始化空元组来存储所有重塑后的隐藏状态
        all_reshaped_hidden_states = () if output_hidden_states else None

        # 如果需要输出隐藏状态，则重塑隐藏状态张量的形状
        if output_hidden_states:
            batch_size, _, hidden_size = hidden_states.shape
            # rearrange b (h w) c -> b c h w
            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
            all_hidden_states += (hidden_states,)
            all_reshaped_hidden_states += (reshaped_hidden_state,)

        # 遍历所有阶段模块进行处理
        for i, stage_module in enumerate(self.stages):
            # 如果启用了梯度检查点且正在训练阶段，则使用梯度检查点函数来计算阶段输出
            if self.gradient_checkpointing and self.training:
                stage_outputs = self._gradient_checkpointing_func(
                    stage_module.__call__,
                    hidden_states,
                    input_dimensions,
                )
            else:
                # 否则直接调用阶段模块来计算阶段输出
                stage_outputs = stage_module(hidden_states, input_dimensions)

            # 更新隐藏状态为当前阶段的主要输出
            hidden_states = stage_outputs[0]
            # 保存当前阶段的下采样之前的隐藏状态
            hidden_states_before_downsampling = stage_outputs[1]
            # 更新输出的尺寸维度信息
            output_dimensions = stage_outputs[2]

            # 更新输入尺寸为当前输出尺寸的高度和宽度
            input_dimensions = (output_dimensions[-2], output_dimensions[-1])

            # 如果需要输出隐藏状态且输出下采样之前的隐藏状态
            if output_hidden_states and output_hidden_states_before_downsampling:
                batch_size, _, hidden_size = hidden_states_before_downsampling.shape
                # rearrange b (h w) c -> b c h w
                # 使用原始（未下采样）的高度和宽度来重塑隐藏状态张量的形状
                reshaped_hidden_state = hidden_states_before_downsampling.view(
                    batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
                )
                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
                all_hidden_states += (hidden_states_before_downsampling,)
                all_reshaped_hidden_states += (reshaped_hidden_state,)
            # 如果需要输出隐藏状态但不需要输出下采样之前的隐藏状态
            elif output_hidden_states and not output_hidden_states_before_downsampling:
                batch_size, _, hidden_size = hidden_states.shape
                # rearrange b (h w) c -> b c h w
                # 重塑隐藏状态张量的形状
                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
                all_hidden_states += (hidden_states,)
                all_reshaped_hidden_states += (reshaped_hidden_state,)

        # 如果不需要以字典形式返回结果，则返回元组形式的隐藏状态和所有隐藏状态
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)

        # 否则以 FocalNetEncoderOutput 类的形式返回结果，包括最后的隐藏状态、所有隐藏状态和所有重塑后的隐藏状态
        return FocalNetEncoderOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            reshaped_hidden_states=all_reshaped_hidden_states,
        )
# 从transformers.models.swin.modeling_swin.SwinPreTrainedModel复制过来，并将Swin->FocalNet，swin->focalnet
class FocalNetPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用FocalNetConfig作为配置类
    config_class = FocalNetConfig
    # base_model_prefix指定模型前缀为"focalnet"
    base_model_prefix = "focalnet"
    # 主输入的名称为"pixel_values"
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        # 初始化权重：对于Linear和Conv2d层使用正态分布初始化权重，均值为0，标准差为config.initializer_range
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 与TF版本稍有不同，TF版本使用截断正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果存在偏置项，则将其初始化为0
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            # 如果是LayerNorm层，将偏置项初始化为0，权重初始化为1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


# FOCALNET_START_DOCSTRING是FocalNetModel的文档字符串的一部分，包含模型的基本用法和参数说明
FOCALNET_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`FocalNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# FOCALNET_INPUTS_DOCSTRING是FocalNetModel的输入参数说明文档字符串的一部分，详细描述了输入参数的类型和含义
FOCALNET_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`AutoImageProcessor.__call__`] for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 使用@add_start_docstrings注解，将FocalNetModel的文档字符串合并生成
@add_start_docstrings(
    "The bare FocalNet Model outputting raw hidden-states without any specific head on top.",
    FOCALNET_START_DOCSTRING,
)
# 定义FocalNetModel类，继承自FocalNetPreTrainedModel类
class FocalNetModel(FocalNetPreTrainedModel):
    pass  # Placeholder for future model implementation
    # 初始化函数，用于初始化模型
    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
        # 调用父类的初始化函数
        super().__init__(config)
        # 将配置参数保存到对象中
        self.config = config
        # 计算深度列表的长度，确定特征数量
        self.num_stages = len(config.depths)
        self.num_features = int(config.embed_dim * 2 ** (self.num_stages - 1))

        # 创建嵌入层对象
        self.embeddings = FocalNetEmbeddings(config, use_mask_token=use_mask_token)
        # 创建编码器对象，使用嵌入层的 patch_grid 参数
        self.encoder = FocalNetEncoder(config, self.embeddings.patch_grid)

        # 创建 LayerNorm 层，设置归一化的特征数量和 epsilon 值
        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
        # 如果指定要添加池化层，则创建 AdaptiveAvgPool1d 层，用于池化操作
        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None

        # 初始化权重并进行最终处理
        self.post_init()

    # 获取输入嵌入的函数，返回 patch_embeddings 属性
    def get_input_embeddings(self):
        return self.embeddings.patch_embeddings

    # 前向传播函数，根据输入参数进行模型的前向计算
    @add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=FocalNetModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, FocalNetModelOutput]:
        r"""
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        """
        # 如果未指定 output_hidden_states，则使用配置中的设定
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定 return_dict，则使用配置中的设定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果 pixel_values 为 None，则抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 调用嵌入层的前向函数，得到嵌入输出和输入维度
        embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)

        # 调用编码器的前向函数，得到编码器的输出
        encoder_outputs = self.encoder(
            embedding_output,
            input_dimensions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取序列输出并进行 LayerNorm 处理
        sequence_output = encoder_outputs[0]
        sequence_output = self.layernorm(sequence_output)

        # 初始化池化输出为 None
        pooled_output = None
        # 如果池化层不为 None，则进行池化操作和扁平化处理
        if self.pooler is not None:
            pooled_output = self.pooler(sequence_output.transpose(1, 2))
            pooled_output = torch.flatten(pooled_output, 1)

        # 如果不返回字典，则返回元组形式的输出
        if not return_dict:
            output = (sequence_output, pooled_output) + encoder_outputs[1:]
            return output

        # 如果返回字典，则返回 FocalNetModelOutput 类型的对象
        return FocalNetModelOutput(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
        )
@add_start_docstrings(
    """
    FocalNet Model with a decoder on top for masked image modeling.

    This follows the same implementation as in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    """,
    FOCALNET_START_DOCSTRING,
)
class FocalNetForMaskedImageModeling(FocalNetPreTrainedModel):
    """
    FocalNet Model for masked image modeling, extending FocalNetPreTrainedModel.

    Inherits from FocalNetPreTrainedModel and implements a model architecture with a decoder.
    """

    def __init__(self, config):
        """
        Initializes the FocalNetForMaskedImageModeling.

        Args:
            config: FocalNet configuration class instance.
        """
        super().__init__(config)

        # Initialize FocalNet model with specified configuration
        self.focalnet = FocalNetModel(config, add_pooling_layer=False, use_mask_token=True)

        # Calculate number of stages and features for the decoder
        self.num_stages = len(config.depths)
        num_features = int(config.embed_dim * 2 ** (self.num_stages - 1))

        # Define decoder architecture using convolution and pixel shuffle
        self.decoder = nn.Sequential(
            nn.Conv2d(
                in_channels=num_features, out_channels=config.encoder_stride**2 * config.num_channels, kernel_size=1
            ),
            nn.PixelShuffle(config.encoder_stride),
        )

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=FocalNetMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



@add_start_docstrings(
    """
    FocalNet Model with an image classification head on top (a linear layer on top of the pooled output) e.g. for
    ImageNet.
    """,
    FOCALNET_START_DOCSTRING,
)
class FocalNetForImageClassification(FocalNetPreTrainedModel):
    """
    FocalNet Model for image classification tasks, extending FocalNetPreTrainedModel.

    Inherits from FocalNetPreTrainedModel and implements a model architecture with an image classification head.
    """

    # Copied from transformers.models.swin.modeling_swin.SwinForImageClassification.__init__ with Swin->FocalNet, swin->focalnet
    def __init__(self, config):
        """
        Initializes the FocalNetForImageClassification.

        Args:
            config: FocalNet configuration class instance.
        """
        super().__init__(config)

        self.num_labels = config.num_labels
        self.focalnet = FocalNetModel(config)

        # Define classifier head
        self.classifier = (
            nn.Linear(self.focalnet.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=FocalNetImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, FocalNetImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 为 None，则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 FocalNet 模型进行前向传播
        outputs = self.focalnet(
            pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取池化后的输出，通常是经过全局平均池化的结果
        pooled_output = outputs[1]

        # 对池化后的输出进行分类器的前向传播，得到分类器的输出 logits
        logits = self.classifier(pooled_output)

        # 初始化损失为 None
        loss = None
        # 如果 labels 不为 None，则计算损失
        if labels is not None:
            # 如果问题类型未定义，则根据条件自动定义问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()  # 使用均方误差损失函数
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()  # 使用交叉熵损失函数
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()  # 使用带 logits 的二元交叉熵损失函数
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回损失和模型输出，否则返回自定义输出对象 FocalNetImageClassifierOutput
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return FocalNetImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            reshaped_hidden_states=outputs.reshaped_hidden_states,
        )
@add_start_docstrings(
    """
    FocalNet backbone, to be used with frameworks like X-Decoder.
    """,
    FOCALNET_START_DOCSTRING,
)
class FocalNetBackbone(FocalNetPreTrainedModel, BackboneMixin):
    def __init__(self, config: FocalNetConfig):
        super().__init__(config)
        super()._init_backbone(config)

        # 设置特征的维度列表，包括嵌入维度和隐藏层尺寸
        self.num_features = [config.embed_dim] + config.hidden_sizes
        # 创建 FocalNet 模型对象
        self.focalnet = FocalNetModel(config)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.Tensor,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> BackboneOutput:
        """
        Returns:

        Examples:

        ```
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
        >>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```"""
        # 如果 return_dict 为 None，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 output_hidden_states 为 None，则使用配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        # 调用 FocalNet 模型进行前向传播
        outputs = self.focalnet(pixel_values, output_hidden_states=True, return_dict=True)

        # 获取重塑后的隐藏状态
        hidden_states = outputs.reshaped_hidden_states

        feature_maps = ()
        # 遍历阶段名称和输出特征名称，获取特征映射
        for idx, stage in enumerate(self.stage_names):
            if stage in self.out_features:
                feature_maps += (hidden_states[idx],)

        # 如果不要求返回字典，则返回一个元组
        if not return_dict:
            output = (feature_maps,)
            if output_hidden_states:
                output += (outputs.hidden_states,)
            return output

        # 返回 BackboneOutput 对象，包含特征映射、隐藏状态（如果需要）、注意力（暂时为 None）
        return BackboneOutput(
            feature_maps=feature_maps,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=None,
        )

`.\models\focalnet\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构字典
_import_structure = {"configuration_focalnet": ["FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FocalNetConfig"]}

# 检查是否有torch可用，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果torch可用，定义modeling_focalnet模块的导入结构列表
    _import_structure["modeling_focalnet"] = [
        "FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST",
        "FocalNetForImageClassification",
        "FocalNetForMaskedImageModeling",
        "FocalNetBackbone",
        "FocalNetModel",
        "FocalNetPreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 导入configuration_focalnet模块中的特定内容
    from .configuration_focalnet import FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FocalNetConfig

    # 再次检查torch是否可用，若不可用则抛出OptionalDependencyNotAvailable异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入modeling_focalnet模块中的特定内容
        from .modeling_focalnet import (
            FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST,
            FocalNetBackbone,
            FocalNetForImageClassification,
            FocalNetForMaskedImageModeling,
            FocalNetModel,
            FocalNetPreTrainedModel,
        )

# 如果不在类型检查模式下
else:
    import sys

    # 将当前模块替换为LazyModule对象，以支持懒加载模块的特性
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\fsmt\configuration_fsmt.py`

"""
FSMT configuration
"""

from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...utils import logging  # 导入日志模块

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP = {}  # FSMT预训练模型配置存档映射为空字典

class DecoderConfig(PretrainedConfig):
    r"""
    Configuration class for FSMT's decoder specific things. note: this is a private helper class
    """
    
    model_type = "fsmt_decoder"  # 模型类型为"fsmt_decoder"

    def __init__(self, vocab_size=0, bos_token_id=0):
        super().__init__()
        self.vocab_size = vocab_size  # 初始化词汇表大小
        self.bos_token_id = bos_token_id  # 初始化起始标记ID

class FSMTConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`FSMTModel`]. It is used to instantiate a FSMT
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the FSMT
    [facebook/wmt19-en-ru](https://huggingface.co/facebook/wmt19-en-ru) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Examples:

    ```
    >>> from transformers import FSMTConfig, FSMTModel

    >>> # Initializing a FSMT facebook/wmt19-en-ru style configuration
    >>> config = FSMTConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = FSMTModel(config)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "fsmt"  # 模型类型为"fsmt"
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}

    # update the defaults from config file
    # 初始化函数，用于创建一个新的配置对象，设置各种模型参数和选项
    def __init__(
        self,
        langs=["en", "de"],  # 设置默认语言列表为英语和德语
        src_vocab_size=42024,  # 源语言词汇表大小，默认为42024
        tgt_vocab_size=42024,  # 目标语言词汇表大小，默认为42024
        activation_function="relu",  # 激活函数，默认为ReLU
        d_model=1024,  # 模型维度，同时用于编码器和解码器的嵌入维度
        max_length=200,  # 最大序列长度，默认为200
        max_position_embeddings=1024,  # 最大位置编码数，默认为1024
        encoder_ffn_dim=4096,  # 编码器中间层维度，默认为4096
        encoder_layers=12,  # 编码器层数，默认为12层
        encoder_attention_heads=16,  # 编码器注意力头数，默认为16
        encoder_layerdrop=0.0,  # 编码器层间丢弃率，默认为0.0
        decoder_ffn_dim=4096,  # 解码器中间层维度，默认为4096
        decoder_layers=12,  # 解码器层数，默认为12层
        decoder_attention_heads=16,  # 解码器注意力头数，默认为16
        decoder_layerdrop=0.0,  # 解码器层间丢弃率，默认为0.0
        attention_dropout=0.0,  # 注意力层丢弃率，默认为0.0
        dropout=0.1,  # 通用丢弃率，默认为0.1
        activation_dropout=0.0,  # 激活函数中的丢弃率，默认为0.0
        init_std=0.02,  # 初始化标准差，默认为0.02，用于参数初始化
        decoder_start_token_id=2,  # 解码器起始标记ID，默认为2
        is_encoder_decoder=True,  # 模型是否为编码解码器结构，默认为True
        scale_embedding=True,  # 是否对嵌入进行缩放，默认为True
        tie_word_embeddings=False,  # 是否绑定词嵌入，默认为False
        num_beams=5,  # Beam搜索的数量，默认为5
        length_penalty=1.0,  # 长度惩罚因子，默认为1.0
        early_stopping=False,  # 是否启用早停策略，默认为False
        use_cache=True,  # 是否使用缓存，默认为True
        pad_token_id=1,  # 填充标记ID，默认为1
        bos_token_id=0,  # 起始标记ID，默认为0
        eos_token_id=2,  # 结束标记ID，默认为2
        forced_eos_token_id=2,  # 强制结束标记ID，默认为2
        **common_kwargs,  # 其他共享关键字参数
    ):
        self.langs = langs  # 将传入的语言列表赋值给对象的langs属性
        self.src_vocab_size = src_vocab_size  # 将传入的源语言词汇表大小赋值给对象的src_vocab_size属性
        self.tgt_vocab_size = tgt_vocab_size  # 将传入的目标语言词汇表大小赋值给对象的tgt_vocab_size属性
        self.d_model = d_model  # 将传入的模型维度赋值给对象的d_model属性（编码器和解码器的嵌入维度）

        self.encoder_ffn_dim = encoder_ffn_dim  # 将传入的编码器中间层维度赋值给对象的encoder_ffn_dim属性
        self.encoder_layers = self.num_hidden_layers = encoder_layers  # 将传入的编码器层数赋值给对象的encoder_layers和num_hidden_layers属性
        self.encoder_attention_heads = encoder_attention_heads  # 将传入的编码器注意力头数赋值给对象的encoder_attention_heads属性
        self.encoder_layerdrop = encoder_layerdrop  # 将传入的编码器层间丢弃率赋值给对象的encoder_layerdrop属性
        self.decoder_layerdrop = decoder_layerdrop  # 将传入的解码器层间丢弃率赋值给对象的decoder_layerdrop属性
        self.decoder_ffn_dim = decoder_ffn_dim  # 将传入的解码器中间层维度赋值给对象的decoder_ffn_dim属性
        self.decoder_layers = decoder_layers  # 将传入的解码器层数赋值给对象的decoder_layers属性
        self.decoder_attention_heads = decoder_attention_heads  # 将传入的解码器注意力头数赋值给对象的decoder_attention_heads属性
        self.max_position_embeddings = max_position_embeddings  # 将传入的最大位置编码数赋值给对象的max_position_embeddings属性
        self.init_std = init_std  # 将传入的初始化标准差赋值给对象的init_std属性（用于参数初始化）
        self.activation_function = activation_function  # 将传入的激活函数赋值给对象的activation_function属性

        self.decoder = DecoderConfig(vocab_size=tgt_vocab_size, bos_token_id=eos_token_id)  # 创建解码器配置对象，指定词汇表大小和起始标记ID
        if "decoder" in common_kwargs:  # 如果common_kwargs中包含"decoder"键
            del common_kwargs["decoder"]  # 删除common_kwargs中的"decoder"键

        self.scale_embedding = scale_embedding  # 将传入的嵌入缩放标志赋值给对象的scale_embedding属性（如果为True，则嵌入缩放因子为sqrt(d_model)）

        self.attention_dropout = attention_dropout  # 将传入的注意力层丢弃率赋值给对象的attention_dropout属性
        self.activation_dropout = activation_dropout  # 将传入的激活函数中的丢弃率赋值给对象的activation_dropout属性
        self.dropout = dropout  # 将传入的通用丢弃率赋值给对象的dropout属性

        self.use_cache = use_cache  # 将传入的缓存使用标志赋值给对象的use_cache属性
        super().__init__(  # 调用父类的初始化方法，传入公共关键字参数和其他特定参数
            pad_token_id=pad_token_id,  # 填充标记ID
            bos_token_id=bos_token_id,  # 起始标记ID
            eos_token_id=eos_token_id,  # 结束标记ID
            decoder_start_token_id=decoder_start_token_id,  # 解码器起始标记ID
            is_encoder_decoder=is_encoder_decoder,  # 是否为编码解码器结构
            tie_word_embeddings=tie_word_embeddings,  # 是否绑定词嵌入
            forced_eos_token_id=forced_eos_token_id,  # 强制结束标记ID
            max_length=max_length,  # 最大序列长度
            num_beams=num_beams,  # Beam搜索数量
            length_penalty=length_penalty,  # 长度惩罚因子
            early_stopping=early_stopping,  # 是否启用早停策略
            **common_kwargs,  # 其他共享关键字参数
        )

`.\models\fsmt\convert_fsmt_original_pytorch_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Note: if you intend to run this script make sure you look under scripts/fsmt/
# to locate the appropriate script to do the work correctly. There is a set of scripts to:
# - download and prepare data and run the conversion script
# - perform eval to get the best hparam into the config
# - generate model_cards - useful if you have multiple models from the same paper

# 导入必要的库和模块
import argparse  # 用于解析命令行参数
import json  # 用于处理JSON格式数据
import os  # 用于操作系统相关的功能
import re  # 用于正则表达式操作
from collections import OrderedDict  # 导入OrderedDict，用于有序字典
from os.path import basename, dirname  # 导入basename和dirname函数，用于处理文件路径

import fairseq  # 导入fairseq库
import torch  # 导入PyTorch库
from fairseq import hub_utils  # 导入fairseq的hub_utils模块
from fairseq.data.dictionary import Dictionary  # 导入fairseq的Dictionary类

from transformers import FSMTConfig, FSMTForConditionalGeneration  # 导入transformers库中的FSMTConfig和FSMTForConditionalGeneration类
from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES  # 导入transformers库中FSMT的tokenization_fsmt模块中的VOCAB_FILES_NAMES变量
from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE  # 导入transformers库中的TOKENIZER_CONFIG_FILE变量
from transformers.utils import WEIGHTS_NAME, logging  # 导入transformers库中的WEIGHTS_NAME和logging模块

logging.set_verbosity_warning()  # 设置日志输出级别为警告级别

json_indent = 2  # 设置JSON格式化时的缩进空格数为2

# 基于在wmt19测试数据上对一系列`num_beams`、`length_penalty`和`early_stopping`值的搜索结果，选择最佳的默认值
best_score_hparams = {
    # fairseq模型配置:
    "wmt19-ru-en": {"length_penalty": 1.1},
    "wmt19-en-ru": {"length_penalty": 1.15},
    "wmt19-en-de": {"length_penalty": 1.0},
    "wmt19-de-en": {"length_penalty": 1.1},
    # allenai模型配置:
    "wmt16-en-de-dist-12-1": {"length_penalty": 0.6},
    "wmt16-en-de-dist-6-1": {"length_penalty": 0.6},
    "wmt16-en-de-12-1": {"length_penalty": 0.8},
    "wmt19-de-en-6-6-base": {"length_penalty": 0.6},
    "wmt19-de-en-6-6-big": {"length_penalty": 0.6},
}

# 将不同模型重新映射到它们的组织名称
org_names = {}
for m in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]:
    org_names[m] = "facebook"
for m in [
    "wmt16-en-de-dist-12-1",
    "wmt16-en-de-dist-6-1",
    "wmt16-en-de-12-1",
    "wmt19-de-en-6-6-base",
    "wmt19-de-en-6-6-big",
]:
    org_names[m] = "allenai"


def rewrite_dict_keys(d):
    # TODO: Implement function to rewrite dictionary keys
    # (1) remove word breaking symbol, (2) add word ending symbol where the word is not broken up,
    # 创建一个新的字典 d2，将输入字典 d 中的特定键进行处理：
    # - 如果键以 "@@" 结尾，则去除 "@@" 后作为新键，保留原值 v；
    # - 否则，在键末尾添加 "</w>" 字符串作为新键，并保留原值 v。
    d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "</w>", k), v) for k, v in d.items())
    
    # 定义要保留的特殊键列表
    keep_keys = "<s> <pad> </s> <unk>".split()
    
    # 遍历要保留的特殊键，并在 d2 中做相应的操作：
    # - 删除 d2 中以 "<key></w>" 形式结尾的键；
    # - 将原始键的值复制回 d2 中对应的键位置，以恢复原始值。
    for k in keep_keys:
        del d2[f"{k}</w>"]
        d2[k] = d[k]  # 恢复原始值
    
    # 返回处理后的字典 d2
    return d2
def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path):
    # 检查给定路径的文件是否存在
    assert os.path.exists(fsmt_checkpoint_path)
    # 创建目标文件夹路径，如果不存在则创建
    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
    # 打印提示信息，指示结果将写入的目标文件夹路径
    print(f"Writing results to {pytorch_dump_folder_path}")

    # 处理不同类型的模型

    # 获取检查点文件名和文件夹路径
    checkpoint_file = basename(fsmt_checkpoint_path)
    fsmt_folder_path = dirname(fsmt_checkpoint_path)

    # 使用 fairseq.model_parallel.models.transformer.ModelParallelTransformerModel 类
    cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel
    # 获取可用的模型列表
    models = cls.hub_models()
    kwargs = {"bpe": "fastbpe", "tokenizer": "moses"}
    data_name_or_path = "."
    # 注意：由于模型转储旧，fairseq 已经升级了其模型，因此在保存的权重上进行了重写和分割，
    # 因此不能直接在模型文件上使用 torch.load()。
    # 参见 fairseq_model.py 中的 upgrade_state_dict(state_dict)。
    print(f"using checkpoint {checkpoint_file}")
    # 使用 hub_utils.from_pretrained 加载模型检查点
    chkpt = hub_utils.from_pretrained(
        fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kwargs
    )

    # 获取模型参数
    args = vars(chkpt["args"]["model"])

    # 获取源语言和目标语言
    src_lang = args["source_lang"]
    tgt_lang = args["target_lang"]

    # 获取数据根路径和模型目录名
    data_root = dirname(pytorch_dump_folder_path)
    model_dir = basename(pytorch_dump_folder_path)

    # 字典文件
    src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt")
    tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt")

    # 加载源语言和目标语言的字典
    src_dict = Dictionary.load(src_dict_file)
    # 重写字典键值
    src_vocab = rewrite_dict_keys(src_dict.indices)
    src_vocab_size = len(src_vocab)
    src_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-src.json")
    # 打印提示信息，生成源语言词汇表文件
    print(f"Generating {src_vocab_file} of {src_vocab_size} of {src_lang} records")
    with open(src_vocab_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))

    # 检测是否需要执行小写转换，根据源语言词汇表中是否存在大写字母来判断
    do_lower_case = True
    for k in src_vocab.keys():
        if not k.islower():
            do_lower_case = False
            break

    # 加载目标语言的字典
    tgt_dict = Dictionary.load(tgt_dict_file)
    # 重写字典键值
    tgt_vocab = rewrite_dict_keys(tgt_dict.indices)
    tgt_vocab_size = len(tgt_vocab)
    tgt_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-tgt.json")
    # 打印提示信息，生成目标语言词汇表文件
    print(f"Generating {tgt_vocab_file} of {tgt_vocab_size} of {tgt_lang} records")
    with open(tgt_vocab_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent))

    # merges_file (bpecodes)
    merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"])
    # 遍历文件名列表，找到存在的合并文件
    for fn in ["bpecodes", "code"]:  # older fairseq called the merges file "code"
        fsmt_merges_file = os.path.join(fsmt_folder_path, fn)
        if os.path.exists(fsmt_merges_file):
            break
    # 从指定文件中读取内容，使用 UTF-8 编码打开文件
    with open(fsmt_merges_file, encoding="utf-8") as fin:
        merges = fin.read()
    # 使用正则表达式去除字符串末尾的数字（频率信息）
    merges = re.sub(r" \d+$", "", merges, 0, re.M)  # remove frequency number
    # 打印生成信息，输出文件名变量的值
    print(f"Generating {merges_file}")
    # 使用 UTF-8 编码打开文件，并将处理后的字符串写入文件
    with open(merges_file, "w", encoding="utf-8") as fout:
        fout.write(merges)

    # model config
    # 构建模型配置文件路径
    fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json")

    # 校验 BPE/tokenizer 配置，当前强制使用 moses+fastbpe -
    # 如果未来模型使用其他类型的 tokenizer，需要扩展支持
    assert args["bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}"
    assert args["tokenizer"] == "moses", f"need to extend tokenizer to support bpe={args['tokenizer']}"

    # 配置模型参数
    model_conf = {
        "architectures": ["FSMTForConditionalGeneration"],
        "model_type": "fsmt",
        "activation_dropout": args["activation_dropout"],
        "activation_function": "relu",
        "attention_dropout": args["attention_dropout"],
        "d_model": args["decoder_embed_dim"],
        "dropout": args["dropout"],
        "init_std": 0.02,
        "max_position_embeddings": args["max_source_positions"],
        "num_hidden_layers": args["encoder_layers"],
        "src_vocab_size": src_vocab_size,
        "tgt_vocab_size": tgt_vocab_size,
        "langs": [src_lang, tgt_lang],
        "encoder_attention_heads": args["encoder_attention_heads"],
        "encoder_ffn_dim": args["encoder_ffn_embed_dim"],
        "encoder_layerdrop": args["encoder_layerdrop"],
        "encoder_layers": args["encoder_layers"],
        "decoder_attention_heads": args["decoder_attention_heads"],
        "decoder_ffn_dim": args["decoder_ffn_embed_dim"],
        "decoder_layerdrop": args["decoder_layerdrop"],
        "decoder_layers": args["decoder_layers"],
        "bos_token_id": 0,
        "pad_token_id": 1,
        "eos_token_id": 2,
        "is_encoder_decoder": True,
        "scale_embedding": not args["no_scale_embedding"],
        "tie_word_embeddings": args["share_all_embeddings"],
    }

    # 设置模型配置的默认超参数
    model_conf["num_beams"] = 5
    model_conf["early_stopping"] = False
    # 如果最佳分数的超参数中包含长度惩罚项，则使用该值；否则设置为默认值 1.0
    if model_dir in best_score_hparams and "length_penalty" in best_score_hparams[model_dir]:
        model_conf["length_penalty"] = best_score_hparams[model_dir]["length_penalty"]
    else:
        model_conf["length_penalty"] = 1.0

    # 打印生成信息，输出模型配置文件名变量的值
    print(f"Generating {fsmt_model_config_file}")
    # 使用 UTF-8 编码打开文件，并将模型配置信息写入文件
    with open(fsmt_model_config_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))

    # tokenizer config
    # 构建 tokenizer 配置文件路径
    fsmt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE)

    # 配置 tokenizer 参数
    tokenizer_conf = {
        "langs": [src_lang, tgt_lang],
        "model_max_length": 1024,
        "do_lower_case": do_lower_case,
    }

    # 打印生成信息，输出 tokenizer 配置文件名变量的值
    print(f"Generating {fsmt_tokenizer_config_file}")
    # 打开文件 `fsmt_tokenizer_config_file` 以写入模式，编码为 UTF-8
    with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f:
        # 将 `tokenizer_conf` 对象转换为 JSON 格式并写入文件
        f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent))

    # 从 `chkpt` 字典中获取第一个模型，并获取其状态字典
    model = chkpt["models"][0]
    model_state_dict = model.state_dict()

    # 将模型状态字典中的键名加上前缀 'model.'
    model_state_dict = OrderedDict(("model." + k, v) for k, v in model_state_dict.items())

    # 移除不需要的键名
    ignore_keys = [
        "model.model",
        "model.encoder.version",
        "model.decoder.version",
        "model.encoder_embed_tokens.weight",
        "model.decoder_embed_tokens.weight",
        "model.encoder.embed_positions._float_tensor",
        "model.decoder.embed_positions._float_tensor",
    ]
    for k in ignore_keys:
        # 从模型状态字典中移除对应的键名
        model_state_dict.pop(k, None)

    # 从指定路径 `pytorch_dump_folder_path` 加载 FSMT 模型配置
    config = FSMTConfig.from_pretrained(pytorch_dump_folder_path)
    # 基于加载的配置创建一个新的 FSMT 模型
    model_new = FSMTForConditionalGeneration(config)

    # 非严格模式加载模型状态字典到 `model_new`
    model_new.load_state_dict(model_state_dict, strict=False)

    # 设置用于保存权重的路径
    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
    # 打印保存路径信息
    print(f"Generating {pytorch_weights_dump_path}")
    # 使用 Torch 保存模型状态字典到指定路径
    torch.save(model_state_dict, pytorch_weights_dump_path)

    # 打印转换完成信息
    print("Conversion is done!")
    # 打印下一步上传文件到 S3的指引
    print("\nLast step is to upload the files to s3")
    # 打印进入 `data_root` 目录的指引
    print(f"cd {data_root}")
    # 使用 `transformers-cli` 工具上传 `model_dir` 到 Hugging Face 模型库
    print(f"transformers-cli upload {model_dir}")
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 创建一个命令行参数解析器对象

    # Required parameters
    parser.add_argument(
        "--fsmt_checkpoint_path",
        default=None,
        type=str,
        required=True,
        help=(
            "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts,"
            " bpecodes, etc."
        ),
    )
    # 添加一个必需的命令行参数 --fsmt_checkpoint_path，用于指定官方PyTorch检查点文件的路径

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 添加另一个必需的命令行参数 --pytorch_dump_folder_path，用于指定输出的PyTorch模型的路径

    args = parser.parse_args()
    # 解析命令行参数并将其存储在args对象中

    convert_fsmt_checkpoint_to_pytorch(args.fsmt_checkpoint_path, args.pytorch_dump_folder_path)
    # 调用函数convert_fsmt_checkpoint_to_pytorch，传入解析后得到的检查点文件路径和输出模型路径作为参数

`.\models\fsmt\modeling_fsmt.py`

# coding=utf-8
# 设置文件编码为 UTF-8

# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
# 版权声明，版权归Facebook AI Research Team Authors和HuggingFace Inc.团队所有

# Licensed under the Apache License, Version 2.0 (the "License");
# 使用Apache License Version 2.0许可协议，详见 http://www.apache.org/licenses/LICENSE-2.0

# you may not use this file except in compliance with the License.
# 除非遵守许可证的条款，否则不得使用此文件

# You may obtain a copy of the License at
# 您可以在上述链接获取许可证的副本

# http://www.apache.org/licenses/LICENSE-2.0
# 许可证详细信息

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则按"原样"分发，无论是明示的还是暗示的

# See the License for the specific language governing permissions and
# limitations under the License.
# 详细了解许可协议的权限和限制

# Original implementation: https://github.com/pytorch/fairseq/tree/master/examples/wmt19
# 原始实现来源链接

# Authors:
# - @alexeib Alexei Baevski
# - @edunov Sergey Edunov
# - @michaelauli Michael Auli
# - @myleott Myle Ott
# - @nng555 Nathan Ng
# - David Grangier
# - Kyra Yee
# 各位作者姓名

# Paper: Facebook FAIR's WMT19 News Translation Task Submission https://arxiv.org/abs/1907.06616
# 相关论文

"""
PyTorch Fairseq model, ported from https://github.com/pytorch/fairseq/tree/master/examples/wmt19
PyTorch Fairseq模型，从https://github.com/pytorch/fairseq/tree/master/examples/wmt19 迁移而来
"""

import math
# 导入数学库

from typing import Any, Dict, List, Optional, Tuple, Union
# 导入类型提示相关库

import torch
# 导入PyTorch库

from torch import Tensor, nn
# 导入PyTorch的Tensor和神经网络模块

from torch.nn import CrossEntropyLoss, LayerNorm
# 导入交叉熵损失和层归一化模块

from ...activations import ACT2FN
# 导入激活函数映射模块

from ...integrations.deepspeed import is_deepspeed_zero3_enabled
# 导入DeepSpeed集成模块

from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
)
# 导入模型输出相关类

from ...modeling_utils import PreTrainedModel
# 导入预训练模型工具类

from ...utils import (
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 导入工具函数和日志模块

from .configuration_fsmt import FSMTConfig
# 导入FSMT配置类

logger = logging.get_logger(__name__)
# 获取日志记录器

_CHECKPOINT_FOR_DOC = "facebook/wmt19-ru-en"
# 模型检查点路径

_CONFIG_FOR_DOC = "FSMTConfig"
# FSMT配置类名称

# See all FSMT models at https://huggingface.co/models?filter=fsmt
# 查看所有FSMT模型的链接

# Porting notes:
# this one is modeled after BartModel*
# 本模型基于BartModel*进行了建模

# Currently only translation (fairseq also has weights for LM)
# 目前仅支持翻译（fairseq还具有语言模型的权重）

# fairseq provides weights for ru-en, en-ru and de-en, en-de pairs. All have been ported.
# fairseq提供了ru-en、en-ru和de-en、en-de等语言对的权重，所有这些都已经迁移

# - ru-en, en-ru use asymmetric vocab
# - de-en, en-de use a merged single vocab (but the code works as if they are separate)
# ru-en、en-ru使用非对称词汇，而de-en、en-de使用合并的单词表（但代码处理时像是分开处理）

# Differences with Bart:
# - not using bos token
# - 2 separate vocabs (src and target)
# - embed weights aren't tied
# - uses a model Ensemble (but that part isn't ported/implemented yet) - so we
#   aren't getting as good of a BLEU score
# - uses a projection layer at the end of the decoder
# - doesn't use final_logits_bias
# - beam search: stops as soon as num_beams == len(hypos) (whereas transformers
#   is not satisfied there and will continue searching until the next cycles
#   aren't promising something better), comparing BLEU scores - the transformers
#   algorithm is slightly superior, therefore using the latter. But if you want
# 与Bart模型的区别：
# - 不使用bos标记
# - 有两个独立的词汇表（源和目标）
# - 嵌入权重不是绑定的
# - 使用模型集成（但该部分尚未迁移/实现），因此我们的BLEU分数不如预期
# - 在解码器末端使用投影层
# - 不使用final_logits_bias
# - 波束搜索：一旦num_beams == len(hypos)就停止（而transformers会继续搜索），比较BLEU分数- transformers算法稍优，因此使用后者。但如果您想要
#   to match fairseq outputs, you need to pass ``early_stopping=True`` to ``generate()``.
#
# SinusoidalPositionalEmbedding is slightly different from Bart's - generates
# different embeddings. This implementation is copied verbatim from fairseq with
# some small changes to make it work here.
#
# Other changes:
#  - doesn't support use_cache as Bart's version does
#
#
# FSMTConfig changes with BartConfig
#
#    Differences with BART:
#    - src/tgt vocabs aren't shared
#    - token embeddings aren't shared
#    - needs a language pair
#    - scale_embedding are True
#
#    some unused args were removed too
#
#
# TODO:
# - port model ensemble (fs uses 4 model checkpoints)
# - solve beam search discrepancies
# docstyle-ignore

"""
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
# 运行一个命令行脚本，评估模型翻译效果并生成评估结果文件，包括源文件、目标文件、BLEU 分数等参数设置

# (fairseq BLEU: 43.1 http://matrix.statmt.org/matrix/output/1909?run_id=6862)
# 指示 fairseq 模型的 BLEU 分数及其详细信息的链接

"""


FSMT_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`FSMTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.

"""
# FSMT_START_DOCSTRING 注释已提供在示例中


FSMT_GENERATION_EXAMPLE = r"""
    Translation example::

    ```
    >>> from transformers import AutoTokenizer, FSMTForConditionalGeneration

    >>> mname = "facebook/wmt19-ru-en"
    >>> model = FSMTForConditionalGeneration.from_pretrained(mname)
    >>> tokenizer = AutoTokenizer.from_pretrained(mname)

    >>> src_text = "Машинное обучение - это здорово, не так ли?"
    >>> input_ids = tokenizer(src_text, return_tensors="pt").input_ids
    >>> outputs = model.generate(input_ids, num_beams=5, num_return_sequences=3)
    >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
    "Machine learning is great, isn't it?"
    ```

"""
# FSMT_GENERATION_EXAMPLE 注释已提供在示例中


FSMT_INPUTS_DOCSTRING = r"""
"""


def invert_mask(attention_mask):
    """Turns 1->0, 0->1, False->True, True-> False"""
    assert attention_mask.dim() == 2
    return attention_mask.eq(0)
# invert_mask 函数：反转注意力掩码的值，将1变为0，0变为1，True变为False，False变为True


def triu_onnx(x, diagonal=0):
    l = x.shape[0]
    arange = torch.arange(l, device=x.device)
    mask = arange.expand(l, l)
    arange = arange.unsqueeze(-1)
    if diagonal:
        arange = arange + diagonal
    mask = mask >= arange
    return x.masked_fill(mask == 0, 0)
# triu_onnx 函数：生成一个上三角矩阵的掩码，用于在 ONNX 运行时操作


def _prepare_fsmt_decoder_inputs(
    config,
    input_ids,
    decoder_input_ids=None,
    decoder_padding_mask=None,
    causal_mask_dtype=torch.float32,
):
    """
    Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if none are provided.
    This mimics the default behavior in fairseq. To override it pass in masks. Note: this is not called during
    generation
    """
    pad_token_id = config.pad_token_id
    if decoder_input_ids is None:
        decoder_input_ids = shift_tokens_right(input_ids, pad_token_id)
    bsz, tgt_len = decoder_input_ids.size()
# _prepare_fsmt_decoder_inputs 函数：准备解码器的输入，包括忽略填充标记的掩码和因果掩码，以及处理 fairseq 中的默认行为
    # 如果 decoder_padding_mask 为 None，则使用 decoder_input_ids 和 pad_token_id 创建填充遮罩
    if decoder_padding_mask is None:
        decoder_padding_mask = make_padding_mask(decoder_input_ids, pad_token_id)
    # 否则，反转 decoder_padding_mask
    else:
        decoder_padding_mask = invert_mask(decoder_padding_mask)
    
    # 创建一个上三角矩阵的 causal_mask，使用 fill_with_neg_inf 创建全零矩阵并填充负无穷值，然后取上三角部分并在设备上进行设定
    causal_mask = triu_onnx(fill_with_neg_inf(torch.zeros(tgt_len, tgt_len, dtype=causal_mask_dtype)), 1).to(
        device=decoder_input_ids.device
    )
    
    # 返回 decoder_input_ids（解码器输入序列）、decoder_padding_mask（填充遮罩）和 causal_mask（因果遮罩）
    return decoder_input_ids, decoder_padding_mask, causal_mask
class PretrainedFSMTModel(PreTrainedModel):
    # 使用 FSMTConfig 类作为配置类
    config_class = FSMTConfig
    # 模型中基础模型的前缀
    base_model_prefix = "model"

    def _init_weights(self, module):
        # 从配置中获取初始化的标准差
        std = self.config.init_std
        if isinstance(module, nn.Linear):
            # 如果是线性层，使用正态分布初始化权重，偏置初始化为零
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, SinusoidalPositionalEmbedding):
            # 如果是正弦位置嵌入，不进行初始化操作
            pass
        elif isinstance(module, nn.Embedding):
            # 如果是嵌入层，使用正态分布初始化权重，如果有填充索引，则对应权重初始化为零
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()

    @property
    def dummy_inputs(self):
        # 获取填充标记的 ID
        pad_token = self.config.pad_token_id
        # 创建一个示例输入的张量，包含两个样本的输入 ID
        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
        # 构建虚拟输入字典，包含注意力掩码和输入 ID
        dummy_inputs = {
            "attention_mask": input_ids.ne(pad_token),
            "input_ids": input_ids,
        }
        return dummy_inputs


def _make_linear_from_emb(emb):
    # 从嵌入层创建线性层
    vocab_size, emb_size = emb.weight.shape
    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
    # 将线性层的权重设为嵌入层的权重
    lin_layer.weight.data = emb.weight.data
    return lin_layer


# Helper Functions, mostly for making masks
def _check_shapes(shape_1, shape2):
    # 检查两个形状是否匹配，如果不匹配则引发错误
    if shape_1 != shape2:
        raise AssertionError(f"shape mismatch: {shape_1} != {shape2}")


def shift_tokens_right(input_ids, pad_token_id):
    """Shift input ids one token to the right, and wrap the last non pad token (usually <eos>)."""

    # 将标签中可能的 -100 值替换为 `pad_token_id`
    input_ids.masked_fill_(input_ids == -100, pad_token_id)

    # 克隆输入 ID，作为输出的前一个 token
    prev_output_tokens = input_ids.clone()
    # 找到每个样本中最后一个非填充 token 的索引
    index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
    # 将前一个输出 token 的第一个位置设为最后一个非填充 token
    prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
    # 将其余位置向右移动一个位置
    prev_output_tokens[:, 1:] = input_ids[:, :-1]
    return prev_output_tokens


def make_padding_mask(input_ids, padding_idx=1):
    """True for pad tokens"""
    # 创建用于填充 token 的掩码，值为 True
    padding_mask = input_ids.eq(padding_idx)
    if not padding_mask.any():
        padding_mask = None
    return padding_mask


# Helper Modules


class EncoderLayer(nn.Module):
    def __init__(self, config: FSMTConfig):
        super().__init__()
        # 设置嵌入维度
        self.embed_dim = config.d_model
        # 创建自注意力层
        self.self_attn = Attention(self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout)
        # 自注意力层后的 LayerNorm
        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
        # 配置中的 dropout 率
        self.dropout = config.dropout
        # 激活函数
        self.activation_fn = ACT2FN[config.activation_function]
        # 激活函数后的 dropout 率
        self.activation_dropout = config.activation_dropout
        # 第一个全连接层
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        # 第二个全连接层
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        # 最终的 LayerNorm
        self.final_layer_norm = LayerNorm(self.embed_dim)
    def forward(self, x, encoder_padding_mask, layer_head_mask, output_attentions=False):
        """
        Args:
            x (`torch.Tensor`): 输入到层的输入，形状为 *(seq_len, batch, embed_dim)*
            encoder_padding_mask (`torch.ByteTensor`): 二进制 ByteTensor，形状为
                *(batch, src_len)*，其中填充元素由 `1` 表示。
                对于 t_tgt，t_src 被排除在外（或者被掩盖），=0 表示在注意力机制中包含它们。
            layer_head_mask (`torch.FloatTensor`): 给定层中注意力头的掩码，大小为
                *(config.encoder_attention_heads,)*。

        Returns:
            编码后的输出，形状为 *(seq_len, batch, embed_dim)*
        """
        residual = x  # 保留残差连接

        # 自注意力机制
        x, attn_weights = self.self_attn(
            query=x,
            key=x,
            key_padding_mask=encoder_padding_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        x = nn.functional.dropout(x, p=self.dropout, training=self.training)  # 应用 dropout
        x = residual + x  # 添加残差连接
        x = self.self_attn_layer_norm(x)  # 应用层归一化

        residual = x  # 更新残差连接

        # 前馈神经网络（FFN）部分
        x = self.activation_fn(self.fc1(x))  # 应用激活函数和第一层线性变换
        x = nn.functional.dropout(x, p=self.activation_dropout, training=self.training)  # 应用 dropout
        x = self.fc2(x)  # 第二层线性变换
        x = nn.functional.dropout(x, p=self.dropout, training=self.training)  # 应用 dropout
        x = residual + x  # 添加残差连接
        x = self.final_layer_norm(x)  # 最终的层归一化
        return x, attn_weights  # 返回编码后的输出和注意力权重
class FSMTEncoder(nn.Module):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`EncoderLayer`].

    Args:
        config: FSMTConfig
    """

    def __init__(self, config: FSMTConfig, embed_tokens):
        super().__init__()
        self.dropout = config.dropout  # 从配置中获取 dropout 比例
        self.layerdrop = config.encoder_layerdrop  # 从配置中获取层间 dropout 比例
        self.padding_idx = embed_tokens.padding_idx  # 获取嵌入标记的填充索引
        self.embed_tokens = embed_tokens  # 嵌入 tokens
        embed_dim = embed_tokens.embedding_dim  # 嵌入维度
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0  # 计算嵌入比例因子
        self.embed_positions = SinusoidalPositionalEmbedding(
            config.max_position_embeddings + self.padding_idx + 1, embed_dim, self.padding_idx
        )  # 创建正弦位置嵌入
        self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)])  # 创建编码器层列表

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        inputs_embeds: torch.Tensor = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        """
        Transformer 编码器的前向传播方法。

        Args:
            input_ids: 输入的 token ids
            attention_mask: 注意力遮罩，可选
            inputs_embeds: 嵌入的输入，可选
            head_mask: 头部遮罩，可选
            output_attentions: 是否输出注意力权重，可选
            output_hidden_states: 是否输出隐藏状态，可选
            return_dict: 是否返回字典形式的输出，可选

        Returns:
            根据 return_dict 返回不同形式的输出结果
        """
        # 省略具体的前向传播逻辑，因为这里只要求注释每行代码的作用
        pass


class DecoderLayer(nn.Module):
    def __init__(self, config: FSMTConfig):
        super().__init__()
        self.embed_dim = config.d_model  # 获取嵌入维度

        self.self_attn = Attention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
        )  # 创建自注意力层

        self.dropout = config.dropout  # 从配置中获取 dropout 比例
        self.activation_fn = ACT2FN[config.activation_function]  # 激活函数
        self.activation_dropout = config.activation_dropout  # 激活函数的 dropout

        self.self_attn_layer_norm = LayerNorm(self.embed_dim)  # 自注意力层的 LayerNorm

        self.encoder_attn = Attention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            encoder_decoder_attention=True,
        )  # 创建编码器注意力层

        self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)  # 编码器注意力层的 LayerNorm

        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)  # 第一个全连接层
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)  # 第二个全连接层
        self.final_layer_norm = LayerNorm(self.embed_dim)  # 最终的 LayerNorm

    def forward(
        self,
        x,
        encoder_hidden_states,
        encoder_attn_mask=None,
        layer_state=None,
        causal_mask=None,
        layer_head_mask=None,
        cross_attn_layer_head_mask=None,
        decoder_padding_mask=None,
        output_attentions=False,
    ):
        """
        Transformer 解码器层的前向传播方法。

        Args:
            x: 输入张量
            encoder_hidden_states: 编码器的隐藏状态
            encoder_attn_mask: 编码器注意力的遮罩，可选
            layer_state: 层状态，可选
            causal_mask: 因果遮罩，可选
            layer_head_mask: 层头部遮罩，可选
            cross_attn_layer_head_mask: 交叉注意力层头部遮罩，可选
            decoder_padding_mask: 解码器填充遮罩，可选
            output_attentions: 是否输出注意力权重，可选

        Returns:
            根据不同的参数返回不同形式的输出结果
        """
        # 省略具体的前向传播逻辑，因为这里只要求注释每行代码的作用
        pass
    ):
        residual = x  # 保存输入的残差连接

        if layer_state is None:
            layer_state = {}  # 如果状态为空，则初始化为空字典

        # 自注意力机制
        x, self_attn_weights = self.self_attn(
            query=x,
            key=x,
            layer_state=layer_state,  # 将键添加到层状态中
            key_padding_mask=decoder_padding_mask,
            attn_mask=causal_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        x = nn.functional.dropout(x, p=self.dropout, training=self.training)  # 使用 dropout 进行正则化
        x = residual + x  # 添加残差连接
        x = self.self_attn_layer_norm(x)  # 应用自注意力层的 LayerNorm

        # 跨注意力机制
        residual = x  # 保存输入的残差连接
        assert self.encoder_attn.cache_key != self.self_attn.cache_key  # 断言确保编码器注意力缓存键不同于自注意力的缓存键
        x, cross_attn_weights = self.encoder_attn(
            query=x,
            key=encoder_hidden_states,
            key_padding_mask=encoder_attn_mask,
            layer_state=layer_state,  # 更新层状态
            layer_head_mask=cross_attn_layer_head_mask,
            output_attentions=output_attentions,
        )
        x = nn.functional.dropout(x, p=self.dropout, training=self.training)  # 使用 dropout 进行正则化
        x = residual + x  # 添加残差连接
        x = self.encoder_attn_layer_norm(x)  # 应用编码器注意力层的 LayerNorm

        # 全连接层
        residual = x  # 保存输入的残差连接
        x = self.activation_fn(self.fc1(x))  # 应用激活函数和第一个全连接层
        x = nn.functional.dropout(x, p=self.activation_dropout, training=self.training)  # 使用 dropout 进行正则化
        x = self.fc2(x)  # 应用第二个全连接层
        x = nn.functional.dropout(x, p=self.dropout, training=self.training)  # 使用 dropout 进行正则化
        x = residual + x  # 添加残差连接
        x = self.final_layer_norm(x)  # 应用最终的 LayerNorm
        return (
            x,
            self_attn_weights,  # 返回自注意力的权重
            layer_state,  # 返回层状态，用于解码的缓存
            cross_attn_weights,  # 返回跨注意力的权重
        )
class FSMTDecoder(nn.Module):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DecoderLayer`]

    Args:
        config: FSMTConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self, config: FSMTConfig, embed_tokens: nn.Embedding):
        super().__init__()
        self.dropout = config.dropout  # 从配置中获取丢弃率
        self.layerdrop = config.decoder_layerdrop  # 从配置中获取层丢弃率
        self.padding_idx = embed_tokens.padding_idx  # 获取嵌入层的填充索引
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0  # 根据配置计算嵌入缩放因子
        self.embed_tokens = embed_tokens  # 初始化嵌入 tokens
        embed_dim = embed_tokens.embedding_dim  # 获取嵌入维度
        # 初始化 sinusoidal 位置嵌入
        self.embed_positions = SinusoidalPositionalEmbedding(
            config.max_position_embeddings + self.padding_idx + 1, embed_dim, self.padding_idx
        )
        # 创建多个解码层并存入列表
        self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.decoder_layers)])  # type: List[DecoderLayer]

        if is_deepspeed_zero3_enabled():
            import deepspeed
            # 如果启用了 DeepSpeed Zero3，使用 GatheredParameters 重新排列权重
            with deepspeed.zero.GatheredParameters(self.embed_tokens.weight, modifier_rank=None):
                embed_tokens_weight_shape = self.embed_tokens.weight.shape
        else:
            embed_tokens_weight_shape = self.embed_tokens.weight.shape
        # 初始化输出投影线性层，用于转换到嵌入 tokens 的维度
        self.output_projection = nn.Linear(embed_tokens_weight_shape[1], embed_tokens_weight_shape[0], bias=False)
        self.output_projection.weight = self.embed_tokens.weight

    def forward(
        self,
        input_ids: torch.Tensor,
        encoder_hidden_states: torch.Tensor,
        encoder_padding_mask: torch.Tensor,
        decoder_padding_mask: torch.Tensor,
        decoder_causal_mask: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        """
        Defines the forward pass for the FSMTDecoder module.

        Args:
            input_ids (torch.Tensor): Input token IDs.
            encoder_hidden_states (torch.Tensor): Hidden states from the encoder.
            encoder_padding_mask (torch.Tensor): Mask for encoder padding.
            decoder_padding_mask (torch.Tensor): Mask for decoder padding.
            decoder_causal_mask (torch.Tensor): Mask for causal (autoregressive) decoding.
            head_mask (Optional[torch.Tensor]): Mask for attention heads.
            inputs_embeds (Optional[torch.Tensor]): Embedded inputs.
            cross_attn_head_mask (Optional[torch.Tensor]): Mask for cross-attention heads.
            past_key_values (Optional[List[torch.FloatTensor]]): Cached key-value states.
            use_cache (bool): Whether to use cached key-values.
            output_attentions (bool): Whether to output attention weights.
            output_hidden_states (bool): Whether to output hidden states.
            return_dict (bool): Whether to return a dictionary.

        Returns:
            torch.Tensor or Dict[str, torch.Tensor]: Depending on `return_dict` flag, either logits or dictionary.
        """
        # Forward pass implementation details omitted
        pass


def _reorder_buffer(attn_cache, new_order):
    """
    Reorders the attention cache according to the new order of indices.

    Args:
        attn_cache (Dict[str, torch.Tensor]): Attention cache dictionary.
        new_order (torch.Tensor): New order of indices.

    Returns:
        Dict[str, torch.Tensor]: Reordered attention cache.
    """
    for k, input_buffer_k in attn_cache.items():
        if input_buffer_k is not None:
            attn_cache[k] = input_buffer_k.index_select(0, new_order)
    return attn_cache


class Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim,
        num_heads,
        dropout=0.0,
        bias=True,
        encoder_decoder_attention=False,  # otherwise self_attention
    ):
        """
        Initializes the Attention module.

        Args:
            embed_dim (int): Dimensionality of input embeddings.
            num_heads (int): Number of attention heads.
            dropout (float): Dropout probability.
            bias (bool): Whether to use bias in linear layers.
            encoder_decoder_attention (bool): Whether it's encoder-decoder attention or self-attention.
        """
        super().__init__()
        # Initialization details omitted
        pass
    ):
        super().__init__()  # 调用父类的初始化方法
        self.embed_dim = embed_dim  # 设置嵌入维度
        self.num_heads = num_heads  # 设置注意力头数
        self.dropout = dropout  # 设置Dropout比率
        self.head_dim = embed_dim // num_heads  # 计算每个注意力头的维度
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"  # 断言确保embed_dim能被num_heads整除
        self.scaling = self.head_dim**-0.5  # 缩放因子

        self.encoder_decoder_attention = encoder_decoder_attention  # 设置编码器-解码器注意力标志
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 创建线性层k_proj，用于投影查询
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 创建线性层v_proj，用于投影键
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 创建线性层q_proj，用于投影值
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 创建线性层out_proj，用于最终输出
        self.cache_key = "encoder_decoder" if self.encoder_decoder_attention else "self"  # 缓存键，根据encoder_decoder_attention选择"encoder_decoder"或"self"

    def _shape(self, tensor, seq_len, bsz):
        return tensor.contiguous().view(seq_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)  # 将张量重塑为(batch_size * num_heads, seq_len, head_dim)

    def forward(
        self,
        query,
        key: Optional[Tensor],
        key_padding_mask: Optional[Tensor] = None,
        layer_state: Optional[Dict[str, Optional[Tensor]]] = None,
        attn_mask: Optional[Tensor] = None,
        layer_head_mask: Optional[Tensor] = None,
        output_attentions=False,
    def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv, bsz):
        # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
        if "prev_key" in saved_state:
            _prev_key = saved_state["prev_key"]
            assert _prev_key is not None
            prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)  # 从saved_state中获取并重塑prev_key
            if static_kv:
                k = prev_key  # 如果static_kv为True，则使用prev_key作为当前的k
            else:
                assert k is not None
                k = torch.cat([prev_key, k], dim=1)  # 否则将prev_key和当前的k连接起来
        if "prev_value" in saved_state:
            _prev_value = saved_state["prev_value"]
            assert _prev_value is not None
            prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)  # 从saved_state中获取并重塑prev_value
            if static_kv:
                v = prev_value  # 如果static_kv为True，则使用prev_value作为当前的v
            else:
                assert v is not None
                v = torch.cat([prev_value, v], dim=1)  # 否则将prev_value和当前的v连接起来
        assert k is not None and v is not None
        prev_key_padding_mask: Optional[Tensor] = saved_state.get("prev_key_padding_mask", None)  # 从saved_state获取prev_key_padding_mask
        if prev_key_padding_mask is not None:
            if static_kv:
                new_key_padding_mask = prev_key_padding_mask  # 如果static_kv为True，则使用prev_key_padding_mask作为新的key_padding_mask
            else:
                new_key_padding_mask = torch.cat([prev_key_padding_mask, key_padding_mask], dim=1)  # 否则将prev_key_padding_mask和当前的key_padding_mask连接起来
        else:
            new_key_padding_mask = key_padding_mask  # 如果没有prev_key_padding_mask，则直接使用当前的key_padding_mask
        return k, v, new_key_padding_mask
# FP16兼容的函数，用于将输入张量 t 填充为负无穷
def fill_with_neg_inf(t):
    return t.float().fill_(torch.finfo(t.dtype).min).type_as(t)


# 返回张量 t 的形状，如果不存在则返回 None
def _get_shape(t):
    return getattr(t, "shape", None)


# FSMT 模型，继承自 PretrainedFSMTModel 类
@add_start_docstrings(
    "The bare FSMT Model outputting raw hidden-states without any specific head on top.",
    FSMT_START_DOCSTRING,
)
class FSMTModel(PretrainedFSMTModel):
    # 被绑定权重的键列表
    _tied_weights_keys = ["decoder.embed_tokens.weight", "decoder.output_projection.weight"]

    # 初始化方法
    def __init__(self, config: FSMTConfig):
        super().__init__(config)

        # 获取填充索引
        padding_idx = config.pad_token_id
        # 创建编码器嵌入层
        encoder_embed_tokens = nn.Embedding(config.src_vocab_size, config.d_model, padding_idx)
        # 创建解码器嵌入层
        decoder_embed_tokens = nn.Embedding(config.tgt_vocab_size, config.d_model, padding_idx)

        # 初始化编码器和解码器
        self.encoder = FSMTEncoder(config, encoder_embed_tokens)
        self.decoder = FSMTDecoder(config, decoder_embed_tokens)

        # 执行初始化权重和最终处理
        self.post_init()

    # 获取编码器方法
    def get_encoder(self):
        return self.encoder

    # 获取解码器方法
    def get_decoder(self):
        return self.decoder

    # 绑定权重方法
    def _tie_weights(self):
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(self.decoder.embed_tokens, self.get_input_embeddings())
            self._tie_or_clone_weights(self.decoder.output_projection, self.get_input_embeddings())

    # 前向传播方法，使用装饰器添加文档字符串和代码示例文档字符串
    @add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Seq2SeqModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        return_dict: Optional[bool] = None,
    ):
        # 方法体省略，实现模型的具体前向传播逻辑

    # 获取输入嵌入方法，返回编码器的嵌入层
    def get_input_embeddings(self):
        return self.encoder.embed_tokens

    # 设置输入嵌入方法，设置编码器的嵌入层为指定值
    def set_input_embeddings(self, value):
        self.encoder.embed_tokens = value

    # 获取输出嵌入方法，返回解码器的嵌入层
    def get_output_embeddings(self):
        return self.decoder.embed_tokens

    # 设置输出嵌入方法，设置解码器的嵌入层为指定值
    def set_output_embeddings(self, value):
        self.decoder.embed_tokens = value


@add_start_docstrings(
    "The FSMT Model with a language modeling head. Can be used for summarization.", FSMT_START_DOCSTRING
)
class FSMTForConditionalGeneration(PretrainedFSMTModel):
    base_model_prefix = "model"
    _tied_weights_keys = ["decoder.embed_tokens.weight", "decoder.output_projection.weight"]

    def __init__(self, config: FSMTConfig):
        super().__init__(config)
        # 创建基础的FSMTModel对象，使用给定的配置信息
        base_model = FSMTModel(config)
        # 将创建的模型对象赋值给self.model属性
        self.model = base_model

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    @add_end_docstrings(FSMT_GENERATION_EXAMPLE)
    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        decoder_inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:
            Depending on `return_dict`, either a tuple containing `masked_lm_loss` and model outputs or a `Seq2SeqLMOutput`.

        """
        # Determine if `return_dict` is provided; otherwise, use default from configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Disable caching if `labels` are provided to ensure fresh calculations
        if labels is not None:
            use_cache = False

        # Pass inputs to the model for generation, with optional arguments
        outputs = self.model(
            input_ids,
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_inputs_embeds=decoder_inputs_embeds,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        lm_logits = outputs[0]  # Extract logits from model outputs

        masked_lm_loss = None
        # Calculate masked language modeling loss if `labels` are provided
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Compute loss only on non-masked tokens between logits and labels
            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.tgt_vocab_size), labels.view(-1))

        # If `return_dict` is `False`, return tuple with logits and other outputs
        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # Return structured output with loss and model outputs in `Seq2SeqLMOutput` format
        return Seq2SeqLMOutput(
            loss=masked_lm_loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )

    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 返回一个字典，包含以下字段：
        # "input_ids": None，不需要input_ids，因为已经定义了encoder_outputs
        # "encoder_outputs": encoder_outputs，编码器的输出
        # "past_key_values": past_key_values，过去的键值（缓存）
        # "decoder_input_ids": decoder_input_ids，解码器的输入ids
        # "attention_mask": attention_mask，注意力掩码
        # "head_mask": head_mask，头掩码
        # "decoder_head_mask": decoder_head_mask，解码器头部掩码
        # "cross_attn_head_mask": cross_attn_head_mask，跨注意力头掩码
        # "use_cache": use_cache，用于控制缓存的标志，可能是为了调试而更改的
        return {
            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
        }

    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        # 从标签中准备解码器的输入ids，通过将标签向右移动来实现
        return shift_tokens_right(labels, self.config.pad_token_id)

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = []
        for layer_past in past_key_values:
            # 对过去的缓存重新排序，根据beam_idx来调整每层的缓存
            layer_past_new = {
                attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items()
            }
            reordered_past.append(layer_past_new)
        return reordered_past

    def get_encoder(self):
        # 返回模型的编码器
        return self.model.encoder

    def get_decoder(self):
        # 返回模型的解码器
        return self.model.decoder

    def get_output_embeddings(self):
        # 返回模型的输出嵌入层
        return self.model.decoder.embed_tokens

    def set_output_embeddings(self, value):
        # 设置模型的输出嵌入层
        self.model.decoder.embed_tokens = value
class SinusoidalPositionalEmbedding(nn.Embedding):
    """
    This module produces sinusoidal positional embeddings of any length.

    We don't want to save the weight of this embedding since it's not trained (deterministic) and it can be huge.

    Padding symbols are ignored.

    These embeddings get automatically extended in forward if more positions is needed.
    """

    def __init__(self, num_positions, embedding_dim, padding_idx):
        # 调用 make_weight 方法创建权重矩阵
        self.make_weight(num_positions, embedding_dim, padding_idx)

    def make_weight(self, num_positions, embedding_dim, padding_idx):
        # 调用 get_embedding 方法获取位置编码的权重
        weight = self.get_embedding(num_positions, embedding_dim, padding_idx)
        if not hasattr(self, "weight"):
            # 如果实例中没有权重，通过 nn.Embedding 的构造函数初始化权重
            super().__init__(num_positions, embedding_dim, padding_idx, _weight=weight)
        else:
            # 如果实例中已经有权重，则更新现有权重的 dtype 和 device
            weight = weight.to(dtype=self.weight.dtype, device=self.weight.device)
            self.weight = nn.Parameter(weight)
        # 将权重设为不可训练
        self.weight.detach_()
        self.weight.requires_grad = False

    @staticmethod
    def get_embedding(num_embeddings, embedding_dim, padding_idx):
        """
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        """
        # 计算位置编码的半维度
        half_dim = embedding_dim // 2
        # 计算位置编码的增长率
        emb = math.log(10000) / (half_dim - 1)
        # 计算正弦和余弦位置编码的数值
        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
        if embedding_dim % 2 == 1:
            # 若维度是奇数，添加零填充
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        if padding_idx is not None:
            # 若有填充索引，则将该位置的编码置为零向量
            emb[padding_idx, :] = 0
        return emb

    @staticmethod
    def make_positions(tensor, padding_idx: int):
        """
        Replace non-padding symbols with their position numbers.

        Position numbers begin at padding_idx+1. Padding symbols are ignored.
        """
        # 生成替换非填充符号的位置数字
        mask = tensor.ne(padding_idx).int()
        return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx

    def forward(
        self,
        input,
        incremental_state: Optional[Any] = None,
        timestep: Optional[Tensor] = None,
    ):
        """
        Input is expected to be of size [bsz x seqlen].
        """
        # 获取输入张量的批量大小和序列长度
        bsz, seq_len = input.shape[:2]
        # 计算最大位置，考虑填充索引和序列长度
        max_pos = self.padding_idx + 1 + seq_len
        # 如果最大位置超过当前权重张量的大小，则扩展嵌入权重
        if max_pos > self.weight.size(0):
            # 调用方法扩展权重张量
            self.make_weight(max_pos, self.embedding_dim, self.padding_idx)
        # 生成位置编码，使用输入张量和填充索引
        positions = self.make_positions(input, self.padding_idx)
        # 调用父类的 forward 方法，传递位置编码张量
        return super().forward(positions)

`.\models\fsmt\tokenization_fsmt.py`

# coding=utf-8
# Copyright 2019 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for FSMT."""


import json
import os
import re
import unicodedata
from typing import Dict, List, Optional, Tuple

from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging


logger = logging.get_logger(__name__)

# 定义词汇文件名的映射字典
VOCAB_FILES_NAMES = {
    "src_vocab_file": "vocab-src.json",  # 源语言词汇文件名
    "tgt_vocab_file": "vocab-tgt.json",  # 目标语言词汇文件名
    "merges_file": "merges.txt",          # 合并文件名
}

# 定义预训练模型的词汇文件映射字典
PRETRAINED_VOCAB_FILES_MAP = {
    "src_vocab_file": {
        "stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/vocab-src.json"
    },  # 源语言词汇文件的预训练模型映射
    "tgt_vocab_file": {
        "stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/vocab-tgt.json"
    },  # 目标语言词汇文件的预训练模型映射
    "merges_file": {
        "stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/merges.txt"
    },  # 合并文件的预训练模型映射
}

# 定义预训练位置嵌入大小的字典
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"stas/tiny-wmt19-en-de": 1024}

# 定义预训练初始化配置的字典
PRETRAINED_INIT_CONFIGURATION = {
    "stas/tiny-wmt19-en-de": {
        "langs": ["en", "de"],                 # 支持的语言列表
        "model_max_length": 1024,              # 模型最大长度
        "special_tokens_map_file": None,       # 特殊标记映射文件路径
        "full_tokenizer_file": None,           # 完整分词器文件路径
    }
}


def get_pairs(word):
    """
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))           # 将每对相邻字符添加到集合中
        prev_char = char
    return pairs


def replace_unicode_punct(text):
    """
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
    """
    text = text.replace("，", ",")              # 替换中文逗号为英文逗号
    text = re.sub(r"。\s*", ". ", text)        # 替换中文句号为英文句号并去除其后的空格
    text = text.replace("、", ",")              # 替换中文顿号为英文逗号
    text = text.replace("”", '"')               # 替换中文右双引号为英文双引号
    text = text.replace("“", '"')               # 替换中文左双引号为英文双引号
    text = text.replace("∶", ":")               # 替换中文冒号为英文冒号
    text = text.replace("：", ":")               # 替换中文冒号为英文冒号
    text = text.replace("？", "?")               # 替换中文问号为英文问号
    text = text.replace("《", '"')               # 替换中文书名号为英文双引号
    text = text.replace("》", '"')               # 替换中文书名号为英文双引号
    text = text.replace("）", ")")               # 替换中文右括号为英文右括号
    text = text.replace("！", "!")               # 替换中文感叹号为英文感叹号
    text = text.replace("（", "(")               # 替换中文左括号为英文左括号
    text = text.replace("；", ";")               # 替换中文分号为英文分号
    text = text.replace("１", "1")               # 替换全角数字１为半角数字1
    text = text.replace("」", '"')               # 替换中文右双引号为英文双引号
    text = text.replace("「", '"')               # 替换中文左双引号为英文双引号
    text = text.replace("０", "0")               # 替换全角数字０为半角数字0
    text = text.replace("３", "3")               # 替换全角数字３为半角数字3
    text = text.replace("２", "2")               # 替换全角数字２为半角数字2
    text = text.replace("５", "5")               # 替换全角数字５为半角数字5
    text = text.replace("６", "6")               # 替换全角数字６为半角数字6
    # 将全角数字９替换为半角数字9
    text = text.replace("９", "9")
    # 将全角数字７替换为半角数字7
    text = text.replace("７", "7")
    # 将全角数字８替换为半角数字8
    text = text.replace("８", "8")
    # 将全角数字４替换为半角数字4
    text = text.replace("４", "4")
    # 将中文句号后面的空白字符（包括全角和半角）替换为一个半角空格
    text = re.sub(r"．\s*", ". ", text)
    # 将全角波浪号～替换为半角波浪号~
    text = text.replace("～", "~")
    # 将全角右单引号’替换为半角右单引号'
    text = text.replace("’", "'")
    # 将全角省略号…替换为半角省略号...
    text = text.replace("…", "...")
    # 将全角长破折号━替换为半角破折号-
    text = text.replace("━", "-")
    # 将全角左尖括号〈替换为半角左尖括号<
    text = text.replace("〈", "<")
    # 将全角右尖括号〉替换为半角右尖括号>
    text = text.replace("〉", ">")
    # 将全角左方括号【替换为半角左方括号[
    text = text.replace("【", "[")
    # 将全角右方括号】替换为半角右方括号]
    text = text.replace("】", "]")
    # 将全角百分号％替换为半角百分号%
    text = text.replace("％", "%")
    # 返回处理后的文本
    return text
def remove_non_printing_char(text):
    """
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
    """
    # 初始化一个空列表用于存储处理后的文本字符
    output = []
    # 遍历输入的文本中的每个字符
    for char in text:
        # 使用 unicodedata 获取字符的分类信息
        cat = unicodedata.category(char)
        # 如果字符的分类以 "C" 开头（表示控制字符），则跳过该字符
        if cat.startswith("C"):
            continue
        # 将非控制字符添加到输出列表中
        output.append(char)
    # 将处理后的字符列表连接成一个字符串并返回
    return "".join(output)


# Porting notes:
# this one is modeled after XLMTokenizer
#
# added:
# - src_vocab_file,
# - tgt_vocab_file,
# - langs,


class FSMTTokenizer(PreTrainedTokenizer):
    """
    Construct an FAIRSEQ Transformer tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:

    - Moses preprocessing and tokenization.
    - Normalizing all inputs text.
    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
      "__classify__") to a vocabulary.
    - The argument `langs` defines a pair of languages.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        langs (`List[str]`, *optional*):
            A list of two languages to translate from and to, for instance `["en", "ru"]`.
        src_vocab_file (`str`, *optional*):
            File containing the vocabulary for the source language.
        tgt_vocab_file (`st`, *optional*):
            File containing the vocabulary for the target language.
        merges_file (`str`, *optional*):
            File containing the merges.
        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.

    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 获取预训练模型的初始化配置
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # 获取预训练位置编码大小的配置
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask"]

    # XLMTokenizer 类的构造函数
    def __init__(
        self,
        langs=None,
        src_vocab_file=None,
        tgt_vocab_file=None,
        merges_file=None,
        do_lower_case=False,
        unk_token="<unk>",
        bos_token="<s>",
        sep_token="</s>",
        pad_token="<pad>",
        **kwargs,
    ):
        try:
            import sacremoses
        except ImportError:
            # 如果导入失败，抛出 ImportError 异常
            raise ImportError(
                "You need to install sacremoses to use XLMTokenizer. "
                "See https://pypi.org/project/sacremoses/ for installation."
            )

        # 导入 sacremoses 成功后，将其保存到实例属性中
        self.sm = sacremoses

        # 设置实例属性，保存传入的参数
        self.src_vocab_file = src_vocab_file
        self.tgt_vocab_file = tgt_vocab_file
        self.merges_file = merges_file
        self.do_lower_case = do_lower_case

        # 实例属性，缓存 sacremoses 的 MosesPunctNormalizer 实例
        self.cache_moses_punct_normalizer = {}
        # 实例属性，缓存 sacremoses 的 MosesTokenizer 实例
        self.cache_moses_tokenizer = {}
        # 实例属性，缓存 sacremoses 的 MosesDetokenizer 实例
        self.cache_moses_detokenizer = {}

        # 如果指定了语言列表，并且长度为 2
        if langs and len(langs) == 2:
            # 将第一个语言和第二个语言分别保存到实例属性中
            self.src_lang, self.tgt_lang = langs
        else:
            # 如果语言列表不符合要求，抛出 ValueError 异常
            raise ValueError(
                f"arg `langs` needs to be a list of 2 langs, e.g. ['en', 'ru'], but got {langs}. "
                "Usually that means that tokenizer can't find a mapping for the given model path "
                "in PRETRAINED_VOCAB_FILES_MAP, and other maps of this tokenizer."
            )

        # 使用 utf-8 编码打开源语料库词汇文件，并加载为 JSON 格式，保存到实例属性中
        with open(src_vocab_file, encoding="utf-8") as src_vocab_handle:
            self.encoder = json.load(src_vocab_handle)
        # 使用 utf-8 编码打开目标语料库词汇文件，并加载为 JSON 格式，创建反向映射字典，保存到实例属性中
        with open(tgt_vocab_file, encoding="utf-8") as tgt_vocab_handle:
            tgt_vocab = json.load(tgt_vocab_handle)
            self.decoder = {v: k for k, v in tgt_vocab.items()}
        # 使用 utf-8 编码打开 BPE 合并文件，读取内容并处理成元组列表，保存到实例属性中
        with open(merges_file, encoding="utf-8") as merges_handle:
            merges = merges_handle.read().split("\n")[:-1]
        merges = [tuple(merge.split()[:2]) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        # 实例属性，缓存
        self.cache = {}

        # 调用父类的构造函数，传入相同的参数和关键字参数
        super().__init__(
            langs=langs,
            src_vocab_file=src_vocab_file,
            tgt_vocab_file=tgt_vocab_file,
            merges_file=merges_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            bos_token=bos_token,
            sep_token=sep_token,
            pad_token=pad_token,
            **kwargs,
        )

    # hack override，重写父类方法，获取词汇表
    def get_vocab(self) -> Dict[str, int]:
        return self.get_src_vocab()

    # hack override，重写父类属性，返回源语言词汇表大小
    @property
    def vocab_size(self) -> int:
        return self.src_vocab_size
    # 使用 MosesPunctNormalizer 对象规范化文本中的标点符号，根据语言缓存对象以提高效率
    def moses_punct_norm(self, text, lang):
        if lang not in self.cache_moses_punct_normalizer:
            punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
            self.cache_moses_punct_normalizer[lang] = punct_normalizer
        return self.cache_moses_punct_normalizer[lang].normalize(text)

    # 使用 MosesTokenizer 对象对文本进行标记化，根据语言缓存对象以提高效率
    def moses_tokenize(self, text, lang):
        if lang not in self.cache_moses_tokenizer:
            moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
            self.cache_moses_tokenizer[lang] = moses_tokenizer
        return self.cache_moses_tokenizer[lang].tokenize(
            text, aggressive_dash_splits=True, return_str=False, escape=True
        )

    # 使用 MosesDetokenizer 对象对标记化的 tokens 进行反标记化，根据语言缓存对象以提高效率
    def moses_detokenize(self, tokens, lang):
        if lang not in self.cache_moses_detokenizer:
            moses_detokenizer = self.sm.MosesDetokenizer(lang=lang)
            self.cache_moses_detokenizer[lang] = moses_detokenizer
        return self.cache_moses_detokenizer[lang].detokenize(tokens)

    # 使用一系列预处理步骤处理文本，包括替换Unicode标点、标准化标点符号和移除非打印字符
    def moses_pipeline(self, text, lang):
        text = replace_unicode_punct(text)
        text = self.moses_punct_norm(text, lang)
        text = remove_non_printing_char(text)
        return text

    # 返回源语言词汇表的大小，即编码器的长度
    @property
    def src_vocab_size(self):
        return len(self.encoder)

    # 返回目标语言词汇表的大小，即解码器的长度
    @property
    def tgt_vocab_size(self):
        return len(self.decoder)

    # 返回源语言的词汇表，包括编码器和附加的特殊标记
    def get_src_vocab(self):
        return dict(self.encoder, **self.added_tokens_encoder)

    # 返回目标语言的词汇表，包括解码器和附加的特殊标记
    def get_tgt_vocab(self):
        return dict(self.decoder, **self.added_tokens_decoder)

    # 使用 BPE（字节对编码）算法对单词进行分段处理，根据缓存提高处理速度
    def bpe(self, token):
        word = tuple(token[:-1]) + (token[-1] + "</w>",)
        if token in self.cache:
            return self.cache[token]
        pairs = get_pairs(word)

        if not pairs:
            return token + "</w>"

        while True:
            # 找到 BPE 算法中频率最低的字节对，根据事先定义的排序规则选择
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        # 将处理后的单词转换为字符串形式，并进行缓存以提高后续处理效率
        word = " ".join(word)
        if word == "\n  </w>":
            word = "\n</w>"
        self.cache[token] = word
        return word
    def _tokenize(self, text, lang="en", bypass_tokenizer=False):
        """
        Tokenize a string given language code using Moses.

        Details of tokenization:

            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
            - Install with `pip install sacremoses`

        Args:
            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
              languages. However, we don't enforce it.
            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
              (bool). If True, we only apply BPE.

        Returns:
            List of tokens.
        """
        # 忽略当前没有显式传递的 `lang` 参数，tokenization_utils.py 中总是结果 lang=en
        # if lang != self.src_lang:
        #     raise ValueError(f"Expected lang={self.src_lang}, but got {lang}")
        # 将 lang 参数设置为 self.src_lang
        lang = self.src_lang

        # 如果 do_lower_case 为 True，则将文本转换为小写
        if self.do_lower_case:
            text = text.lower()

        # 如果 bypass_tokenizer 为 True，则将文本按空格分割成列表
        if bypass_tokenizer:
            text = text.split()
        else:
            # 使用 Moses 处理管道处理文本
            text = self.moses_pipeline(text, lang=lang)
            # 使用 Moses 分词函数对文本进行分词
            text = self.moses_tokenize(text, lang=lang)

        # 初始化空列表 split_tokens 用于存放最终的分词结果
        split_tokens = []
        # 遍历每个 token
        for token in text:
            # 如果 token 存在
            if token:
                # 将 BPE 分词后的结果以空格分隔，加入到 split_tokens 列表中
                split_tokens.extend(list(self.bpe(token).split(" ")))

        # 返回最终的分词结果列表
        return split_tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 使用词汇表将 token 转换为对应的 id，如果找不到则返回 unk_token 对应的 id
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用词汇表将 index 转换为对应的 token，如果找不到则返回 unk_token
        return self.decoder.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""

        # 去除 tokens 中的 BPE 标记
        tokens = [t.replace(" ", "").replace("</w>", " ") for t in tokens]
        # 将 tokens 列表合并成一个字符串
        tokens = "".join(tokens).split()
        # 使用 Moses 的 detokenize 方法将 tokens 转换为单个字符串
        text = self.moses_detokenize(tokens, self.tgt_lang)
        # 返回最终的字符串文本
        return text

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        ):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.

        A RoBERTa sequence has the following format:
        single sequence: [CLS] X [SEP]
        pair of sequences: [CLS] A [SEP] B [SEP]

        Args:
            token_ids_0 (:obj:`List[int]`):
                List of IDs corresponding to the first sequence.
            token_ids_1 (:obj:`List[int]`, `optional`):
                List of IDs corresponding to the second sequence.

        Returns:
            :obj:`List[int]`: List of IDs with the appropriate special tokens.
        """
        # 初始化输入 tokens 列表，并加入第一个特殊 token [CLS]
        input_ids = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        # 如果有第二个序列的 token IDs，加入第二个特殊 token [SEP] 和第二个序列的 token IDs
        if token_ids_1 is not None:
            input_ids += token_ids_1 + [self.sep_token_id]

        # 返回包含特殊 token 的输入 token IDs 列表
        return input_ids
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
        adding special tokens. A FAIRSEQ Transformer sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of input IDs with the appropriate special tokens.
        """
        sep = [self.sep_token_id]

        # no bos used in fairseq
        # If token_ids_1 is not provided, return token_ids_0 concatenated with sep tokens
        if token_ids_1 is None:
            return token_ids_0 + sep
        # Otherwise, concatenate token_ids_0, sep, token_ids_1, and sep
        return token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        # If already_has_special_tokens is True, delegate to the superclass's method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )
        
        # no bos used in fairseq
        # If token_ids_1 is not None, create a mask with 0s for token_ids_0, 1 for sep, 0s for token_ids_1, and 1 for sep
        if token_ids_1 is not None:
            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        
        # Otherwise, create a mask with 0s for token_ids_0 and 1 for sep
        return ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create token type IDs tensor from a list of token ids. In a sequence pair, A and B would have different types (0 and 1).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs.
        """
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in
    # 定义 __getstate__ 方法，用于返回对象的状态字典
    def __getstate__(self):
        # 复制对象的 __dict__ 属性，获取当前对象的状态
        state = self.__dict__.copy()
        # 将对象的 "sm" 属性设为 None，可能是为了清除敏感信息或重置状态
        state["sm"] = None
        # 返回对象的状态字典
        return state

    # 定义 __setstate__ 方法，用于设置对象的状态
    def __setstate__(self, d):
        # 将传入的状态字典 d 直接赋给对象的 __dict__ 属性，以恢复对象的状态
        self.__dict__ = d

        # 尝试导入 sacremoses 库，如果导入失败则抛出 ImportError
        try:
            import sacremoses
        except ImportError:
            raise ImportError(
                "You need to install sacremoses to use XLMTokenizer. "
                "See https://pypi.org/project/sacremoses/ for installation."
            )

        # 将导入的 sacremoses 库赋给对象的 "sm" 属性，可能用于后续的操作
        self.sm = sacremoses

`.\models\fsmt\init.py`

# 导入类型检查工具
from typing import TYPE_CHECKING

# 导入自定义的异常和模块延迟加载工具
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构，包括配置、标记化和建模组件
_import_structure = {
    "configuration_fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig"],
    "tokenization_fsmt": ["FSMTTokenizer"],
}

# 检查是否支持 Torch，如果不支持则引发异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持 Torch，则添加建模组件到导入结构中
    _import_structure["modeling_fsmt"] = ["FSMTForConditionalGeneration", "FSMTModel", "PretrainedFSMTModel"]

# 如果是类型检查模式
if TYPE_CHECKING:
    # 从配置、标记化和建模模块导入特定类和类型
    from .configuration_fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig
    from .tokenization_fsmt import FSMTTokenizer

    # 再次检查 Torch 是否可用，并在可用时导入建模类和类型
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_fsmt import FSMTForConditionalGeneration, FSMTModel, PretrainedFSMTModel

# 如果不是类型检查模式
else:
    import sys

    # 将当前模块重新定义为延迟加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\funnel\configuration_funnel.py`

# coding=utf-8
# Copyright 2020, Hugging Face
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Funnel Transformer model configuration"""

# 导入预训练配置类和日志工具
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# 预训练模型配置文件映射表，映射模型名称到其配置文件的 URL
FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "funnel-transformer/small": "https://huggingface.co/funnel-transformer/small/resolve/main/config.json",
    "funnel-transformer/small-base": "https://huggingface.co/funnel-transformer/small-base/resolve/main/config.json",
    "funnel-transformer/medium": "https://huggingface.co/funnel-transformer/medium/resolve/main/config.json",
    "funnel-transformer/medium-base": "https://huggingface.co/funnel-transformer/medium-base/resolve/main/config.json",
    "funnel-transformer/intermediate": (
        "https://huggingface.co/funnel-transformer/intermediate/resolve/main/config.json"
    ),
    "funnel-transformer/intermediate-base": (
        "https://huggingface.co/funnel-transformer/intermediate-base/resolve/main/config.json"
    ),
    "funnel-transformer/large": "https://huggingface.co/funnel-transformer/large/resolve/main/config.json",
    "funnel-transformer/large-base": "https://huggingface.co/funnel-transformer/large-base/resolve/main/config.json",
    "funnel-transformer/xlarge": "https://huggingface.co/funnel-transformer/xlarge/resolve/main/config.json",
    "funnel-transformer/xlarge-base": "https://huggingface.co/funnel-transformer/xlarge-base/resolve/main/config.json",
}

# 定义 FunnelConfig 类，继承自 PretrainedConfig 类
class FunnelConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`FunnelModel`] or a [`TFBertModel`]. It is used to
    instantiate a Funnel Transformer model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the Funnel
    Transformer [funnel-transformer/small](https://huggingface.co/funnel-transformer/small) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """

    # 指定模型类型为 "funnel"
    model_type = "funnel"
    # 定义属性映射，将配置中的参数名映射到模型参数名
    attribute_map = {
        "hidden_size": "d_model",               # hidden_size 映射到 d_model
        "num_attention_heads": "n_head",        # num_attention_heads 映射到 n_head
    }
    # 初始化函数，用于创建一个新的模型对象
    def __init__(
        self,
        vocab_size=30522,                         # 设置词汇表大小，默认为30522
        block_sizes=[4, 4, 4],                     # 每个块的大小列表，默认为[4, 4, 4]
        block_repeats=None,                       # 每个块的重复次数列表，默认为None
        num_decoder_layers=2,                     # 解码器层数，默认为2
        d_model=768,                              # 模型的维度，默认为768
        n_head=12,                                # 注意力头的数量，默认为12
        d_head=64,                                # 每个注意力头的维度，默认为64
        d_inner=3072,                             # 内部隐藏层的维度，默认为3072
        hidden_act="gelu_new",                    # 隐藏层激活函数，默认为"gelu_new"
        hidden_dropout=0.1,                       # 隐藏层的Dropout比率，默认为0.1
        attention_dropout=0.1,                    # 注意力层的Dropout比率，默认为0.1
        activation_dropout=0.0,                   # 激活函数的Dropout比率，默认为0.0
        initializer_range=0.1,                    # 初始化范围，默认为0.1
        initializer_std=None,                     # 初始化标准差，默认为None
        layer_norm_eps=1e-9,                      # Layer Norm的epsilon，默认为1e-9
        pooling_type="mean",                      # 汇聚类型，默认为"mean"
        attention_type="relative_shift",          # 注意力类型，默认为"relative_shift"
        separate_cls=True,                        # 是否分开处理CLS，默认为True
        truncate_seq=True,                        # 是否截断序列，默认为True
        pool_q_only=True,                         # 是否仅对query池化，默认为True
        **kwargs,                                 # 其他关键字参数
    ):
        self.vocab_size = vocab_size               # 设置词汇表大小属性
        self.block_sizes = block_sizes             # 设置块大小列表属性
        self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
                                                  # 设置块重复次数列表属性，若未提供则为每个块设置为1次
        assert len(block_sizes) == len(
            self.block_repeats
        ), "`block_sizes` and `block_repeats` should have the same length."  # 检查块大小列表和重复次数列表长度是否相同

        self.num_decoder_layers = num_decoder_layers  # 设置解码器层数属性
        self.d_model = d_model                      # 设置模型维度属性
        self.n_head = n_head                        # 设置注意力头数量属性
        self.d_head = d_head                        # 设置每个注意力头维度属性
        self.d_inner = d_inner                      # 设置内部隐藏层维度属性
        self.hidden_act = hidden_act                # 设置隐藏层激活函数属性
        self.hidden_dropout = hidden_dropout        # 设置隐藏层Dropout比率属性
        self.attention_dropout = attention_dropout  # 设置注意力层Dropout比率属性
        self.activation_dropout = activation_dropout  # 设置激活函数Dropout比率属性
        self.initializer_range = initializer_range  # 设置初始化范围属性
        self.initializer_std = initializer_std      # 设置初始化标准差属性
        self.layer_norm_eps = layer_norm_eps        # 设置Layer Norm的epsilon属性

        assert pooling_type in [
            "mean",
            "max",
        ], f"Got {pooling_type} for `pooling_type` but only 'mean' and 'max' are supported."
                                                  # 检查汇聚类型是否支持，只支持'mean'和'max'
        self.pooling_type = pooling_type            # 设置汇聚类型属性

        assert attention_type in [
            "relative_shift",
            "factorized",
        ], f"Got {attention_type} for `attention_type` but only 'relative_shift' and 'factorized' are supported."
                                                  # 检查注意力类型是否支持，只支持'relative_shift'和'factorized'
        self.attention_type = attention_type        # 设置注意力类型属性
        self.separate_cls = separate_cls            # 设置是否分开处理CLS属性
        self.truncate_seq = truncate_seq            # 设置是否截断序列属性
        self.pool_q_only = pool_q_only              # 设置是否仅对query池化属性

        super().__init__(**kwargs)                  # 调用父类初始化函数，并传递其他关键字参数

    @property
    def num_hidden_layers(self):
        return sum(self.block_sizes)                # 返回总隐藏层数，即所有块大小之和

    @num_hidden_layers.setter
    def num_hidden_layers(self, value):
        raise NotImplementedError(
            "This model does not support the setting of `num_hidden_layers`. Please set `block_sizes`."
        )                                           # 设置num_hidden_layers属性的setter方法，不支持设置，提出错误提示

    @property
    def num_blocks(self):
        return len(self.block_sizes)                # 返回块数量，即块大小列表的长度

    @num_blocks.setter
    def num_blocks(self, value):
        raise NotImplementedError("This model does not support the setting of `num_blocks`. Please set `block_sizes`.")
                                                  # 设置num_blocks属性的setter方法，不支持设置，提出错误提示

`.\models\funnel\convert_funnel_original_tf_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Funnel checkpoint."""


import argparse  # 导入解析命令行参数的模块

import torch  # 导入PyTorch模块

from transformers import FunnelBaseModel, FunnelConfig, FunnelModel, load_tf_weights_in_funnel  # 导入Transformers库中相关类和函数
from transformers.utils import logging  # 导入Transformers库中的日志模块


logging.set_verbosity_info()  # 设置日志输出级别为info


def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, base_model):
    # Initialise PyTorch model
    config = FunnelConfig.from_json_file(config_file)  # 从配置文件加载Funnel模型的配置
    print(f"Building PyTorch model from configuration: {config}")  # 打印正在根据配置构建PyTorch模型的消息
    model = FunnelBaseModel(config) if base_model else FunnelModel(config)  # 根据base_model参数选择性地创建基础模型或完整模型

    # Load weights from tf checkpoint
    load_tf_weights_in_funnel(model, config, tf_checkpoint_path)  # 加载TensorFlow的checkpoint中的权重到PyTorch模型中

    # Save pytorch-model
    print(f"Save PyTorch model to {pytorch_dump_path}")  # 打印正在保存PyTorch模型到指定路径的消息
    torch.save(model.state_dict(), pytorch_dump_path)  # 将PyTorch模型的状态字典保存到指定路径


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建命令行参数解析器

    # Required parameters
    parser.add_argument(
        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
    )  # 添加必需的命令行参数：TensorFlow checkpoint的路径
    parser.add_argument(
        "--config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.",
    )  # 添加必需的命令行参数：配置JSON文件的路径，指定预训练模型的架构
    parser.add_argument(
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )  # 添加必需的命令行参数：输出PyTorch模型的路径
    parser.add_argument(
        "--base_model", action="store_true", help="Whether you want just the base model (no decoder) or not."
    )  # 添加可选的命令行参数：是否只需要基础模型（没有解码器）

    args = parser.parse_args()  # 解析命令行参数
    convert_tf_checkpoint_to_pytorch(
        args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.base_model
    )  # 调用转换函数，传入命令行参数

`.\models\funnel\modeling_funnel.py`

# coding=utf-8
# Copyright 2020-present Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Funnel Transformer model."""

import os
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import numpy as np
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutput,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_funnel import FunnelConfig

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# 用于文档的配置和检查点名称
_CONFIG_FOR_DOC = "FunnelConfig"
_CHECKPOINT_FOR_DOC = "funnel-transformer/small"

# 预训练模型的存档列表
FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "funnel-transformer/small",  # B4-4-4H768
    "funnel-transformer/small-base",  # B4-4-4H768, no decoder
    "funnel-transformer/medium",  # B6-3x2-3x2H768
    "funnel-transformer/medium-base",  # B6-3x2-3x2H768, no decoder
    "funnel-transformer/intermediate",  # B6-6-6H768
    "funnel-transformer/intermediate-base",  # B6-6-6H768, no decoder
    "funnel-transformer/large",  # B8-8-8H1024
    "funnel-transformer/large-base",  # B8-8-8H1024, no decoder
    "funnel-transformer/xlarge-base",  # B10-10-10H1024
    "funnel-transformer/xlarge",  # B10-10-10H1024, no decoder
]

# 无穷大常量
INF = 1e6

# 加载 TensorFlow 模型权重到 PyTorch 模型
def load_tf_weights_in_funnel(model, config, tf_checkpoint_path):
    """Load tf checkpoints in a pytorch model."""
    try:
        import re
        import numpy as np
        import tensorflow as tf
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    
    # 获取 TensorFlow 检查点文件的绝对路径
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
    
    # 从 TensorFlow 模型中加载权重
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    for name, shape in init_vars:
        # 记录日志，显示正在加载的 TensorFlow 权重名称和形状
        logger.info(f"Loading TF weight {name} with shape {shape}")
        # 使用 TensorFlow 的 API 加载指定路径下的变量数据
        array = tf.train.load_variable(tf_path, name)
        # 将加载的变量名添加到列表中
        names.append(name)
        # 将加载的变量数据添加到数组中
        arrays.append(array)

    _layer_map = {
        "k": "k_head",
        "q": "q_head",
        "v": "v_head",
        "o": "post_proj",
        "layer_1": "linear_1",
        "layer_2": "linear_2",
        "rel_attn": "attention",
        "ff": "ffn",
        "kernel": "weight",
        "gamma": "weight",
        "beta": "bias",
        "lookup_table": "weight",
        "word_embedding": "word_embeddings",
        "input": "embeddings",
    }

    for name, array in zip(names, arrays):
        # 将变量名按 '/' 分割
        name = name.split("/")
        # 如果变量名中包含以下任意一个，跳过加载：
        # "adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"
        if any(
            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
            for n in name
        ):
            # 记录日志，显示跳过的变量名
            logger.info(f"Skipping {'/'.join(name)}")
            # 继续下一个变量的处理
            continue
        # 如果变量名的第一个部分是 "generator"，跳过处理
        if name[0] == "generator":
            continue
        # 初始化指针为模型本身
        pointer = model
        skipped = False
        # 遍历变量名中的每个部分
        for m_name in name[1:]:
            # 如果指针不是 FunnelPositionwiseFFN 类型，并且 m_name 符合 "layer_\d+" 的格式
            if not isinstance(pointer, FunnelPositionwiseFFN) and re.fullmatch(r"layer_\d+", m_name):
                # 提取出层索引
                layer_index = int(re.search(r"layer_(\d+)", m_name).groups()[0])
                # 如果层索引小于配置中的隐藏层数量
                if layer_index < config.num_hidden_layers:
                    block_idx = 0
                    # 找到对应的块和层
                    while layer_index >= config.block_sizes[block_idx]:
                        layer_index -= config.block_sizes[block_idx]
                        block_idx += 1
                    pointer = pointer.blocks[block_idx][layer_index]
                else:
                    # 如果层索引大于等于配置中的隐藏层数量，使用层索引来访问指针的层
                    layer_index -= config.num_hidden_layers
                    pointer = pointer.layers[layer_index]
            elif m_name == "r" and isinstance(pointer, FunnelRelMultiheadAttention):
                # 如果 m_name 是 "r"，且指针是 FunnelRelMultiheadAttention 类型，则访问 r_kernel
                pointer = pointer.r_kernel
                break
            elif m_name in _layer_map:
                # 如果 m_name 在 _layer_map 中，根据映射找到对应的指针属性
                pointer = getattr(pointer, _layer_map[m_name])
            else:
                try:
                    # 尝试获取指针中的属性
                    pointer = getattr(pointer, m_name)
                except AttributeError:
                    # 如果属性不存在，记录日志并跳过当前变量
                    print(f"Skipping {'/'.join(name)}", array.shape)
                    skipped = True
                    break
        # 如果没有跳过当前变量的处理
        if not skipped:
            # 如果指针的形状与加载的数组形状不匹配，重新调整数组形状
            if len(pointer.shape) != len(array.shape):
                array = array.reshape(pointer.shape)
            # 如果 m_name 是 "kernel"，对数组进行转置操作
            if m_name == "kernel":
                array = np.transpose(array)
            # 使用 torch.from_numpy 将数组数据转换为 Torch 张量，并赋值给指针的数据
            pointer.data = torch.from_numpy(array)

    # 返回加载并更新后的模型
    return model
class FunnelEmbeddings(nn.Module):
    def __init__(self, config: FunnelConfig) -> None:
        super().__init__()
        # 初始化词嵌入层，将词汇表大小设为 config.vocab_size，隐藏单元大小设为 config.hidden_size，
        # 并设置填充标记为 config.pad_token_id
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 初始化 Layer Normalization 层，输入维度为 config.d_model，epsilon 设为 config.layer_norm_eps
        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
        # 初始化 Dropout 层，丢弃率为 config.hidden_dropout
        self.dropout = nn.Dropout(config.hidden_dropout)

    def forward(
        self, input_ids: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        if inputs_embeds is None:
            # 如果 inputs_embeds 为 None，则使用词嵌入层将 input_ids 转换为词嵌入向量
            inputs_embeds = self.word_embeddings(input_ids)
        # 对输入的词嵌入向量进行 Layer Normalization 处理
        embeddings = self.layer_norm(inputs_embeds)
        # 对经过 Layer Normalization 的向量应用 Dropout
        embeddings = self.dropout(embeddings)
        return embeddings


class FunnelAttentionStructure(nn.Module):
    """
    Contains helpers for `FunnelRelMultiheadAttention `.
    """

    cls_token_type_id: int = 2

    def __init__(self, config: FunnelConfig) -> None:
        super().__init__()
        self.config = config
        # 初始化 Sinusoidal Dropout 层，丢弃率为 config.hidden_dropout
        self.sin_dropout = nn.Dropout(config.hidden_dropout)
        # 初始化 Cosinusoidal Dropout 层，丢弃率为 config.hidden_dropout
        self.cos_dropout = nn.Dropout(config.hidden_dropout)
        # 用于跟踪从原始输入进行池化的进度，例如，通过将序列长度除以多少
        self.pooling_mult = None

    def init_attention_inputs(
        self,
        inputs_embeds: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor]:
        """Returns the attention inputs associated to the inputs of the model."""
        # 设置 pooling_mult 为 1，表示尚未进行任何池化
        self.pooling_mult = 1
        self.seq_len = seq_len = inputs_embeds.size(1)
        # 获取位置嵌入，形状为 seq_len x config.d_model，数据类型为 inputs_embeds 的数据类型，
        # 设备为 inputs_embeds 的设备
        position_embeds = self.get_position_embeds(seq_len, inputs_embeds.dtype, inputs_embeds.device)
        # 如果存在 token_type_ids，则将其转换为 token_type_mat
        token_type_mat = self.token_type_ids_to_mat(token_type_ids) if token_type_ids is not None else None
        # 如果配置要求分离 <cls> 标记，则创建对应的掩码
        cls_mask = (
            nn.functional.pad(inputs_embeds.new_ones([seq_len - 1, seq_len - 1]), (1, 0, 1, 0))
            if self.config.separate_cls
            else None
        )
        # 返回初始化的注意力输入元组
        return (position_embeds, token_type_mat, attention_mask, cls_mask)

    def token_type_ids_to_mat(self, token_type_ids: torch.Tensor) -> torch.Tensor:
        """Convert `token_type_ids` to `token_type_mat`."""
        # 将 token_type_ids 转换为 token_type_mat，形状为 batch_size x seq_len x seq_len
        token_type_mat = token_type_ids[:, :, None] == token_type_ids[:, None]
        # 将 <cls> 标记视为与 A 和 B 同一段
        cls_ids = token_type_ids == self.cls_token_type_id
        cls_mat = cls_ids[:, :, None] | cls_ids[:, None]
        return cls_mat | token_type_mat

    def get_position_embeds(
        self, seq_len: int, dtype: torch.dtype, device: torch.device
    ) -> torch.Tensor:
        # 返回位置嵌入向量，形状为 seq_len x config.d_model，数据类型为 dtype，设备为 device
        pass  # 实现在此处
    def stride_pool_pos(self, pos_id: torch.Tensor, block_index: int):
        """
        Pool `pos_id` while keeping the cls token separate (if `config.separate_cls=True`).
        """
        if self.config.separate_cls:
            # 在分离 <cls> 的情况下，我们将 <cls> 视为第一个真实块的前一个块。
            # 由于第一个真实块的位置始终为1，前一个块的位置将为 `1 - 2 ** block_index`。
            cls_pos = pos_id.new_tensor([-(2**block_index) + 1])
            # 如果设置了截断序列，排除第一个和最后一个位置
            pooled_pos_id = pos_id[1:-1] if self.config.truncate_seq else pos_id[1:]
            # 返回合并后的位置信息，首先是 <cls> 的位置，然后是按步长为2抽取的池化位置
            return torch.cat([cls_pos, pooled_pos_id[::2]], 0)
        else:
            # 如果不分离 <cls>，则直接按步长为2抽取池化位置
            return pos_id[::2]

    def relative_pos(self, pos: torch.Tensor, stride: int, pooled_pos=None, shift: int = 1) -> torch.Tensor:
        """
        Build the relative positional vector between `pos` and `pooled_pos`.
        """
        if pooled_pos is None:
            pooled_pos = pos

        # 参考点是池化后位置的第一个元素减去原始位置的第一个元素
        ref_point = pooled_pos[0] - pos[0]
        num_remove = shift * len(pooled_pos)
        # 计算最大距离和最小距离
        max_dist = ref_point + num_remove * stride
        min_dist = pooled_pos[0] - pos[-1]

        # 构建相对位置向量，从最大距离开始到最小距离结束，步长为负的步长值
        return torch.arange(max_dist, min_dist - 1, -stride, dtype=torch.long, device=pos.device)

    def stride_pool(
        self,
        tensor: Union[torch.Tensor, Tuple[torch.Tensor], List[torch.Tensor]],
        axis: Union[int, Tuple[int], List[int]],
    ) -> torch.Tensor:
        """
        Perform pooling by stride slicing the tensor along the given axis.
        """
        if tensor is None:
            return None

        # 如果轴是整数，则递归地沿着给定轴进行步长池化
        if isinstance(axis, (list, tuple)):
            for ax in axis:
                tensor = self.stride_pool(tensor, ax)
            return tensor

        # 如果张量是列表或元组的列表，则递归地对每个张量进行步长池化
        if isinstance(tensor, (tuple, list)):
            return type(tensor)(self.stride_pool(x, axis) for x in tensor)

        # 处理负轴值，将轴值映射到张量的维度上
        axis %= tensor.ndim

        # 确定切片方式，根据配置决定是否分离 <cls> 并是否截断序列
        axis_slice = (
            slice(None, -1, 2) if self.config.separate_cls and self.config.truncate_seq else slice(None, None, 2)
        )
        enc_slice = [slice(None)] * axis + [axis_slice]

        # 如果配置分离 <cls>，则在第一个位置前添加 <cls> 的位置信息
        if self.config.separate_cls:
            cls_slice = [slice(None)] * axis + [slice(None, 1)]
            tensor = torch.cat([tensor[cls_slice], tensor], axis=axis)

        # 返回根据切片后的张量
        return tensor[enc_slice]

    def pool_tensor(
        self, tensor: Union[torch.Tensor, Tuple[torch.Tensor], List[torch.Tensor]], mode: str = "mean", stride: int = 2
    ) -> torch.Tensor:
        """
        Perform pooling operation on the input tensor.
        """
        # 这里将根据模式（平均或其他）和步长执行张量池化操作
        # 具体的池化操作在实际代码中会根据 mode 参数实现
        # 在这里我们省略了详细的具体实现方式
        pass
    ) -> torch.Tensor:
        """Apply 1D pooling to a tensor of size [B x T (x H)]."""
        if tensor is None:
            return None

        # Do the pool recursively if tensor is a list or tuple of tensors.
        if isinstance(tensor, (tuple, list)):
            return type(tensor)(self.pool_tensor(tensor, mode=mode, stride=stride) for x in tensor)

        # Adjust tensor format if separate_cls flag is enabled.
        if self.config.separate_cls:
            suffix = tensor[:, :-1] if self.config.truncate_seq else tensor
            tensor = torch.cat([tensor[:, :1], suffix], dim=1)

        ndim = tensor.ndim
        if ndim == 2:
            tensor = tensor[:, None, :, None]  # Expand tensor dimensions for pooling
        elif ndim == 3:
            tensor = tensor[:, None, :, :]  # Expand tensor dimensions for pooling

        # Define stride specifically for pooling operation
        stride = (stride, 1)

        # Apply pooling based on selected mode
        if mode == "mean":
            tensor = nn.functional.avg_pool2d(tensor, stride, stride=stride, ceil_mode=True)
        elif mode == "max":
            tensor = nn.functional.max_pool2d(tensor, stride, stride=stride, ceil_mode=True)
        elif mode == "min":
            tensor = -nn.functional.max_pool2d(-tensor, stride, stride=stride, ceil_mode=True)
        else:
            raise NotImplementedError("The supported modes are 'mean', 'max' and 'min'.")

        # Adjust tensor format back after pooling operation
        if ndim == 2:
            return tensor[:, 0, :, 0]  # Squeeze extra dimensions for 2D tensor
        elif ndim == 3:
            return tensor[:, 0]  # Squeeze extra dimension for 3D tensor
        return tensor  # Return pooled tensor

    def pre_attention_pooling(
        self, output, attention_inputs: Tuple[torch.Tensor]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
        """Pool `output` and the proper parts of `attention_inputs` before the attention layer."""
        # Unpack attention_inputs into individual tensors
        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs

        # Adjust position embeddings based on configuration
        if self.config.pool_q_only:
            if self.config.attention_type == "factorized":
                position_embeds = self.stride_pool(position_embeds[:2], 0) + position_embeds[2:]
            token_type_mat = self.stride_pool(token_type_mat, 1)  # Apply stride pooling on token_type_mat
            cls_mask = self.stride_pool(cls_mask, 0)  # Apply stride pooling on cls_mask
            output = self.pool_tensor(output, mode=self.config.pooling_type)  # Apply pooling on output tensor
        else:
            self.pooling_mult *= 2  # Update pooling multiplier
            if self.config.attention_type == "factorized":
                position_embeds = self.stride_pool(position_embeds, 0)  # Apply stride pooling on position_embeds
            token_type_mat = self.stride_pool(token_type_mat, [1, 2])  # Apply stride pooling on token_type_mat
            cls_mask = self.stride_pool(cls_mask, [1, 2])  # Apply stride pooling on cls_mask
            attention_mask = self.pool_tensor(attention_mask, mode="min")  # Apply min pooling on attention_mask
            output = self.pool_tensor(output, mode=self.config.pooling_type)  # Apply pooling on output tensor

        # Pack adjusted tensors back into attention_inputs
        attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)

        # Return pooled output and adjusted attention_inputs
        return output, attention_inputs
    def post_attention_pooling(self, attention_inputs: Tuple[torch.Tensor]) -> Tuple[torch.Tensor]:
        """Pool the proper parts of `attention_inputs` after the attention layer."""
        # 解包输入的注意力部分：位置嵌入、标记类型矩阵、注意力掩码、CLS掩码
        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
        
        # 如果配置要求仅对查询进行池化
        if self.config.pool_q_only:
            # 增加池化倍数
            self.pooling_mult *= 2
            
            # 如果注意力类型为"factorized"
            if self.config.attention_type == "factorized":
                # 对位置嵌入的前两部分进行池化，并与后续部分拼接
                position_embeds = position_embeds[:2] + self.stride_pool(position_embeds[2:], 0)
            
            # 对标记类型矩阵进行池化
            token_type_mat = self.stride_pool(token_type_mat, 2)
            
            # 对CLS掩码进行池化
            cls_mask = self.stride_pool(cls_mask, 1)
            
            # 对注意力掩码进行池化，使用最小值池化模式
            attention_mask = self.pool_tensor(attention_mask, mode="min")
        
        # 更新注意力输入为池化后的部分
        attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
        
        # 返回更新后的注意力输入
        return attention_inputs
def _relative_shift_gather(positional_attn: torch.Tensor, context_len: int, shift: int) -> torch.Tensor:
    batch_size, n_head, seq_len, max_rel_len = positional_attn.shape
    # 定义函数参数和返回值的类型注解

    # 将 positional_attn 重新形状为 [batch_size, n_head, max_rel_len, seq_len]
    positional_attn = torch.reshape(positional_attn, [batch_size, n_head, max_rel_len, seq_len])
    # 从第 shift 列开始，截取后续的数据
    positional_attn = positional_attn[:, :, shift:, :]
    # 将 positional_attn 重新形状为 [batch_size, n_head, seq_len, max_rel_len - shift]
    positional_attn = torch.reshape(positional_attn, [batch_size, n_head, seq_len, max_rel_len - shift])
    # 仅保留最后一维度中的前 context_len 个元素
    positional_attn = positional_attn[..., :context_len]
    # 返回处理后的 positional_attn
    return positional_attn


class FunnelRelMultiheadAttention(nn.Module):
    def __init__(self, config: FunnelConfig, block_index: int) -> None:
        super().__init__()
        # 初始化 FunnelRelMultiheadAttention 类，设置参数和属性

        self.config = config
        self.block_index = block_index
        d_model, n_head, d_head = config.d_model, config.n_head, config.d_head

        self.hidden_dropout = nn.Dropout(config.hidden_dropout)
        self.attention_dropout = nn.Dropout(config.attention_dropout)

        # 初始化 q、k、v 头部线性映射
        self.q_head = nn.Linear(d_model, n_head * d_head, bias=False)
        self.k_head = nn.Linear(d_model, n_head * d_head)
        self.v_head = nn.Linear(d_model, n_head * d_head)

        # 初始化 r_w_bias、r_r_bias、r_kernel、r_s_bias 和 seg_embed 作为参数
        self.r_w_bias = nn.Parameter(torch.zeros([n_head, d_head]))
        self.r_r_bias = nn.Parameter(torch.zeros([n_head, d_head]))
        self.r_kernel = nn.Parameter(torch.zeros([d_model, n_head, d_head]))
        self.r_s_bias = nn.Parameter(torch.zeros([n_head, d_head]))
        self.seg_embed = nn.Parameter(torch.zeros([2, n_head, d_head]))

        # 初始化后处理的线性映射和层归一化
        self.post_proj = nn.Linear(n_head * d_head, d_model)
        self.layer_norm = nn.LayerNorm(d_model, eps=config.layer_norm_eps)
        self.scale = 1.0 / (d_head**0.5)
    def relative_positional_attention(self, position_embeds, q_head, context_len, cls_mask=None):
        """Relative attention score for the positional encodings"""
        # q_head has shape batch_size x sea_len x n_head x d_head
        
        # Check if the attention type is factorized
        if self.config.attention_type == "factorized":
            # Notations from the paper, appending A.2.2, final formula (https://arxiv.org/abs/2006.03236)
            # phi and pi have shape seq_len x d_model, psi and omega have shape context_len x d_model
            phi, pi, psi, omega = position_embeds
            
            # Calculate relative bias term u with shape n_head x d_head
            u = self.r_r_bias * self.scale  # Shape n_head x d_head
            
            # Retrieve the kernel for relative attention with shape d_model x n_head x d_head
            w_r = self.r_kernel  # Shape d_model x n_head x d_head
            
            # Compute q_r_attention with shape batch_size x sea_len x n_head x d_model
            q_r_attention = torch.einsum("binh,dnh->bind", q_head + u, w_r)
            
            # Compute scaled attention scores based on positional embeddings phi and pi
            q_r_attention_1 = q_r_attention * phi[:, None]
            q_r_attention_2 = q_r_attention * pi[:, None]
            
            # Combine positional attention contributions from phi, pi, psi, and omega
            # Resulting shape: batch_size x n_head x seq_len x context_len
            positional_attn = torch.einsum("bind,jd->bnij", q_r_attention_1, psi) + torch.einsum(
                "bind,jd->bnij", q_r_attention_2, omega
            )
        else:
            # For other attention types, determine the shift value
            shift = 2 if q_head.shape[1] != context_len else 1
            
            # Notations from the paper, appending A.2.1, final formula (https://arxiv.org/abs/2006.03236)
            # Grab the positional encoding for the given shift
            r = position_embeds[self.block_index][shift - 1]  # Shape max_rel_len x d_model
            
            # Compute relative bias term v with shape n_head x d_head
            v = self.r_r_bias * self.scale  # Shape n_head x d_head
            
            # Retrieve the kernel for relative attention with shape d_model x n_head x d_head
            w_r = self.r_kernel  # Shape d_model x n_head x d_head
            
            # Compute r_head using the positional encoding r and kernel w_r
            r_head = torch.einsum("td,dnh->tnh", r, w_r)  # Shape max_rel_len x n_head x d_model
            
            # Compute positional attention scores based on q_head and r_head
            # Resulting shape: batch_size x n_head x seq_len x max_rel_len
            positional_attn = torch.einsum("binh,tnh->bnit", q_head + v, r_head)
            
            # Adjust positional attention scores based on relative shift and context_len
            # Resulting shape: batch_size x n_head x seq_len x context_len
            positional_attn = _relative_shift_gather(positional_attn, context_len, shift)

        # Apply class token masking if cls_mask is provided
        if cls_mask is not None:
            positional_attn *= cls_mask
        
        # Return the computed positional attention scores
        return positional_attn
    def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
        """Relative attention score for the token_type_ids"""
        # 如果token_type_mat为空，则返回0
        if token_type_mat is None:
            return 0
        # 获取batch_size, seq_len, context_len的维度
        batch_size, seq_len, context_len = token_type_mat.shape
        # q_head的形状为batch_size x seq_len x n_head x d_head
        # 形状为n_head x d_head的r_s_bias乘以scale
        r_s_bias = self.r_s_bias * self.scale

        # 形状为batch_size x n_head x seq_len x 2的token_type_bias
        token_type_bias = torch.einsum("bind,snd->bnis", q_head + r_s_bias, self.seg_embed)
        # token_type_mat的形状扩展为batch_size x n_head x seq_len x context_len
        token_type_mat = token_type_mat[:, None].expand([batch_size, q_head.shape[2], seq_len, context_len])
        # 在最后一个维度上分割为形状为batch_size x n_head x seq_len x 1的diff_token_type和same_token_type
        diff_token_type, same_token_type = torch.split(token_type_bias, 1, dim=-1)
        # 形状为batch_size x n_head x seq_len x context_len的token_type_attn
        token_type_attn = torch.where(
            token_type_mat,  # 条件是token_type_mat
            same_token_type.expand(token_type_mat.shape),  # 如果条件成立，使用same_token_type扩展形状
            diff_token_type.expand(token_type_mat.shape)   # 否则，使用diff_token_type扩展形状
        )

        # 如果有cls_mask，则将token_type_attn与其相乘
        if cls_mask is not None:
            token_type_attn *= cls_mask
        # 返回token_type_attn
        return token_type_attn
    ) -> Tuple[torch.Tensor, ...]:
        # query has shape batch_size x seq_len x d_model
        # key and value have shapes batch_size x context_len x d_model
        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs

        batch_size, seq_len, _ = query.shape
        context_len = key.shape[1]
        n_head, d_head = self.config.n_head, self.config.d_head

        # Shape batch_size x seq_len x n_head x d_head
        q_head = self.q_head(query).view(batch_size, seq_len, n_head, d_head)
        # Shapes batch_size x context_len x n_head x d_head
        k_head = self.k_head(key).view(batch_size, context_len, n_head, d_head)
        v_head = self.v_head(value).view(batch_size, context_len, n_head, d_head)

        q_head = q_head * self.scale
        # Shape n_head x d_head
        r_w_bias = self.r_w_bias * self.scale
        # Shapes batch_size x n_head x seq_len x context_len
        content_score = torch.einsum("bind,bjnd->bnij", q_head + r_w_bias, k_head)
        # Calculate relative positional attention
        positional_attn = self.relative_positional_attention(position_embeds, q_head, context_len, cls_mask)
        # Calculate relative token type attention
        token_type_attn = self.relative_token_type_attention(token_type_mat, q_head, cls_mask)

        # merge attention scores
        attn_score = content_score + positional_attn + token_type_attn

        # precision safe in case of mixed precision training
        dtype = attn_score.dtype
        attn_score = attn_score.float()
        # perform masking
        if attention_mask is not None:
            # Apply attention mask to attention scores
            attn_score = attn_score - INF * (1 - attention_mask[:, None, None].float())
        # attention probability
        attn_prob = torch.softmax(attn_score, dim=-1, dtype=dtype)
        attn_prob = self.attention_dropout(attn_prob)

        # attention output, shape batch_size x seq_len x n_head x d_head
        attn_vec = torch.einsum("bnij,bjnd->bind", attn_prob, v_head)

        # Shape shape batch_size x seq_len x d_model
        attn_out = self.post_proj(attn_vec.reshape(batch_size, seq_len, n_head * d_head))
        attn_out = self.hidden_dropout(attn_out)

        # Residual connection and layer normalization
        output = self.layer_norm(query + attn_out)
        return (output, attn_prob) if output_attentions else (output,)
# 定义一个用于Funnel模型中的编码器的类
class FunnelEncoder(nn.Module):
    def __init__(self, config: FunnelConfig) -> None:
        super().__init__()
        self.config = config  # 保存Funnel配置对象

        # 初始化Funnel注意力结构对象
        self.attention_structure = FunnelAttentionStructure(config)

        # 创建多层模块列表，每一层由多个FunnelLayer组成，根据配置中的块大小生成
        self.blocks = nn.ModuleList(
            [
                nn.ModuleList([FunnelLayer(config, block_index) for _ in range(block_size)])
                for block_index, block_size in enumerate(config.block_sizes)
            ]
        )

    # 前向传播函数，接受输入嵌入向量及其它可选参数，并返回输出结果
    def forward(
        self,
        inputs_embeds: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[Tuple[torch.Tensor, Tuple], torch.Tensor]:
        # 省略了前向传播函数体的注释
    ) -> Union[Tuple, BaseModelOutput]:
        # 定义函数的输入和输出类型，此函数返回一个元组或者BaseModelOutput类型的对象

        # 将注意力掩码转换为与输入嵌入张量相同的数据类型
        attention_mask = attention_mask.type_as(inputs_embeds)

        # 使用注意力结构初始化注意力输入，包括输入嵌入张量、注意力掩码、标记类型ID
        attention_inputs = self.attention_structure.init_attention_inputs(
            inputs_embeds,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        # 将隐藏状态初始化为输入嵌入张量
        hidden = inputs_embeds

        # 如果需要输出所有隐藏状态，则初始化存储所有隐藏状态的元组
        all_hidden_states = (inputs_embeds,) if output_hidden_states else None

        # 如果需要输出所有注意力权重，则初始化存储所有注意力权重的元组
        all_attentions = () if output_attentions else None

        # 遍历每一个块
        for block_index, block in enumerate(self.blocks):
            # 根据配置和块索引确定是否进行池化
            pooling_flag = hidden.size(1) > (2 if self.config.separate_cls else 1)
            pooling_flag = pooling_flag and block_index > 0

            # 如果需要池化，则执行前注意力池化操作
            if pooling_flag:
                pooled_hidden, attention_inputs = self.attention_structure.pre_attention_pooling(
                    hidden, attention_inputs
                )

            # 遍历块内的每一层
            for layer_index, layer in enumerate(block):
                # 根据块配置的重复次数遍历每一层
                for repeat_index in range(self.config.block_repeats[block_index]):
                    # 确定当前是否需要进行池化操作
                    do_pooling = (repeat_index == 0) and (layer_index == 0) and pooling_flag

                    # 根据是否需要池化，选择不同的查询（query）、键（key）、值（value）
                    if do_pooling:
                        query = pooled_hidden
                        key = value = hidden if self.config.pool_q_only else pooled_hidden
                    else:
                        query = key = value = hidden

                    # 调用当前层的前向方法，获取层的输出
                    layer_output = layer(query, key, value, attention_inputs, output_attentions=output_attentions)

                    # 更新隐藏状态为当前层的输出
                    hidden = layer_output[0]

                    # 如果执行了池化操作，则执行后注意力池化操作
                    if do_pooling:
                        attention_inputs = self.attention_structure.post_attention_pooling(attention_inputs)

                    # 如果需要输出注意力权重，则将当前层的注意力权重添加到all_attentions中
                    if output_attentions:
                        all_attentions = all_attentions + layer_output[1:]

                    # 如果需要输出所有隐藏状态，则将当前隐藏状态添加到all_hidden_states中
                    if output_hidden_states:
                        all_hidden_states = all_hidden_states + (hidden,)

        # 根据return_dict标志返回不同的输出形式
        if not return_dict:
            return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
        return BaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)
# 定义一个函数 `upsample`，用于对输入的张量 `x` 进行上采样操作，使其长度与 `target_len` 相匹配，
# 方法是在序列长度维度上重复每个标记 `stride` 次。
def upsample(
    x: torch.Tensor, stride: int, target_len: int, separate_cls: bool = True, truncate_seq: bool = False
) -> torch.Tensor:
    """
    Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
    """
    # 如果 `stride` 为 1，则直接返回输入的张量 `x`
    if stride == 1:
        return x
    # 如果 `separate_cls` 为 True，则从 `x` 中分离出特殊标记（CLS 标记）
    if separate_cls:
        cls = x[:, :1]  # 提取第一个标记作为特殊标记
        x = x[:, 1:]    # 剩余部分作为序列数据
    # 在序列长度维度上重复每个标记 `stride` 次，形成上采样后的输出
    output = torch.repeat_interleave(x, repeats=stride, dim=1)
    # 如果 `separate_cls` 为 True，则根据需要截断序列并重新连接特殊标记
    if separate_cls:
        # 如果需要截断序列 (`truncate_seq` 为 True)，则在末尾进行零填充
        if truncate_seq:
            output = nn.functional.pad(output, (0, 0, 0, stride - 1, 0, 0))
        # 截取序列长度至 `target_len - 1`，并重新连接特殊标记
        output = output[:, : target_len - 1]
        output = torch.cat([cls, output], dim=1)
    else:
        # 如果 `separate_cls` 为 False，则直接截取序列长度至 `target_len`
        output = output[:, :target_len]
    # 返回经过上述处理后的输出张量
    return output


class FunnelDecoder(nn.Module):
    # 定义 FunnelDecoder 类，继承自 nn.Module 类
    def __init__(self, config: FunnelConfig) -> None:
        super().__init__()
        self.config = config
        # 初始化注意力结构模块 FunnelAttentionStructure，并传入配置参数
        self.attention_structure = FunnelAttentionStructure(config)
        # 使用列表推导式创建多个 FunnelLayer 层，并存储在 layers 属性中
        self.layers = nn.ModuleList([FunnelLayer(config, 0) for _ in range(config.num_decoder_layers)])

    def forward(
        self,
        final_hidden: torch.Tensor,
        first_block_hidden: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[Tuple, BaseModelOutput]:
        # 对 final_hidden 进行上采样操作，使其与 first_block_hidden 的长度相匹配
        upsampled_hidden = upsample(
            final_hidden,
            stride=2 ** (len(self.config.block_sizes) - 1),
            target_len=first_block_hidden.shape[1],
            separate_cls=self.config.separate_cls,
            truncate_seq=self.config.truncate_seq,
        )

        # 将上采样后的 hidden 与 first_block_hidden 相加得到新的 hidden 张量
        hidden = upsampled_hidden + first_block_hidden
        # 初始化空列表，用于存储所有的隐藏状态
        all_hidden_states = (hidden,) if output_hidden_states else None
        # 初始化空元组，用于存储所有的注意力权重
        all_attentions = () if output_attentions else None

        # 初始化注意力结构输入参数，并传入相应参数
        attention_inputs = self.attention_structure.init_attention_inputs(
            hidden,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        # 遍历所有的 FunnelLayer 层，并依次进行前向传播计算
        for layer in self.layers:
            # 调用每一层的前向传播方法，并获取输出
            layer_output = layer(hidden, hidden, hidden, attention_inputs, output_attentions=output_attentions)
            hidden = layer_output[0]  # 更新 hidden 为当前层的输出

            # 如果需要输出注意力权重，则将当前层的注意力权重添加到 all_attentions 中
            if output_attentions:
                all_attentions = all_attentions + layer_output[1:]
            # 如果需要输出隐藏状态，则将当前层的隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden,)

        # 如果不需要返回字典形式的结果，则按需返回 hidden、all_hidden_states 和 all_attentions
        if not return_dict:
            return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
        # 如果需要返回字典形式的结果，则使用 BaseModelOutput 构造器返回结果
        return BaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)


class FunnelDiscriminatorPredictions(nn.Module):
    """Prediction module for the discriminator, made up of two dense layers."""
    # FunnelDiscriminatorPredictions 类，用于判别器的预测模块，由两个全连接层组成
    # 初始化函数，用于创建一个新的对象实例
    def __init__(self, config: FunnelConfig) -> None:
        # 调用父类的初始化方法
        super().__init__()
        # 将传入的配置对象保存到实例变量中
        self.config = config
        # 创建一个线性层，输入和输出维度都为 config.d_model
        self.dense = nn.Linear(config.d_model, config.d_model)
        # 创建一个线性层，输入维度为 config.d_model，输出维度为 1
        self.dense_prediction = nn.Linear(config.d_model, 1)

    # 前向传播函数，定义了数据从输入到输出的流程
    def forward(self, discriminator_hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用线性层进行转换，输入为 discriminator_hidden_states
        hidden_states = self.dense(discriminator_hidden_states)
        # 根据配置中的激活函数选择对 hidden_states 进行非线性变换
        hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
        # 使用预测线性层得到最终的 logits，将结果的最后一个维度压缩为 1
        logits = self.dense_prediction(hidden_states).squeeze(-1)
        # 返回处理后的 logits 结果
        return logits
class FunnelPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 引入配置类，用于处理模型配置
    config_class = FunnelConfig
    # 载入 TensorFlow 权重的方法
    load_tf_weights = load_tf_weights_in_funnel
    # 基础模型的前缀
    base_model_prefix = "funnel"

    def _init_weights(self, module):
        # 获取当前模块的类名
        classname = module.__class__.__name__
        # 如果类名中包含 "Linear" 字符串，表示是线性层
        if classname.find("Linear") != -1:
            # 如果模块具有权重属性
            if getattr(module, "weight", None) is not None:
                # 如果初始化标准差未指定，则计算标准差为平方根值
                if self.config.initializer_std is None:
                    fan_out, fan_in = module.weight.shape
                    std = np.sqrt(1.0 / float(fan_in + fan_out))
                else:
                    std = self.config.initializer_std
                # 使用正态分布初始化权重
                nn.init.normal_(module.weight, std=std)
            # 如果模块具有偏置属性，则将偏置初始化为 0
            if getattr(module, "bias", None) is not None:
                nn.init.constant_(module.bias, 0.0)
        # 如果类名是 "FunnelRelMultiheadAttention"，表示是多头注意力层
        elif classname == "FunnelRelMultiheadAttention":
            # 使用均匀分布初始化特定参数
            nn.init.uniform_(module.r_w_bias, b=self.config.initializer_range)
            nn.init.uniform_(module.r_r_bias, b=self.config.initializer_range)
            nn.init.uniform_(module.r_kernel, b=self.config.initializer_range)
            nn.init.uniform_(module.r_s_bias, b=self.config.initializer_range)
            nn.init.uniform_(module.seg_embed, b=self.config.initializer_range)
        # 如果类名是 "FunnelEmbeddings"，表示是嵌入层
        elif classname == "FunnelEmbeddings":
            # 如果未指定初始化标准差，则使用默认值 1.0
            std = 1.0 if self.config.initializer_std is None else self.config.initializer_std
            # 使用正态分布初始化词嵌入权重
            nn.init.normal_(module.word_embeddings.weight, std=std)
            # 如果嵌入层具有填充索引，则将填充索引位置的权重置为零
            if module.word_embeddings.padding_idx is not None:
                module.word_embeddings.weight.data[module.padding_idx].zero_()


class FunnelClassificationHead(nn.Module):
    def __init__(self, config: FunnelConfig, n_labels: int) -> None:
        super().__init__()
        # 线性层，用于隐藏层
        self.linear_hidden = nn.Linear(config.d_model, config.d_model)
        # Dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 线性层，用于输出层
        self.linear_out = nn.Linear(config.d_model, n_labels)

    def forward(self, hidden: torch.Tensor) -> torch.Tensor:
        # 隐藏层的线性变换
        hidden = self.linear_hidden(hidden)
        # 使用双曲正切函数进行激活
        hidden = torch.tanh(hidden)
        # Dropout 操作
        hidden = self.dropout(hidden)
        # 输出层的线性变换
        return self.linear_out(hidden)


@dataclass
class FunnelForPreTrainingOutput(ModelOutput):
    """
    Output type of [`FunnelForPreTraining`].
    """
    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            ELECTRA-style目标函数的总损失。
            如果提供了`labels`，则返回该损失。
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            模型头部的预测分数（每个token的分数，未经过SoftMax）。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型每一层的隐藏状态，以及初始嵌入输出的元组。
            每个元素是`torch.FloatTensor`，形状为`(batch_size, sequence_length, hidden_size)`。
            当参数`output_hidden_states=True`或`config.output_hidden_states=True`时返回。

        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            每层注意力权重的元组。
            每个元素是`torch.FloatTensor`，形状为`(batch_size, num_heads, sequence_length, sequence_length)`。
            在注意力softmax后得到的注意力权重，用于计算自注意力头部的加权平均值。
            当参数`output_attentions=True`或`config.output_attentions=True`时返回。
FUNNEL_START_DOCSTRING = r"""

    The Funnel Transformer model was proposed in [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient
    Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`FunnelConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


注释：
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列中词汇表中的索引。
            # 可以使用 [`AutoTokenizer`] 获得这些索引。参见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`] 了解详情。
            # [什么是输入 ID？](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 遮盖掩码，用于在填充标记索引上避免执行注意力操作。
            # 遮盖值选择在 `[0, 1]` 之间：
            # - 1 表示 **未遮盖** 的标记，
            # - 0 表示 **被遮盖** 的标记。
            # [什么是注意力遮盖？](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 分段标记索引，用于指示输入的第一部分和第二部分。
            # 索引在 `[0, 1]` 之间选择：
            # - 0 对应于 *句子 A* 的标记，
            # - 1 对应于 *句子 B* 的标记。
            # [什么是标记类型 ID？](../glossary#token-type-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选项，可以直接传递嵌入表示，而不是传递 `input_ids`。
            # 如果您希望更加控制将 `input_ids` 索引转换为相关向量的方式，而不是使用模型内部的嵌入查找矩阵，则这很有用。
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。
            # 更多细节请参见返回的张量中的 `attentions`。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。
            # 更多细节请参见返回的张量中的 `hidden_states`。
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是简单的元组。
"""
FunnelTransformer 模型的基础类，输出原始的隐藏状态，没有上采样头（也称为解码器）或任何特定任务的顶部头部。

该类包含了 FunnelTransformer 模型的基本结构和方法。
"""
@add_start_docstrings(
    """
    The base Funnel Transformer Model transformer outputting raw hidden-states without upsampling head (also called
    decoder) or any task-specific head on top.
    """,
    FUNNEL_START_DOCSTRING,
)
class FunnelBaseModel(FunnelPreTrainedModel):
    def __init__(self, config: FunnelConfig) -> None:
        """
        初始化 FunnelBaseModel 类的实例。

        Args:
            config (FunnelConfig): 包含模型配置信息的对象。
        """
        super().__init__(config)

        # 初始化嵌入层和编码器
        self.embeddings = FunnelEmbeddings(config)
        self.encoder = FunnelEncoder(config)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Embedding:
        """
        获取输入嵌入层的方法。

        Returns:
            nn.Embedding: 返回用于词嵌入的 nn.Embedding 对象。
        """
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
        """
        设置输入嵌入层的方法。

        Args:
            new_embeddings (nn.Embedding): 新的词嵌入对象。
        """
        self.embeddings.word_embeddings = new_embeddings

    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="funnel-transformer/small-base",
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,


这段代码定义了一个名为 `FunnelBaseModel` 的类，作为 Funnel Transformer 模型的基础实现。它包含了模型的初始化方法、嵌入层和编码器的设置方法，以及模型的前向传播方法，用于处理输入并生成输出隐藏状态。
        # 如果 output_attentions 参数未指定，则使用配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果 output_hidden_states 参数未指定，则使用配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 return_dict 参数未指定，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果既指定了 input_ids 又指定了 inputs_embeds，则抛出异常
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        # 如果指定了 input_ids
        elif input_ids is not None:
            # 检查是否需要警告，即是否存在填充并且未提供 attention_mask
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            # 获取 input_ids 的形状
            input_shape = input_ids.size()
        # 如果指定了 inputs_embeds
        elif inputs_embeds is not None:
            # 获取 inputs_embeds 的形状，排除最后一维
            input_shape = inputs_embeds.size()[:-1]
        # 如果既未指定 input_ids 也未指定 inputs_embeds，则抛出异常
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 根据是否指定了 input_ids，确定设备
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # 如果未指定 attention_mask，则创建全为 1 的默认 attention_mask
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        # 如果未指定 token_type_ids，则创建全为 0 的 token_type_ids
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # TODO: 处理 head_mask，这部分代码目前尚未实现

        # 如果未指定 inputs_embeds，则使用 self.embeddings 对 input_ids 进行嵌入
        if inputs_embeds is None:
            inputs_embeds = self.embeddings(input_ids)

        # 将嵌入后的 inputs_embeds 输入到 encoder 中进行编码
        encoder_outputs = self.encoder(
            inputs_embeds,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 返回编码器的输出
        return encoder_outputs
# 使用装饰器为模型类添加文档字符串，描述此模型是一个输出原始隐藏状态的Funnel Transformer模型，没有特定的头部处理。
@add_start_docstrings(
    "The bare Funnel Transformer Model transformer outputting raw hidden-states without any specific head on top.",
    FUNNEL_START_DOCSTRING,
)
# 定义FunnelModel类，继承自FunnelPreTrainedModel类
class FunnelModel(FunnelPreTrainedModel):
    
    # 初始化方法，接收一个FunnelConfig类型的config对象
    def __init__(self, config: FunnelConfig) -> None:
        # 调用父类的初始化方法
        super().__init__(config)
        # 将传入的config对象赋值给实例变量self.config
        self.config = config
        # 创建FunnelEmbeddings对象并赋值给实例变量self.embeddings
        self.embeddings = FunnelEmbeddings(config)
        # 创建FunnelEncoder对象并赋值给实例变量self.encoder
        self.encoder = FunnelEncoder(config)
        # 创建FunnelDecoder对象并赋值给实例变量self.decoder
        self.decoder = FunnelDecoder(config)

        # 调用模型后处理方法，用于初始化权重并进行最终处理
        self.post_init()

    # 返回输入嵌入层的方法
    def get_input_embeddings(self) -> nn.Embedding:
        return self.embeddings.word_embeddings

    # 设置输入嵌入层的方法，接收一个新的nn.Embedding对象作为参数
    def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
        self.embeddings.word_embeddings = new_embeddings

    # 使用装饰器为前向传播方法添加文档字符串，描述前向传播的输入参数和返回值
    # 同时添加代码示例的文档字符串，展示如何调用此方法进行推理
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 设置是否返回注意力矩阵，默认从配置中获取
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置是否返回隐藏层状态，默认从配置中获取
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置返回类型，默认从配置中获取
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果同时指定了 input_ids 和 inputs_embeds，则抛出 ValueError
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            # 如果只指定了 input_ids，则检查是否需要警告无 attention_mask 的情况，并获取输入的形状
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            # 如果只指定了 inputs_embeds，则获取输入的形状（去掉最后一维，即 batch 维度）
            input_shape = inputs_embeds.size()[:-1]
        else:
            # 如果既没有指定 input_ids 也没有指定 inputs_embeds，则抛出 ValueError
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 根据 input_ids 或 inputs_embeds 确定设备
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # 如果没有提供 attention_mask，则创建一个全为 1 的 mask，形状与输入数据相同，放置在指定的设备上
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        # 如果没有提供 token_type_ids，则创建一个全为 0 的 token 类型 ID，形状与输入数据相同，数据类型为 long，放置在指定的设备上
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # TODO: 处理 head_mask（头部遮罩），待实现

        # 如果没有提供 inputs_embeds，则使用 self.embeddings 对 input_ids 进行嵌入
        if inputs_embeds is None:
            inputs_embeds = self.embeddings(input_ids)

        # 使用 self.encoder 进行编码器的前向传播计算
        encoder_outputs = self.encoder(
            inputs_embeds,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=output_attentions,
            output_hidden_states=True,  # 强制输出隐藏状态
            return_dict=return_dict,
        )

        # 使用 self.decoder 进行解码器的前向传播计算
        decoder_outputs = self.decoder(
            final_hidden=encoder_outputs[0],  # 使用编码器的最终隐藏状态作为解码器的输入
            first_block_hidden=encoder_outputs[1][self.config.block_sizes[0]],  # 使用编码器第一个块的隐藏状态作为解码器的输入
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果不需要返回字典形式的结果，则根据需要构建返回的元组
        if not return_dict:
            idx = 0
            outputs = (decoder_outputs[0],)  # 将解码器输出的最后隐藏状态作为输出的第一个元素
            if output_hidden_states:
                idx += 1
                # 如果需要输出隐藏状态，则将编码器和解码器的隐藏状态拼接起来作为输出的一部分
                outputs = outputs + (encoder_outputs[1] + decoder_outputs[idx],)
            if output_attentions:
                idx += 1
                # 如果需要输出注意力矩阵，则将编码器和解码器的注意力矩阵拼接起来作为输出的一部分
                outputs = outputs + (encoder_outputs[2] + decoder_outputs[idx],)
            return outputs

        # 如果需要返回字典形式的结果，则构建一个 BaseModelOutput 对象作为输出
        return BaseModelOutput(
            last_hidden_state=decoder_outputs[0],  # 最后的隐藏状态来自解码器的输出
            hidden_states=(encoder_outputs.hidden_states + decoder_outputs.hidden_states)
            if output_hidden_states  # 如果需要输出隐藏状态，则将编码器和解码器的隐藏状态列表合并
            else None,
            attentions=(encoder_outputs.attentions + decoder_outputs.attentions)
            if output_attentions  # 如果需要输出注意力矩阵，则将编码器和解码器的注意力矩阵列表合并
            else None,
        )
add_start_docstrings(
    """
    Funnel Transformer model with a binary classification head on top as used during pretraining for identifying
    generated tokens.
    """,
    FUNNEL_START_DOCSTRING,
)



class FunnelForPreTraining(FunnelPreTrainedModel):
    def __init__(self, config: FunnelConfig) -> None:
        super().__init__(config)

        # 初始化 Funnel 模型
        self.funnel = FunnelModel(config)
        # 初始化用于判别预测的组件
        self.discriminator_predictions = FunnelDiscriminatorPredictions(config)
        # 初始化权重并应用最终处理
        self.post_init()



    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=FunnelForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



        ):
        """
        Funnel 模型的前向传播方法，支持的输入参数包括:
        - input_ids: 输入的 token IDs
        - attention_mask: 注意力掩码
        - token_type_ids: token 类型 IDs
        - inputs_embeds: 输入的嵌入向量
        - labels: 标签
        - output_attentions: 是否输出注意力权重
        - output_hidden_states: 是否输出隐藏状态
        - return_dict: 是否返回结果字典形式

        返回一个包含预测输出的 FunnelForPreTrainingOutput 对象。
        """
        ) -> Union[Tuple, FunnelForPreTrainingOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see `input_ids`
            docstring) Indices should be in `[0, 1]`:

            - 0 indicates the token is an original token,
            - 1 indicates the token was replaced.

        Returns:

        Examples:

        ```
        >>> from transformers import AutoTokenizer, FunnelForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
        >>> model = FunnelForPreTraining.from_pretrained("funnel-transformer/small")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> logits = model(**inputs).logits
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 获取鉴别器的隐藏状态，通过调用Funnel模型进行计算
        discriminator_hidden_states = self.funnel(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取鉴别器输出的序列结果
        discriminator_sequence_output = discriminator_hidden_states[0]

        # 将鉴别器输出序列传入鉴别器预测模块，生成logits
        logits = self.discriminator_predictions(discriminator_sequence_output)

        loss = None
        # 如果提供了labels，则计算损失
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            if attention_mask is not None:
                # 计算有效的损失，只考虑attention_mask标记为1的部分
                active_loss = attention_mask.view(-1, discriminator_sequence_output.shape[1]) == 1
                active_logits = logits.view(-1, discriminator_sequence_output.shape[1])[active_loss]
                active_labels = labels[active_loss]
                loss = loss_fct(active_logits, active_labels.float())
            else:
                # 计算所有位置的损失
                loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float())

        # 如果不要求返回字典，则输出一个元组
        if not return_dict:
            output = (logits,) + discriminator_hidden_states[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典，则输出一个FunnelForPreTrainingOutput对象
        return FunnelForPreTrainingOutput(
            loss=loss,
            logits=logits,
            hidden_states=discriminator_hidden_states.hidden_states,
            attentions=discriminator_hidden_states.attentions,
        )
# 使用装饰器为类添加文档字符串，描述其为在Funnel Transformer模型基础上带有语言建模头部的模型
@add_start_docstrings("""Funnel Transformer Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING)
class FunnelForMaskedLM(FunnelPreTrainedModel):
    # 定义权重共享的键值对列表，这里指定了语言建模头部权重
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config: FunnelConfig) -> None:
        # 调用父类的初始化方法
        super().__init__(config)

        # 创建Funnel模型实例，使用给定的配置
        self.funnel = FunnelModel(config)
        # 创建一个线性层作为语言建模头部，输入维度为配置中定义的d_model，输出维度为词汇表大小（vocab_size）
        self.lm_head = nn.Linear(config.d_model, config.vocab_size)

        # 调用后续初始化方法，用于权重初始化和最终处理
        self.post_init()

    # 返回语言建模头部的线性层对象
    def get_output_embeddings(self) -> nn.Linear:
        return self.lm_head

    # 设置新的输出嵌入层作为语言建模头部
    def set_output_embeddings(self, new_embeddings: nn.Embedding) -> None:
        self.lm_head = new_embeddings

    # 使用装饰器为前向方法添加文档字符串，描述其输入参数和使用示例
    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="<mask>",
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 根据需要确定是否返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给Funnel模型进行前向传播
        outputs = self.funnel(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取Funnel模型输出的最后一层隐藏状态
        last_hidden_state = outputs[0]

        # 使用语言模型头部对最后一层隐藏状态进行预测
        prediction_logits = self.lm_head(last_hidden_state)

        masked_lm_loss = None
        # 如果提供了标签，则计算masked language modeling的损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # 定义交叉熵损失函数，-100索引对应填充标记
            masked_lm_loss = loss_fct(prediction_logits.view(-1, self.config.vocab_size), labels.view(-1))

        # 如果不需要返回字典形式的输出，则将结果按顺序打包返回
        if not return_dict:
            output = (prediction_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # 返回MaskedLMOutput对象，其中包含损失、预测logits、隐藏状态和注意力分布
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
    first timestep of the last hidden state) e.g. for GLUE tasks.
    """,
    FUNNEL_START_DOCSTRING,
)
class FunnelForSequenceClassification(FunnelPreTrainedModel):
    def __init__(self, config: FunnelConfig) -> None:
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.funnel = FunnelBaseModel(config)  # 初始化FunnelBaseModel模型
        self.classifier = FunnelClassificationHead(config, config.num_labels)  # 初始化FunnelClassificationHead分类头
        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="funnel-transformer/small-base",
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        将输入传递给Funnel模型以执行前向传播。
        """
        # 略
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 默认情况下，如果 return_dict 为 None，则根据 self.config.use_return_dict 来确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 Funnel 模型进行处理，获取输出
        outputs = self.funnel(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出中的最后一层隐藏状态
        last_hidden_state = outputs[0]
        # 提取池化后的输出，通常是最后一层隐藏状态的第一个位置的输出
        pooled_output = last_hidden_state[:, 0]
        # 将池化后的输出传递给分类器，得到预测的 logits
        logits = self.classifier(pooled_output)

        # 初始化损失为 None
        loss = None
        # 如果有提供标签
        if labels is not None:
            # 如果问题类型未定义，则根据标签的类型自动推断问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算损失
            if self.config.problem_type == "regression":
                # 使用均方误差损失函数
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单个标签的回归任务，计算损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签的回归任务，计算损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                # 计算单标签分类任务的损失
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 使用带 logits 的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                # 计算多标签分类任务的损失
                loss = loss_fct(logits, labels)

        # 如果不需要返回字典形式的输出
        if not return_dict:
            # 返回一个元组，包含 logits 和可能的其他输出
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典形式的输出
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Funnel Transformer Model with a multiple choice classification head on top (two linear layer on top of the first
    timestep of the last hidden state, and a softmax) e.g. for RocStories/SWAG tasks.
    """,
    FUNNEL_START_DOCSTRING,
)
class FunnelForMultipleChoice(FunnelPreTrainedModel):
    def __init__(self, config: FunnelConfig) -> None:
        super().__init__(config)

        # 初始化 FunnelBaseModel，用于处理 Transformer 的主体部分
        self.funnel = FunnelBaseModel(config)
        # 初始化 FunnelClassificationHead，用于多选分类任务的头部
        self.classifier = FunnelClassificationHead(config, 1)
        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="funnel-transformer/small-base",
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        FunnelForMultipleChoice 模型的前向传播方法，接收多个输入参数，返回模型输出结果。

        Args:
            input_ids (Optional[torch.Tensor], optional): 输入序列的 token IDs. Defaults to None.
            attention_mask (Optional[torch.Tensor], optional): 注意力遮罩，掩盖无效输入. Defaults to None.
            token_type_ids (Optional[torch.Tensor], optional): token 类型 IDs, 用于区分 segment. Defaults to None.
            inputs_embeds (Optional[torch.Tensor], optional): 替代输入 token IDs 的嵌入. Defaults to None.
            labels (Optional[torch.Tensor], optional): 真实标签. Defaults to None.
            output_attentions (Optional[bool], optional): 是否输出注意力权重. Defaults to None.
            output_hidden_states (Optional[bool], optional): 是否输出隐藏状态. Defaults to None.
            return_dict (Optional[bool], optional): 是否返回字典格式的输出. Defaults to None.
        
        Returns:
            输出结果，根据 return_dict 的设置返回不同的格式，可能包括分类结果、注意力权重或隐藏状态等信息.
        """
        # 省略部分方法内容...
    ) -> Union[Tuple, MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据返回字典的存在性来确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 获取输入的选择数量
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 重新整形输入数据，将其变为二维张量
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 使用 Funnel 模型进行前向传播
        outputs = self.funnel(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取最后一层隐藏状态和池化输出
        last_hidden_state = outputs[0]
        pooled_output = last_hidden_state[:, 0]
        # 使用分类器得到 logits
        logits = self.classifier(pooled_output)
        # 重新整形 logits，以匹配 num_choices 的形状
        reshaped_logits = logits.view(-1, num_choices)

        # 初始化损失为 None
        loss = None
        # 如果存在 labels，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果不使用返回字典，则返回 reshaped_logits 和额外的 outputs
        if not return_dict:
            output = (reshaped_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 使用返回字典类 MultipleChoiceModelOutput 返回结果
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
Funnel Transformer Model with a span classification head on top for extractive question-answering tasks like SQuAD
"""
@add_start_docstrings(
    """
    Funnel Transformer Model with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    """,
    FUNNEL_START_DOCSTRING,
)
class FunnelForTokenClassification(FunnelPreTrainedModel):
    def __init__(self, config: FunnelConfig) -> None:
        super().__init__(config)
        self.num_labels = config.num_labels

        # Initialize Funnel Transformer model
        self.funnel = FunnelModel(config)
        # Dropout layer for regularization
        self.dropout = nn.Dropout(config.hidden_dropout)
        # Linear layer for token classification
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Forward pass through Funnel Transformer model
        outputs = self.funnel(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Apply dropout to the output of the transformer
        last_hidden_state = outputs[0]
        last_hidden_state = self.dropout(last_hidden_state)
        # Project the hidden states to logits using a linear layer
        logits = self.classifier(last_hidden_state)

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # Prepare the output according to return_dict flag
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 这里是字符串常量，描述了在隐藏状态输出之上的线性层，用于计算“起始位置标志”和“结束位置标志”
    (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    # 导入了名为 FUNNEL_START_DOCSTRING 的文档字符串常量
    FUNNEL_START_DOCSTRING,
    )
    # 定义 FunnelForQuestionAnswering 类，继承自 FunnelPreTrainedModel
    class FunnelForQuestionAnswering(FunnelPreTrainedModel):
        def __init__(self, config: FunnelConfig) -> None:
            super().__init__(config)
            self.num_labels = config.num_labels

            # 初始化 FunnelModel 和 QA 输出层
            self.funnel = FunnelModel(config)
            self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

            # 初始化权重并进行最终处理
            self.post_init()

        @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
        @add_code_sample_docstrings(
            checkpoint=_CHECKPOINT_FOR_DOC,
            output_type=QuestionAnsweringModelOutput,
            config_class=_CONFIG_FOR_DOC,
        )
        # 定义 forward 方法，接受一系列输入参数并返回相应的输出
        def forward(
            self,
            input_ids: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            token_type_ids: Optional[torch.Tensor] = None,
            inputs_embeds: Optional[torch.Tensor] = None,
            start_positions: Optional[torch.Tensor] = None,
            end_positions: Optional[torch.Tensor] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
        ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 根据返回字典的设置，确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 Funnel 模型进行推理
        outputs = self.funnel(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取最后一层隐藏状态
        last_hidden_state = outputs[0]

        # 使用 QA 输出层得到起始和结束 logits
        logits = self.qa_outputs(last_hidden_state)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果在多 GPU 上运行，增加一个维度
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 忽略超出模型输入的起始/结束位置
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        # 如果不需要返回字典，则返回元组形式的输出
        if not return_dict:
            output = (start_logits, end_logits) + outputs[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果需要返回字典，则创建 QuestionAnsweringModelOutput 对象
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Transformers-源码解析-五十-

Transformers 源码解析（五十）

.\models\fnet\tokenization_fnet.py

.\models\fnet\tokenization_fnet_fast.py

.\models\fnet\__init__.py

.\models\focalnet\configuration_focalnet.py

.\models\focalnet\convert_focalnet_to_hf_format.py

.\models\focalnet\modeling_focalnet.py

.\models\focalnet\__init__.py

.\models\fsmt\configuration_fsmt.py

.\models\fsmt\convert_fsmt_original_pytorch_checkpoint_to_pytorch.py

.\models\fsmt\modeling_fsmt.py

.\models\fsmt\tokenization_fsmt.py

.\models\fsmt\__init__.py

.\models\funnel\configuration_funnel.py

.\models\funnel\convert_funnel_original_tf_checkpoint_to_pytorch.py

.\models\funnel\modeling_funnel.py

`.\models\fnet\tokenization_fnet.py`

`.\models\fnet\tokenization_fnet_fast.py`

`.\models\fnet\init.py`

`.\models\focalnet\configuration_focalnet.py`

`.\models\focalnet\convert_focalnet_to_hf_format.py`

`.\models\focalnet\modeling_focalnet.py`

`.\models\focalnet\init.py`

`.\models\fsmt\configuration_fsmt.py`

`.\models\fsmt\convert_fsmt_original_pytorch_checkpoint_to_pytorch.py`

`.\models\fsmt\modeling_fsmt.py`

`.\models\fsmt\tokenization_fsmt.py`

`.\models\fsmt\init.py`

`.\models\funnel\configuration_funnel.py`

`.\models\funnel\convert_funnel_original_tf_checkpoint_to_pytorch.py`

`.\models\funnel\modeling_funnel.py`