Transformers 源码解析（六十三）

`.\models\layoutlmv3\processing_layoutlmv3.py`

"""
Processor class for LayoutLMv3.
"""

# 导入警告模块
import warnings
# 引入类型提示模块中的相关类型
from typing import List, Optional, Union

# 导入处理工具的混合处理器
from ...processing_utils import ProcessorMixin
# 导入基础的令牌化工具相关模块
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
# 导入张量类型
from ...utils import TensorType

# 定义 LayoutLMv3Processor 类，继承自 ProcessorMixin
class LayoutLMv3Processor(ProcessorMixin):
    r"""
    Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a
    single processor.

    [`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.

    It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
    get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
    [`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
    into token-level `labels` for token classification tasks (such as FUNSD, CORD).

    Args:
        image_processor (`LayoutLMv3ImageProcessor`, *optional*):
            An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
        tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*):
            An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
    """

    # 定义类属性 attributes
    attributes = ["image_processor", "tokenizer"]
    # 定义图像处理器类的名称
    image_processor_class = "LayoutLMv3ImageProcessor"
    # 定义令牌化器类的名称，可以是 LayoutLMv3Tokenizer 或 LayoutLMv3TokenizerFast
    tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        # 初始化函数，用于创建类的实例
        feature_extractor = None
        if "feature_extractor" in kwargs:
            # 如果传入了 `feature_extractor` 参数，发出警告，此参数在 v5 版本中将被移除，请使用 `image_processor` 替代
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            # 将 `feature_extractor` 参数的值从 `kwargs` 中弹出并保存
            feature_extractor = kwargs.pop("feature_extractor")

        # 如果未指定 `image_processor`，则尝试使用 `feature_extractor`
        image_processor = image_processor if image_processor is not None else feature_extractor
        # 如果最终 `image_processor` 仍然为 None，则抛出数值错误
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        # 如果未指定 `tokenizer`，则抛出数值错误
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        # 调用父类的初始化函数，传入 `image_processor` 和 `tokenizer`
        super().__init__(image_processor, tokenizer)

    def __call__(
        self,
        images,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ):
        # 调用对象时执行的函数，支持多种参数组合，具体含义参见参数列表
        ...

    def get_overflowing_images(self, images, overflow_to_sample_mapping):
        # 获取溢出样本对应的图像数据，确保每个 `input_ids` 样本都映射到相应的图像
        images_with_overflow = []
        for sample_idx in overflow_to_sample_mapping:
            # 根据溢出样本映射，添加对应的图像数据到列表中
            images_with_overflow.append(images[sample_idx])

        # 检查溢出图像列表长度是否与映射长度一致，若不一致则抛出数值错误
        if len(images_with_overflow) != len(overflow_to_sample_mapping):
            raise ValueError(
                "Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
                f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
            )

        # 返回带有溢出图像的列表
        return images_with_overflow

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        # 批量解码方法，将所有参数转发给 `PreTrainedTokenizer` 的 `batch_decode` 方法
        return self.tokenizer.batch_decode(*args, **kwargs)
    # 将所有参数转发到 PreTrainedTokenizer 的 `decode` 方法，并返回结果
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
        to the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    # 返回模型输入的名称列表
    @property
    def model_input_names(self):
        return ["input_ids", "bbox", "attention_mask", "pixel_values"]

    # 返回特征提取器的类。警告：`feature_extractor_class` 将在 v5 中移除，建议使用 `image_processor_class`
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class

    # 返回特征提取器。警告：`feature_extractor` 将在 v5 中移除，建议使用 `image_processor`
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor

`.\models\layoutlmv3\tokenization_layoutlmv3.py`

# coding=utf-8
# 设置文件编码为UTF-8，确保支持各种语言字符集
# Copyright The HuggingFace Inc. team. All rights reserved.
# 版权声明，保留所有权利
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据Apache 2.0许可证授权，允许使用此代码
# you may not use this file except in compliance with the License.
# 除非遵守许可证，否则禁止使用此文件
# You may obtain a copy of the License at
# 可在上述链接获取许可证副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# 除非适用法律要求或书面同意，否则不得在软件中使用
# distributed under the License is distributed on an "AS IS" BASIS,
# 软件按原样提供，不附带任何担保
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 不提供任何明示或默示的担保或条件
# See the License for the specific language governing permissions and
# 详细了解许可证的具体条款和条件，请参阅许可证
# limitations under the License.
# 许可证下的限制
"""Tokenization class for LayoutLMv3. Same as LayoutLMv2, but RoBERTa-like BPE tokenization instead of WordPiece."""
# 为LayoutLMv3设计的分词类，与LayoutLMv2相同，但使用类似RoBERTa的BPE分词而不是WordPiece

import json
import os
from functools import lru_cache
from typing import Dict, List, Optional, Tuple, Union

import regex as re

from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...tokenization_utils_base import (
    BatchEncoding,
    EncodedInput,
    PreTokenizedInput,
    TextInput,
    TextInputPair,
    TruncationStrategy,
)
# 导入所需的模块和类

from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
# 导入工具类和函数

logger = logging.get_logger(__name__)
# 获取用于当前文件名的日志记录器

VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",
    "merges_file": "merges.txt",
}
# 定义词汇文件的名称映射

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/raw/main/vocab.json",
        "microsoft/layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/raw/main/vocab.json",
    },
    "merges_file": {
        "microsoft/layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/raw/main/merges.txt",
        "microsoft/layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/raw/main/merges.txt",
    },
}
# 预训练模型使用的词汇文件映射及其对应的URL

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/layoutlmv3-base": 512,
    "microsoft/layoutlmv3-large": 512,
}
# 预训练模型的位置嵌入尺寸映射

"""


"""


@lru_cache()
# 使用LRU缓存装饰器，缓存函数的调用结果，提高性能
# Copied from transformers.models.roberta.tokenization_roberta.bytes_to_unicode
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    """
    # 返回utf-8字节的列表和到Unicode字符串的映射
    # 避免映射到BPE代码无法处理的空白字符和控制字符
    # 可逆BPE代码适用于Unicode字符串。这意味着如果要避免UNK（未知标记），词汇表中需要大量的Unicode字符
    # 在处理约100亿个标记的数据集compatibility with BPE tokenization
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    # Adding utf-8 bytes not present in bs, creating mapping for BPE tokenization
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    # Converting indices to corresponding unicode characters
    cs = [chr(n) for n in cs]
    # 使用内置的 zip 函数将两个列表 bs 和 cs 中的元素一一配对，生成一个元组的列表
    # 使用 dict() 函数将这个元组的列表转换为字典，并将其作为函数的返回值
    return dict(zip(bs, cs))
# Copied from transformers.models.roberta.tokenization_roberta.get_pairs
# 定义函数 get_pairs，用于获取单词中的符号对集合
def get_pairs(word):
    """
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    单词被表示为符号的元组（符号是可变长度的字符串）。
    """
    # 初始化空集合用于存放符号对
    pairs = set()
    # 获取前一个字符作为初始符号
    prev_char = word[0]
    # 遍历单词中的每个字符（从第二个字符开始）
    for char in word[1:]:
        # 将前一个字符和当前字符组成的符号对添加到集合中
        pairs.add((prev_char, char))
        # 更新前一个字符为当前字符，为下一个符号对做准备
        prev_char = char
    # 返回所有符号对的集合
    return pairs


class LayoutLMv3Tokenizer(PreTrainedTokenizer):
    r"""
    Construct a LayoutLMv3 tokenizer. Based on [`RoBERTatokenizer`] (Byte Pair Encoding or BPE).
    [`LayoutLMv3Tokenizer`] can be used to turn words, word-level bounding boxes and optional word labels to
    token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`, and optional `labels` (for token
    classification).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    [`LayoutLMv3Tokenizer`] runs end-to-end tokenization: punctuation splitting and wordpiece. It also turns the
    word-level bounding boxes into token-level bounding boxes.

    """

    # 定义类属性，存储词汇文件名
    vocab_files_names = VOCAB_FILES_NAMES
    # 定义类属性，存储预训练词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 定义类属性，存储模型最大输入尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 定义类属性，存储模型输入名称列表
    model_input_names = ["input_ids", "attention_mask", "bbox"]

    def __init__(
        self,
        vocab_file,
        merges_file,
        errors="replace",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        add_prefix_space=True,
        cls_token_box=[0, 0, 0, 0],
        sep_token_box=[0, 0, 0, 0],
        pad_token_box=[0, 0, 0, 0],
        pad_token_label=-100,
        only_label_first_subword=True,
        **kwargs,
    ):
        # 调用父类的初始化方法，传入必要参数
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )

        # 初始化 LayoutLMv3Tokenizer 的特有属性
        # 存储词汇文件路径
        self.vocab_file = vocab_file
        # 存储合并文件路径
        self.merges_file = merges_file
        # 错误处理方式
        self.errors = errors
        # 起始符号
        self.cls_token = cls_token
        # 结束符号
        self.sep_token = sep_token
        # 未知符号
        self.unk_token = unk_token
        # 填充符号
        self.pad_token = pad_token
        # 掩码符号
        self.mask_token = mask_token
        # 起始符号对应的边界框
        self.cls_token_box = cls_token_box
        # 结束符号对应的边界框
        self.sep_token_box = sep_token_box
        # 填充符号对应的边界框
        self.pad_token_box = pad_token_box
        # 填充符号对应的标签
        self.pad_token_label = pad_token_label
        # 是否仅标签化第一个子词
        self.only_label_first_subword = only_label_first_subword
        # 其他参数
        self.special_tokens_map_extended = {}
        self.unique_no_split_tokens = set()
        self._extra_ids = 0

        # 调用初始化方法，加载词汇表
        self._additional_special_tokens = []
        self.add_special_tokens(
            {"bos_token": bos_token, "eos_token": eos_token, "unk_token": unk_token, "pad_token": pad_token}
        )
    ):
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token

        # Mask token behave like a normal word, i.e. include the space before it
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        # 从指定的 vocab_file 中加载编码器，以 JSON 格式读取文件内容并存储在 self.encoder 中
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        
        # 创建解码器，使用 self.encoder 字典的键值对调，存储在 self.decoder 中
        self.decoder = {v: k for k, v in self.encoder.items()}
        
        # 设置处理解码时的错误处理策略
        self.errors = errors  # how to handle errors in decoding
        
        # 使用 bytes_to_unicode 函数创建编码器的字节到 Unicode 字符的映射，存储在 self.byte_encoder 中
        self.byte_encoder = bytes_to_unicode()
        
        # 创建解码器的反向映射，使用 self.byte_encoder 字典的键值对调，存储在 self.byte_decoder 中
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        
        # 从指定的 merges_file 中读取 BPE 合并操作，解析为元组列表并使用其顺序创建 self.bpe_ranks 字典
        with open(merges_file, encoding="utf-8") as merges_handle:
            bpe_merges = merges_handle.read().split("\n")[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        
        # 初始化缓存字典
        self.cache = {}
        
        # 设置是否在标记前加空格的标志
        self.add_prefix_space = add_prefix_space

        # 使用正则表达式创建 self.pat 以处理特定文本模式，包括缩写和单词
        # 添加 re.IGNORECASE 以便可以对大小写不敏感的情况进行 BPE 合并
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

        # 设置额外的属性
        self.cls_token_box = cls_token_box
        self.sep_token_box = sep_token_box
        self.pad_token_box = pad_token_box
        self.pad_token_label = pad_token_label
        self.only_label_first_subword = only_label_first_subword

        # 调用父类的初始化方法，传递所需参数和额外的关键字参数 **kwargs
        super().__init__(
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            cls_token_box=cls_token_box,
            sep_token_box=sep_token_box,
            pad_token_box=pad_token_box,
            pad_token_label=pad_token_label,
            only_label_first_subword=only_label_first_subword,
            **kwargs,
        )

    @property
    # 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size 处复制而来
    def vocab_size(self):
        # 返回 self.encoder 字典的长度，即词汇表的大小
        return len(self.encoder)

    # 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab 处复制而来
    # 复制自 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab 方法
    def get_vocab(self):
        # 从 self.encoder 字典创建 vocab 字典的副本
        vocab = dict(self.encoder).copy()
        # 将 self.added_tokens_encoder 字典合并到 vocab 字典中
        vocab.update(self.added_tokens_encoder)
        return vocab

    # 复制自 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe 方法
    def bpe(self, token):
        # 如果 token 已经在缓存中，则直接返回缓存中的结果
        if token in self.cache:
            return self.cache[token]
        word = tuple(token)
        # 获得所有可能的字符对
        pairs = get_pairs(word)

        # 如果没有字符对，则直接返回 token
        if not pairs:
            return token

        # 反复处理字符对，直到无法再合并
        while True:
            # 找到频率最低的字符对
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            # 遍历当前词中的字符
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    # 如果找不到 first，则将剩余部分添加到 new_word 中
                    new_word.extend(word[i:])
                    break
                else:
                    # 将 first 之前的部分添加到 new_word 中
                    new_word.extend(word[i:j])
                    i = j

                # 检查当前位置是否匹配 bigram，如果匹配则合并为一个新的字符
                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    # 否则将当前字符添加到 new_word 中
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            # 如果合并后只剩一个字符，则结束循环
            if len(word) == 1:
                break
            else:
                # 否则继续处理新的字符对
                pairs = get_pairs(word)
        # 将处理后的字符列表连接成一个字符串
        word = " ".join(word)
        # 将结果存入缓存并返回
        self.cache[token] = word
        return word

    # 复制自 transformers.models.roberta.tokenization_roberta.RobertaTokenizer._tokenize 方法
    def _tokenize(self, text):
        """对字符串进行分词处理。"""
        bpe_tokens = []
        # 使用正则表达式找到所有匹配的 token
        for token in re.findall(self.pat, text):
            # 将 token 中的每个字节编码成 Unicode 字符串，避免 BPE 中的控制标记（例如空格）
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # 将所有字节映射为 unicode 字符串，避免 BPE 的控制标记（在我们的情况下是空格）
            # 使用 BPE 算法处理 token，并将结果拆分为多个子 token
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        return bpe_tokens

    # 复制自 transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_token_to_id 方法
    def _convert_token_to_id(self, token):
        """使用词汇表将 token（字符串）转换为对应的 id。"""
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    # 复制自 transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_id_to_token 方法
    def _convert_id_to_token(self, index):
        """使用词汇表将索引（整数）转换为对应的 token（字符串）。"""
        return self.decoder.get(index)

    # 复制自 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.convert_tokens_to_string 方法
    def convert_tokens_to_string(self, tokens):
        """将一系列 token（字符串）转换为单个字符串。"""
        text = "".join(tokens)
        # 使用字节数组将每个字符解码为 UTF-8 编码的字符串，避免错误，使用指定的错误处理方法
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        return text
    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.save_vocabulary
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否为一个目录，如果不是则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 确定词汇文件的路径，结合指定的前缀和文件名
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        # 确定合并文件的路径，结合指定的前缀和文件名
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )
    
        # 打开词汇文件并将编码器内容以 JSON 格式写入文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
    
        index = 0
        # 打开合并文件并写入版本信息
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            # 遍历并写入 BPE 标记及其索引，确保索引连续性，同时记录警告信息
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1
    
        # 返回词汇文件和合并文件的路径
        return vocab_file, merge_file
    
    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.build_inputs_with_special_tokens
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        通过连接和添加特殊标记，为序列分类任务构建模型输入。RoBERTa 的序列格式如下：
    
        - 单个序列: `<s> X </s>`
        - 序列对: `<s> A </s></s> B </s>`
    
        Args:
            token_ids_0 (`List[int]`):
                将添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *可选*):
                第二个序列的 ID 列表，用于序列对任务。
    
        Returns:
            `List[int]`: 包含适当特殊标记的 [输入 ID](../glossary#input-ids) 列表。
        """
        if token_ids_1 is None:
            # 对于单个序列，添加起始和结束标记
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        
        # 对于序列对，添加起始和结束标记，并根据 RoBERTa 的格式添加额外的结束标记
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
    
    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_special_tokens_mask
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # If the token list already has special tokens, delegate to the base class method to get the mask
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # If there is no token_ids_1 (no sequence pair), return a mask with special tokens around token_ids_0
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        
        # For sequence pairs, return a mask with special tokens around both token_ids_0 and token_ids_1
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.create_token_type_ids_from_sequences
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        # Define special tokens for start of sequence and separator
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If there is no token_ids_1, return a list of zeros corresponding to the length of cls + token_ids_0 + sep
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # For sequence pairs, return a list of zeros corresponding to the length of cls + token_ids_0 + sep + sep + token_ids_1 + sep
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        # If the text starts with a token that should not be split, no space is added before the text in any case.
        # It's necessary to match the fast tokenization
        if (
            (is_split_into_words or add_prefix_space)
            and (len(text) > 0 and not text[0].isspace())
            and sum([text.startswith(no_split_token) for no_split_token in self.added_tokens_encoder]) == 0
        ):
            text = " " + text
        return (text, kwargs)

    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.__call__
    # 使用 __call__ 方法作为对象的调用接口，接受多种形式的文本输入和相关参数
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 使用 add_end_docstrings 函数添加文档字符串，包括 LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING 和 LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING 的内容
        @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
        # 方法的主体是从 layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.batch_encode_plus 复制过来的
        def batch_encode_plus(
            self,
            batch_text_or_text_pairs: Union[
                List[TextInput],
                List[TextInputPair],
                List[PreTokenizedInput],
            ],
            is_pair: bool = None,
            boxes: Optional[List[List[List[int]]]] = None,
            word_labels: Optional[Union[List[int], List[List[int]]]] = None,
            add_special_tokens: bool = True,
            padding: Union[bool, str, PaddingStrategy] = False,
            truncation: Union[bool, str, TruncationStrategy] = None,
            max_length: Optional[int] = None,
            stride: int = 0,
            pad_to_multiple_of: Optional[int] = None,
            return_tensors: Optional[Union[str, TensorType]] = None,
            return_token_type_ids: Optional[bool] = None,
            return_attention_mask: Optional[bool] = None,
            return_overflowing_tokens: bool = False,
            return_special_tokens_mask: bool = False,
            return_offsets_mapping: bool = False,
            return_length: bool = False,
            verbose: bool = True,
            **kwargs,
        ):
        # 为了向后兼容而设置的参数，用于确定填充和截断策略，以及最大长度和其他关键字参数
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用内部方法 _batch_encode_plus 进行批量编码，传递各种参数
        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            is_pair=is_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )
    ) -> BatchEncoding:
        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast."
            )

        # 调用私有方法 _batch_prepare_for_model，准备批量输入数据用于模型处理
        batch_outputs = self._batch_prepare_for_model(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            is_pair=is_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            return_tensors=return_tensors,
            verbose=verbose,
        )

        # 返回 BatchEncoding 类的实例，将批处理输出封装成 BatchEncoding 对象
        return BatchEncoding(batch_outputs)

    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # 从 transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer._batch_prepare_for_model 复制而来
    def _batch_prepare_for_model(
        self,
        batch_text_or_text_pairs,
        is_pair: bool = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[List[int]]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[str] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens.

        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
        """

        batch_outputs = {}  # 初始化空字典，用于存储批处理输出结果

        # 遍历批处理中的每个示例，同时迭代处理文本对或文本与框的组合
        for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
            batch_text_or_text_pair, boxes_example = example
            # 根据是否为文本对，选择合适的输入文本或文本对，以及相关的框信息
            outputs = self.prepare_for_model(
                batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
                batch_text_or_text_pair[1] if is_pair else None,
                boxes_example,
                word_labels=word_labels[idx] if word_labels is not None else None,
                add_special_tokens=add_special_tokens,
                padding=PaddingStrategy.DO_NOT_PAD.value,  # 不进行填充，批处理中后续会进行填充
                truncation=truncation_strategy.value,
                max_length=max_length,
                stride=stride,
                pad_to_multiple_of=None,  # 不进行填充，批处理中后续会进行填充
                return_attention_mask=False,  # 不返回注意力掩码，批处理中后续会返回
                return_token_type_ids=return_token_type_ids,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_length=return_length,
                return_tensors=None,  # 最终将整个批次转换为张量
                prepend_batch_axis=False,
                verbose=verbose,
            )

            # 将每个输出添加到对应的键中，确保每个键对应一个列表，存储所有示例的输出
            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        # 对批处理的输出进行填充，使用指定的填充策略和最大长度
        batch_outputs = self.pad(
            batch_outputs,
            padding=padding_strategy.value,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
        )

        # 将填充后的输出封装成 BatchEncoding 对象，使用指定的张量类型
        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)

        # 返回封装后的批处理输出对象
        return batch_outputs

    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING)
    # 从 transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.encode 复制而来
    def encode(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> List[int]:
        # 调用 encode_plus 方法对文本进行编码，并返回编码后的输入特征
        encoded_inputs = self.encode_plus(
            text=text,
            text_pair=text_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

        # 返回编码后的输入特征中的 input_ids（输入 token 的 IDs）
        return encoded_inputs["input_ids"]

    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # 从 transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.encode_plus 复制而来
    def encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
        `__call__` should be used instead.

        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
            text_pair (`List[str]` or `List[int]`, *optional*):
                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
                list of list of strings (words of a batch of examples).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        # 获取填充和截断策略，以及其他相关的参数
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用_encode_plus方法，进行编码和处理文本
        return self._encode_plus(
            text=text,
            boxes=boxes,
            text_pair=text_pair,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer._encode_plus
    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        # 如果设置了返回偏移映射，抛出未实现错误，因为 Python tokenizer 不支持此功能
        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast. "
                "More information on available tokenizers at "
                "https://github.com/huggingface/transformers/pull/2674"
            )

        # 调用 prepare_for_model 方法，准备输入以供模型使用
        return self.prepare_for_model(
            text=text,  # 主要文本输入
            text_pair=text_pair,  # 可选的第二个文本输入（用于双输入模型）
            boxes=boxes,  # 文本框坐标信息（用于图像文本输入）
            word_labels=word_labels,  # 单词级别标签（用于标注任务）
            add_special_tokens=add_special_tokens,  # 是否添加特殊标记（如 [CLS], [SEP]）
            padding=padding_strategy.value,  # 填充策略（布尔值、字符串或填充策略对象）
            truncation=truncation_strategy.value,  # 截断策略（布尔值、字符串或截断策略对象）
            max_length=max_length,  # 最大长度限制
            stride=stride,  # 滑动窗口步长
            pad_to_multiple_of=pad_to_multiple_of,  # 填充到某个倍数
            return_tensors=return_tensors,  # 返回的张量类型
            prepend_batch_axis=True,  # 是否在结果中添加批次维度
            return_attention_mask=return_attention_mask,  # 是否返回注意力掩码
            return_token_type_ids=return_token_type_ids,  # 是否返回 token 类型 IDs
            return_overflowing_tokens=return_overflowing_tokens,  # 是否返回溢出的 tokens
            return_special_tokens_mask=return_special_tokens_mask,  # 是否返回特殊 tokens 掩码
            return_length=return_length,  # 是否返回输入长度信息
            verbose=verbose,  # 是否显示详细信息
        )

    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def prepare_for_model(
        self,
        text: Union[TextInput, PreTokenizedInput],  # 主要文本输入或预标记输入
        text_pair: Optional[PreTokenizedInput] = None,  # 可选的第二个文本输入
        boxes: Optional[List[List[int]]] = None,  # 文本框坐标信息
        word_labels: Optional[List[int]] = None,  # 单词级别标签
        add_special_tokens: bool = True,  # 是否添加特殊标记
        padding: Union[bool, str, PaddingStrategy] = False,  # 填充策略
        truncation: Union[bool, str, TruncationStrategy] = None,  # 截断策略
        max_length: Optional[int] = None,  # 最大长度限制
        stride: int = 0,  # 滑动窗口步长
        pad_to_multiple_of: Optional[int] = None,  # 填充到某个倍数
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的张量类型
        return_token_type_ids: Optional[bool] = None,  # 是否返回 token 类型 IDs
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力掩码
        return_overflowing_tokens: bool = False,  # 是否返回溢出的 tokens
        return_special_tokens_mask: bool = False,  # 是否返回特殊 tokens 掩码
        return_offsets_mapping: bool = False,  # 是否返回偏移映射
        return_length: bool = False,  # 是否返回输入长度信息
        verbose: bool = True,  # 是否显示详细信息
        prepend_batch_axis: bool = False,  # 是否在结果中添加批次维度
        **kwargs,  # 其他关键字参数
    # 从 transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.truncate_sequences 复制而来
    def truncate_sequences(
        self,
        ids: List[int],  # 输入的 token IDs 列表
        token_boxes: List[List[int]],  # 对应的 token 边框坐标列表
        pair_ids: Optional[List[int]] = None,  # 可选的第二个文本输入的 token IDs 列表
        pair_token_boxes: Optional[List[List[int]]] = None,  # 可选的第二个文本输入的 token 边框坐标列表
        labels: Optional[List[int]] = None,  # 标签列表（用于标注任务）
        num_tokens_to_remove: int = 0,  # 要删除的 tokens 数量
        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",  # 截断策略
        stride: int = 0,  # 滑动窗口步长
    # 从 transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer._pad 复制而来
    # 定义一个私有方法 `_pad`，用于填充输入数据，确保它们达到指定的最大长度
    # 方法参数说明：
    # - encoded_inputs: 可以是字典形式的编码输入或者批量编码对象，用于输入数据的编码
    # - max_length: 可选参数，指定填充后的最大长度
    # - padding_strategy: 填充策略，默认为不填充（PaddingStrategy.DO_NOT_PAD）
    # - pad_to_multiple_of: 可选参数，指定填充后长度的倍数
    # - return_attention_mask: 可选参数，是否返回注意力掩码，默认为 None

`.\models\layoutlmv3\tokenization_layoutlmv3_fast.py`

# 导入必要的模块和类
import json  # 导入用于处理 JSON 数据的模块
from typing import Dict, List, Optional, Tuple, Union  # 引入类型提示，用于函数参数和返回值类型检查

from tokenizers import pre_tokenizers, processors  # 导入tokenizers库中的预处理器和处理器

# 导入基础的 tokenization_utils_base 模块中定义的类和函数
from ...tokenization_utils_base import (
    BatchEncoding,  # 批量编码结果的数据结构
    EncodedInput,  # 编码后的输入数据
    PaddingStrategy,  # 填充策略枚举类型
    PreTokenizedInput,  # 预分词化的输入数据结构
    TensorType,  # 张量类型标识
    TextInput,  # 文本输入数据类型
    TextInputPair,  # 文本对输入数据类型
    TruncationStrategy,  # 截断策略枚举类型
)

# 导入快速 tokenization_utils_fast 模块中定义的类
from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 快速预训练分词器基类

# 导入工具函数和日志记录
from ...utils import add_end_docstrings, logging  # 导入添加文档结束字符串的装饰器和日志记录功能

# 导入 LayoutLMv3Tokenizer 类
from .tokenization_layoutlmv3 import (
    LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING,  # 编码关键字参数文档字符串
    LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,  # 编码加强版关键字参数文档字符串
    LayoutLMv3Tokenizer,  # LayoutLMv3Tokenizer 类
)

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/raw/main/vocab.json",
        "microsoft/layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/raw/main/vocab.json",
    },
    "merges_file": {
        "microsoft/layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/raw/main/merges.txt",
        "microsoft/layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/raw/main/merges.txt",
    },
}

# 预训练模型的位置嵌入大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/layoutlmv3-base": 512,
    "microsoft/layoutlmv3-large": 512,
}


class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
    r"""
    Construct a "fast" LayoutLMv3 tokenizer (backed by HuggingFace's *tokenizers* library). Based on BPE.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    """

    # 词汇文件名称映射
    vocab_files_names = VOCAB_FILES_NAMES

    # 预训练模型的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

    # 预训练模型的最大输入尺寸映射
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    # 模型的输入名称列表
    model_input_names = ["input_ids", "attention_mask"]

    # 慢速分词器的类，用于提供后备
    slow_tokenizer_class = LayoutLMv3Tokenizer
    # 定义一个初始化方法，用于初始化对象的各种属性和参数
    def __init__(
        self,
        vocab_file=None,                # 词汇表文件路径，默认为None
        merges_file=None,               # 合并文件路径，默认为None
        tokenizer_file=None,            # 分词器文件路径，默认为None
        errors="replace",               # 编码错误处理方式，默认为替换
        bos_token="<s>",                # 开始符号标记，默认为"<s>"
        eos_token="</s>",               # 结束符号标记，默认为"</s>"
        sep_token="</s>",               # 分隔符号标记，默认为"</s>"
        cls_token="<s>",                # 类别标记，默认为"<s>"
        unk_token="<unk>",              # 未知标记，默认为"<unk>"
        pad_token="<pad>",              # 填充标记，默认为"<pad>"
        mask_token="<mask>",            # 掩码标记，默认为"<mask>"
        add_prefix_space=True,          # 是否在标记前添加空格，默认为True
        trim_offsets=True,              # 是否修剪偏移量，默认为True
        cls_token_box=[0, 0, 0, 0],     # 类别标记框，默认为[0, 0, 0, 0]
        sep_token_box=[0, 0, 0, 0],     # 分隔符号标记框，默认为[0, 0, 0, 0]
        pad_token_box=[0, 0, 0, 0],     # 填充标记框，默认为[0, 0, 0, 0]
        pad_token_label=-100,           # 填充标记的标签，默认为-100
        only_label_first_subword=True,  # 是否只标记第一个子词，默认为True
        **kwargs,                       # 其他未命名参数，以字典形式接收
        )
        # 调用父类的构造函数，初始化 LayoutLMv3TokenizerFast 实例
        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            trim_offsets=trim_offsets,
            cls_token_box=cls_token_box,
            sep_token_box=sep_token_box,
            pad_token_box=pad_token_box,
            pad_token_label=pad_token_label,
            only_label_first_subword=only_label_first_subword,
            **kwargs,
        )

        # 获取当前的前处理器（pre_tokenizer）状态并转换为 JSON 格式
        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())

        # 如果当前前处理器的 add_prefix_space 属性与参数 add_prefix_space 不一致
        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
            # 获取当前前处理器的类型
            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
            # 更新前处理器状态的 add_prefix_space 属性为参数值
            pre_tok_state["add_prefix_space"] = add_prefix_space
            # 根据更新后的状态重新创建前处理器对象
            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

        # 设置对象的 add_prefix_space 属性
        self.add_prefix_space = add_prefix_space

        # 获取后处理器（post_processor）组件的实例
        tokenizer_component = "post_processor"
        tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)

        # 如果后处理器组件实例存在
        if tokenizer_component_instance:
            # 获取当前后处理器实例的状态并转换为 JSON 格式
            state = json.loads(tokenizer_component_instance.__getstate__())

            # 如果状态中包含 'sep' 列表，则将其转换为元组
            if "sep" in state:
                state["sep"] = tuple(state["sep"])
            # 如果状态中包含 'cls' 列表，则将其转换为元组
            if "cls" in state:
                state["cls"] = tuple(state["cls"])

            # 初始化变量，用于记录是否有更新需要应用到后处理器实例
            changes_to_apply = False

            # 如果后处理器状态中的 add_prefix_space 属性与参数 add_prefix_space 不一致
            if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
                # 更新状态中的 add_prefix_space 属性为参数值
                state["add_prefix_space"] = add_prefix_space
                # 标记需要应用变更
                changes_to_apply = True

            # 如果后处理器状态中的 trim_offsets 属性与参数 trim_offsets 不一致
            if state.get("trim_offsets", trim_offsets) != trim_offsets:
                # 更新状态中的 trim_offsets 属性为参数值
                state["trim_offsets"] = trim_offsets
                # 标记需要应用变更
                changes_to_apply = True

            # 如果有需要应用的变更
            if changes_to_apply:
                # 获取后处理器的类
                component_class = getattr(processors, state.pop("type"))
                # 创建新的后处理器实例
                new_value = component_class(**state)
                # 将新的后处理器实例赋给后处理器组件
                setattr(self.backend_tokenizer, tokenizer_component, new_value)

        # 设置额外的属性值
        self.cls_token_box = cls_token_box
        self.sep_token_box = sep_token_box
        self.pad_token_box = pad_token_box
        self.pad_token_label = pad_token_label
        self.only_label_first_subword = only_label_first_subword
    # 通过 add_end_docstrings 装饰器添加文档字符串，参考 LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING 和 LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # 从 transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.__call__ 复制而来
    # 定义一个特殊方法 __call__，使实例对象可以像函数一样被调用
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 使用 layoutlmv3 的特定参数文档来扩展函数的文档字符串
        @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
        # 方法批量编码多个文本或文本对，并返回处理后的结果
        # 这是从 transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.batch_encode_plus 复制过来的
        def batch_encode_plus(
            self,
            batch_text_or_text_pairs: Union[
                List[TextInput],
                List[TextInputPair],
                List[PreTokenizedInput],
            ],
            is_pair: bool = None,
            boxes: Optional[List[List[List[int]]]] = None,
            word_labels: Optional[Union[List[int], List[List[int]]]] = None,
            add_special_tokens: bool = True,
            padding: Union[bool, str, PaddingStrategy] = False,
            truncation: Union[bool, str, TruncationStrategy] = None,
            max_length: Optional[int] = None,
            stride: int = 0,
            pad_to_multiple_of: Optional[int] = None,
            return_tensors: Optional[Union[str, TensorType]] = None,
            return_token_type_ids: Optional[bool] = None,
            return_attention_mask: Optional[bool] = None,
            return_overflowing_tokens: bool = False,
            return_special_tokens_mask: bool = False,
            return_offsets_mapping: bool = False,
            return_length: bool = False,
            verbose: bool = True,
            **kwargs,
        ):
    ) -> BatchEncoding:
        # 获取填充和截断策略，以及相关参数，用于向后兼容 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用 _batch_encode_plus 方法进行批量编码处理
        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            is_pair=is_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

    # 从 transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.tokenize 复制而来
    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
        # 如果有文本对，则构建批次输入
        batched_input = [(text, pair)] if pair else [text]
        # 使用 _tokenizer 对象进行批量编码处理，返回编码结果
        encodings = self._tokenizer.encode_batch(
            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
        )

        # 返回第一个编码结果的 tokens
        return encodings[0].tokens

    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # 从 transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.encode_plus 复制而来
    def encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
        `__call__` should be used instead.

        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
            text_pair (`List[str]` or `List[int]`, *optional*):
                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
                list of list of strings (words of a batch of examples).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        # 获取填充和截断策略，以及其他参数，用于后续编码过程
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用 _encode_plus 方法进行编码处理，返回编码结果
        return self._encode_plus(
            text=text,
            boxes=boxes,
            text_pair=text_pair,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,
        boxes: Optional[List[List[List[int]]]] = None,
        word_labels: Optional[List[List[int]]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[str] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    # 从 transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._encode_plus 复制而来的方法定义
    def _encode_plus(
        self,
        # 输入的文本，可以是单个文本或者预分词后的输入
        text: Union[TextInput, PreTokenizedInput],
        # 可选参数，第二个输入文本对
        text_pair: Optional[PreTokenizedInput] = None,
        # 可选参数，文本框的坐标列表
        boxes: Optional[List[List[int]]] = None,
        # 可选参数，单词标签列表
        word_labels: Optional[List[int]] = None,
        # 是否添加特殊标记（如[CLS]和[SEP]）
        add_special_tokens: bool = True,
        # 填充策略，默认不填充
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        # 截断策略，默认不截断
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        # 最大长度限制
        max_length: Optional[int] = None,
        # 滑动窗口步长
        stride: int = 0,
        # 填充到的倍数
        pad_to_multiple_of: Optional[int] = None,
        # 是否返回张量
        return_tensors: Optional[bool] = None,
        # 是否返回token类型ID
        return_token_type_ids: Optional[bool] = None,
        # 是否返回注意力掩码
        return_attention_mask: Optional[bool] = None,
        # 是否返回溢出的tokens
        return_overflowing_tokens: bool = False,
        # 是否返回特殊tokens的掩码
        return_special_tokens_mask: bool = False,
        # 是否返回偏移映射
        return_offsets_mapping: bool = False,
        # 是否返回长度
        return_length: bool = False,
        # 是否显示详细信息
        verbose: bool = True,
        # 其它关键字参数
        **kwargs,
    ) -> BatchEncoding:
        # 将输入转换为批量输入
        # 两种选项：
        # 1) 只有文本，此时文本必须是一个字符串列表
        # 2) 文本 + 文本对，此时文本是一个字符串，text_pair 是一个字符串列表
        batched_input = [(text, text_pair)] if text_pair else [text]
        batched_boxes = [boxes]
        batched_word_labels = [word_labels] if word_labels is not None else None
        # 调用 _batch_encode_plus 方法进行批量编码
        batched_output = self._batch_encode_plus(
            batched_input,
            is_pair=bool(text_pair is not None),
            boxes=batched_boxes,
            word_labels=batched_word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

        # 如果 return_tensors 是 None，并且不返回溢出的 token，则去除前导批次轴
        # 溢出的 token 作为输出的批次返回，因此在这种情况下保留它们
        if return_tensors is None and not return_overflowing_tokens:
            batched_output = BatchEncoding(
                {
                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
                    for key, value in batched_output.items()
                },
                batched_output.encodings,
            )

        # 对批处理的输入进行长度检查，如果长度超出设定的最大长度则警告
        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)

        return batched_output

    # 从 transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._pad 复制而来
    def _pad(
        self,
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
    # 从 transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.save_vocabulary 复制而来
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 使用 _tokenizer.model.save 方法保存词汇表
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        # 返回保存的文件名元组
        return tuple(files)
    # 创建一个包含特殊标记的输入序列，用于模型输入
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        # 构建输出列表，加入起始标记、token_ids_0和结束标记
        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        # 如果没有第二个序列token_ids_1，则直接返回output
        if token_ids_1 is None:
            return output

        # 否则在output后加入结束标记、token_ids_1和再次结束标记
        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]

    # 根据输入的两个序列token_ids_0和token_ids_1，创建用于序列对分类任务的token类型ID列表
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Args:
            token_ids_0 (`List[int]`):
                第一个序列的ID列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个序列的ID列表，用于序列对任务。

        Returns:
            `List[int]`: 全零列表，用作RoBERTa模型中不使用token类型ID的占位。
        """
        # 分隔标记
        sep = [self.sep_token_id]
        # 分类标记
        cls = [self.cls_token_id]

        # 如果没有token_ids_1，则返回长度为cls + token_ids_0 + sep的全零列表
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]

        # 否则返回长度为cls + token_ids_0 + sep + sep + token_ids_1 + sep的全零列表
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

`.\models\layoutlmv3\init.py`

# 版权声明及许可信息
# 2022年由HuggingFace团队版权所有。
# 根据Apache许可证2.0版（“许可证”）授权；
# 您只能在遵守许可证的情况下使用此文件。
# 您可以通过访问以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发的软件
# 没有任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。
#

from typing import TYPE_CHECKING

from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
    is_vision_available,
)

# 定义了导入结构的字典，用于模块化地导入布局LMv3相关模块和类
_import_structure = {
    "configuration_layoutlmv3": [
        "LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "LayoutLMv3Config",
        "LayoutLMv3OnnxConfig",
    ],
    "processing_layoutlmv3": ["LayoutLMv3Processor"],
    "tokenization_layoutlmv3": ["LayoutLMv3Tokenizer"],
}

# 检查是否安装了tokenizers库，若未安装则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若安装了tokenizers，则添加"tokenization_layoutlmv3_fast"到_import_structure字典
    _import_structure["tokenization_layoutlmv3_fast"] = ["LayoutLMv3TokenizerFast"]

# 检查是否安装了torch库，若未安装则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若安装了torch，则添加"modeling_layoutlmv3"到_import_structure字典
    _import_structure["modeling_layoutlmv3"] = [
        "LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST",
        "LayoutLMv3ForQuestionAnswering",
        "LayoutLMv3ForSequenceClassification",
        "LayoutLMv3ForTokenClassification",
        "LayoutLMv3Model",
        "LayoutLMv3PreTrainedModel",
    ]

# 检查是否安装了tensorflow库，若未安装则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若安装了tensorflow，则添加"modeling_tf_layoutlmv3"到_import_structure字典
    _import_structure["modeling_tf_layoutlmv3"] = [
        "TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFLayoutLMv3ForQuestionAnswering",
        "TFLayoutLMv3ForSequenceClassification",
        "TFLayoutLMv3ForTokenClassification",
        "TFLayoutLMv3Model",
        "TFLayoutLMv3PreTrainedModel",
    ]

# 检查是否安装了vision库，若未安装则抛出OptionalDependencyNotAvailable异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若安装了vision，则添加"feature_extraction_layoutlmv3"和"image_processing_layoutlmv3"到_import_structure字典
    _import_structure["feature_extraction_layoutlmv3"] = ["LayoutLMv3FeatureExtractor"]
    _import_structure["image_processing_layoutlmv3"] = ["LayoutLMv3ImageProcessor"]

# 如果是类型检查阶段，则从各模块导入对应的类和常量
if TYPE_CHECKING:
    from .configuration_layoutlmv3 import (
        LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP,
        LayoutLMv3Config,
        LayoutLMv3OnnxConfig,
    )
    from .processing_layoutlmv3 import LayoutLMv3Processor
    from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer

    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    # 尝试导入 LayoutLMv3TokenizerFast，如果 OptionalDependencyNotAvailable 异常抛出则跳过
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_layoutlmv3_fast import LayoutLMv3TokenizerFast

    # 尝试检查是否 Torch 可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常并跳过
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入 Torch 版本的 LayoutLMv3 模型和相关类
        from .modeling_layoutlmv3 import (
            LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST,
            LayoutLMv3ForQuestionAnswering,
            LayoutLMv3ForSequenceClassification,
            LayoutLMv3ForTokenClassification,
            LayoutLMv3Model,
            LayoutLMv3PreTrainedModel,
        )

    # 尝试检查是否 TensorFlow 可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常并跳过
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入 TensorFlow 版本的 LayoutLMv3 模型和相关类
        from .modeling_tf_layoutlmv3 import (
            TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFLayoutLMv3ForQuestionAnswering,
            TFLayoutLMv3ForSequenceClassification,
            TFLayoutLMv3ForTokenClassification,
            TFLayoutLMv3Model,
            TFLayoutLMv3PreTrainedModel,
        )

    # 尝试检查是否 Vision 可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常并跳过
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入 LayoutLMv3 的图像特征提取器和图像处理器
        from .feature_extraction_layoutlmv3 import LayoutLMv3FeatureExtractor
        from .image_processing_layoutlmv3 import LayoutLMv3ImageProcessor
else:
    # 如果不在前面的任何一个条件分支中，则执行以下操作
    import sys
    # 导入sys模块，用于访问系统相关的功能

    # 将当前模块注册到sys.modules中，使用_LazyModule延迟加载模式
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
    # __name__表示当前模块名，__file__表示当前模块的文件名
    # _LazyModule是一个延迟加载模块的类，用于按需加载模块的内容
    # _import_structure和__spec__是用于模块导入和规范的参数

`.\models\layoutxlm\processing_layoutxlm.py`

# coding=utf-8
# 设置文件编码为 UTF-8

# Copyright 2021 The HuggingFace Inc. team.
# 版权声明：2021 年由 HuggingFace 公司团队拥有

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 进行许可

# you may not use this file except in compliance with the License.
# 除非遵循许可协议，否则不得使用此文件

# You may obtain a copy of the License at
# 您可以在以下网址获取许可协议副本

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，本软件按"原样"分发，不附带任何形式的明示或暗示担保或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查阅许可协议以了解特定语言的许可权限和限制

"""
Processor class for LayoutXLM.
"""
# 用于 LayoutXLM 的处理器类

import warnings
# 导入警告模块
from typing import List, Optional, Union
# 导入类型提示的必要模块

from ...processing_utils import ProcessorMixin
# 从父级目录的 processing_utils 模块中导入 ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
# 从父级目录的 tokenization_utils_base 模块中导入多个类和策略
from ...utils import TensorType
# 从父级目录的 utils 模块中导入 TensorType 类型

class LayoutXLMProcessor(ProcessorMixin):
    r"""
    Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
    processor.

    [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.

    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
    get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
    [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
    into token-level `labels` for token classification tasks (such as FUNSD, CORD).

    Args:
        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
        tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*):
            An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
    """
    # LayoutXLM 处理器类，结合 LayoutXLM 图像处理器和 LayoutXLM 分词器成为一个单独的处理器

    attributes = ["image_processor", "tokenizer"]
    # 类属性列表包括 "image_processor" 和 "tokenizer"

    image_processor_class = "LayoutLMv2ImageProcessor"
    # 图像处理器类为 "LayoutLMv2ImageProcessor"

    tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
    # 分词器类包括 "LayoutXLMTokenizer" 和 "LayoutXLMTokenizerFast"
    # 初始化方法，用于创建一个新的实例对象
    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        # 如果 kwargs 中包含 'feature_extractor' 参数，则发出警告并将其移除
        if "feature_extractor" in kwargs:
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")

        # 将 image_processor 设置为传入的 image_processor 或从 kwargs 中取出的 feature_extractor
        image_processor = image_processor if image_processor is not None else feature_extractor
        # 如果 image_processor 仍然为 None，则抛出数值错误异常
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        # 如果 tokenizer 为 None，则抛出数值错误异常
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        # 调用父类的初始化方法，传入 image_processor 和 tokenizer 参数
        super().__init__(image_processor, tokenizer)

    # 调用实例对象时执行的方法，用于处理输入的图片和文本数据
    def __call__(
        self,
        images,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ):
        # 此处省略了方法体，用于处理传入的多个参数并进行相关处理

    # 获取溢出图片的方法，根据溢出映射返回对应的溢出图片列表
    def get_overflowing_images(self, images, overflow_to_sample_mapping):
        # 创建空列表，用于存放溢出的图片数据
        images_with_overflow = []
        # 遍历溢出映射，将对应索引的图片添加到列表中
        for sample_idx in overflow_to_sample_mapping:
            images_with_overflow.append(images[sample_idx])

        # 如果 images_with_overflow 列表长度与溢出映射长度不一致，则抛出数值错误异常
        if len(images_with_overflow) != len(overflow_to_sample_mapping):
            raise ValueError(
                "Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
                f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
            )

        # 返回包含溢出图片的列表
        return images_with_overflow

    # 批量解码方法，将参数传递给 PreTrainedTokenizer 的 batch_decode 方法进行批量解码
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)
    # 将所有参数传递给 PreTrainedTokenizer 的 `decode` 方法，并返回其结果
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
        to the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    # 返回模型输入的名称列表，包括输入的标识符、边界框、注意力掩码和图像
    @property
    def model_input_names(self):
        return ["input_ids", "bbox", "attention_mask", "image"]

    # 返回特征提取器的类名，并发出未来警告，建议使用 `image_processor_class` 替代
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class

    # 返回特征提取器，并发出未来警告，建议使用 `image_processor` 替代
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor

`.\models\layoutxlm\tokenization_layoutxlm.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
""" Tokenization classes for LayoutXLM model."""


import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union

import sentencepiece as spm  # 导入 sentencepiece 库

from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...tokenization_utils_base import (
    BatchEncoding,
    EncodedInput,
    PreTokenizedInput,
    TextInput,
    TextInputPair,
    TruncationStrategy,
)
from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
from ..xlm_roberta.tokenization_xlm_roberta import (  # 导入 XLM-Roberta 的 tokenization 模块相关内容
    PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
    PRETRAINED_VOCAB_FILES_MAP,
    SPIECE_UNDERLINE,
    VOCAB_FILES_NAMES,
)


logger = logging.get_logger(__name__)  # 获取 logger 对象


class LayoutXLMTokenizer(PreTrainedTokenizer):
    """
    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    """

    vocab_files_names = VOCAB_FILES_NAMES  # 设置词汇文件名
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 设置预训练词汇文件映射
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 设置最大模型输入大小
    model_input_names = ["input_ids", "attention_mask"]  # 设置模型输入名称列表

    def __init__(
        self,
        vocab_file,
        bos_token="<s>",  # 开始词标记
        eos_token="</s>",  # 结束词标记
        sep_token="</s>",  # 分隔词标记
        cls_token="<s>",  # 类别标记
        unk_token="<unk>",  # 未知词标记
        pad_token="<pad>",  # 填充词标记
        mask_token="<mask>",  # 掩码词标记
        cls_token_box=[0, 0, 0, 0],  # 类别标记边界框
        sep_token_box=[1000, 1000, 1000, 1000],  # 分隔词标记边界框
        pad_token_box=[0, 0, 0, 0],  # 填充词标记边界框
        pad_token_label=-100,  # 填充词标签
        only_label_first_subword=True,  # 仅标记第一个子词
        sp_model_kwargs: Optional[Dict[str, Any]] = None,  # sentencepiece 模型参数
        **kwargs,  # 其他关键字参数
    ) -> None:
        # Mask token behave like a normal word, i.e. include the space before it
        # 如果 mask_token 是字符串，则创建一个带有特殊属性且会去除左侧空格的 AddedToken 对象
        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token

        # 如果 sp_model_kwargs 为 None，则初始化为空字典，否则使用传入的参数
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 使用 SentencePieceProcessor 初始化 self.sp_model 对象，并加载给定的 vocab_file
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(str(vocab_file))
        self.vocab_file = vocab_file

        # 确保 fairseq 的词汇表和 spm 的词汇表必须是“对齐”的关系
        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}

        # fairseq 词汇表的偏移量，用于实现 token-to-id 对齐
        self.fairseq_offset = 1

        # 添加 "<mask>" token 到 fairseq 的词汇表映射中，并计算其对应的 id
        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}

        # 设置额外的属性
        self.cls_token_box = cls_token_box
        self.sep_token_box = sep_token_box
        self.pad_token_box = pad_token_box
        self.pad_token_label = pad_token_label
        self.only_label_first_subword = only_label_first_subword

        # 调用父类的初始化方法，传递参数和关键字参数
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            cls_token_box=cls_token_box,
            sep_token_box=sep_token_box,
            pad_token_box=pad_token_box,
            pad_token_label=pad_token_label,
            only_label_first_subword=only_label_first_subword,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

    def __getstate__(self):
        # 创建当前对象的状态字典副本
        state = self.__dict__.copy()
        # 将 sp_model 设置为 None，以防止序列化时保存 SentencePieceProcessor 对象
        state["sp_model"] = None
        # 将 sp_model_proto 设置为当前 sp_model 的序列化模型协议
        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
        return state

    def __setstate__(self, d):
        # 恢复对象的状态字典
        self.__dict__ = d

        # 兼容旧版本的代码，如果不存在 sp_model_kwargs，则初始化为空字典
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 重新创建 sp_model 对象，并从序列化模型协议中加载 sp_model 的状态
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Generate token type IDs from a pair of sequences. Token type IDs distinguish between two sequences in a model input.
        For XLM-RoBERTa, token type IDs are:

        - single sequence: 0s for all tokens
        - pair of sequences: 0s for the tokens from the first sequence, 1s for the tokens from the second sequence

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional list of IDs for the second sequence in a pair.

        Returns:
            `List[int]`: List of token type IDs indicating the sequence membership of each token.
        """

        # If only one sequence is provided, return token type IDs with only the special tokens
        if token_ids_1 is None:
            return [0] * len(token_ids_0)

        # For a pair of sequences, generate token type IDs distinguishing the two sequences
        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        """

        # Initialize special tokens
        sep = [self.sep_token_id]  # List containing the separator token ID
        cls = [self.cls_token_id]  # List containing the classification token ID

        if token_ids_1 is None:
            # If only one sequence is provided, return a list of zeros for its combined length with special tokens
            return len(cls + token_ids_0 + sep) * [0]
        else:
            # If two sequences are provided, return a list of zeros for their combined length with special tokens
            return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    @property
    def vocab_size(self):
        # Calculate and return the vocabulary size including an additional token for <mask>
        return len(self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token

    def get_vocab(self):
        # Generate and return a dictionary mapping tokens to their IDs
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)  # Update with any additional tokens
        return vocab

    def _tokenize(self, text: str) -> List[str]:
        # Tokenize the input text using SentencePiece model and return a list of tokens as strings
        return self.sp_model.encode(text, out_type=str)

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]  # Return ID if token is in pre-defined mapping
        spm_id = self.sp_model.PieceToId(token)

        # Return ID adjusted by fairseq offset for unknown tokens returned by SentencePiece
        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]  # Return token if index is in pre-defined mapping
        return self.sp_model.IdToPiece(index - self.fairseq_offset)  # Return token from SentencePiece model

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            # Log an error if the save directory is not valid
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return  # Return None if directory is invalid
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            # Copy the existing vocabulary file if paths differ and the current file exists
            copyfile(self.vocab_file, out_vocab_file)
        elif not os.path.isfile(self.vocab_file):
            # Otherwise, write the serialized SentencePiece model to the output vocabulary file
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        return (out_vocab_file,)

    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
    # 定义一个方法，使对象可被调用，接受多种文本输入形式和相关参数
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 实现文本和文本对的编码，并根据需求进行特殊标记、填充和截断
        ...

    # 定义一个批处理编码方法，接受多个文本或文本对输入及相关参数
    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,
        boxes: Optional[List[List[List[int]]]] = None,
        word_labels: Optional[List[List[int]]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 对批量文本或文本对进行编码，支持对特殊标记、填充策略和截断策略的控制
        ...
    ) -> BatchEncoding:
        # 如果用户请求返回偏移映射，则抛出未实现的错误，因为 Python tokenizers 不支持这个功能
        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast."
            )

        # 调用 _batch_prepare_for_model 方法，准备输入数据并返回模型输入的批编码
        batch_outputs = self._batch_prepare_for_model(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            is_pair=is_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            return_tensors=return_tensors,
            verbose=verbose,
        )

        # 将批处理输出包装成 BatchEncoding 对象并返回
        return BatchEncoding(batch_outputs)

    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
    def _batch_prepare_for_model(
        self,
        batch_text_or_text_pairs,
        is_pair: bool = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[List[int]]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[str] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
        """

        # Initialize an empty dictionary to store batch outputs
        batch_outputs = {}

        # Iterate over examples in the batch, where each example consists of text or text pairs and associated boxes
        for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
            batch_text_or_text_pair, boxes_example = example

            # Prepare inputs for the model
            outputs = self.prepare_for_model(
                batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,  # First sequence or single sequence
                batch_text_or_text_pair[1] if is_pair else None,  # Second sequence (if pair) or None
                boxes_example,  # Boxes associated with the example
                word_labels=word_labels[idx] if word_labels is not None else None,  # Word labels if provided
                add_special_tokens=add_special_tokens,  # Whether to add special tokens
                padding=PaddingStrategy.DO_NOT_PAD.value,  # Padding strategy
                truncation=truncation_strategy.value,  # Truncation strategy
                max_length=max_length,  # Maximum sequence length
                stride=stride,  # Stride for overflowing tokens
                pad_to_multiple_of=None,  # Pad to multiple of this value (will pad in batch)
                return_attention_mask=False,  # Do not return attention masks here (batch level operation)
                return_token_type_ids=return_token_type_ids,  # Whether to return token type IDs
                return_overflowing_tokens=return_overflowing_tokens,  # Whether to return overflowing tokens
                return_special_tokens_mask=return_special_tokens_mask,  # Whether to return special tokens mask
                return_length=return_length,  # Whether to return length of sequences
                return_tensors=None,  # Convert the batch to tensors at the end
                prepend_batch_axis=False,  # Do not prepend batch axis
                verbose=verbose,  # Verbosity level
            )

            # Aggregate outputs into batch_outputs dictionary
            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        # Pad the batch outputs according to specified padding strategy and parameters
        batch_outputs = self.pad(
            batch_outputs,
            padding=padding_strategy.value,  # Padding strategy
            max_length=max_length,  # Maximum sequence length for padding
            pad_to_multiple_of=pad_to_multiple_of,  # Pad to multiple of this value
            return_attention_mask=return_attention_mask,  # Whether to return attention mask
        )

        # Convert batch_outputs dictionary to BatchEncoding object with specified tensor type
        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)

        # Return the prepared batch outputs
        return batch_outputs
    # 定义一个方法 `_encode_plus`，用于将文本和可能的配对文本、文本框、词标签等编码为模型输入
    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput],  # 主要文本输入，可以是文本或预分词的输入
        text_pair: Optional[PreTokenizedInput] = None,  # 可选的配对文本输入，预分词的输入格式
        boxes: Optional[List[List[int]]] = None,  # 可选的文本框列表，每个文本框由四个整数表示
        word_labels: Optional[List[int]] = None,  # 可选的词标签列表，整数表示每个词的标签
        add_special_tokens: bool = True,  # 是否添加特殊的标记符号（如CLS、SEP）
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略，默认不填充
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略，默认不截断
        max_length: Optional[int] = None,  # 最大长度限制
        stride: int = 0,  # 滑动窗口的步长
        pad_to_multiple_of: Optional[int] = None,  # 填充到指定的倍数
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的张量类型
        return_token_type_ids: Optional[bool] = None,  # 是否返回token类型IDs
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力掩码
        return_overflowing_tokens: bool = False,  # 是否返回溢出的token
        return_special_tokens_mask: bool = False,  # 是否返回特殊token的掩码
        return_offsets_mapping: bool = False,  # 是否返回偏移映射
        return_length: bool = False,  # 是否返回序列长度
        verbose: bool = True,  # 是否详细输出信息
        **kwargs,  # 其他关键字参数
    ) -> BatchEncoding:
        # 如果设置了返回偏移映射，则抛出未实现错误
        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast. "
                "More information on available tokenizers at "
                "https://github.com/huggingface/transformers/pull/2674"
            )

        # 调用 `prepare_for_model` 方法，准备输入以供模型使用，返回 `BatchEncoding` 对象
        return self.prepare_for_model(
            text=text,  # 主要文本输入
            text_pair=text_pair,  # 配对文本输入
            boxes=boxes,  # 文本框列表
            word_labels=word_labels,  # 词标签列表
            add_special_tokens=add_special_tokens,  # 是否添加特殊标记
            padding=padding_strategy.value,  # 填充策略的值
            truncation=truncation_strategy.value,  # 截断策略的值
            max_length=max_length,  # 最大长度限制
            stride=stride,  # 滑动窗口步长
            pad_to_multiple_of=pad_to_multiple_of,  # 填充到指定的倍数
            return_tensors=return_tensors,  # 返回的张量类型
            prepend_batch_axis=True,  # 是否在返回的张量中添加批次维度
            return_attention_mask=return_attention_mask,  # 是否返回注意力掩码
            return_token_type_ids=return_token_type_ids,  # 是否返回token类型IDs
            return_overflowing_tokens=return_overflowing_tokens,  # 是否返回溢出的token
            return_special_tokens_mask=return_special_tokens_mask,  # 是否返回特殊token的掩码
            return_length=return_length,  # 是否返回序列长度
            verbose=verbose,  # 是否详细输出信息
        )

    # 使用装饰器添加关于 `LAYOUTXLM_ENCODE_KWARGS_DOCSTRING` 的文档字符串
    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
    def prepare_for_model(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        prepend_batch_axis: bool = False,
        **kwargs,
    ):
        """
        准备输入以供模型使用的方法。

        参数:
        - text: 输入文本，可以是未分词或预分词的输入。
        - text_pair: 可选的第二个文本输入，用于处理文本对（如句子对任务）。
        - boxes: 可选的边界框列表，用于处理与文本相关的图像区域。
        - word_labels: 可选的单词级别标签列表。
        - add_special_tokens: 是否添加特殊的语言模型令牌（如CLS和SEP）。
        - padding: 控制填充输入序列的方式，可以是布尔值、字符串或填充策略对象。
        - truncation: 控制截断输入序列的方式，可以是布尔值、字符串或截断策略对象。
        - max_length: 输入序列的最大长度限制。
        - stride: 截断或填充时的步长。
        - pad_to_multiple_of: 如果指定，将输入填充到该数的倍数。
        - return_tensors: 控制返回的张量类型。
        - return_token_type_ids: 是否返回token_type_ids。
        - return_attention_mask: 是否返回attention_mask。
        - return_overflowing_tokens: 是否返回溢出的token。
        - return_special_tokens_mask: 是否返回特殊token的mask。
        - return_offsets_mapping: 是否返回token在原始输入中的偏移映射。
        - return_length: 是否返回输入长度。
        - verbose: 是否打印详细信息。
        - prepend_batch_axis: 是否在返回张量中添加批处理维度。
        - **kwargs: 其他未明确列出的参数。
        """
        ...

    def truncate_sequences(
        self,
        ids: List[int],
        token_boxes: List[List[int]],
        pair_ids: Optional[List[int]] = None,
        pair_token_boxes: Optional[List[List[int]]] = None,
        labels: Optional[List[int]] = None,
        num_tokens_to_remove: int = 0,
        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
        stride: int = 0,
    ):
        """
        截断序列的方法。

        参数:
        - ids: 输入序列的token IDs。
        - token_boxes: 每个token的边界框。
        - pair_ids: 可选的第二个序列的token IDs，用于处理序列对。
        - pair_token_boxes: 可选的第二个序列的边界框列表。
        - labels: 可选的标签列表。
        - num_tokens_to_remove: 要移除的token数量。
        - truncation_strategy: 截断策略，如"longest_first"等。
        - stride: 截断时的步长。
        """
        ...

    def _pad(
        self,
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
    ):
        """
        填充方法，用于将输入编码填充到相同长度。

        参数:
        - encoded_inputs: 编码后的输入，可以是单个EncodedInput对象或BatchEncoding对象。
        - max_length: 填充后的最大长度限制。
        - padding_strategy: 填充策略对象，控制如何进行填充。
        - pad_to_multiple_of: 如果指定，将填充到该数的倍数。
        - return_attention_mask: 是否返回attention_mask。
        """
        ...

`.\models\layoutxlm\tokenization_layoutxlm_fast.py`

# 导入必要的库和模块
import os  # 导入操作系统相关功能
from shutil import copyfile  # 导入复制文件功能
from typing import Dict, List, Optional, Tuple, Union  # 导入类型提示相关模块

# 导入 tokenization_utils 中的部分功能和类
from ...tokenization_utils import AddedToken  
from ...tokenization_utils_base import (
    BatchEncoding,  # 批编码相关类
    EncodedInput,  # 编码输入相关类
    PreTokenizedInput,  # 预分词输入相关类
    TextInput,  # 文本输入相关类
    TextInputPair,  # 文本对输入相关类
    TruncationStrategy,  # 截断策略相关类
)
# 导入 tokenization_utils_fast 中的 PreTrainedTokenizerFast 类
from ...tokenization_utils_fast import PreTrainedTokenizerFast  
# 导入工具函数和常量定义
from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging  
# 从 xlm_roberta.tokenization_xlm_roberta_fast 导入相关常量
from ..xlm_roberta.tokenization_xlm_roberta_fast import (
    PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,  # 预训练位置嵌入尺寸
    PRETRAINED_VOCAB_FILES_MAP,  # 预训练词汇文件映射
    VOCAB_FILES_NAMES,  # 词汇文件名列表
)

# 如果 sentencepiece 可用，则导入 LayoutXLMTokenizer 类，否则设为 None
if is_sentencepiece_available():
    from .tokenization_layoutxlm import LayoutXLMTokenizer
else:
    LayoutXLMTokenizer = None

# 获取日志记录器
logger = logging.get_logger(__name__)
        # Mask token behave like a normal word, i.e. include the space before it
        # 如果 mask token 是字符串，则将其作为 AddedToken 对象，保留其前面的空格
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        # 调用父类的初始化方法，设置各种特殊标记和文件路径
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
            cls_token_box=cls_token_box,
            sep_token_box=sep_token_box,
            pad_token_box=pad_token_box,
            pad_token_label=pad_token_label,
            only_label_first_subword=only_label_first_subword,
            **kwargs,
        )

        # 将初始化参数中的词汇文件路径存储在对象属性中
        self.vocab_file = vocab_file

        # 设置额外的属性值
        self.cls_token_box = cls_token_box
        self.sep_token_box = sep_token_box
        self.pad_token_box = pad_token_box
        self.pad_token_label = pad_token_label
        self.only_label_first_subword = only_label_first_subword

    @property
    def can_save_slow_tokenizer(self) -> bool:
        # 检查词汇文件是否存在，以确定是否可以保存缓慢的分词器
        return os.path.isfile(self.vocab_file) if self.vocab_file else False

    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 调用对象本身，实现文本编码和处理的方法
        pass  # 这里只是占位符，实际应该有具体的处理逻辑

    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
        # 创建批量输入列表
        batched_input = [(text, pair)] if pair else [text]
        # 使用底层的分词器对批量输入进行编码
        encodings = self._tokenizer.encode_batch(
            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
        )

        # 返回第一个编码结果的 token 列表
        return encodings[0].tokens
    # 定义一个方法 _batch_encode_plus，用于批量编码文本或文本对
    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,  # 标志是否为文本对
        boxes: Optional[List[List[List[int]]]] = None,  # 文本框的坐标信息
        word_labels: Optional[List[List[int]]] = None,  # 单词标签列表
        add_special_tokens: bool = True,  # 是否添加特殊标记
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略
        max_length: Optional[int] = None,  # 最大长度限制
        stride: int = 0,  # 步长
        pad_to_multiple_of: Optional[int] = None,  # 填充到的倍数
        return_tensors: Optional[str] = None,  # 返回的张量类型
        return_token_type_ids: Optional[bool] = None,  # 是否返回token类型ID
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力掩码
        return_overflowing_tokens: bool = False,  # 是否返回溢出的token
        return_special_tokens_mask: bool = False,  # 是否返回特殊token的掩码
        return_offsets_mapping: bool = False,  # 是否返回偏移映射
        return_length: bool = False,  # 是否返回长度
        verbose: bool = True,  # 是否详细输出
        **kwargs,  # 其他关键字参数
    ):
    
        # 定义一个方法 _encode_plus，用于编码单个文本或文本对
        def _encode_plus(
            self,
            text: Union[TextInput, PreTokenizedInput],  # 文本输入
            text_pair: Optional[PreTokenizedInput] = None,  # 第二个文本输入（可选）
            boxes: Optional[List[List[int]]] = None,  # 文本框的坐标信息
            word_labels: Optional[List[int]] = None,  # 单词标签列表
            add_special_tokens: bool = True,  # 是否添加特殊标记
            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略
            truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略
            max_length: Optional[int] = None,  # 最大长度限制
            stride: int = 0,  # 步长
            pad_to_multiple_of: Optional[int] = None,  # 填充到的倍数
            return_tensors: Optional[bool] = None,  # 返回的张量类型
            return_token_type_ids: Optional[bool] = None,  # 是否返回token类型ID
            return_attention_mask: Optional[bool] = None,  # 是否返回注意力掩码
            return_overflowing_tokens: bool = False,  # 是否返回溢出的token
            return_special_tokens_mask: bool = False,  # 是否返回特殊token的掩码
            return_offsets_mapping: bool = False,  # 是否返回偏移映射
            return_length: bool = False,  # 是否返回长度
            verbose: bool = True,  # 是否详细输出
            **kwargs,  # 其他关键字参数
        ):
    ) -> BatchEncoding:
        # 将输入处理为批量输入
        # 两个选项：
        # 1) 只有文本，如果文本是一个字符串列表
        # 2) 文本 + 文本对，其中文本是一个字符串，文本对是一个字符串列表
        batched_input = [(text, text_pair)] if text_pair else [text]
        batched_boxes = [boxes]  # 将盒子（框）数据转为列表形式
        batched_word_labels = [word_labels] if word_labels is not None else None  # 将单词标签数据转为列表形式或保持为None
        batched_output = self._batch_encode_plus(
            batched_input,
            is_pair=bool(text_pair is not None),  # 判断是否有文本对
            boxes=batched_boxes,
            word_labels=batched_word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

        # 如果返回的张量为None，则删除前导的批处理轴
        # 如果有溢出的tokens，则以批处理形式返回它们
        if return_tensors is None and not return_overflowing_tokens:
            batched_output = BatchEncoding(
                {
                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
                    for key, value in batched_output.items()
                },
                batched_output.encodings,
            )

        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)

        return batched_output

    def _pad(
        self,
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
    ):
        # 对编码后的输入进行填充处理
        # encoded_inputs: 包含编码输入的字典或BatchEncoding对象
        # max_length: 最大长度限制（可选）
        # padding_strategy: 填充策略，默认不填充
        # pad_to_multiple_of: 填充到的倍数（可选）
        # return_attention_mask: 是否返回注意力掩码（可选）
        ...
    
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        # 构建带有特殊标记的输入
        # token_ids_0: 第一个句子的标记ID列表
        # token_ids_1: 第二个句子的标记ID列表（可选）
        ...
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
        adding special tokens. An XLM-RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of input IDs with the appropriate special tokens added.
        """

        # If only one sequence is provided, add `<s>` (CLS) token, sequence tokens, and `</s>` (SEP) token
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        # For pairs of sequences, concatenate tokens with appropriate special tokens
        cls = [self.cls_token_id]  # CLS token
        sep = [self.sep_token_id]  # SEP token
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros indicating the token types.

        """

        sep = [self.sep_token_id]  # SEP token
        cls = [self.cls_token_id]  # CLS token

        # If only one sequence is provided, return a list of zeros for token type ids
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]

        # For pairs of sequences, return a list of zeros for token type ids
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the tokenizer vocabulary to the specified directory.

        Args:
            save_directory (str):
                Directory where the vocabulary will be saved.
            filename_prefix (str, *optional*):
                Prefix to prepend to the saved vocabulary filename.

        Returns:
            Tuple containing the path to the saved vocabulary file.
        """

        # Check if the fast tokenizer can save the vocabulary for a slow tokenizer
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # Check if save_directory exists and is a directory
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
            return  # Return if directory is not valid

        # Construct the output vocabulary file path
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # If the current vocabulary file path is not the same as the desired output path, copy the vocabulary file
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # Return the path to the saved vocabulary file
        return (out_vocab_file,)

`.\models\layoutxlm\init.py`

# 引入必要的类型检查模块
from typing import TYPE_CHECKING

# 从工具包中引入所需的依赖项和工具函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_tokenizers_available,
    is_torch_available,
    is_vision_available,
)

# 定义模块的导入结构
_import_structure = {"processing_layoutxlm": ["LayoutXLMProcessor"]}

# 检查是否安装了句子分词工具，如果没有则抛出异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 LayoutXLMTokenizer 导入结构中
    _import_structure["tokenization_layoutxlm"] = ["LayoutXLMTokenizer"]

# 检查是否安装了 Tokenizers 库，如果没有则抛出异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 LayoutXLMTokenizerFast 导入结构中
    _import_structure["tokenization_layoutxlm_fast"] = ["LayoutXLMTokenizerFast"]

# 如果正在进行类型检查，执行以下操作
if TYPE_CHECKING:
    # 从当前模块中导入 LayoutXLMProcessor 类
    from .processing_layoutxlm import LayoutXLMProcessor

    # 再次检查句子分词工具是否可用，如果可用则从相应模块导入 LayoutXLMTokenizer 类
    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_layoutxlm import LayoutXLMTokenizer

    # 再次检查 Tokenizers 是否可用，如果可用则从相应模块导入 LayoutXLMTokenizerFast 类
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast

# 如果不是类型检查阶段，则进行模块的懒加载处理
else:
    # 导入 sys 模块
    import sys

    # 创建一个 LazyModule 对象，将当前模块注册为 LazyModule
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\led\configuration_led.py`

# coding=utf-8
# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" LED model configuration"""

from typing import List, Union

# 从配置工具中导入预训练配置类
from ...configuration_utils import PretrainedConfig
# 从工具中导入日志记录功能
from ...utils import logging

# 获取logger对象，用于记录日志信息
logger = logging.get_logger(__name__)

# 预训练配置存档映射，将模型名称映射到其配置文件的URL
LED_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/config.json",
    # 查看所有LED模型：https://huggingface.co/models?filter=led
}

# LED配置类，继承自预训练配置类PretrainedConfig
class LEDConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LEDModel`]. It is used to instantiate an LED
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the LED
    [allenai/led-base-16384](https://huggingface.co/allenai/led-base-16384) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import LEDModel, LEDConfig

    >>> # Initializing a LED allenai/led-base-16384 style configuration
    >>> configuration = LEDConfig()

    >>> # Initializing a model from the allenai/led-base-16384 style configuration
    >>> model = LEDModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    
    # 模型类型为LED
    model_type = "led"
    # 属性映射字典，将LED配置中的一些属性名称映射到标准的transformers配置名称
    attribute_map = {
        "num_attention_heads": "encoder_attention_heads",
        "hidden_size": "d_model",
        "attention_probs_dropout_prob": "attention_dropout",
        "initializer_range": "init_std",
    }
    # 初始化方法，用于创建一个新的Transformer模型实例
    def __init__(
        self,
        vocab_size=50265,  # 词汇表大小，默认为50265
        max_encoder_position_embeddings=16384,  # 编码器最大位置嵌入数量，默认为16384
        max_decoder_position_embeddings=1024,   # 解码器最大位置嵌入数量，默认为1024
        encoder_layers=12,  # 编码器层数，默认为12层
        encoder_ffn_dim=4096,  # 编码器中FFN层的维度，默认为4096
        encoder_attention_heads=16,  # 编码器中注意力头的数量，默认为16
        decoder_layers=12,  # 解码器层数，默认为12层
        decoder_ffn_dim=4096,  # 解码器中FFN层的维度，默认为4096
        decoder_attention_heads=16,  # 解码器中注意力头的数量，默认为16
        encoder_layerdrop=0.0,  # 编码器层的丢弃率，默认为0.0
        decoder_layerdrop=0.0,  # 解码器层的丢弃率，默认为0.0
        use_cache=True,  # 是否使用缓存，默认为True
        is_encoder_decoder=True,  # 是否为编码-解码结构，默认为True
        activation_function="gelu",  # 激活函数类型，默认为gelu
        d_model=1024,  # 模型的维度，默认为1024
        dropout=0.1,  # 总体的丢弃率，默认为0.1
        attention_dropout=0.0,  # 注意力丢弃率，默认为0.0
        activation_dropout=0.0,  # 激活函数的丢弃率，默认为0.0
        init_std=0.02,  # 参数初始化的标准差，默认为0.02
        decoder_start_token_id=2,  # 解码器起始标记ID，默认为2
        classifier_dropout=0.0,  # 分类器的丢弃率，默认为0.0
        pad_token_id=1,  # 填充标记的ID，默认为1
        bos_token_id=0,  # 开始序列标记的ID，默认为0
        eos_token_id=2,  # 结束序列标记的ID，默认为2
        attention_window: Union[List[int], int] = 512,  # 注意力窗口的大小或者列表，默认为512
        **kwargs,  # 其他关键字参数
    ):
        self.vocab_size = vocab_size  # 设置词汇表大小
        self.max_encoder_position_embeddings = max_encoder_position_embeddings  # 设置编码器最大位置嵌入数量
        self.max_decoder_position_embeddings = max_decoder_position_embeddings  # 设置解码器最大位置嵌入数量
        self.d_model = d_model  # 设置模型的维度
        self.encoder_ffn_dim = encoder_ffn_dim  # 设置编码器中FFN层的维度
        self.encoder_layers = encoder_layers  # 设置编码器层数
        self.encoder_attention_heads = encoder_attention_heads  # 设置编码器中注意力头的数量
        self.decoder_ffn_dim = decoder_ffn_dim  # 设置解码器中FFN层的维度
        self.decoder_layers = decoder_layers  # 设置解码器层数
        self.decoder_attention_heads = decoder_attention_heads  # 设置解码器中注意力头的数量
        self.dropout = dropout  # 设置总体的丢弃率
        self.attention_dropout = attention_dropout  # 设置注意力丢弃率
        self.activation_dropout = activation_dropout  # 设置激活函数的丢弃率
        self.activation_function = activation_function  # 设置激活函数类型
        self.init_std = init_std  # 设置参数初始化的标准差
        self.encoder_layerdrop = encoder_layerdrop  # 设置编码器层的丢弃率
        self.decoder_layerdrop = decoder_layerdrop  # 设置解码器层的丢弃率
        self.classifier_dropout = classifier_dropout  # 设置分类器的丢弃率
        self.use_cache = use_cache  # 设置是否使用缓存
        self.num_hidden_layers = encoder_layers  # 设置隐藏层的数量为编码器层数
        self.attention_window = attention_window  # 设置注意力窗口的大小或者列表

        super().__init__(  # 调用父类的初始化方法
            pad_token_id=pad_token_id,  # 设置填充标记的ID
            bos_token_id=bos_token_id,  # 设置开始序列标记的ID
            eos_token_id=eos_token_id,  # 设置结束序列标记的ID
            is_encoder_decoder=is_encoder_decoder,  # 设置是否为编码-解码结构
            decoder_start_token_id=decoder_start_token_id,  # 设置解码器起始标记ID
            **kwargs,  # 传递其他关键字参数
        )

`.\models\led\modeling_led.py`

# coding=utf-8
# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch LED model."""


import math
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
    Seq2SeqQuestionAnsweringModelOutput,
    Seq2SeqSequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_led import LEDConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "allenai/led-base-16384"
_CONFIG_FOR_DOC = "LEDConfig"


LED_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "allenai/led-base-16384",
    # See all LED models at https://huggingface.co/models?filter=led
]


def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    # 创建一个新的张量，形状与输入相同，用于存放右移后的输入ids
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    # 将输入ids除了第一个位置外的所有位置向右移动一位
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    # 将第一个位置设置为decoder起始token的id
    shifted_input_ids[:, 0] = decoder_start_token_id

    # 如果pad_token_id为None，抛出值错误异常
    if pad_token_id is None:
        raise ValueError("config.pad_token_id has to be defined.")
    # 将labels中可能存在的-100值替换为pad_token_id
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


def _prepare_4d_attention_mask_inverted(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    # 获取输入mask的形状信息
    bsz, src_len = mask.size()
    # 如果未指定tgt_len，默认设为src_len
    tgt_len = tgt_len if tgt_len is not None else src_len

    # 将mask从[bsz, seq_len]扩展为[bsz, 1, tgt_seq_len, src_seq_len]
    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)

    # 创建反转的mask，用于全局attention
    inverted_mask = 1.0 - expanded_mask
    # 将反转后的mask中的True值用极小的负数填充，以便后续处理
    expanded_attention_mask = inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)

    # 确保全局attention_mask为正数
    # 将扩展后的注意力掩码与反转掩码逐元素相乘，实现对应位置的逻辑运算
    expanded_attention_mask = expanded_attention_mask * inverted_mask

    # 返回经过处理后的扩展注意力掩码
    return expanded_attention_mask
class LEDLearnedPositionalEmbedding(nn.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int):
        super().__init__(num_embeddings, embedding_dim)

    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
        """`input_ids_shape` is expected to be [bsz x seqlen]."""
        # 获取输入的批量大小（batch size）和序列长度（sequence length）
        bsz, seq_len = input_ids_shape[:2]
        # 根据序列长度和历史键值长度计算出位置信息
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
        )
        # 调用父类的forward方法，传入计算得到的位置信息
        return super().forward(positions)


# Copied from transformers.models.longformer.modeling_longformer.LongformerSelfAttention with Longformer->LEDEncoder
class LEDEncoderSelfAttention(nn.Module):
    def __init__(self, config, layer_id):
        super().__init__()
        # 检查隐藏大小是否可以整除注意力头数
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )
        # 设置注意力头数、每头的维度和嵌入维度
        self.num_heads = config.num_attention_heads
        self.head_dim = int(config.hidden_size / config.num_attention_heads)
        self.embed_dim = config.hidden_size

        # 为查询、键和值设置线性映射层
        self.query = nn.Linear(config.hidden_size, self.embed_dim)
        self.key = nn.Linear(config.hidden_size, self.embed_dim)
        self.value = nn.Linear(config.hidden_size, self.embed_dim)

        # 为全局注意力的查询、键和值设置线性映射层
        self.query_global = nn.Linear(config.hidden_size, self.embed_dim)
        self.key_global = nn.Linear(config.hidden_size, self.embed_dim)
        self.value_global = nn.Linear(config.hidden_size, self.embed_dim)

        # 设置注意力概率的dropout率
        self.dropout = config.attention_probs_dropout_prob

        self.layer_id = layer_id
        # 检查并设置单向注意力窗口大小
        attention_window = config.attention_window[self.layer_id]
        assert (
            attention_window % 2 == 0
        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
        assert (
            attention_window > 0
        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"

        self.one_sided_attn_window_size = attention_window // 2

        self.config = config

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        layer_head_mask=None,
        is_index_masked=None,
        is_index_global_attn=None,
        is_global_attn=None,
        output_attentions=False,
    ):
        # 此处应该是从Longformer模型复制过来的代码，但未完成，需要补充完整
        pass
    def _pad_and_transpose_last_two_dims(hidden_states_padded, padding):
        """
        对最后两个维度进行填充和转置操作。

        Args:
            hidden_states_padded (torch.Tensor): 填充后的隐藏状态张量
            padding (tuple): 填充值，实际数值并不重要，因为它将被覆盖

        Returns:
            torch.Tensor: 填充和转置后的隐藏状态张量
        """
        hidden_states_padded = nn.functional.pad(
            hidden_states_padded, padding
        )  # 使用 padding 对 hidden_states_padded 进行填充，填充值并不重要，因为后续会被覆盖
        hidden_states_padded = hidden_states_padded.view(
            *hidden_states_padded.size()[:-2], hidden_states_padded.size(-1), hidden_states_padded.size(-2)
        )  # 转置最后两个维度
        return hidden_states_padded

    @staticmethod
    def _pad_and_diagonalize(chunked_hidden_states):
        """
        将每一行向右移动一步，将列转换为对角线。

        Args:
            chunked_hidden_states (torch.Tensor): 分块的隐藏状态张量

        Returns:
            torch.Tensor: 填充和对角化后的隐藏状态张量
        """
        total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
        chunked_hidden_states = nn.functional.pad(
            chunked_hidden_states, (0, window_overlap + 1)
        )  # 对 chunked_hidden_states 进行填充，第一个维度不填充，第二个维度填充 window_overlap + 1
        chunked_hidden_states = chunked_hidden_states.view(
            total_num_heads, num_chunks, -1
        )  # 将张量视图重塑为 total_num_heads x num_chunks x (window_overlap*hidden_dim + window_overlap + 1)
        chunked_hidden_states = chunked_hidden_states[
            :, :, :-window_overlap
        ]  # 截取最后一个维度的部分，得到 total_num_heads x num_chunks x (window_overlap*hidden_dim)
        chunked_hidden_states = chunked_hidden_states.view(
            total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim
        )  # 将张量重塑为 total_num_heads x num_chunks x window_overlap x (window_overlap + hidden_dim)
        chunked_hidden_states = chunked_hidden_states[:, :, :, :-1]  # 去除最后一个维度的最后一个元素
        return chunked_hidden_states
    def _chunk(hidden_states, window_overlap, onnx_export: bool = False):
        """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
        if not onnx_export:
            # 非 ONNX 导出模式下，创建大小为 2w 的非重叠块
            hidden_states = hidden_states.view(
                hidden_states.size(0),
                torch.div(hidden_states.size(1), (window_overlap * 2), rounding_mode="trunc"),
                window_overlap * 2,
                hidden_states.size(2),
            )
            # 使用 `as_strided` 实现重叠块，重叠大小为 window_overlap
            chunk_size = list(hidden_states.size())
            chunk_size[1] = chunk_size[1] * 2 - 1

            chunk_stride = list(hidden_states.stride())
            chunk_stride[1] = chunk_stride[1] // 2
            return hidden_states.as_strided(size=chunk_size, stride=chunk_stride)

        # 当导出到 ONNX 时，使用单独的逻辑
        # 因为 ONNX 导出不支持 `as_strided`、`unfold` 和二维张量索引，所以需要使用较慢的实现方法

        # TODO 替换以下代码为
        # > return hidden_states.unfold(dimension=1, size=window_overlap * 2, step=window_overlap).transpose(2, 3)
        # 一旦 `unfold` 得到支持
        # 当 hidden_states.size(1) == window_overlap * 2 时，也可以简单返回 hidden_states.unsqueeze(1)，但这是控制流

        chunk_size = [
            hidden_states.size(0),
            torch.div(hidden_states.size(1), window_overlap, rounding_mode="trunc") - 1,
            window_overlap * 2,
            hidden_states.size(2),
        ]

        # 创建一个与 hidden_states 形状相同的张量用于存储重叠块
        overlapping_chunks = torch.empty(chunk_size, device=hidden_states.device)
        for chunk in range(chunk_size[1]):
            # 将重叠块存储到 overlapping_chunks 中
            overlapping_chunks[:, chunk, :, :] = hidden_states[
                :, chunk * window_overlap : chunk * window_overlap + 2 * window_overlap, :
            ]
        return overlapping_chunks

    @staticmethod
    def _mask_invalid_locations(input_tensor, affected_seq_len) -> torch.Tensor:
        # 创建一个二维矩阵，用于掩盖无效位置
        beginning_mask_2d = input_tensor.new_ones(affected_seq_len, affected_seq_len + 1).tril().flip(dims=[0])
        beginning_mask = beginning_mask_2d[None, :, None, :]
        ending_mask = beginning_mask.flip(dims=(1, 3))
        
        # 控制起始部分的输入张量，使其无效位置被掩盖
        beginning_input = input_tensor[:, :affected_seq_len, :, : affected_seq_len + 1]
        beginning_mask = beginning_mask.expand(beginning_input.size())
        input_tensor[:, :affected_seq_len, :, : affected_seq_len + 1] = torch.full_like(
            beginning_input, -float("inf")
        ).where(beginning_mask.bool(), beginning_input)
        
        # 控制结束部分的输入张量，使其无效位置被掩盖
        ending_input = input_tensor[:, -affected_seq_len:, :, -(affected_seq_len + 1) :]
        ending_mask = ending_mask.expand(ending_input.size())
        input_tensor[:, -affected_seq_len:, :, -(affected_seq_len + 1) :] = torch.full_like(
            ending_input, -float("inf")
        ).where(ending_mask.bool(), ending_input)
    def _sliding_chunks_matmul_attn_probs_value(
        self, attn_probs: torch.Tensor, value: torch.Tensor, window_overlap: int
    ):
        """
        Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the
        same shape as `attn_probs`
        """
        # 获取 value 的形状信息：batch_size 是批大小，seq_len 是序列长度，num_heads 是注意力头数，head_dim 是每个头的维度
        batch_size, seq_len, num_heads, head_dim = value.size()

        # 断言确保 seq_len 能被 2 * window_overlap 整除，以支持滑动窗口的操作
        assert seq_len % (window_overlap * 2) == 0
        # 断言确保 attn_probs 和 value 的前三个维度匹配，即 batch_size、seq_len 和 num_heads
        assert attn_probs.size()[:3] == value.size()[:3]
        # 断言确保 attn_probs 的第四个维度是 2 * window_overlap + 1，与滑动窗口的大小匹配
        assert attn_probs.size(3) == 2 * window_overlap + 1

        # 计算 chunks_count，即分块的数量，每个块大小为 window_overlap
        chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1

        # 将 attn_probs 转置并重塑成新的形状，以便进行滑动窗口操作
        chunked_attn_probs = attn_probs.transpose(1, 2).reshape(
            batch_size * num_heads,
            torch.div(seq_len, window_overlap, rounding_mode="trunc"),
            window_overlap,
            2 * window_overlap + 1,
        )

        # 将 value 转置并重塑成新的形状，以便进行滑动窗口操作
        value = value.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)

        # 在序列的开头和结尾各填充 window_overlap 个值，以支持滑动窗口操作
        padded_value = nn.functional.pad(value, (0, 0, window_overlap, window_overlap), value=-1)

        # 根据滑动窗口的大小和重叠，将 padded_value 进行分块处理
        chunked_value_size = (batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim)
        chunked_value_stride = padded_value.stride()
        chunked_value_stride = (
            chunked_value_stride[0],
            window_overlap * chunked_value_stride[1],
            chunked_value_stride[1],
            chunked_value_stride[2],
        )
        chunked_value = padded_value.as_strided(size=chunked_value_size, stride=chunked_value_stride)

        # 对 chunked_attn_probs 进行填充和对角化处理
        chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs)

        # 使用 Einstein Summation 计算 context 向量，用于最终的输出
        context = torch.einsum("bcwd,bcdh->bcwh", (chunked_attn_probs, chunked_value))

        # 将 context 向量重塑成最终的形状，并进行维度转置，以匹配期望的输出形状
        return context.view(batch_size, num_heads, seq_len, head_dim).transpose(1, 2)
    def _get_global_attn_indices(is_index_global_attn):
        """计算在整个前向传递中需要使用的全局注意力索引"""
        # 计算每个样本中全局注意力索引的数量
        num_global_attn_indices = is_index_global_attn.long().sum(dim=1)

        # 批次中全局注意力索引的最大数量
        max_num_global_attn_indices = num_global_attn_indices.max()

        # 获取全局注意力索引的位置
        is_index_global_attn_nonzero = is_index_global_attn.nonzero(as_tuple=True)

        # 辅助变量，用于标识是否为全局注意力索引
        is_local_index_global_attn = torch.arange(
            max_num_global_attn_indices, device=is_index_global_attn.device
        ) < num_global_attn_indices.unsqueeze(dim=-1)

        # 非零值位置的全局注意力索引
        is_local_index_global_attn_nonzero = is_local_index_global_attn.nonzero(as_tuple=True)

        # 非全局注意力索引的零值位置
        is_local_index_no_global_attn_nonzero = (is_local_index_global_attn == 0).nonzero(as_tuple=True)
        return (
            max_num_global_attn_indices,
            is_index_global_attn_nonzero,
            is_local_index_global_attn_nonzero,
            is_local_index_no_global_attn_nonzero,
        )

    def _concat_with_global_key_attn_probs(
        self,
        key_vectors,
        query_vectors,
        max_num_global_attn_indices,
        is_index_global_attn_nonzero,
        is_local_index_global_attn_nonzero,
        is_local_index_no_global_attn_nonzero,
    ):
        batch_size = key_vectors.shape[0]

        # 创建仅包含全局键向量的张量
        key_vectors_only_global = key_vectors.new_zeros(
            batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim
        )

        # 将全局注意力索引对应的键向量复制到新张量中
        key_vectors_only_global[is_local_index_global_attn_nonzero] = key_vectors[is_index_global_attn_nonzero]

        # 使用 Einstein Summation 表示计算注意力概率
        # (batch_size, seq_len, num_heads, max_num_global_attn_indices)
        attn_probs_from_global_key = torch.einsum("blhd,bshd->blhs", (query_vectors, key_vectors_only_global))

        # 由于 ONNX 导出仅支持连续索引，需要转置操作
        attn_probs_from_global_key = attn_probs_from_global_key.transpose(1, 3)

        # 将非全局注意力索引位置的值置为一个极小的数，用于遮盖
        attn_probs_from_global_key[
            is_local_index_no_global_attn_nonzero[0], is_local_index_no_global_attn_nonzero[1], :, :
        ] = torch.finfo(attn_probs_from_global_key.dtype).min

        # 再次进行转置，使形状与原始张量保持一致
        attn_probs_from_global_key = attn_probs_from_global_key.transpose(1, 3)

        return attn_probs_from_global_key

    def _compute_attn_output_with_global_indices(
        self,
        value_vectors,
        attn_probs,
        max_num_global_attn_indices,
        is_index_global_attn_nonzero,
        is_local_index_global_attn_nonzero,
        is_local_index_no_global_attn_nonzero,
    ):
    ):
        batch_size = attn_probs.shape[0]  # 获取批量大小

        # cut local attn probs to global only
        attn_probs_only_global = attn_probs.narrow(-1, 0, max_num_global_attn_indices)
        # get value vectors for global only
        value_vectors_only_global = value_vectors.new_zeros(
            batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim
        )
        # 将全局注意力的位置上的值向量复制到新的张量中
        value_vectors_only_global[is_local_index_global_attn_nonzero] = value_vectors[is_index_global_attn_nonzero]

        # use `matmul` because `einsum` crashes sometimes with fp16
        # 使用`matmul`进行矩阵乘法计算，因为在fp16模式下，`einsum`有时会崩溃
        attn_output_only_global = torch.matmul(
            attn_probs_only_global.transpose(1, 2).clone(), value_vectors_only_global.transpose(1, 2).clone()
        ).transpose(1, 2)

        # reshape attn probs
        # 重新整形非全局注意力概率
        attn_probs_without_global = attn_probs.narrow(
            -1, max_num_global_attn_indices, attn_probs.size(-1) - max_num_global_attn_indices
        ).contiguous()

        # compute attn output with global
        # 使用滑动窗口方法计算包含全局注意力的注意力输出
        attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value(
            attn_probs_without_global, value_vectors, self.one_sided_attn_window_size
        )
        return attn_output_only_global + attn_output_without_global

    def _compute_global_attn_output_from_hidden(
        self,
        hidden_states,
        max_num_global_attn_indices,
        layer_head_mask,
        is_local_index_global_attn_nonzero,
        is_index_global_attn_nonzero,
        is_local_index_no_global_attn_nonzero,
        is_index_masked,
class LEDEncoderAttention(nn.Module):
    def __init__(self, config, layer_id):
        super().__init__()
        # 初始化自注意力机制模块，使用给定的配置和层编号
        self.longformer_self_attn = LEDEncoderSelfAttention(config, layer_id=layer_id)
        # 输出层，线性变换到配置中指定的模型维度
        self.output = nn.Linear(config.d_model, config.d_model)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        is_index_masked: Optional[torch.Tensor] = None,
        is_index_global_attn: Optional[torch.Tensor] = None,
        is_global_attn: Optional[bool] = None,
        output_attentions: bool = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""
        
        # 调用长形式自注意力模块进行前向传播
        self_outputs = self.longformer_self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            is_index_masked=is_index_masked,
            is_index_global_attn=is_index_global_attn,
            is_global_attn=is_global_attn,
            output_attentions=output_attentions,
        )

        # 将自注意力模块的输出进行线性变换
        attn_output = self.output(self_outputs[0])
        # 组装并返回输出，包括注意力输出（如果有）、额外的注意力权重等
        outputs = (attn_output,) + self_outputs[1:]

        return outputs


class LEDDecoderAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
    ):
        super().__init__()
        # 初始化解码器注意力模块，指定嵌入维度、头数、dropout率和是否为解码器
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        if self.head_dim * num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {num_heads})."
            )
        # 缩放因子，根据头维度进行设置
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        # 初始化键、值、查询和输出的线性投影层
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 将张量重塑为适合多头注意力机制的形状
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
        """Forward pass of the decoder attention module."""
        # 省略部分详细的前向传播说明，可根据需要进一步添加
    def __init__(self, config: LEDConfig, layer_id: int):
        super().__init__()
        # 初始化 LED 层的配置
        self.embed_dim = config.d_model
        # 创建自注意力机制对象
        self.self_attn = LEDEncoderAttention(config, layer_id)
        # 创建自注意力层的 LayerNorm
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        # 设定 dropout 概率
        self.dropout = config.dropout
        # 获取激活函数
        self.activation_fn = ACT2FN[config.activation_function]
        # 设定激活函数的 dropout 概率
        self.activation_dropout = config.activation_dropout
        # 第一个全连接层，线性映射到 config.encoder_ffn_dim 维度
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        # 第二个全连接层，线性映射回 self.embed_dim 维度
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        # 创建最终的 LayerNorm 层
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_head_mask: torch.Tensor,
        is_index_masked=None,
        is_index_global_attn=None,
        is_global_attn=None,
        output_attentions=False,
    ):
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
            attention_mask (`torch.FloatTensor`): attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                *(encoder_attention_heads,)*.
        """
        # 保存残差连接
        residual = hidden_states
        # 执行自注意力计算
        attn_outputs = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            is_index_masked=is_index_masked,
            is_index_global_attn=is_index_global_attn,
            is_global_attn=is_global_attn,
            output_attentions=output_attentions,
        )
        # 更新 hidden_states 为自注意力输出
        hidden_states = attn_outputs[0]
        # 应用 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 添加残差连接
        hidden_states = residual + hidden_states
        # 执行自注意力层的 LayerNorm
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # 保存残差连接
        residual = hidden_states
        # 应用激活函数并进行第一个全连接层映射
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 应用 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        # 应用第二个全连接层映射
        hidden_states = self.fc2(hidden_states)
        # 应用 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 添加残差连接
        hidden_states = residual + hidden_states
        # 执行最终的 LayerNorm
        hidden_states = self.final_layer_norm(hidden_states)

        # 处理浮点数异常（如果存在）
        if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
        # 返回结果，包括 hidden_states 和可能的注意力输出
        return (hidden_states,) + attn_outputs[1:]
class LEDDecoderLayer(nn.Module):
    def __init__(self, config: LEDConfig):
        super().__init__()
        self.embed_dim = config.d_model  # 初始化 embed_dim 属性为配置中的 d_model

        # 初始化自注意力机制
        self.self_attn = LEDDecoderAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )

        self.dropout = config.dropout  # 初始化 dropout 属性为配置中的 dropout
        self.activation_fn = ACT2FN[config.activation_function]  # 根据配置选择激活函数
        self.activation_dropout = config.activation_dropout  # 初始化激活函数的 dropout

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 初始化自注意力层的 LayerNorm
        # 初始化编码器注意力机制
        self.encoder_attn = LEDDecoderAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 初始化编码器注意力层的 LayerNorm
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)  # 第一个全连接层
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)  # 第二个全连接层
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)  # 最终输出的 LayerNorm

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,
    ):
        # 省略具体的前向传播逻辑，用于处理给定的参数和层，并返回结果
        pass


class LEDClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(
        self,
        input_dim: int,
        inner_dim: int,
        num_classes: int,
        pooler_dropout: float,
    ):
        super().__init__()
        self.dense = nn.Linear(input_dim, inner_dim)  # 全连接层
        self.dropout = nn.Dropout(p=pooler_dropout)  # Dropout 层
        self.out_proj = nn.Linear(inner_dim, num_classes)  # 输出分类的全连接层

    def forward(self, hidden_states: torch.Tensor):
        hidden_states = self.dropout(hidden_states)  # 应用 Dropout
        hidden_states = self.dense(hidden_states)  # 全连接层
        hidden_states = torch.tanh(hidden_states)  # Tanh 激活函数
        hidden_states = self.dropout(hidden_states)  # 再次应用 Dropout
        hidden_states = self.out_proj(hidden_states)  # 输出分类的全连接层
        return hidden_states  # 返回分类结果


class LEDPreTrainedModel(PreTrainedModel):
    config_class = LEDConfig  # 配置类为 LEDConfig
    base_model_prefix = "led"  # 基础模型前缀为 "led"
    supports_gradient_checkpointing = True  # 支持梯度检查点

    def _init_weights(self, module):
        std = self.config.init_std  # 从配置中获取初始化标准差
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)  # 初始化线性层的权重
            if module.bias is not None:
                module.bias.data.zero_()  # 初始化偏置为零
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)  # 初始化嵌入层的权重
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()  # 如果有填充索引，将其初始化为零

    @property
    def dummy_property(self):
        # 这是一个虚拟的属性示例，通常用于占位或者模型中的特定设置
        pass
    # 定义一个方法用于生成虚拟的输入数据
    def dummy_inputs(self):
        # 获取配置中的填充标记 ID
        pad_token = self.config.pad_token_id
        # 创建包含两个示例输入序列的张量，使用 PyTorch 库生成
        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
        # 构建虚拟输入的字典，包含注意力掩码和输入序列
        dummy_inputs = {
            "attention_mask": input_ids.ne(pad_token),  # 生成注意力掩码，用于指示哪些位置是真实输入
            "input_ids": input_ids,  # 将输入序列添加到字典中
        }
        # 返回生成的虚拟输入字典
        return dummy_inputs
# 使用 @dataclass 装饰器声明一个数据类，用于定义 LEDEncoderBaseModelOutput 类
@dataclass
# 从 transformers.models.longformer.modeling_longformer.LongformerBaseModelOutput 复制代码，并将 Longformer 替换为 LEDEncoder
# LEDEncoderBaseModelOutput 类是 LEDEncoder 模型输出的基类，可能包含隐藏状态、局部和全局注意力等信息
class LEDEncoderBaseModelOutput(ModelOutput):
    """
    LEDEncoder 的输出基类，可能包含隐藏状态、局部和全局注意力等信息。
    """
    # 定义函数参数：最后一层模型隐藏状态
    last_hidden_state: torch.FloatTensor
    # 定义函数参数：可选项，隐藏状态的元组，包括每层模型的隐藏状态和初始嵌入输出
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 定义一个名为 attentions 的可选元组，用于存储 torch.FloatTensor 类型的数据，初始值为 None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 定义一个名为 global_attentions 的可选元组，用于存储 torch.FloatTensor 类型的数据，初始值为 None
    global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class LEDSeq2SeqModelOutput(ModelOutput):
    """
    LEDSeq2SeqModelOutput 类，继承自 ModelOutput，用于表示模型编码器的输出，
    同时包含预先计算的隐藏状态，可加快顺序解码过程。
    """

    last_hidden_state: torch.FloatTensor = None
    past_key_values: Optional[List[torch.FloatTensor]] = None
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    encoder_global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


@dataclass
class LEDSeq2SeqLMOutput(ModelOutput):
    """
    LEDSeq2SeqLMOutput 类，继承自 ModelOutput，用于表示序列到序列语言模型的输出。
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    past_key_values: Optional[List[torch.FloatTensor]] = None
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    encoder_global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


@dataclass
class LEDSeq2SeqSequenceClassifierOutput(ModelOutput):
    """
    LEDSeq2SeqSequenceClassifierOutput 类，继承自 ModelOutput，用于表示序列到序列句子分类模型的输出。
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    past_key_values: Optional[List[torch.FloatTensor]] = None
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    encoder_global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


@dataclass
class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
    """
    LEDSeq2SeqQuestionAnsweringModelOutput 类，继承自 ModelOutput，用于表示序列到序列问答模型的输出。
    """

    loss: Optional[torch.FloatTensor] = None
    start_logits: torch.FloatTensor = None
    end_logits: torch.FloatTensor = None
    past_key_values: Optional[List[torch.FloatTensor]] = None
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 定义一个可选的变量，用于存储编码器的最后隐藏状态，初始化为None
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    # 定义一个可选的元组变量，用于存储编码器的所有隐藏状态，初始化为None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 定义一个可选的元组变量，用于存储编码器的所有注意力权重，初始化为None
    encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 定义一个可选的元组变量，用于存储编码器的所有全局注意力权重，初始化为None
    encoder_global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# LED_START_DOCSTRING 是一个长字符串，包含有关 LED 模型的文档说明。它继承自 PreTrainedModel，并列出了模型通用方法和使用方式。
LED_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. See the superclass documentation for the generic methods the library
    implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for general usage and behavior.

    Parameters:
        config ([`LEDConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# LED_GENERATION_EXAMPLE 是一个包含摘要示例的字符串，展示了如何使用 LED 模型进行摘要生成。
LED_GENERATION_EXAMPLE = r"""
    Summarization example:

    ```
    >>> import torch
    >>> from transformers import AutoTokenizer, LEDForConditionalGeneration

    >>> model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
    >>> tokenizer = AutoTokenizer.from_pretrained("allenai/led-large-16384-arxiv")

    >>> ARTICLE_TO_SUMMARIZE = '''Transformers (Vaswani et al., 2017) have achieved state-of-the-art
    ...     results in a wide range of natural language tasks including generative language modeling
    ...     (Dai et al., 2019; Radford et al., 2019) and discriminative ... language understanding (Devlin et al., 2019).
    ...     This success is partly due to the self-attention component which enables the network to capture contextual
    ...     information from the entire sequence. While powerful, the memory and computational requirements of
    ...     self-attention grow quadratically with sequence length, making it infeasible (or very expensive) to
    ...     process long sequences. To address this limitation, we present Longformer, a modified Transformer
    ...     architecture with a self-attention operation that scales linearly with the sequence length, making it
    ...     versatile for processing long documents (Fig 1). This is an advantage for natural language tasks such as
    ...     long document classification, question answering (QA), and coreference resolution, where existing approaches
    ...     partition or shorten the long context into smaller sequences that fall within the typical 512 token limit
    ...     of BERT-style pretrained models. Such partitioning could potentially result in loss of important
    ...     cross-partition information, and to mitigate this problem, existing methods often rely on complex
    ...     architectures to address such interactions. On the other hand, our proposed Longformer is able to build
    ...     contextual representations of the entire context using multiple layers of attention, reducing the need for
    ...     task-specific architectures.'''
    >>> inputs = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors="pt")
    # 使用给定的文章进行分词编码，返回PyTorch张量表示的输入
    
    >>> global_attention_mask = torch.zeros_like(inputs)
    # 创建一个与输入张量相同大小的全零张量，用于全局注意力掩码
    
    >>> global_attention_mask[:, 0] = 1
    # 将全局注意力掩码的第一个位置设置为1，以指示模型应该全局关注输入的第一个token
    
    >>> summary_ids = model.generate(inputs, global_attention_mask=global_attention_mask, num_beams=3, max_length=32)
    # 使用模型生成摘要，传入输入张量、全局注意力掩码、束搜索数量和最大长度限制
    
    >>> print(tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
    # 解码生成的摘要，跳过特殊标记并清理标记化空格后打印输出
"""
LED_INPUTS_DOCSTRING = r"""
"""


class LEDEncoder(LEDPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self-attention layers. Each layer is a
    [`LEDEncoderLayer`].

    Args:
        config: LEDConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self, config: LEDConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)

        self.dropout = config.dropout  # 从配置中获取 dropout 概率
        self.layerdrop = config.encoder_layerdrop  # 从配置中获取层之间的 dropout 概率

        embed_dim = config.d_model  # 从配置中获取嵌入维度
        self.padding_idx = config.pad_token_id  # 从配置中获取填充标记的索引
        self.max_source_positions = config.max_encoder_position_embeddings  # 从配置中获取最大源位置编码数

        if isinstance(config.attention_window, int):
            if config.attention_window % 2 != 0:
                raise ValueError("`config.attention_window` has to be an even value")  # 如果注意窗口大小为奇数则报错
            if config.attention_window <= 0:
                raise ValueError("`config.attention_window` has to be positive")  # 如果注意窗口大小为非正数则报错
            config.attention_window = [config.attention_window] * config.num_hidden_layers  # 每层使用相同的注意窗口大小
        else:
            if len(config.attention_window) != config.num_hidden_layers:
                raise ValueError(
                    "`len(config.attention_window)` should equal `config.num_hidden_layers`. "
                    f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
                )  # 如果注意窗口大小列表长度不匹配层数则报错

        if embed_tokens is not None:
            self.embed_tokens = embed_tokens  # 如果提供了嵌入标记，则使用它
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)  # 否则创建新的嵌入层

        self.embed_positions = LEDLearnedPositionalEmbedding(
            self.max_source_positions,
            embed_dim,
        )  # 创建学习的位置编码嵌入层

        self.layers = nn.ModuleList([LEDEncoderLayer(config, i) for i in range(config.encoder_layers)])  # 创建多层编码器层
        self.layernorm_embedding = nn.LayerNorm(embed_dim)  # 创建归一化层用于嵌入层

        self.gradient_checkpointing = False  # 是否使用梯度检查点，默认为 False

        # Initialize weights and apply final processing
        self.post_init()

    def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor):
        # longformer self-attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn)
        # (global_attention_mask + 1) => 1 for local attention, 2 for global attention
        # => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention
        if attention_mask is not None:
            attention_mask = attention_mask * (global_attention_mask + 1)  # 合并局部和全局注意力掩码
        else:
            # simply use `global_attention_mask` as `attention_mask`
            # if no `attention_mask` is given
            attention_mask = global_attention_mask + 1  # 如果没有给定注意力掩码，则使用全局注意力掩码

        return attention_mask

    def _pad_to_window_size(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        inputs_embeds: torch.Tensor,
        pad_token_id: int,
    ):
        """
        A helper function to pad tokens and mask to work with implementation of Longformer self-attention.
        """
        # padding
        attention_window = (
            self.config.attention_window
            if isinstance(self.config.attention_window, int)
            else max(self.config.attention_window)
        )

        if attention_window % 2 != 0:
            raise ValueError(f"`attention_window` should be an even value. Given {attention_window}")
        
        # Determine the shape of the input tensor (either input_ids or inputs_embeds)
        input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape
        batch_size, seq_len = input_shape[:2]

        # Calculate the padding length required to make seq_len a multiple of attention_window
        padding_len = (attention_window - seq_len % attention_window) % attention_window
        
        # Warn and pad input_ids or inputs_embeds if padding is necessary
        if padding_len > 0:
            logger.warning_once(
                f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
                f"`config.attention_window`: {attention_window}"
            )
            if input_ids is not None:
                # Pad input_ids tensor with pad_token_id
                input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id)
            if inputs_embeds is not None:
                # Create padding tensor for inputs_embeds and concatenate with original inputs_embeds
                input_ids_padding = inputs_embeds.new_full(
                    (batch_size, padding_len),
                    self.config.pad_token_id,
                    dtype=torch.long,
                )
                inputs_embeds_padding = self.embed_tokens(input_ids_padding)
                inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2)

            # Pad attention_mask tensor to match the new input_ids shape
            attention_mask = nn.functional.pad(
                attention_mask, (0, padding_len), value=False
            )  # no attention on the padding tokens

        # Return the padding length and updated tensors: input_ids, attention_mask, inputs_embeds
        return padding_len, input_ids, attention_mask, inputs_embeds

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        global_attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
class LEDDecoder(LEDPreTrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`LEDDecoderLayer`]

    Args:
        config: LEDConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self, config: LEDConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)
        self.dropout = config.dropout  # 从配置中获取dropout比率
        self.layerdrop = config.decoder_layerdrop  # 从配置中获取decoder层的layerdrop比率
        self.padding_idx = config.pad_token_id  # 从配置中获取填充token的索引
        self.max_target_positions = config.max_decoder_position_embeddings  # 获取最大解码位置

        if embed_tokens is not None:
            self.embed_tokens = embed_tokens  # 如果提供了embed_tokens，则使用给定的嵌入
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)  # 否则创建新的嵌入层

        self.embed_positions = LEDLearnedPositionalEmbedding(
            self.max_target_positions,
            config.d_model,
        )  # 创建学习的位置嵌入

        self.layers = nn.ModuleList([LEDDecoderLayer(config) for _ in range(config.decoder_layers)])  # 创建多层decoder层
        self.layernorm_embedding = nn.LayerNorm(config.d_model)  # 创建嵌入层的LayerNorm

        self.gradient_checkpointing = False  # 是否使用梯度检查点优化，默认为False

        # Initialize weights and apply final processing
        self.post_init()  # 执行后初始化步骤



    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        global_attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        inputs_embeds=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        """
        Forward pass of the LEDDecoder model.

        Args:
            input_ids (torch.LongTensor, optional): Input token IDs, shape (batch_size, seq_length).
            attention_mask (torch.Tensor, optional): Attention mask for input_ids, shape (batch_size, seq_length).
            global_attention_mask (torch.Tensor, optional): Global attention mask, shape (batch_size, seq_length).
            encoder_hidden_states (torch.FloatTensor, optional): Hidden states from the encoder.
            encoder_attention_mask (torch.FloatTensor, optional): Attention mask for encoder_hidden_states.
            head_mask (torch.FloatTensor, optional): Mask to nullify heads, shape (num_heads).
            cross_attn_head_mask (torch.FloatTensor, optional): Mask for cross-attention heads, shape (num_decoder_heads, num_encoder_heads).
            past_key_values (tuple, optional): Cached key/values for faster autoregressive decoding.
            inputs_embeds (torch.FloatTensor, optional): Embedded inputs if input_ids is not provided, shape (batch_size, seq_length, embed_dim).
            use_cache (bool, optional): Whether to use cached key/values for autoregressive decoding.
            output_attentions (bool, optional): Whether to return attentions weights.
            output_hidden_states (bool, optional): Whether to return hidden states.
            return_dict (bool, optional): Whether to return a dictionary as output.

        Returns:
            Sequence of output tensors depending on flags (return_dict, output_attentions, output_hidden_states).
        """
        # Forward pass logic of LEDDecoder
        # (具体的前向传播逻辑由各个方法和层实现，这里是参数和返回值的说明)



@add_start_docstrings(
    "The bare LED Model outputting raw hidden-states without any specific head on top.",
    LED_START_DOCSTRING,
)
class LEDModel(LEDPreTrainedModel):
    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]

    def __init__(self, config: LEDConfig):
        super().__init__(config)

        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)

        self.encoder = LEDEncoder(config, self.shared)  # 创建LED编码器
        self.decoder = LEDDecoder(config, self.shared)  # 创建LED解码器

        # Initialize weights and apply final processing
        self.post_init()  # 执行后初始化步骤

    def get_input_embeddings(self):
        return self.shared  # 返回共享的嵌入层

    def set_input_embeddings(self, value):
        self.shared = value  # 设置新的嵌入层
        self.encoder.embed_tokens = self.shared  # 更新编码器的嵌入层
        self.decoder.embed_tokens = self.shared  # 更新解码器的嵌入层

    def get_encoder(self):
        return self.encoder  # 返回编码器对象

    def get_decoder(self):
        return self.decoder  # 返回解码器对象

    @add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Seq2SeqModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        global_attention_mask=None,
        encoder_outputs=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        decoder_past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        """
        Forward pass of the LEDModel.

        Args:
            input_ids (torch.LongTensor, optional): Input token IDs, shape (batch_size, seq_length).
            attention_mask (torch.Tensor, optional): Attention mask for input_ids, shape (batch_size, seq_length).
            global_attention_mask (torch.Tensor, optional): Global attention mask, shape (batch_size, seq_length).
            encoder_outputs (tuple, optional): Outputs of the encoder.
            decoder_input_ids (torch.LongTensor, optional): Decoder input token IDs, shape (batch_size, seq_length).
            decoder_attention_mask (torch.Tensor, optional): Attention mask for decoder_input_ids, shape (batch_size, seq_length).
            decoder_past_key_values (tuple, optional): Cached key/values for faster autoregressive decoding.
            use_cache (bool, optional): Whether to use cached key/values for autoregressive decoding.
            output_attentions (bool, optional): Whether to return attentions weights.
            output_hidden_states (bool, optional): Whether to return hidden states.
            return_dict (bool, optional): Whether to return a dictionary as output.

        Returns:
            Sequence of output tensors depending on flags (return_dict, output_attentions, output_hidden_states).
        """
        # Forward pass logic of LEDModel
        # (具体的前向传播逻辑由各个方法和层实现，这里是参数和返回值的说明)
    # 定义 Transformer 模型的前向传播方法，处理输入和输出
    def forward(
        self,
        # 输入序列的 token IDs，可选的长整型张量
        input_ids: Optional[torch.LongTensor] = None,
        # 注意力掩码，指示哪些元素是填充的，可选的张量
        attention_mask: Optional[torch.Tensor] = None,
        # 解码器输入的 token IDs，可选的长整型张量
        decoder_input_ids: Optional[torch.LongTensor] = None,
        # 解码器的注意力掩码，指示哪些元素是填充的，可选的长整型张量
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        # 头部掩码，用于指定哪些注意力头部应该被保留，可选的张量
        head_mask: Optional[torch.Tensor] = None,
        # 解码器头部掩码，用于指定哪些解码器的注意力头部应该被保留，可选的张量
        decoder_head_mask: Optional[torch.Tensor] = None,
        # 交叉注意力头部掩码，用于指定哪些交叉注意力头部应该被保留，可选的张量
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        # 编码器输出的元组，包含每一层的输出，可选的张量
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        # 全局注意力掩码，指示哪些元素是填充的，可选的浮点数张量
        global_attention_mask: Optional[torch.FloatTensor] = None,
        # 过去键值对，用于缓存的元组，包含每一层的键值对张量
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        # 输入的嵌入张量，可选的浮点数张量
        inputs_embeds: Optional[torch.FloatTensor] = None,
        # 解码器输入的嵌入张量，可选的浮点数张量
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        # 是否使用缓存，可选的布尔值
        use_cache: Optional[bool] = None,
        # 是否输出注意力权重，可选的布尔值
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态，可选的布尔值
        output_hidden_states: Optional[bool] = None,
        # 是否返回字典形式的结果，可选的布尔值
        return_dict: Optional[bool] = None,
# 使用装饰器添加文档字符串，描述此类是一个带有语言建模头的 LED 模型，可以用于摘要生成
@add_start_docstrings(
    "The LED Model with a language modeling head. Can be used for summarization.", LED_START_DOCSTRING
)
# 定义 LEDForConditionalGeneration 类，继承自 LEDPreTrainedModel
class LEDForConditionalGeneration(LEDPreTrainedModel):
    # 模型的基础名称前缀为 "led"
    base_model_prefix = "led"
    # 在加载模型时忽略的键名列表，缺失时不加载 "final_logits_bias"
    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
    # 共享权重的键名列表
    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]

    # 初始化方法，接收 LEDConfig 类型的 config 参数
    def __init__(self, config: LEDConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 创建 LEDModel 对象，使用给定的 config
        self.led = LEDModel(config)
        # 注册一个缓冲区 "final_logits_bias"，大小为 (1, self.led.shared.num_embeddings)，初始化为零张量
        self.register_buffer("final_logits_bias", torch.zeros((1, self.led.shared.num_embeddings)))
        # 创建一个线性层 lm_head，将输入大小 config.d_model 映射到 self.led.shared.num_embeddings，不使用偏置
        self.lm_head = nn.Linear(config.d_model, self.led.shared.num_embeddings, bias=False)

        # 执行初始化权重和应用最终处理
        self.post_init()

    # 获取编码器的方法
    def get_encoder(self):
        return self.led.get_encoder()

    # 获取解码器的方法
    def get_decoder(self):
        return self.led.get_decoder()

    # 调整 token embeddings 的大小，返回调整后的新的 nn.Embedding 对象
    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
        # 调用父类的 resize_token_embeddings 方法，获取新的 embeddings
        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
        # 调用 _resize_final_logits_bias 方法，调整 final_logits_bias 的大小
        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
        # 返回新的 embeddings
        return new_embeddings

    # 调整 final_logits_bias 的大小，确保与新的 token 数量匹配
    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
        # 获取当前 final_logits_bias 的 token 数量
        old_num_tokens = self.final_logits_bias.shape[-1]
        # 如果新的 token 数量小于等于当前的数量，截取部分旧的 final_logits_bias
        if new_num_tokens <= old_num_tokens:
            new_bias = self.final_logits_bias[:, :new_num_tokens]
        # 如果新的 token 数量大于当前的数量，扩展 final_logits_bias，并填充零张量
        else:
            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
        # 注册更新后的 final_logits_bias
        self.register_buffer("final_logits_bias", new_bias)

    # 获取输出 embeddings 的方法
    def get_output_embeddings(self):
        return self.lm_head

    # 设置新的输出 embeddings
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 使用装饰器添加文档字符串，描述模型前向传播的输入
    @add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
    # 替换返回的文档字符串，指定输出类型为 Seq2SeqLMOutput，使用 _CONFIG_FOR_DOC 配置类
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 添加末尾的文档字符串，提供 LED 生成的示例
    @add_end_docstrings(LED_GENERATION_EXAMPLE)
    # 定义模型的前向传播方法，用于生成模型输出
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        global_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 如果使用过去的键值（past_key_values），则仅保留decoder_input_ids的最后一个标记
        if past_key_values is not None:
            decoder_input_ids = decoder_input_ids[:, -1:]

        # 返回一个字典，包含用于模型生成的输入和相关掩码信息
        return {
            "input_ids": None,  # encoder_outputs已定义，不需要input_ids
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "global_attention_mask": global_attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # 将此项更改为避免缓存（可能用于调试）
        }

    # 准备生成过程中的输入，用于生成decoder_input_ids
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        global_attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果使用过去的键值（past_key_values），则截断decoder_input_ids
        if past_key_values is not None:
            decoder_input_ids = decoder_input_ids[:, -1:]

        # 返回一个字典，包含用于生成的输入数据
        return {
            "input_ids": None,  # encoder_outputs已定义，input_ids不需要
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "global_attention_mask": global_attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # 将此项更改为避免缓存（可能用于调试）
        }

    # 根据标签生成decoder_input_ids，用于模型解码
    @staticmethod
    def prepare_decoder_input_ids_from_labels(labels: torch.Tensor):
        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)

    # 重新排列缓存中的键值对，以便与beam search结果对应
    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            # 缓存的交叉注意力状态不需要重新排序 -> 它们始终相同
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
                + layer_past[2:],
            )
        return reordered_past
@add_start_docstrings(
    """
    LED model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    """,
    LED_START_DOCSTRING,
)
class LEDForSequenceClassification(LEDPreTrainedModel):
    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]

    def __init__(self, config: LEDConfig, **kwargs):
        # 发出警告信息，表明此类将在 Transformers 版本 5 中被移除
        warnings.warn(
            "The `transformers.LEDForSequenceClassification` class is deprecated and will be removed in version 5 of"
            " Transformers. No actual method were provided in the original paper on how to perfom"
            " sequence classification.",
            FutureWarning,
        )
        # 调用父类构造函数初始化模型配置
        super().__init__(config, **kwargs)
        # 创建 LEDModel 实例
        self.led = LEDModel(config)
        # 创建用于分类任务的分类头部
        self.classification_head = LEDClassificationHead(
            config.d_model,
            config.d_model,
            config.num_labels,
            config.classifier_dropout,
        )

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Seq2SeqSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        global_attention_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



@add_start_docstrings(
    """
    LED Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer
    on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    LED_START_DOCSTRING,
)
class LEDForQuestionAnswering(LEDPreTrainedModel):
    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
    # 初始化函数，接受一个配置参数config
    def __init__(self, config):
        # 调用父类的初始化函数，传入配置参数config
        super().__init__(config)

        # 设置模型的分类标签数为2
        config.num_labels = 2
        # 将分类标签数保存到实例变量self.num_labels中
        self.num_labels = config.num_labels

        # 创建LEDModel对象，传入配置参数config，并保存到self.led中
        self.led = LEDModel(config)
        
        # 创建一个线性层，输入维度为config.hidden_size，输出维度为config.num_labels，并保存到self.qa_outputs中
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 调用模型的后初始化方法
        # 在这个方法里进行权重的初始化和最终的处理
        self.post_init()

    # 前向传播函数，定义模型的输入输出及其处理过程
    @add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Seq2SeqQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        global_attention_mask: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

`.\models\led\modeling_tf_led.py`

# coding=utf-8
# 版权声明
#
# 根据 Apache 许可证版本 2.0（"许可证"）授权；除非符合许可证的条款，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件根据"原样"分发，无任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。
""" TF 2.0 LED 模型。"""

from __future__ import annotations

import random
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import TFBaseModelOutputWithPastAndCrossAttentions

# Public API
from ...modeling_tf_utils import (
    TFModelInputType,
    TFPreTrainedModel,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_led import LEDConfig

logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "allenai/led-base-16384"
_CONFIG_FOR_DOC = "LEDConfig"

LARGE_NEGATIVE = -1e8

# 从 transformers.models.bart.modeling_tf_bart.shift_tokens_right 复制而来
def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
    # 将 pad_token_id 和 decoder_start_token_id 转换为与 input_ids 相同的数据类型
    pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
    decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
    
    # 创建起始 token，形状为 (batch_size, 1)，填充值为 decoder_start_token_id
    start_tokens = tf.fill(
        (shape_list(input_ids)[0], 1),
        tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)
    )
    
    # 将 input_ids 向右移动一位，将起始 token 放在最前面
    shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
    
    # 将 labels 中可能的 -100 值替换为 pad_token_id
    shifted_input_ids = tf.where(
        shifted_input_ids == -100,
        tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
        shifted_input_ids,
    )
    
    # 断言 shifted_input_ids 中的值都大于等于 0
    assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))
    
    # 确保断言操作被调用，通过在结果中包装一个身份 no-op
    with tf.control_dependencies([assert_gte0]):
        shifted_input_ids = tf.identity(shifted_input_ids)
    
    return shifted_input_ids


# 从 transformers.models.bart.modeling_tf_bart._make_causal_mask 复制而来
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
    """
    # 创建用于双向自注意力的因果掩码。
    """
    # 获取输入张量的批大小
    bsz = input_ids_shape[0]
    # 获取目标长度（通常是序列长度）
    tgt_len = input_ids_shape[1]
    # 创建一个全为负无穷大的张量作为初始掩码
    mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
    # 创建一个条件张量，其值为0到tgt_len-1的序列
    mask_cond = tf.range(shape_list(mask)[-1])

    # 根据条件张量设置掩码的值
    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)

    # 如果过去的键值对长度大于0，则在掩码的左侧连接一列零张量
    if past_key_values_length > 0:
        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)

    # 使用 tf.tile 对掩码进行扩展，以匹配输入张量的批处理大小和维度
    return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
# 从transformers.models.bart.modeling_tf_bart._expand_mask复制而来的函数
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
    """
    将注意力掩码从`[bsz, seq_len]`扩展到`[bsz, 1, tgt_seq_len, src_seq_len]`。
    """
    # 获取掩码张量的序列长度
    src_len = shape_list(mask)[1]
    # 如果未提供目标长度，则使用源长度作为目标长度
    tgt_len = tgt_len if tgt_len is not None else src_len
    # 创建一个常数张量，值为1.0
    one_cst = tf.constant(1.0)
    # 将掩码张量转换为与one_cst相同的数据类型
    mask = tf.cast(mask, dtype=one_cst.dtype)
    # 在掩码张量的第二维度上进行复制，复制tgt_len次，扩展为`[bsz, 1, tgt_len, src_len]`
    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))

    # 返回扩展后的掩码张量与一个较大负数相乘的结果
    return (one_cst - expanded_mask) * LARGE_NEGATIVE


class TFLEDLearnedPositionalEmbedding(keras.layers.Embedding):
    """
    该模块学习固定最大大小的位置嵌入。
    """

    def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
        super().__init__(num_embeddings, embedding_dim, **kwargs)

    def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
        """
        输入预期为大小为[bsz x seqlen]的张量。
        """
        # 获取输入张量的序列长度
        seq_len = input_shape[1]
        # 创建一个序列长度的范围张量，以1为步长
        position_ids = tf.range(seq_len, delta=1, name="range")
        # 将过去键值对的长度添加到位置ID中
        position_ids += past_key_values_length

        # 调用父类的call方法，传入位置ID张量，并返回结果
        return super().call(tf.cast(position_ids, dtype=tf.int32))


# 从transformers.models.longformer.modeling_tf_longformer.TFLongformerSelfAttention复制而来，将TFLongformer改为TFLEDEncoder
class TFLEDEncoderSelfAttention(keras.layers.Layer):
    # 初始化函数，接受配置、层ID等参数，并调用父类的初始化方法
    def __init__(self, config, layer_id, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 将配置参数保存在对象中
        self.config = config

        # 检查隐藏层大小是否能被注意力头数整除，若不能则抛出数值错误异常
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads}"
            )

        # 初始化对象的属性：注意力头数、每个头的维度、嵌入维度
        self.num_heads = config.num_attention_heads
        self.head_dim = int(config.hidden_size / config.num_attention_heads)
        self.embed_dim = config.hidden_size

        # 创建查询、键、值的Dense层，用于自注意力机制
        self.query = keras.layers.Dense(
            self.embed_dim,
            kernel_initializer=get_initializer(config.initializer_range),
            name="query",
        )
        self.key = keras.layers.Dense(
            self.embed_dim,
            kernel_initializer=get_initializer(config.initializer_range),
            name="key",
        )
        self.value = keras.layers.Dense(
            self.embed_dim,
            kernel_initializer=get_initializer(config.initializer_range),
            name="value",
        )

        # 为具有全局注意力的标记创建独立的投影层
        self.query_global = keras.layers.Dense(
            self.embed_dim,
            kernel_initializer=get_initializer(config.initializer_range),
            name="query_global",
        )
        self.key_global = keras.layers.Dense(
            self.embed_dim,
            kernel_initializer=get_initializer(config.initializer_range),
            name="key_global",
        )
        self.value_global = keras.layers.Dense(
            self.embed_dim,
            kernel_initializer=get_initializer(config.initializer_range),
            name="value_global",
        )

        # 创建Dropout层，用于注意力概率的随机丢弃
        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
        self.global_dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)

        # 将层ID保存在对象中
        self.layer_id = layer_id

        # 获取当前层的注意力窗口大小，并进行断言检查
        attention_window = config.attention_window[self.layer_id]

        assert (
            attention_window % 2 == 0
        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
        assert (
            attention_window > 0
        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"

        # 计算单侧注意力窗口的大小
        self.one_sided_attn_window_size = attention_window // 2
    # 如果模型尚未构建，则构建查询(query_global)、键(key_global)和值(value_global)的全局作用域
    def build(self, input_shape=None):
        if not self.built:
            # 使用名字作用域创建查询(query_global)的组件
            with tf.name_scope("query_global"):
                self.query_global.build((self.config.hidden_size,))
            # 使用名字作用域创建键(key_global)的组件
            with tf.name_scope("key_global"):
                self.key_global.build((self.config.hidden_size,))
            # 使用名字作用域创建值(value_global)的组件
            with tf.name_scope("value_global"):
                self.value_global.build((self.config.hidden_size,))

        # 如果模型已构建，则直接返回
        if self.built:
            return
        # 设置模型构建状态为已构建
        self.built = True

        # 如果存在查询(query)属性，则使用其名字作用域构建查询组件
        if getattr(self, "query", None) is not None:
            with tf.name_scope(self.query.name):
                self.query.build([None, None, self.config.hidden_size])
        # 如果存在键(key)属性，则使用其名字作用域构建键组件
        if getattr(self, "key", None) is not None:
            with tf.name_scope(self.key.name):
                self.key.build([None, None, self.config.hidden_size])
        # 如果存在值(value)属性，则使用其名字作用域构建值组件
        if getattr(self, "value", None) is not None:
            with tf.name_scope(self.value.name):
                self.value.build([None, None, self.config.hidden_size])
        # 如果存在全局查询(query_global)属性，则使用其名字作用域构建全局查询组件
        if getattr(self, "query_global", None) is not None:
            with tf.name_scope(self.query_global.name):
                self.query_global.build([None, None, self.config.hidden_size])
        # 如果存在全局键(key_global)属性，则使用其名字作用域构建全局键组件
        if getattr(self, "key_global", None) is not None:
            with tf.name_scope(self.key_global.name):
                self.key_global.build([None, None, self.config.hidden_size])
        # 如果存在全局值(value_global)属性，则使用其名字作用域构建全局值组件
        if getattr(self, "value_global", None) is not None:
            with tf.name_scope(self.value_global.name):
                self.value_global.build([None, None, self.config.hidden_size])

    @staticmethod
    def _mask_invalid_locations(input_tensor, window_overlap):
        # 创建正确的上三角形布尔掩码
        mask_2d_upper = tf.reverse(
            tf.linalg.band_part(tf.ones(shape=(window_overlap, window_overlap + 1)), -1, 0),
            axis=[0],
        )

        # 对掩码进行填充以形成完整的矩阵
        padding = tf.convert_to_tensor(
            [[0, shape_list(input_tensor)[1] - window_overlap], [0, shape_list(input_tensor)[3] - window_overlap - 1]]
        )

        # 创建下三角形掩码
        mask_2d = tf.pad(mask_2d_upper, padding)

        # 将下三角形掩码与上三角形掩码合并
        mask_2d = mask_2d + tf.reverse(mask_2d, axis=[0, 1])

        # 将二维掩码扩展到四维矩阵
        mask_4d = tf.tile(mask_2d[None, :, None, :], (shape_list(input_tensor)[0], 1, 1, 1))

        # 用于掩蔽操作的负无穷大张量
        inf_tensor = -float("inf") * tf.ones_like(input_tensor)

        # 执行掩蔽操作
        input_tensor = tf.where(tf.math.greater(mask_4d, 0), inf_tensor, input_tensor)

        return input_tensor
    def _sliding_chunks_matmul_attn_probs_value(self, attn_probs, value, window_overlap):
        """
        Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the
        same shape as `attn_probs`
        """

        # 获取 value 张量的形状信息：batch_size, seq_len, num_heads, head_dim
        batch_size, seq_len, num_heads, head_dim = shape_list(value)

        # 断言条件：seq_len 必须是 2 * window_overlap 的倍数
        tf.debugging.assert_equal(
            seq_len % (window_overlap * 2), 0, message="Seq_len has to be multiple of 2 * window_overlap"
        )
        
        # 断言条件：attn_probs 和 value 张量的前三个维度必须相同（除了 head_dim 维度）
        tf.debugging.assert_equal(
            shape_list(attn_probs)[:3],
            shape_list(value)[:3],
            message="value and attn_probs must have same dims (except head_dim)",
        )
        
        # 断言条件：attn_probs 张量的最后一个维度必须是 2 * window_overlap + 1
        tf.debugging.assert_equal(
            shape_list(attn_probs)[3],
            2 * window_overlap + 1,
            message="attn_probs last dim has to be 2 * window_overlap + 1",
        )

        # 计算 chunk 的数量，每个 chunk 的长度为 window_overlap
        chunks_count = seq_len // window_overlap - 1

        # 将 attn_probs 张量进行转置，并按照一定规则重新组织成 chunked_attn_probs 张量
        chunked_attn_probs = tf.reshape(
            tf.transpose(attn_probs, (0, 2, 1, 3)),
            (
                batch_size * num_heads,
                seq_len // window_overlap,
                window_overlap,
                2 * window_overlap + 1,
            ),
        )

        # 将 value 张量进行转置，并按照一定规则重新组织成 chunked_value 张量
        value = tf.reshape(
            tf.transpose(value, (0, 2, 1, 3)),
            (batch_size * num_heads, seq_len, head_dim),
        )

        # 在 seq_len 的两端各填充 window_overlap 个元素，值为 -1
        paddings = tf.convert_to_tensor([[0, 0], [window_overlap, window_overlap], [0, 0]])
        padded_value = tf.pad(value, paddings, constant_values=-1)

        # 将 padded_value 张量按照一定的窗口大小和跳跃步长进行切片
        frame_size = 3 * window_overlap * head_dim
        frame_hop_size = (shape_list(padded_value)[1] * head_dim - frame_size) // chunks_count
        chunked_value = tf.signal.frame(
            tf.reshape(padded_value, (batch_size * num_heads, -1)),
            frame_size,
            frame_hop_size,
        )
        chunked_value = tf.reshape(
            chunked_value,
            (batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim),
        )

        # 断言条件：chunked_value 张量的形状必须是 [batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim]
        tf.debugging.assert_equal(
            shape_list(chunked_value),
            [batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim],
            message="Chunked value has the wrong shape",
        )

        # 对 chunked_attn_probs 和 chunked_value 进行张量乘法操作，得到上下文信息 context
        chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs)
        context = tf.einsum("bcwd,bcdh->bcwh", chunked_attn_probs, chunked_value)
        
        # 转置和重新组织 context 张量的维度，以符合预期的输出形状
        context = tf.transpose(
            tf.reshape(context, (batch_size, num_heads, seq_len, head_dim)),
            (0, 2, 1, 3),
        )

        # 返回计算得到的上下文 context 张量
        return context
    def _pad_and_transpose_last_two_dims(hidden_states_padded, paddings):
        """
        对最后两个维度进行填充和转置操作。

        Args:
        - hidden_states_padded: 填充后的隐藏状态张量
        - paddings: 填充的尺寸，用于指定在各维度上的填充数量

        Returns:
        - hidden_states_padded: 转置后的隐藏状态张量
        """
        hidden_states_padded = tf.pad(
            hidden_states_padded, paddings
        )  # 填充操作，具体填充的值并不重要，因为之后会被覆写
        batch_size, chunk_size, seq_length, hidden_dim = shape_list(hidden_states_padded)
        hidden_states_padded = tf.reshape(hidden_states_padded, (batch_size, chunk_size, hidden_dim, seq_length))

        return hidden_states_padded

    @staticmethod
    def _pad_and_diagonalize(chunked_hidden_states):
        """
        将每一行向右移动一个步长，将列转换为对角线。

        Args:
        - chunked_hidden_states: 分块的隐藏状态张量，每个块的形状为 (total_num_heads, num_chunks, window_overlap, hidden_dim)

        Returns:
        - chunked_hidden_states: 填充并对角化后的隐藏状态张量
        """
        total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states)
        paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]])
        chunked_hidden_states = tf.pad(
            chunked_hidden_states, paddings
        )  # 填充操作，具体填充的值并不重要，因为之后会被覆写
        chunked_hidden_states = tf.reshape(
            chunked_hidden_states, (total_num_heads, num_chunks, -1)
        )  # 将填充后的张量重新形状为 (total_num_heads, num_chunks, window_overlap + hidden_dim + window_overlap + 1)
        chunked_hidden_states = chunked_hidden_states[
            :, :, :-window_overlap
        ]  # 切片操作，去除填充后多余的部分，使得形状为 (total_num_heads, num_chunks, window_overlap + hidden_dim)
        chunked_hidden_states = tf.reshape(
            chunked_hidden_states,
            (total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim),
        )  # 将张量形状重新调整为 (total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim)
        chunked_hidden_states = chunked_hidden_states[:, :, :, :-1]  # 去除最后一个维度的多余部分

        return chunked_hidden_states

    @staticmethod
    def _chunk(hidden_states, window_overlap):
        """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
        # 获取隐藏状态的形状信息：批量大小、序列长度、隐藏维度
        batch_size, seq_length, hidden_dim = shape_list(hidden_states)
        # 计算输出块的数量，每个块的大小为2w，重叠大小为w
        num_output_chunks = 2 * (seq_length // (2 * window_overlap)) - 1

        # 定义帧大小和帧步长（类似于卷积）
        frame_hop_size = window_overlap * hidden_dim
        frame_size = 2 * frame_hop_size
        # 将隐藏状态重塑为适合帧操作的形状
        hidden_states = tf.reshape(hidden_states, (batch_size, seq_length * hidden_dim))

        # 使用帧操作进行分块，带有重叠部分
        chunked_hidden_states = tf.signal.frame(hidden_states, frame_size, frame_hop_size)

        # 断言确保分块操作正确应用
        tf.debugging.assert_equal(
            shape_list(chunked_hidden_states),
            [batch_size, num_output_chunks, frame_size],
            message=(
                "Make sure chunking is correctly applied. `Chunked hidden states should have output dimension"
                f" {[batch_size, frame_size, num_output_chunks]}, but got {shape_list(chunked_hidden_states)}."
            ),
        )

        # 将分块后的隐藏状态重塑为所需的形状
        chunked_hidden_states = tf.reshape(
            chunked_hidden_states,
            (batch_size, num_output_chunks, 2 * window_overlap, hidden_dim),
        )

        return chunked_hidden_states

    @staticmethod
    def _get_global_attn_indices(is_index_global_attn):
        """compute global attn indices required throughout forward pass"""
        # 计算每个样本中非零全局注意力索引的数量
        num_global_attn_indices = tf.math.count_nonzero(is_index_global_attn, axis=1)
        num_global_attn_indices = tf.cast(num_global_attn_indices, dtype=tf.constant(1).dtype)

        # 批次中全局注意力索引的最大数量
        max_num_global_attn_indices = tf.reduce_max(num_global_attn_indices)

        # 获取所有非零全局注意力索引的位置
        is_index_global_attn_nonzero = tf.where(is_index_global_attn)

        # 创建帮助变量，指示哪些位置是全局注意力索引
        is_local_index_global_attn = tf.range(max_num_global_attn_indices) < tf.expand_dims(
            num_global_attn_indices, axis=-1
        )

        # 获取非填充值在全局注意力索引中的位置
        is_local_index_global_attn_nonzero = tf.where(is_local_index_global_attn)

        # 获取填充值在全局注意力索引中的位置
        is_local_index_no_global_attn_nonzero = tf.where(tf.math.logical_not(is_local_index_global_attn))

        return (
            max_num_global_attn_indices,
            is_index_global_attn_nonzero,
            is_local_index_global_attn_nonzero,
            is_local_index_no_global_attn_nonzero,
        )

    def _concat_with_global_key_attn_probs(
        self,
        attn_scores,
        key_vectors,
        query_vectors,
        max_num_global_attn_indices,
        is_index_global_attn_nonzero,
        is_local_index_global_attn_nonzero,
        is_local_index_no_global_attn_nonzero,
    ):
        batch_size = shape_list(key_vectors)[0]  # 获取key_vectors的批量大小

        # 选择全局key向量
        global_key_vectors = tf.gather_nd(key_vectors, is_index_global_attn_nonzero)

        # 创建仅包含全局key向量的张量
        key_vectors_only_global = tf.scatter_nd(
            is_local_index_global_attn_nonzero,
            global_key_vectors,
            shape=(
                batch_size,
                max_num_global_attn_indices,
                self.num_heads,
                self.head_dim,
            ),
        )

        # 计算来自全局key向量的注意力概率
        # 形状为 (batch_size, seq_len, num_heads, max_num_global_attn_indices)
        attn_probs_from_global_key = tf.einsum("blhd,bshd->blhs", query_vectors, key_vectors_only_global)

        # 转置以匹配形状 (batch_size, max_num_global_attn_indices, seq_len, num_heads)
        attn_probs_from_global_key_trans = tf.transpose(attn_probs_from_global_key, (0, 3, 1, 2))

        # 创建掩码形状
        mask_shape = (shape_list(is_local_index_no_global_attn_nonzero)[0],) + tuple(
            shape_list(attn_probs_from_global_key_trans)[-2:]
        )
        mask = tf.ones(mask_shape) * -10000.0  # 初始化掩码为较大的负数
        mask = tf.cast(mask, dtype=attn_probs_from_global_key_trans.dtype)  # 将掩码转换为与注意力概率相同的数据类型

        # 使用scatter_nd_update方法更新掩码
        attn_probs_from_global_key_trans = tf.tensor_scatter_nd_update(
            attn_probs_from_global_key_trans,
            is_local_index_no_global_attn_nonzero,
            mask,
        )

        # 再次转置以匹配形状 (batch_size, seq_len, num_heads, max_num_global_attn_indices)
        attn_probs_from_global_key = tf.transpose(attn_probs_from_global_key_trans, (0, 2, 3, 1))

        # 连接到注意力分数中
        # 形状为 (batch_size, seq_len, num_heads, extra attention count + 2*window+1)
        attn_scores = tf.concat((attn_probs_from_global_key, attn_scores), axis=-1)

        return attn_scores  # 返回最终的注意力分数张量

    def _compute_attn_output_with_global_indices(
        self,
        value_vectors,
        attn_probs,
        max_num_global_attn_indices,
        is_index_global_attn_nonzero,
        is_local_index_global_attn_nonzero,
    ):
        # 获取注意力概率张量的批量大小
        batch_size = shape_list(attn_probs)[0]

        # 仅保留全局注意力的部分概率值
        attn_probs_only_global = attn_probs[:, :, :, :max_num_global_attn_indices]

        # 根据全局注意力的非零索引，选择全局数值向量
        global_value_vectors = tf.gather_nd(value_vectors, is_index_global_attn_nonzero)

        # 创建仅包含全局数值向量的张量
        value_vectors_only_global = tf.scatter_nd(
            is_local_index_global_attn_nonzero,
            global_value_vectors,
            shape=(
                batch_size,
                max_num_global_attn_indices,
                self.num_heads,
                self.head_dim,
            ),
        )

        # 计算仅含全局注意力的注意力输出
        attn_output_only_global = tf.einsum("blhs,bshd->blhd", attn_probs_only_global, value_vectors_only_global)

        # 重新整形剩余的注意力概率张量
        attn_probs_without_global = attn_probs[:, :, :, max_num_global_attn_indices:]

        # 使用全局和局部注意力计算注意力输出
        attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value(
            attn_probs_without_global, value_vectors, self.one_sided_attn_window_size
        )

        # 返回合并了全局和局部注意力输出的结果
        return attn_output_only_global + attn_output_without_global

    def _compute_global_attn_output_from_hidden(
        self,
        attn_output,
        hidden_states,
        max_num_global_attn_indices,
        layer_head_mask,
        is_local_index_global_attn_nonzero,
        is_index_global_attn_nonzero,
        is_local_index_no_global_attn_nonzero,
        is_index_masked,
        training,
    def reshape_and_transpose(self, vector, batch_size):
        # 将输入向量重新整形并转置，以便进行后续处理
        return tf.reshape(
            tf.transpose(
                tf.reshape(vector, (batch_size, -1, self.num_heads, self.head_dim)),
                (0, 2, 1, 3),
            ),
            (batch_size * self.num_heads, -1, self.head_dim),
        )
class TFLEDEncoderAttention(keras.layers.Layer):
    # 初始化编码器自注意力层
    def __init__(self, config, layer_id, **kwargs):
        super().__init__(**kwargs)
        # 初始化Longformer编码器自注意力层
        self.longformer_self_attn = TFLEDEncoderSelfAttention(config, layer_id=layer_id, name="longformer_self_attn")
        # 输出层，全连接层，输出维度为config中的d_model
        self.output_dense = keras.layers.Dense(config.d_model, use_bias=True, name="output")
        self.config = config

    # 调用函数，用于前向传播
    def call(self, inputs, training=False):
        (
            hidden_states,               # 编码器隐藏状态
            attention_mask,              # 注意力掩码
            layer_head_mask,             # 层头掩码
            is_index_masked,             # 是否对索引进行掩码
            is_index_global_attn,        # 是否全局注意力对索引进行掩码
            is_global_attn,              # 是否全局注意力
        ) = inputs

        # 调用Longformer编码器自注意力层
        self_outputs = self.longformer_self_attn(
            [hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn],
            training=training,
        )

        # 经过输出全连接层
        attention_output = self.output_dense(self_outputs[0], training=training)
        outputs = (attention_output,) + self_outputs[1:]  # 输出结果包括注意力输出和其他信息

        return outputs

    # 构建层，用于初始化层的内部状态
    def build(self, input_shape=None):
        if self.built:  # 如果已经构建过，直接返回
            return
        self.built = True  # 标记已构建
        if getattr(self, "longformer_self_attn", None) is not None:
            with tf.name_scope(self.longformer_self_attn.name):
                self.longformer_self_attn.build(None)  # 构建Longformer编码器自注意力层
        if getattr(self, "output_dense", None) is not None:
            with tf.name_scope(self.output_dense.name):
                self.output_dense.build([None, None, self.config.d_model])  # 构建输出全连接层


class TFLEDDecoderAttention(keras.layers.Layer):
    """Multi-headed attention from "Attention Is All You Need""""

    # 初始化解码器注意力层
    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim  # 嵌入维度

        self.num_heads = num_heads  # 注意力头数
        self.dropout = keras.layers.Dropout(dropout)  # Dropout层
        self.head_dim = embed_dim // num_heads  # 每个注意力头的维度
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5  # 缩放因子
        self.is_decoder = is_decoder  # 是否为解码器

        # 线性变换层，用于计算K、Q、V以及输出
        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")

    # 对张量进行形状变换
    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))

    # 调用函数，用于前向传播
    def call(
        self,
        hidden_states: tf.Tensor,
        key_value_states: tf.Tensor | None = None,
        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
        attention_mask: tf.Tensor | None = None,
        layer_head_mask: tf.Tensor | None = None,
        training=False,
    # 构建函数，用于构建模型的层次结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 设置标志位表示已经构建
        self.built = True
        
        # 如果存在 k_proj 属性，则构建 k_proj 层
        if getattr(self, "k_proj", None) is not None:
            # 在命名空间下构建 k_proj 层
            with tf.name_scope(self.k_proj.name):
                self.k_proj.build([None, None, self.embed_dim])
        
        # 如果存在 q_proj 属性，则构建 q_proj 层
        if getattr(self, "q_proj", None) is not None:
            # 在命名空间下构建 q_proj 层
            with tf.name_scope(self.q_proj.name):
                self.q_proj.build([None, None, self.embed_dim])
        
        # 如果存在 v_proj 属性，则构建 v_proj 层
        if getattr(self, "v_proj", None) is not None:
            # 在命名空间下构建 v_proj 层
            with tf.name_scope(self.v_proj.name):
                self.v_proj.build([None, None, self.embed_dim])
        
        # 如果存在 out_proj 属性，则构建 out_proj 层
        if getattr(self, "out_proj", None) is not None:
            # 在命名空间下构建 out_proj 层
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.embed_dim])
class TFLEDEncoderLayer(keras.layers.Layer):
    # 初始化编码器层，接受配置参数和层编号
    def __init__(self, config: LEDConfig, layer_id: int, **kwargs):
        super().__init__(**kwargs)
        # 设置嵌入维度为模型配置中的维度
        self.embed_dim = config.d_model
        # 初始化自注意力机制
        self.self_attn = TFLEDEncoderAttention(config, layer_id, name="self_attn")
        # 初始化自注意力层规范化
        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
        # 设置dropout层
        self.dropout = keras.layers.Dropout(config.dropout)
        # 获取激活函数
        self.activation_fn = get_tf_activation(config.activation_function)
        # 设置激活函数的dropout层
        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
        # 第一个全连接层，输出维度为编码器FFN的维度
        self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
        # 第二个全连接层，输出维度为嵌入维度
        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
        # 最终层规范化
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
        # 保存配置参数
        self.config = config

    # 定义调用方法，处理输入数据和各种掩码
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        layer_head_mask: tf.Tensor,
        is_index_masked: tf.Tensor,
        is_index_global_attn: tf.Tensor,
        is_global_attn: bool,
        training=False,
    ):
        """
        Args:
            hidden_states (`tf.Tensor`): 输入层的张量形状为 *(batch, seq_len, embed_dim)*
            attention_mask (`tf.Tensor`): 注意力掩码的形状为 *(batch, 1, tgt_len, src_len)*，
                其中填充元素由极大负值表示。
            layer_head_mask (`tf.Tensor`): 给定层中注意力头的掩码形状为 *(config.encoder_attention_heads,)*。
        """
        # 保留输入的残差连接
        residual = hidden_states
        # 进行自注意力计算
        layer_outputs = self.self_attn(
            [hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn],
            training=training,
        )
        
        # 获取自注意力层的输出作为新的隐藏状态
        hidden_states = layer_outputs[0]

        # 断言自注意力是否修改了查询的形状
        tf.debugging.assert_equal(
            shape_list(hidden_states),
            shape_list(residual),
            message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
        )

        # 应用dropout层到隐藏状态
        hidden_states = self.dropout(hidden_states, training=training)
        # 添加残差连接到dropout后的隐藏状态
        hidden_states = residual + hidden_states
        # 应用自注意力层规范化
        hidden_states = self.self_attn_layer_norm(hidden_states)
        # 保留更新后的残差连接
        residual = hidden_states
        # 应用激活函数到第一个全连接层
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 应用激活函数的dropout层
        hidden_states = self.activation_dropout(hidden_states, training=training)
        # 应用第二个全连接层
        hidden_states = self.fc2(hidden_states)
        # 应用dropout层到第二个全连接层
        hidden_states = self.dropout(hidden_states, training=training)
        # 添加残差连接到dropout后的第二个全连接层
        hidden_states = residual + hidden_states
        # 应用最终层规范化
        hidden_states = self.final_layer_norm(hidden_states)

        # 返回更新后的隐藏状态和其他层输出（如果有）
        return (hidden_states,) + layer_outputs[1:]
    # 构建函数，用于构建神经网络层的结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 将构建状态标记为已构建
        self.built = True
        
        # 如果存在 self_attn 属性，构建 self attention 层
        if getattr(self, "self_attn", None) is not None:
            with tf.name_scope(self.self_attn.name):
                self.self_attn.build(None)
        
        # 如果存在 self_attn_layer_norm 属性，构建 self attention 层的 Layer Normalization
        if getattr(self, "self_attn_layer_norm", None) is not None:
            with tf.name_scope(self.self_attn_layer_norm.name):
                self.self_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 fc1 属性，构建第一个全连接层
        if getattr(self, "fc1", None) is not None:
            with tf.name_scope(self.fc1.name):
                self.fc1.build([None, None, self.embed_dim])
        
        # 如果存在 fc2 属性，构建第二个全连接层
        if getattr(self, "fc2", None) is not None:
            with tf.name_scope(self.fc2.name):
                self.fc2.build([None, None, self.config.encoder_ffn_dim])
        
        # 如果存在 final_layer_norm 属性，构建最终的 Layer Normalization 层
        if getattr(self, "final_layer_norm", None) is not None:
            with tf.name_scope(self.final_layer_norm.name):
                self.final_layer_norm.build([None, None, self.embed_dim])
class TFLEDDecoderLayer(keras.layers.Layer):
    # 定义 TFLED 解码器层，继承自 keras.layers.Layer

    def __init__(self, config: LEDConfig, **kwargs):
        # 初始化函数，接受 LEDConfig 类型的配置参数和其他关键字参数

        super().__init__(**kwargs)
        # 调用父类的初始化方法

        self.embed_dim = config.d_model
        # 设置嵌入维度为配置中的模型维度

        self.self_attn = TFLEDDecoderAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            name="self_attn",
            is_decoder=True,
        )
        # 创建自注意力机制对象，用于解码器自注意力层

        self.dropout = keras.layers.Dropout(config.dropout)
        # 创建 dropout 层，用于整个层的 dropout 操作

        self.activation_fn = get_tf_activation(config.activation_function)
        # 获取 TensorFlow 激活函数对象，根据配置中的激活函数类型

        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
        # 创建 dropout 层，用于激活函数的 dropout 操作

        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
        # 创建层归一化层，用于自注意力层的归一化

        self.encoder_attn = TFLEDDecoderAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            name="encoder_attn",
            is_decoder=True,
        )
        # 创建编码器注意力对象，用于解码器与编码器之间的注意力

        self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
        # 创建层归一化层，用于编码器注意力层的归一化

        self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
        # 创建全连接层，用于解码器的前馈神经网络

        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
        # 创建全连接层，用于解码器的前馈神经网络的输出层

        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
        # 创建层归一化层，用于最终输出的归一化

        self.config = config
        # 保存配置对象

    def call(
        self,
        hidden_states,
        attention_mask: tf.Tensor | None = None,
        encoder_hidden_states: tf.Tensor | None = None,
        encoder_attention_mask: tf.Tensor | None = None,
        layer_head_mask: tf.Tensor | None = None,
        encoder_layer_head_mask: tf.Tensor | None = None,
        past_key_value: Tuple[tf.Tensor] | None = None,
        training=False,
        **kwargs
    ):
        # 定义层的调用函数，实现解码器层的前向传播逻辑
    # 构建方法用于构造模型的各个层，如果模型已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        
        # 设置标志表示模型已经构建完成
        self.built = True
        
        # 如果存在 self_attn 层，则构建 self_attn 层，并使用其名称作为命名空间
        if getattr(self, "self_attn", None) is not None:
            with tf.name_scope(self.self_attn.name):
                self.self_attn.build(None)
        
        # 如果存在 self_attn_layer_norm 层，则构建 self_attn_layer_norm 层，
        # 传入的形状是 [None, None, self.embed_dim]
        if getattr(self, "self_attn_layer_norm", None) is not None:
            with tf.name_scope(self.self_attn_layer_norm.name):
                self.self_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 encoder_attn 层，则构建 encoder_attn 层，并使用其名称作为命名空间
        if getattr(self, "encoder_attn", None) is not None:
            with tf.name_scope(self.encoder_attn.name):
                self.encoder_attn.build(None)
        
        # 如果存在 encoder_attn_layer_norm 层，则构建 encoder_attn_layer_norm 层，
        # 传入的形状是 [None, None, self.embed_dim]
        if getattr(self, "encoder_attn_layer_norm", None) is not None:
            with tf.name_scope(self.encoder_attn_layer_norm.name):
                self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 fc1 层，则构建 fc1 层，传入的形状是 [None, None, self.embed_dim]
        if getattr(self, "fc1", None) is not None:
            with tf.name_scope(self.fc1.name):
                self.fc1.build([None, None, self.embed_dim])
        
        # 如果存在 fc2 层，则构建 fc2 层，传入的形状是 [None, None, self.config.decoder_ffn_dim]
        if getattr(self, "fc2", None) is not None:
            with tf.name_scope(self.fc2.name):
                self.fc2.build([None, None, self.config.decoder_ffn_dim])
        
        # 如果存在 final_layer_norm 层，则构建 final_layer_norm 层，
        # 传入的形状是 [None, None, self.embed_dim]
        if getattr(self, "final_layer_norm", None) is not None:
            with tf.name_scope(self.final_layer_norm.name):
                self.final_layer_norm.build([None, None, self.embed_dim])
# 定义 TFLEDPreTrainedModel 类，继承自 TFPreTrainedModel
class TFLEDPreTrainedModel(TFPreTrainedModel):
    # 设置配置类为 LEDConfig
    config_class = LEDConfig
    # 指定基础模型前缀为 "led"
    base_model_prefix = "led"

    # 定义 input_signature 属性，用于指定输入的签名
    @property
    def input_signature(self):
        # 调用父类的 input_signature 方法获取默认签名
        sig = super().input_signature
        # 添加全局注意力掩码的 TensorSpec 到签名中，形状为 (None, None)，数据类型为 tf.int32
        sig["global_attention_mask"] = tf.TensorSpec((None, None), tf.int32, name="global_attention_mask")
        # 返回更新后的签名
        return sig


# 使用 dataclass 装饰器定义 TFLEDEncoderBaseModelOutput 类
@dataclass
# 类的注释被省略，这里是 TFLongformerBaseModelOutput 类的修改版本，用于 TFLEDEncoder
class TFLEDEncoderBaseModelOutput(ModelOutput):
    """
    Base class for Longformer's outputs, with potential hidden states, local and global attentions.
    """
    # 定义函数参数 `last_hidden_state`，类型为 `tf.Tensor`，默认为 None
    last_hidden_state: tf.Tensor = None
    
    # 定义函数参数 `hidden_states`，类型为元组 `Tuple[tf.Tensor, ...]` 或者 None，当 `output_hidden_states=True` 时返回
    # 表示模型在每个层输出的隐藏状态以及初始嵌入输出
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    
    # 定义函数参数 `attentions`，类型为元组 `Tuple[tf.Tensor, ...]` 或者 None，当 `output_attentions=True` 时返回
    # 表示每个层的本地注意力权重，形状为 `(batch_size, num_heads, sequence_length, x + attention_window + 1)`
    # 这些是经过注意力 softmax 后的本地注意力权重，用于计算自注意力头中的加权平均值
    attentions: Tuple[tf.Tensor, ...] | None = None
    
    # 定义函数参数 `global_attentions`，类型为元组 `Tuple[tf.Tensor, ...]` 或者 None，当 `output_attentions=True` 时返回
    # 表示每个层的全局注意力权重，形状为 `(batch_size, num_heads, sequence_length, x)`
    # 这些是经过注意力 softmax 后的全局注意力权重，用于计算自注意力头中的加权平均值
    global_attentions: Tuple[tf.Tensor, ...] | None = None
# 定义一个 TFLEDSeq2SeqModelOutput 类，继承自 ModelOutput 类，用于存储序列到序列模型的输出
@dataclass
class TFLEDSeq2SeqModelOutput(ModelOutput):
    """
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.

    """

    # 最后一个隐藏状态，类型为 tf.Tensor，默认为 None
    last_hidden_state: tf.Tensor = None
    # 存储过去关键值的列表，类型为 List[tf.Tensor] 或者 None
    past_key_values: List[tf.Tensor] | None = None
    # 解码器的隐藏状态元组，类型为 Tuple[tf.Tensor, ...] 或者 None
    decoder_hidden_states: Tuple[tf.Tensor, ...] | None = None
    # 解码器的注意力权重元组，类型为 Tuple[tf.Tensor, ...] 或者 None
    decoder_attentions: Tuple[tf.Tensor, ...] | None = None
    # 交叉注意力权重元组，类型为 Tuple[tf.Tensor, ...] 或者 None
    cross_attentions: Tuple[tf.Tensor, ...] | None = None
    # 编码器最后一个隐藏状态，类型为 tf.Tensor 或者 None
    encoder_last_hidden_state: tf.Tensor | None = None
    # 编码器的隐藏状态元组，类型为 Tuple[tf.Tensor, ...] 或者 None
    encoder_hidden_states: Tuple[tf.Tensor, ...] | None = None
    # 编码器的注意力权重元组，类型为 Tuple[tf.Tensor, ...] 或者 None
    encoder_attentions: Tuple[tf.Tensor, ...] | None = None
    # 编码器的全局注意力权重元组，类型为 Tuple[tf.Tensor, ...] 或者 None
    encoder_global_attentions: Tuple[tf.Tensor, ...] | None = None


# 定义一个 TFLEDSeq2SeqLMOutput 类，继承自 ModelOutput 类，用于存储序列到序列语言模型的输出
@dataclass
class TFLEDSeq2SeqLMOutput(ModelOutput):
    """
    Base class for sequence-to-sequence language models outputs.

    """

    # 损失张量，类型为 tf.Tensor 或者 None
    loss: tf.Tensor | None = None
    # 预测的 logits 张量，类型为 tf.Tensor，默认为 None
    logits: tf.Tensor = None
    # 存储过去关键值的列表，类型为 List[tf.Tensor] 或者 None
    past_key_values: List[tf.Tensor] | None = None
    # 解码器的隐藏状态元组，类型为 Tuple[tf.Tensor, ...] 或者 None
    decoder_hidden_states: Tuple[tf.Tensor, ...] | None = None
    # 解码器的注意力权重元组，类型为 Tuple[tf.Tensor, ...] 或者 None
    decoder_attentions: Tuple[tf.Tensor, ...] | None = None
    # 交叉注意力权重元组，类型为 Tuple[tf.Tensor, ...] 或者 None
    cross_attentions: Tuple[tf.Tensor, ...] | None = None
    # 编码器最后一个隐藏状态，类型为 tf.Tensor 或者 None
    encoder_last_hidden_state: tf.Tensor | None = None
    # 编码器的隐藏状态元组，类型为 Tuple[tf.Tensor, ...] 或者 None
    encoder_hidden_states: Tuple[tf.Tensor, ...] | None = None
    # 编码器的注意力权重元组，类型为 Tuple[tf.Tensor, ...] 或者 None
    encoder_attentions: Tuple[tf.Tensor, ...] | None = None
    # 编码器的全局注意力权重元组，类型为 Tuple[tf.Tensor, ...] 或者 None
    encoder_global_attentions: Tuple[tf.Tensor, ...] | None = None


# LED_START_DOCSTRING 为一个原始字符串，用于描述 TFPreTrainedModel 类的文档字符串
LED_START_DOCSTRING = r"""
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - 在调用模型时，可以使用这种形式传入输入张量。如果模型有不同的输入名称（比如input_ids、attention_mask、token_type_ids），则需要按照相应的输入名称传递张量。

    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
    - 如果模型的输入需要按照名称显式传递，则可以使用这种字典形式传递输入张量，其中键对应于模型的输入名称，值对应于输入张量本身。

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!
    - 如果使用子类化的方式创建模型和层，那么可以像调用任何其他Python函数一样传递输入张量，无需担心输入的名称和形式。

    Args:
        config ([`LEDConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
    - 参数说明部分，config参数接受一个`LEDConfig`类型的对象，该对象包含模型的所有参数配置。使用配置文件初始化时，并不会加载模型的权重，只会加载配置信息。可以查阅[`~TFPreTrainedModel.from_pretrained`]方法来加载模型的权重。
"""
LED_INPUTS_DOCSTRING = r"""
"""

@keras_serializable
class TFLEDEncoder(keras.layers.Layer):
    # 设置配置类为LEDConfig
    config_class = LEDConfig
    """
    Transformer encoder consisting of *config.encoder_layers* self-attention layers. Each layer is a
    [`TFLEDEncoderLayer`].

    Args:
        config: LEDConfig
    """

    def __init__(self, config: LEDConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
        super().__init__(**kwargs)
        # 初始化配置参数
        self.config = config
        # 设置dropout层
        self.dropout = keras.layers.Dropout(config.dropout)
        # 如果启用了encoder_layerdrop，则记录警告信息
        if config.encoder_layerdrop > 0:
            logger.warning("Layerdrop is currently disabled in TFLED models.")
        # 设置layerdrop为0.0
        self.layerdrop = 0.0
        # 设置padding索引为config.pad_token_id
        self.padding_idx = config.pad_token_id

        # 如果config.attention_window为整数，则确认其为偶数且为正数，并复制给每个层
        if isinstance(config.attention_window, int):
            assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value"
            assert config.attention_window > 0, "`config.attention_window` has to be positive"
            config.attention_window = [config.attention_window] * config.num_hidden_layers  # one value per layer
        else:
            # 否则确认其长度与num_hidden_layers相等
            assert len(config.attention_window) == config.num_hidden_layers, (
                "`len(config.attention_window)` should equal `config.num_hidden_layers`. "
                f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
            )

        # 设置attention_window为config.attention_window
        self.attention_window = config.attention_window
        # 设置embed_tokens为输入的embed_tokens
        self.embed_tokens = embed_tokens
        # 初始化位置编码层TFLEDLearnedPositionalEmbedding
        self.embed_positions = TFLEDLearnedPositionalEmbedding(
            config.max_encoder_position_embeddings,
            config.d_model,
            name="embed_positions",
        )
        # 创建transformer encoder层列表，每一层为TFLEDEncoderLayer
        self.layers = [TFLEDEncoderLayer(config, i, name=f"layers.{i}") for i in range(config.encoder_layers)]
        # 创建layernorm层
        self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
        # 设置embed_dim为config.d_model
        self.embed_dim = config.d_model

    # 获取embed_tokens方法
    def get_embed_tokens(self):
        return self.embed_tokens

    # 设置embed_tokens方法
    def set_embed_tokens(self, embed_tokens):
        self.embed_tokens = embed_tokens

    # 解包输入的装饰器函数
    @unpack_inputs
    # 定义call函数，处理Transformer编码器的前向传播
    def call(
        self,
        input_ids=None,
        inputs_embeds=None,
        attention_mask=None,
        global_attention_mask=None,
        head_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
    ):
        # 实际前向传播逻辑在TFLEDEncoderLayer中实现，这里仅定义函数签名和参数

    # 计算隐藏状态的函数，截取hidden_states以适配padding长度
    @tf.function
    def compute_hidden_states(self, hidden_states, padding_len):
        return hidden_states[:, :-padding_len] if padding_len > 0 else hidden_states

    # 填充到指定窗口大小的函数，处理输入以匹配指定的注意力窗口大小
    def _pad_to_window_size(
        self,
        input_ids,
        attention_mask,
        inputs_embeds,
        pad_token_id,
    ):
        """A helper function to pad tokens and mask to work with implementation of Longformer selfattention."""
        # padding
        attention_window = (
            self.attention_window if isinstance(self.attention_window, int) else max(self.attention_window)
        )

        assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}"

        input_shape = shape_list(input_ids) if input_ids is not None else shape_list(inputs_embeds)
        batch_size, seq_len = input_shape[:2]
        padding_len = (attention_window - seq_len % attention_window) % attention_window

        if padding_len > 0:
            logger.warning_once(
                f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
                f"`config.attention_window`: {attention_window}"
            )

        paddings = tf.convert_to_tensor([[0, 0], [0, padding_len]])

        if input_ids is not None:
            # Pad input_ids with pad_token_id according to calculated padding_len
            input_ids = tf.pad(input_ids, paddings, constant_values=pad_token_id)

        if inputs_embeds is not None:
            if padding_len > 0:
                # Create padding for input_ids and embed them to get inputs_embeds_padding
                input_ids_padding = tf.fill((batch_size, padding_len), pad_token_id)
                inputs_embeds_padding = self.embed_tokens(input_ids_padding)
                # Concatenate original inputs_embeds with inputs_embeds_padding along the sequence dimension
                inputs_embeds = tf.concat([inputs_embeds, inputs_embeds_padding], axis=-2)

        # Pad attention_mask with False (indicating no attention on padding tokens)
        attention_mask = tf.pad(attention_mask, paddings, constant_values=False)

        return (
            padding_len,       # Amount of padding added to input_ids and attention_mask
            input_ids,         # Padded input_ids
            attention_mask,    # Padded attention_mask
            inputs_embeds,     # Padded inputs_embeds
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "embed_positions", None) is not None:
            with tf.name_scope(self.embed_positions.name):
                self.embed_positions.build(None)
        if getattr(self, "layernorm_embedding", None) is not None:
            with tf.name_scope(self.layernorm_embedding.name):
                self.layernorm_embedding.build([None, None, self.embed_dim])
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    layer.build(None)
# 使用 @keras_serializable 装饰器，将 TFLEDDecoder 类标记为可序列化的 Keras 层
@keras_serializable
class TFLEDDecoder(keras.layers.Layer):
    # 指定配置类为 LEDConfig
    config_class = LEDConfig
    
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFLEDDecoderLayer`]

    Args:
        config: LEDConfig
        embed_tokens: output embedding
    """

    # 初始化方法，接受 LEDConfig 和可选的嵌入标记作为参数
    def __init__(self, config: LEDConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
        super().__init__(**kwargs)
        # 设置配置
        self.config = config
        # 设置填充索引为配置中的 pad_token_id
        self.padding_idx = config.pad_token_id
        # 设置嵌入标记
        self.embed_tokens = embed_tokens
        # 如果配置中启用了 layerdrop，则发出警告（当前未启用）
        if config.decoder_layerdrop > 0:
            logger.warning("Layerdrop is currently disabled in TFLED models.")
        # 设置 layerdrop 为 0.0
        self.layerdrop = 0.0
        # 创建位置嵌入层对象 TFLEDLearnedPositionalEmbedding
        self.embed_positions = TFLEDLearnedPositionalEmbedding(
            config.max_decoder_position_embeddings,
            config.d_model,
            name="embed_positions",
        )
        # 创建多个 TFLEDDecoderLayer 层对象组成的列表
        self.layers = [TFLEDDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
        # 创建层标准化层对象，用于嵌入层标准化处理
        self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")

        # 创建丢弃层，使用配置中的 dropout 比率
        self.dropout = keras.layers.Dropout(config.dropout)

    # 设置嵌入标记的方法
    def set_embed_tokens(self, embed_tokens):
        self.embed_tokens = embed_tokens

    # 使用 @unpack_inputs 装饰器定义的调用方法，接受多个输入参数并返回结果
    def call(
        self,
        input_ids=None,
        inputs_embeds=None,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        head_mask=None,
        encoder_head_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
    ):
        # 方法实现略过，未提供具体实现

    # 构建方法，用于构建层的内部组件
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        
        # 如果存在 embed_positions 属性，则构建该对象
        if getattr(self, "embed_positions", None) is not None:
            with tf.name_scope(self.embed_positions.name):
                self.embed_positions.build(None)
        
        # 如果存在 layernorm_embedding 属性，则构建该对象
        if getattr(self, "layernorm_embedding", None) is not None:
            with tf.name_scope(self.layernorm_embedding.name):
                self.layernorm_embedding.build([None, None, self.config.d_model])
        
        # 遍历每个层并构建它们
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    layer.build(None)
    # 初始化函数，用于创建一个新的LED模型实例
    def __init__(self, config: LEDConfig, **kwargs):
        # 调用父类的初始化方法，继承父类的属性和方法
        super().__init__(**kwargs)
        # 将传入的配置参数保存到实例属性中
        self.config = config
        # 创建一个共享的嵌入层，用于编码和解码器共享词嵌入
        self.shared = keras.layers.Embedding(
            input_dim=config.vocab_size,  # 词汇表大小，词嵌入的输入维度
            output_dim=config.d_model,     # 嵌入向量的输出维度
            embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),  # 初始化嵌入矩阵的方式
            name="led.shared",  # 嵌入层的名称
        )
        # 为共享的嵌入层添加额外属性，指定层的预期名称范围（用于加载/存储权重）
        self.shared.load_weight_prefix = "led.shared"

        # 创建LED模型的编码器，传入配置和共享的词嵌入层
        self.encoder = TFLEDEncoder(config, self.shared, name="encoder")
        # 创建LED模型的解码器，传入配置和共享的词嵌入层
        self.decoder = TFLEDDecoder(config, self.shared, name="decoder")

    # 返回模型的输入嵌入层（共享的词嵌入层）
    def get_input_embeddings(self):
        return self.shared

    # 设置模型的输入嵌入层为新的词嵌入层
    def set_input_embeddings(self, new_embeddings):
        # 更新共享的词嵌入层
        self.shared = new_embeddings
        # 更新编码器的词嵌入层
        self.encoder.embed_tokens = self.shared
        # 更新解码器的词嵌入层
        self.decoder.embed_tokens = self.shared

    # 使用装饰器将输入参数解包，处理模型的前向传播
    @unpack_inputs
    def call(
        self,
        input_ids=None,
        attention_mask=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        encoder_outputs: Optional[Union[Tuple, TFLEDEncoderBaseModelOutput]] = None,
        global_attention_mask=None,
        past_key_values=None,
        inputs_embeds=None,
        decoder_inputs_embeds=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
        **kwargs,
        ):
            # 如果没有提供解码器的输入 ID 和嵌入向量，则不使用缓存
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                use_cache = False

            # 如果没有提供编码器的输出，则调用编码器来生成编码器的输出
            if encoder_outputs is None:
                encoder_outputs = self.encoder(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    global_attention_mask=global_attention_mask,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                    training=training,
                )
            # 如果用户传入了元组形式的编码器输出，并且设置了 return_dict=True，则将其包装在 TFLEDEncoderBaseModelOutput 中
            elif return_dict and not isinstance(encoder_outputs, TFLEDEncoderBaseModelOutput):
                encoder_outputs = TFLEDEncoderBaseModelOutput(
                    last_hidden_state=encoder_outputs[0],
                    hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                    attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
                )
            # 如果用户传入了 TFLEDEncoderBaseModelOutput 形式的编码器输出，并且设置了 return_dict=False，则将其转换为元组形式
            elif not return_dict and not isinstance(encoder_outputs, tuple):
                encoder_outputs = encoder_outputs.to_tuple()

            # 调用解码器生成解码器的输出
            decoder_outputs = self.decoder(
                decoder_input_ids,
                attention_mask=decoder_attention_mask,
                encoder_hidden_states=encoder_outputs[0],
                encoder_attention_mask=attention_mask,
                head_mask=decoder_head_mask,
                encoder_head_mask=head_mask,
                past_key_values=past_key_values,
                inputs_embeds=decoder_inputs_embeds,
                use_cache=use_cache,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                training=training,
            )

            # 如果 return_dict=False，则返回解码器和编码器的输出作为元组
            if not return_dict:
                return decoder_outputs + encoder_outputs

            # 如果 return_dict=True，则将解码器和编码器的输出包装在 TFLEDSeq2SeqModelOutput 中并返回
            return TFLEDSeq2SeqModelOutput(
                last_hidden_state=decoder_outputs.last_hidden_state,
                past_key_values=decoder_outputs.past_key_values,
                decoder_hidden_states=decoder_outputs.hidden_states,
                decoder_attentions=decoder_outputs.attentions,
                cross_attentions=decoder_outputs.cross_attentions,
                encoder_last_hidden_state=encoder_outputs.last_hidden_state,
                encoder_hidden_states=encoder_outputs.hidden_states,
                encoder_attentions=encoder_outputs.attentions,
                encoder_global_attentions=encoder_outputs.global_attentions,
            )
    # 如果模型已经构建完成，则直接返回，不重复构建
    if self.built:
        return
    # 设置模型已经构建标志为True
    self.built = True
    
    # 共享/绑定的权重期望位于模型基础命名空间中
    # 将"/"添加到tf.name_scope的末尾（而不是开头！）将其放置在根命名空间而不是当前命名空间中
    with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
        # 构建共享模型，无输入形状
        self.shared.build(None)
    
    # 如果存在编码器对象
    if getattr(self, "encoder", None) is not None:
        # 使用编码器名称创建命名空间
        with tf.name_scope(self.encoder.name):
            # 构建编码器，无输入形状
            self.encoder.build(None)
    
    # 如果存在解码器对象
    if getattr(self, "decoder", None) is not None:
        # 使用解码器名称创建命名空间
        with tf.name_scope(self.decoder.name):
            # 构建解码器，无输入形状
            self.decoder.build(None)
# 添加文档字符串，说明这是一个不带顶部特定头的裸 LED 模型输出原始隐藏状态。
# 使用 TFLEDPreTrainedModel 的子类化来定义 TFLEDModel 类
class TFLEDModel(TFLEDPreTrainedModel):
    
    # 初始化方法，接受配置和其他参数
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)
        
        # 创建 TFLEDMainLayer 实例，并命名为 "led"
        self.led = TFLEDMainLayer(config, name="led")

    # 返回编码器的方法
    def get_encoder(self):
        return self.led.encoder

    # 返回解码器的方法
    def get_decoder(self):
        return self.led.decoder

    # call 方法，定义模型的前向传播过程
    @unpack_inputs
    # 添加文档字符串，描述输入的格式要求
    @add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 添加代码示例的文档字符串，指向预训练模型的检查点、输出类型和配置类
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFLEDSeq2SeqModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 方法签名和参数描述，指定了输入和输出的类型及格式
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: tf.Tensor | None = None,
        decoder_input_ids: tf.Tensor | None = None,
        decoder_attention_mask: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        decoder_head_mask: tf.Tensor | None = None,
        encoder_outputs: tf.Tensor | None = None,
        global_attention_mask: tf.Tensor | None = None,
        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
        inputs_embeds: tf.Tensor | None = None,
        decoder_inputs_embeds: tf.Tensor | None = None,
        use_cache: bool | None = None,
        output_attentions: bool | None = None,
        output_hidden_states: bool | None = None,
        return_dict: bool | None = None,
        training: bool = False,
        **kwargs,
    ) -> Tuple[tf.Tensor] | TFLEDSeq2SeqModelOutput:
        # 调用 TFLEDMainLayer 实例的__call__方法，传递参数并接收输出
        outputs = self.led(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            encoder_outputs=encoder_outputs,
            global_attention_mask=global_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回模型的输出
        return outputs
    # 定义一个方法用于处理模型的输出，接受一个output对象作为参数
    def serving_output(self, output):
        # 如果配置中设置使用缓存，则从output的过去键值对中获取第二个元素作为pkv，否则为None
        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
        # 如果配置中设置输出隐藏状态，则将output的解码器隐藏状态转换为张量dec_hs，否则为None
        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置中设置输出注意力，则将output的解码器注意力转换为张量dec_attns，否则为None
        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
        # 如果配置中设置输出注意力，则将output的交叉注意力转换为张量cross_attns，否则为None
        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
        # 如果配置中设置输出隐藏状态，则将output的编码器隐藏状态转换为张量enc_hs，否则为None
        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置中设置输出注意力，则将output的编码器注意力转换为张量enc_attns，否则为None
        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
        # 如果配置中设置输出注意力，则将output的全局编码器注意力转换为张量enc_g_attns，否则为None
        enc_g_attns = tf.convert_to_tensor(output.encoder_global_attentions) if self.config.output_attentions else None

        # 返回一个TFLEDSeq2SeqModelOutput对象，包含处理后的各种张量
        return TFLEDSeq2SeqModelOutput(
            last_hidden_state=output.last_hidden_state,
            past_key_values=pkv,
            decoder_hidden_states=dec_hs,
            decoder_attentions=dec_attns,
            cross_attentions=cross_attns,
            encoder_last_hidden_state=output.encoder_last_hidden_state,
            encoder_hidden_states=enc_hs,
            encoder_attentions=enc_attns,
            encoder_global_attentions=enc_g_attns,
        )

    # 定义一个方法用于构建模型，接受一个输入形状参数，默认为None
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果对象中存在led属性
        if getattr(self, "led", None) is not None:
            # 在led的名字作用域内构建led对象，传入None作为构建参数
            with tf.name_scope(self.led.name):
                self.led.build(None)
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
# BiasLayer 类的定义，用于添加偏置作为一个层。用于序列化目的：`keras.Model.save_weights` 按层存储权重，
# 因此所有权重都必须在一个层中注册。
class BiasLayer(keras.layers.Layer):
    """
    Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
    so all weights have to be registered in a layer.
    """

    def __init__(self, shape, initializer, trainable, name, **kwargs):
        super().__init__(name=name, **kwargs)
        # 注：当序列化时，此变量的名称不会被作用域化，即不会以“outer_layer/inner_layer/.../name:0”的格式。
        # 而是“name:0”。更多细节见：
        # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214
        # 添加一个权重作为偏置，具有给定的形状、初始化器和是否可训练的参数。
        self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable)

    def call(self, x):
        # 在调用时，返回输入张量 x 加上偏置 self.bias
        return x + self.bias


@add_start_docstrings(
    "The LED Model with a language modeling head. Can be used for summarization.",
    LED_START_DOCSTRING,
)
# TFLEDForConditionalGeneration 类继承自 TFLEDPreTrainedModel，表示带有语言建模头部的 LED 模型，可用于摘要生成。
class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
    # 在加载时忽略的键列表，用于不期望的项
    _keys_to_ignore_on_load_unexpected = [
        r"led.encoder.embed_tokens.weight",
        r"led.decoder.embed_tokens.weight",
    ]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 初始化 LED 主层，用给定的配置，并命名为 "led"
        self.led = TFLEDMainLayer(config, name="led")
        # 是否使用缓存，从配置中获取
        self.use_cache = config.use_cache
        # final_bias_logits 在 PyTorch 中作为缓冲区注册，为保持一致性，设为不可训练。
        # 创建一个 BiasLayer 实例作为 final_logits_bias，形状为 [1, vocab_size]，初始化为零。
        self.bias_layer = BiasLayer(
            name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
        )

        # TODO (Joao): investigate why LED has numerical issues in XLA generate
        # 是否支持 XLA 生成，默认为 False，需进一步调查为何在 XLA 生成中 LED 存在数值问题。
        self.supports_xla_generation = False

    # 获取解码器
    def get_decoder(self):
        return self.led.decoder

    # 获取编码器
    def get_encoder(self):
        return self.led.encoder

    # 获取偏置信息，返回包含 final_logits_bias 偏置的字典
    def get_bias(self):
        return {"final_logits_bias": self.bias_layer.bias}

    # 设置偏置，替换包含偏置的现有层以正确（反）序列化
    def set_bias(self, value):
        # 获取词汇表大小
        vocab_size = value["final_logits_bias"].shape[-1]
        # 创建一个 BiasLayer 实例作为 final_logits_bias，形状为 [1, vocab_size]，初始化为零，且不可训练。
        self.bias_layer = BiasLayer(
            name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False
        )
        # 将给定的偏置值赋给 self.bias_layer.bias
        self.bias_layer.bias.assign(value["final_logits_bias"])

    # 获取输出的嵌入层
    def get_output_embeddings(self):
        return self.get_input_embeddings()

    # 设置输出的嵌入层
    def set_output_embeddings(self, value):
        self.set_input_embeddings(value)

    # 对模型前向方法添加开始文档字符串，详见 LED_INPUTS_DOCSTRING，并替换返回值的文档字符串为 TFLEDSeq2SeqLMOutput
    # 用于 API 文档生成。
    @unpack_inputs
    @add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFLEDSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 定义一个方法，用于调用模型
    def call(
        # 输入序列的 token IDs，可以是 TFModelInputType 类型或者 None
        self,
        input_ids: TFModelInputType | None = None,
        # 注意力掩码，可以是 numpy 数组、张量或者 None
        attention_mask: np.ndarray | tf.Tensor | None = None,
        # 解码器输入的 token IDs，可以是 numpy 数组、张量或者 None
        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
        # 解码器的注意力掩码，可以是 numpy 数组、张量或者 None
        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        # 头部掩码，可以是 numpy 数组、张量或者 None
        head_mask: np.ndarray | tf.Tensor | None = None,
        # 解码器的头部掩码，可以是 numpy 数组、张量或者 None
        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
        # 编码器的输出，可以是 TFLEDEncoderBaseModelOutput 类型或者 None
        encoder_outputs: TFLEDEncoderBaseModelOutput | None = None,
        # 全局注意力掩码，可以是 numpy 数组、张量或者 None
        global_attention_mask: np.ndarray | tf.Tensor | None = None,
        # 历史键值对，类型为 Tuple[Tuple[Union[np.ndarray, tf.Tensor]]] 或者 None
        past_key_values: Tuple[Tuple[Union[np.ndarray, tf.Tensor]]] | None = None,
        # 输入嵌入，可以是 numpy 数组、张量或者 None
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        # 解码器的输入嵌入，可以是 numpy 数组、张量或者 None
        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
        # 是否使用缓存，布尔类型或者 None
        use_cache: bool | None = None,
        # 是否输出注意力，布尔类型或者 None
        output_attentions: bool | None = None,
        # 是否输出隐藏状态，布尔类型或者 None
        output_hidden_states: bool | None = None,
        # 是否返回字典形式的结果，布尔类型或者 None
        return_dict: bool | None = None,
        # 标签，张量类型或者 None
        labels: tf.Tensor | None = None,
        # 是否处于训练模式，布尔类型，默认为 False
        training: bool = False,
    ) -> Tuple[tf.Tensor] | TFLEDSeq2SeqLMOutput:
        """
        返回一个元组，包含 tf.Tensor 和 TFLEDSeq2SeqLMOutput 类型的对象。

        如果 labels 不为 None：
            设置 use_cache 为 False
            如果 decoder_input_ids 和 decoder_inputs_embeds 都为 None：
                使用 shift_tokens_right 函数将 labels 右移，并设置填充和解码起始令牌的 ID

        使用 self.led 方法处理以下参数：
            input_ids: 输入的 token IDs
            attention_mask: 注意力掩码
            decoder_input_ids: 解码器输入的 token IDs
            decoder_attention_mask: 解码器注意力掩码
            encoder_outputs: 编码器输出
            global_attention_mask: 全局注意力掩码
            head_mask: 头部注意力掩码
            decoder_head_mask: 解码器头部注意力掩码
            past_key_values: 过去的键值对
            inputs_embeds: 输入的嵌入向量
            decoder_inputs_embeds: 解码器输入的嵌入向量
            use_cache: 是否使用缓存
            output_attentions: 是否输出注意力权重
            output_hidden_states: 是否输出隐藏状态
            return_dict: 是否返回字典
            training: 是否训练模式

        计算 lm_logits：
            使用 self.led.shared.weights 对 outputs[0] 进行矩阵乘法，转置部分

        将 lm_logits 传递给 self.bias_layer 进行处理

        计算 masked_lm_loss：
            如果 labels 为 None，则 masked_lm_loss 为 None，否则调用 self.hf_compute_loss 计算损失

        如果 return_dict 为 False：
            组装输出元组 output，包括 lm_logits 和 outputs 的其余部分

            如果 masked_lm_loss 不为 None，则将其包含在输出中

            返回 output

        否则，以 TFLEDSeq2SeqLMOutput 对象的形式返回：
            loss: masked_lm_loss
            logits: lm_logits
            past_key_values: outputs 中的 past_key_values（索引为 1）
            decoder_hidden_states: outputs 中的 decoder_hidden_states（索引为 2）
            decoder_attentions: outputs 中的 decoder_attentions（索引为 3）
            cross_attentions: outputs 中的 cross_attentions（索引为 4）
            encoder_last_hidden_state: encoder_outputs 中的 encoder_last_hidden_state（索引为 0）
            encoder_hidden_states: encoder_outputs 中的 encoder_hidden_states（索引为 1）
            encoder_attentions: encoder_outputs 中的 encoder_attentions（索引为 2）
            encoder_global_attentions: encoder_global_attentions

        """
    # 定义一个方法用于生成模型的输出
    def serving_output(self, output):
        # 如果配置要求使用缓存，则提取输出中的过去键值（past_key_values）
        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
        # 如果配置要求输出隐藏状态，则将输出的解码器隐藏状态转换为张量
        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置要求输出注意力权重，则将输出的解码器注意力权重转换为张量
        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
        # 如果配置要求输出交叉注意力权重，则将输出的交叉注意力权重转换为张量
        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
        # 如果配置要求输出隐藏状态，则将输出的编码器隐藏状态转换为张量
        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置要求输出注意力权重，则将输出的编码器注意力权重转换为张量
        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
        # 如果配置要求输出全局编码器注意力权重，则将输出的全局编码器注意力权重转换为张量
        enc_g_attns = tf.convert_to_tensor(output.encoder_global_attentions) if self.config.output_attentions else None

        # 返回一个包含输出结果的 TFLEDSeq2SeqLMOutput 对象
        return TFLEDSeq2SeqLMOutput(
            logits=output.logits,
            past_key_values=pkv,
            decoder_hidden_states=dec_hs,
            decoder_attentions=dec_attns,
            cross_attentions=cross_attns,
            encoder_last_hidden_state=output.encoder_last_hidden_state,
            encoder_hidden_states=enc_hs,
            encoder_attentions=enc_attns,
            encoder_global_attentions=enc_g_attns,
        )

    # 定义一个方法，准备生成时的输入参数
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果 past_key_values 不为 None，则截断 decoder_input_ids，只保留最后一个 token
        if past_key_values is not None:
            decoder_input_ids = decoder_input_ids[:, -1:]

        # 返回一个字典，包含生成时需要的输入参数
        return {
            "input_ids": None,  # encoder_outputs 已经定义，不需要 input_ids
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "use_cache": use_cache,  # 更改此参数以避免缓存（推测是为了调试）
        }

    # 定义一个方法，从标签生成解码器的输入 token ids
    def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
    def hf_compute_loss(self, labels, logits):
        """计算跨熵损失，忽略填充标记"""
        # 使用稀疏分类交叉熵损失函数，设置为从 logits 计算，不进行损失值缩减
        loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
        
        # 如果配置为使用旧版本 TensorFlow 的损失计算方式
        if self.config.tf_legacy_loss:
            # 将标签展平为一维张量
            melted_labels = tf.reshape(labels, (-1,))
            # 创建活跃损失掩码，排除填充标记
            active_loss = tf.not_equal(melted_labels, self.config.pad_token_id)
            # 使用掩码从 logits 中提取有效值
            reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
            # 使用掩码从标签中提取有效标签
            labels = tf.boolean_mask(melted_labels, active_loss)
            return loss_fn(labels, reduced_logits)

        # 在此处将负标签裁剪为零，以避免 NaN 和错误 - 这些位置将在后续被掩码处理
        unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
        # 确保只有非填充标签影响损失
        loss_mask = tf.cast(labels != self.config.pad_token_id, dtype=unmasked_loss.dtype)
        # 应用损失掩码到未掩码的损失上
        masked_loss = unmasked_loss * loss_mask
        # 计算掩码后的损失的均值
        reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
        # 返回形状为 (1,) 的降维后的损失张量
        return tf.reshape(reduced_masked_loss, (1,))

    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记已构建
        self.built = True
        
        # 如果存在 LED 层，则构建 LED 层
        if getattr(self, "led", None) is not None:
            with tf.name_scope(self.led.name):
                self.led.build(None)
        
        # 如果存在偏置层，则构建偏置层
        if getattr(self, "bias_layer", None) is not None:
            with tf.name_scope(self.bias_layer.name):
                self.bias_layer.build(None)

Transformers-源码解析-六十三-

Transformers 源码解析（六十三）

.\models\layoutlmv3\processing_layoutlmv3.py

.\models\layoutlmv3\tokenization_layoutlmv3.py

.\models\layoutlmv3\tokenization_layoutlmv3_fast.py

.\models\layoutlmv3\__init__.py

.\models\layoutxlm\processing_layoutxlm.py

.\models\layoutxlm\tokenization_layoutxlm.py

.\models\layoutxlm\tokenization_layoutxlm_fast.py

.\models\layoutxlm\__init__.py

.\models\led\configuration_led.py

.\models\led\modeling_led.py

.\models\led\modeling_tf_led.py

`.\models\layoutlmv3\processing_layoutlmv3.py`

`.\models\layoutlmv3\tokenization_layoutlmv3.py`

`.\models\layoutlmv3\tokenization_layoutlmv3_fast.py`

`.\models\layoutlmv3\init.py`

`.\models\layoutxlm\processing_layoutxlm.py`

`.\models\layoutxlm\tokenization_layoutxlm.py`

`.\models\layoutxlm\tokenization_layoutxlm_fast.py`

`.\models\layoutxlm\init.py`

`.\models\led\configuration_led.py`

`.\models\led\modeling_led.py`

`.\models\led\modeling_tf_led.py`