Transformers 源码解析（六十二）

`.\models\layoutlmv2\processing_layoutlmv2.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for LayoutLMv2.
"""

import warnings
from typing import List, Optional, Union

# 导入处理工具和数据结构定义
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType


class LayoutLMv2Processor(ProcessorMixin):
    r"""
    Constructs a LayoutLMv2 processor which combines a LayoutLMv2 image processor and a LayoutLMv2 tokenizer into a
    single processor.

    [`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.

    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
    get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
    [`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
    into token-level `labels` for token classification tasks (such as FUNSD, CORD).

    Args:
        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
        tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`, *optional*):
            An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
    """

    # 定义类属性，这些属性用于标识 processor 的特征
    attributes = ["image_processor", "tokenizer"]
    # 指定图片处理器类的名称
    image_processor_class = "LayoutLMv2ImageProcessor"
    # 指定 tokenizer 类的名称，支持两种类型
    tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
    # 初始化方法，接受图像处理器（image_processor）、分词器（tokenizer）等参数
    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        feature_extractor = None
        # 如果参数中包含 'feature_extractor'，发出警告并将其移除，建议使用 'image_processor' 替代
        if "feature_extractor" in kwargs:
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")

        # 如果未显式指定图像处理器，则尝试使用 feature_extractor
        image_processor = image_processor if image_processor is not None else feature_extractor
        # 如果最终图像处理器仍为 None，则抛出数值错误
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        # 如果分词器为 None，则抛出数值错误
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        # 调用父类的初始化方法，传递图像处理器和分词器作为参数
        super().__init__(image_processor, tokenizer)

    # 调用实例时执行的方法，用于将输入的图像及相关信息转换为模型可接受的格式
    def __call__(
        self,
        images,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ):
        """
        批量处理图像及其相关信息，将其转换为模型可以处理的格式。参数详细说明可以参考 `PreTrainedTokenizer.batch_decode` 方法的文档字符串。
        """
        # 实际调用分词器的 batch_decode 方法来处理输入数据
        return self.tokenizer.batch_decode(*args, **kwargs)

    # 获取溢出图像的方法，确保每个 `input_ids` 样本都对应其相应的图像
    def get_overflowing_images(self, images, overflow_to_sample_mapping):
        images_with_overflow = []
        # 根据溢出到样本映射，将相应索引的图像加入到结果列表中
        for sample_idx in overflow_to_sample_mapping:
            images_with_overflow.append(images[sample_idx])

        # 检查结果列表的长度与溢出映射的长度是否一致，否则抛出数值错误
        if len(images_with_overflow) != len(overflow_to_sample_mapping):
            raise ValueError(
                "Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
                f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
            )

        # 返回包含溢出图像的列表
        return images_with_overflow
    # 将所有参数转发到 PreTrainedTokenizer 的 `decode` 方法中，并返回结果
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
        to the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    # 返回模型输入的名称列表，包括 input_ids、bbox、token_type_ids、attention_mask 和 image
    @property
    def model_input_names(self):
        return ["input_ids", "bbox", "token_type_ids", "attention_mask", "image"]

    # 返回特征提取器的类。显示警告，告知 `feature_extractor_class` 将在 v5 版本中删除，建议使用 `image_processor_class` 替代
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        return self.image_processor_class

    # 返回特征提取器。显示警告，告知 `feature_extractor` 将在 v5 版本中删除，建议使用 `image_processor` 替代
    @property
    def feature_extractor(self):
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        return self.image_processor

`.\models\layoutlmv2\tokenization_layoutlmv2.py`

# coding=utf-8
# Copyright Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization class for LayoutLMv2."""

import collections  # 导入 collections 模块
import os  # 导入 os 模块
import sys  # 导入 sys 模块
import unicodedata  # 导入 unicodedata 模块
from typing import Dict, List, Optional, Tuple, Union  # 导入类型提示相关的类和函数

from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace  # 导入 tokenization_utils 中的类和函数
from ...tokenization_utils_base import (
    BatchEncoding,
    EncodedInput,
    PreTokenizedInput,
    TextInput,
    TextInputPair,
    TruncationStrategy,
)  # 导入 tokenization_utils_base 中的类和函数
from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging  # 导入 utils 中的类和函数

logger = logging.get_logger(__name__)  # 获取日志记录器

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}  # 定义词汇文件名字典

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/layoutlmv2-base-uncased": (
            "https://huggingface.co/microsoft/layoutlmv2-base-uncased/resolve/main/vocab.txt"
        ),
        "microsoft/layoutlmv2-large-uncased": (
            "https://huggingface.co/microsoft/layoutlmv2-large-uncased/resolve/main/vocab.txt"
        ),
    }
}  # 预训练词汇文件映射

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/layoutlmv2-base-uncased": 512,
    "microsoft/layoutlmv2-large-uncased": 512,
}  # 预训练位置嵌入尺寸

PRETRAINED_INIT_CONFIGURATION = {
    "microsoft/layoutlmv2-base-uncased": {"do_lower_case": True},
    "microsoft/layoutlmv2-large-uncased": {"do_lower_case": True},
}  # 预训练初始化配置

"""

"""


def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()  # 创建一个有序字典用于存储词汇
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()  # 读取词汇文件中的所有行
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")  # 去除每行末尾的换行符
        vocab[token] = index  # 将词汇和索引存入字典
    return vocab  # 返回构建的词汇字典


def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()  # 去除文本两端空白字符
    if not text:
        return []  # 如果文本为空，则返回空列表
    tokens = text.split()  # 使用空格分割文本，得到词汇列表
    return tokens  # 返回分割后的词汇列表


table = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P"))  # 创建一个字典，包含所有标点符号的 Unicode 编码

def subfinder(mylist, pattern):
    matches = []  # 初始化匹配列表
    indices = []  # 初始化索引列表
    for idx, i in enumerate(range(len(mylist))):
        if mylist[i] == pattern[0] and mylist[i : i + len(pattern)] == pattern:
            matches.append(pattern)  # 如果找到匹配的模式，添加到匹配列表
            indices.append(idx)  # 记录模式首次出现的索引
    if matches:
        return matches[0], indices[0]  # 如果有匹配项，返回第一个匹配的模式和其索引
    else:
        return None, 0  # 如果没有匹配项，返回 None 和 0


class LayoutLMv2Tokenizer(PreTrainedTokenizer):
    r"""
    """
    构建一个 LayoutLMv2 的分词器。基于 WordPiece。[`LayoutLMv2Tokenizer`] 可以用于将单词、单词级别边界框和可选的单词标签转换为
    标记级别的 `input_ids`、`attention_mask`、`token_type_ids`、`bbox`，以及可选的 `labels`（用于标记分类）。

    该分词器继承自 [`PreTrainedTokenizer`]，其中包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。

    [`LayoutLMv2Tokenizer`] 运行端到端的分词：标点符号分割和 WordPiece。它还将单词级别的边界框转换为标记级别的边界框。
    """

    # 定义预训练模型所需的词汇文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 定义预训练模型所需的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 定义预训练模型输入的最大长度列表
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 定义预训练模型的初始化配置
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION

    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        cls_token_box=[0, 0, 0, 0],
        sep_token_box=[1000, 1000, 1000, 1000],
        pad_token_box=[0, 0, 0, 0],
        pad_token_label=-100,
        only_label_first_subword=True,
        tokenize_chinese_chars=True,
        strip_accents=None,
        model_max_length: int = 512,
        additional_special_tokens: Optional[List[str]] = None,
        **kwargs,
    ):
    ):
        # 如果 sep_token 是字符串，则创建一个特殊的 AddedToken 对象
        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
        # 如果 unk_token 是字符串，则创建一个特殊的 AddedToken 对象
        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
        # 如果 pad_token 是字符串，则创建一个特殊的 AddedToken 对象
        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
        # 如果 cls_token 是字符串，则创建一个特殊的 AddedToken 对象
        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
        # 如果 mask_token 是字符串，则创建一个特殊的 AddedToken 对象
        mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token

        # 如果指定的词汇文件不存在，抛出 ValueError 异常
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        
        # 加载词汇表文件并将其存储在 self.vocab 中
        self.vocab = load_vocab(vocab_file)
        # 创建一个从 id 到 token 的有序字典 self.ids_to_tokens
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        # 根据 do_basic_tokenize 的设置决定是否使用基础的分词器
        self.do_basic_tokenize = do_basic_tokenize
        if do_basic_tokenize:
            # 如果需要基础的分词，创建 BasicTokenizer 对象
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )
        
        # 使用给定的词汇表和 unk_token 创建 WordpieceTokenizer 对象
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

        # 设置额外的属性
        self.cls_token_box = cls_token_box
        self.sep_token_box = sep_token_box
        self.pad_token_box = pad_token_box
        self.pad_token_label = pad_token_label
        self.only_label_first_subword = only_label_first_subword
        
        # 调用父类的构造函数，初始化参数
        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            cls_token_box=cls_token_box,
            sep_token_box=sep_token_box,
            pad_token_box=pad_token_box,
            pad_token_label=pad_token_label,
            only_label_first_subword=only_label_first_subword,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            model_max_length=model_max_length,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )

    @property
    def do_lower_case(self):
        # 返回基础分词器的小写设置
        return self.basic_tokenizer.do_lower_case

    @property
    def vocab_size(self):
        # 返回词汇表大小
        return len(self.vocab)

    def get_vocab(self):
        # 返回包含词汇表和添加的特殊 token 编码的字典
        return dict(self.vocab, **self.added_tokens_encoder)
    # 将文本进行分词处理，返回分词后的结果列表
    def _tokenize(self, text):
        split_tokens = []
        # 如果需要进行基本的分词处理
        if self.do_basic_tokenize:
            # 使用基本分词器对文本进行分词，忽略不需要分词的特殊标记
            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
                # 如果分词后的 token 在不需要分割的集合中
                if token in self.basic_tokenizer.never_split:
                    # 直接加入到分词结果中
                    split_tokens.append(token)
                else:
                    # 使用 WordPiece 分词器对 token 进行进一步分词处理
                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
        else:
            # 否则，直接使用 WordPiece 分词器对文本进行分词处理
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        # 返回分词后的结果列表
        return split_tokens

    # 根据词汇表将 token 转换为对应的 id
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    # 根据词汇表将 id 转换为对应的 token
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.ids_to_tokens.get(index, self.unk_token)

    # 将 token 列表转换为单个字符串，同时去除特殊标记 "##"
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    # 构建包含特殊标记的输入序列，用于序列分类任务
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # 如果只有一个输入序列
        if token_ids_1 is None:
            # 返回带有 [CLS] 和 [SEP] 特殊标记的输入序列
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        # 分别定义 [CLS] 和 [SEP] 的特殊标记
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        # 返回带有 [CLS], [SEP] 和两个序列之间的 [SEP] 特殊标记的输入序列
        return cls + token_ids_0 + sep + token_ids_1 + sep

    # 获取包含特殊标记的 token id 序列的掩码
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
        """Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using `build_inputs_with_special_tokens` method.
        
        Args:
            token_ids_0 (`List[int]`):
                List of token ids (must be pure token ids without special tokens).
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of token ids for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether the token list is already formated with special tokens or not.

        Returns:
            `List[int]`: A list of integers in the range [0, 1], with 1 specifying special tokens and 0 specifying
            regular tokens.
        """
        # 如果输入的 token_ids 已经包含了特殊标记
        if already_has_special_tokens:
            # 返回与 token_ids 0 和 token_ids 1 长度相同的全零列表
            return [0] * len(token_ids_0)
        # 定义一个用于存储特殊标记掩码的列表
        special_tokens_mask = [1]  # [CLS] token
        # 如果有第二个序列 token_ids_1
        if token_ids_1 is not None:
            # 添加一个 [SEP] token 的掩码
            special_tokens_mask += [1] * len(token_ids_1)  # [SEP] tokens
        # 返回特殊标记的掩码与 token_ids_0 长度相同的列表
        return special_tokens_mask + [0] * (len(token_ids_0) - len(special_tokens_mask))
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            # 如果已经存在特殊标记，则调用父类方法获取特殊标记的掩码
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is not None:
            # 如果存在第二个序列，返回包含特殊标记的掩码：[CLS] + token_ids_0 + [SEP] + token_ids_1 + [SEP]
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        # 如果只有一个序列，返回包含特殊标记的掩码：[CLS] + token_ids_0 + [SEP]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
        pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second
        sequence | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # 获取分隔符和类别标记的 ID
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            # 如果只有一个序列，返回一个长度为 cls + token_ids_0 + sep 长度的全零列表
            return len(cls + token_ids_0 + sep) * [0]
        # 如果有两个序列，返回两个序列加上分隔符的长度分别对应的掩码列表：[CLS] + token_ids_0 + [SEP] + token_ids_1 + [SEP]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    # 将词汇表保存到指定目录下的文件中，返回保存的文件路径元组
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 初始化索引
        index = 0
        # 检查保存目录是否存在，如果存在则构建词汇表文件路径
        if os.path.isdir(save_directory):
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            # 否则直接使用指定的保存路径
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        # 打开词汇表文件，准备写入内容
        with open(vocab_file, "w", encoding="utf-8") as writer:
            # 遍历词汇表中的每个词汇及其索引，并按索引顺序写入文件
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                # 如果当前词汇的索引不是期望的连续索引，记录警告信息
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index
                # 写入词汇到文件，并增加索引计数
                writer.write(token + "\n")
                index += 1
        # 返回保存的词汇表文件路径元组
        return (vocab_file,)

    # 调用函数的装饰器，添加文档字符串到__call__方法
    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # 定义一个方法用于批量编码文本或文本对，并返回批处理编码结果
    def batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,  # 是否为文本对
        boxes: Optional[List[List[List[int]]]] = None,  # 文本框的坐标信息（可选）
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,  # 单词标签（可选）
        add_special_tokens: bool = True,  # 是否添加特殊标记
        padding: Union[bool, str, PaddingStrategy] = False,  # 填充策略
        truncation: Union[bool, str, TruncationStrategy] = None,  # 截断策略
        max_length: Optional[int] = None,  # 最大长度限制（可选）
        stride: int = 0,  # 步长
        pad_to_multiple_of: Optional[int] = None,  # 填充到指定倍数的长度（可选）
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的张量类型（可选）
        return_token_type_ids: Optional[bool] = None,  # 是否返回token类型ID（可选）
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力掩码（可选）
        return_overflowing_tokens: bool = False,  # 是否返回溢出的token
        return_special_tokens_mask: bool = False,  # 是否返回特殊token掩码
        return_offsets_mapping: bool = False,  # 是否返回偏移映射
        return_length: bool = False,  # 是否返回长度
        verbose: bool = True,  # 是否详细输出信息
        **kwargs,  # 其他关键字参数
    ) -> BatchEncoding:
        # 获取填充和截断策略，并处理旧版本参数兼容性
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用内部方法执行批量编码
        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            is_pair=is_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )
    # 定义一个方法 `_batch_encode_plus`，用于批量编码文本或文本对，并生成批编码结果的对象
    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,  # 是否为文本对
        boxes: Optional[List[List[List[int]]]] = None,  # 盒子坐标，用于文本识别任务
        word_labels: Optional[List[List[int]]] = None,  # 单词标签列表
        add_special_tokens: bool = True,  # 是否添加特殊标记（例如 [CLS], [SEP]）
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略，默认不填充
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略，默认不截断
        max_length: Optional[int] = None,  # 最大长度限制
        stride: int = 0,  # 步进值，默认为0
        pad_to_multiple_of: Optional[int] = None,  # 填充到指定的倍数
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的张量类型
        return_token_type_ids: Optional[bool] = None,  # 是否返回 token 类型 IDs
        return_attention_mask: Optional[bool] = None,  # 是否返回 attention mask
        return_overflowing_tokens: bool = False,  # 是否返回溢出的 tokens
        return_special_tokens_mask: bool = False,  # 是否返回特殊 tokens 的 mask
        return_offsets_mapping: bool = False,  # 是否返回偏移映射
        return_length: bool = False,  # 是否返回长度信息
        verbose: bool = True,  # 是否打印详细信息
        **kwargs,  # 其他未命名参数
    ) -> BatchEncoding:  # 方法返回类型为 BatchEncoding 对象
        # 如果请求返回偏移映射，则抛出 NotImplementedError
        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast."
            )

        # 调用内部方法 `_batch_prepare_for_model` 准备批量数据以供模型处理
        batch_outputs = self._batch_prepare_for_model(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            is_pair=is_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            return_tensors=return_tensors,
            verbose=verbose,
        )

        # 将批处理输出转换为 BatchEncoding 对象并返回
        return BatchEncoding(batch_outputs)

    # 将函数 `_batch_encode_plus` 与文档字符串拼接并添加到类中作为方法装饰器
    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # 批量准备输入数据以供模型处理，处理文本或文本对的批次
    def _batch_prepare_for_model(
        self,
        batch_text_or_text_pairs,  # 输入的文本或文本对的批次
        is_pair: bool = None,  # 标志是否为文本对
        boxes: Optional[List[List[int]]] = None,  # 文本框的位置信息（可选）
        word_labels: Optional[List[List[int]]] = None,  # 单词级别的标签（可选）
        add_special_tokens: bool = True,  # 是否添加特殊标记
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略
        max_length: Optional[int] = None,  # 最大长度限制（可选）
        stride: int = 0,  # 步长
        pad_to_multiple_of: Optional[int] = None,  # 填充到指定的倍数（可选）
        return_tensors: Optional[str] = None,  # 返回的张量类型（可选）
        return_token_type_ids: Optional[bool] = None,  # 是否返回token类型id（可选）
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力掩码（可选）
        return_overflowing_tokens: bool = False,  # 是否返回溢出的token
        return_special_tokens_mask: bool = False,  # 是否返回特殊token的掩码
        return_length: bool = False,  # 是否返回批次长度
        verbose: bool = True,  # 是否打印详细信息
    ) -> BatchEncoding:
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens.

        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
        """

        # Initialize an empty dictionary to store batch outputs
        batch_outputs = {}

        # Iterate over each example in the batch, consisting of text or text pairs and corresponding boxes
        for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
            batch_text_or_text_pair, boxes_example = example
            
            # Determine if the current example is a single text or a pair of texts
            if is_pair:
                input_ids_or_pair = batch_text_or_text_pair[0]  # First sequence of input ids
            else:
                input_ids_or_pair = batch_text_or_text_pair  # Single sequence of input ids
            
            # Prepare inputs for the model using the specified parameters
            outputs = self.prepare_for_model(
                input_ids_or_pair,
                batch_text_or_text_pair[1] if is_pair else None,  # Second sequence of input ids if it exists
                boxes_example,
                word_labels=word_labels[idx] if word_labels is not None else None,
                add_special_tokens=add_special_tokens,
                padding=PaddingStrategy.DO_NOT_PAD.value,  # Do not pad here; it's done in batch
                truncation=truncation_strategy.value,
                max_length=max_length,
                stride=stride,
                pad_to_multiple_of=None,  # Pad in batch afterward
                return_attention_mask=False,  # Do not return attention masks here; it's done in batch
                return_token_type_ids=return_token_type_ids,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_length=return_length,
                return_tensors=None,  # Convert to tensors at the end
                prepend_batch_axis=False,
                verbose=verbose,
            )

            # Aggregate outputs into batch_outputs dictionary
            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        # Perform padding across the batch
        batch_outputs = self.pad(
            batch_outputs,
            padding=padding_strategy.value,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
        )

        # Convert batch_outputs to BatchEncoding format
        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)

        # Return the final prepared batch_outputs
        return batch_outputs

    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING)
    # 定义一个方法 `encode`，用于将输入文本和相关信息编码成模型可以处理的输入格式，并返回编码后的输入 ID 列表
    def encode(
        self,
        text: Union[TextInput, PreTokenizedInput],  # 主要输入文本，可以是普通文本或预分词后的输入
        text_pair: Optional[PreTokenizedInput] = None,  # 可选的第二个输入文本，用于处理句对任务
        boxes: Optional[List[List[int]]] = None,  # 文本框的坐标信息列表，用于处理文本与空间信息结合的任务
        word_labels: Optional[List[int]] = None,  # 单词级别的标签列表，用于处理序列标注任务
        add_special_tokens: bool = True,  # 是否添加特殊令牌（如[CLS], [SEP]）
        padding: Union[bool, str, PaddingStrategy] = False,  # 是否进行填充处理
        truncation: Union[bool, str, TruncationStrategy] = None,  # 是否进行截断处理
        max_length: Optional[int] = None,  # 最大序列长度限制
        stride: int = 0,  # 滑动窗口的步长
        pad_to_multiple_of: Optional[int] = None,  # 填充长度的倍数
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的张量类型（如`pt`表示PyTorch张量）
        return_token_type_ids: Optional[bool] = None,  # 是否返回token类型 IDs
        return_attention_mask: Optional[bool] = None,  # 是否返回attention mask
        return_overflowing_tokens: bool = False,  # 是否返回溢出的 token
        return_special_tokens_mask: bool = False,  # 是否返回特殊令牌的 mask
        return_offsets_mapping: bool = False,  # 是否返回字符偏移映射
        return_length: bool = False,  # 是否返回编码后的长度
        verbose: bool = True,  # 是否启用详细输出模式
        **kwargs,  # 其他未指定的参数
    ) -> List[int]:  # 返回一个整数列表，表示编码后的输入 ID
        # 使用 `encode_plus` 方法对输入进行编码，并获取编码后的结果字典
        encoded_inputs = self.encode_plus(
            text=text,
            text_pair=text_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

        # 返回编码后结果中的 `input_ids` 键对应的值，即编码后的输入 ID 列表
        return encoded_inputs["input_ids"]

    # 使用 `add_end_docstrings` 装饰器添加文档字符串，详细说明 `encode_plus` 方法的参数和功能
    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput],  # 主要输入文本，可以是普通文本或预分词后的输入
        text_pair: Optional[PreTokenizedInput] = None,  # 可选的第二个输入文本，用于处理句对任务
        boxes: Optional[List[List[int]]] = None,  # 文本框的坐标信息列表，用于处理文本与空间信息结合的任务
        word_labels: Optional[List[int]] = None,  # 单词级别的标签列表，用于处理序列标注任务
        add_special_tokens: bool = True,  # 是否添加特殊令牌（如[CLS], [SEP]）
        padding: Union[bool, str, PaddingStrategy] = False,  # 是否进行填充处理
        truncation: Union[bool, str, TruncationStrategy] = None,  # 是否进行截断处理
        max_length: Optional[int] = None,  # 最大序列长度限制
        stride: int = 0,  # 滑动窗口的步长
        pad_to_multiple_of: Optional[int] = None,  # 填充长度的倍数
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的张量类型（如`pt`表示PyTorch张量）
        return_token_type_ids: Optional[bool] = None,  # 是否返回token类型 IDs
        return_attention_mask: Optional[bool] = None,  # 是否返回attention mask
        return_overflowing_tokens: bool = False,  # 是否返回溢出的 token
        return_special_tokens_mask: bool = False,  # 是否返回特殊令牌的 mask
        return_offsets_mapping: bool = False,  # 是否返回字符偏移映射
        return_length: bool = False,  # 是否返回编码后的长度
        verbose: bool = True,  # 是否启用详细输出模式
        **kwargs,  # 其他未指定的参数
    ):
        pass  # 方法体略，实际实现中将会进行文本编码并返回编码后的结果字典
    ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
        `__call__` should be used instead.

        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
            text_pair (`List[str]` or `List[int]`, *optional*):
                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
                list of list of strings (words of a batch of examples).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        # 获取填充和截断策略以及其他相关参数
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用 _encode_plus 方法，对文本进行编码和处理
        return self._encode_plus(
            text=text,
            boxes=boxes,
            text_pair=text_pair,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )
    ) -> BatchEncoding:
        if return_offsets_mapping:
            # 如果请求返回偏移映射，则抛出未实现错误
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast. "
                "More information on available tokenizers at "
                "https://github.com/huggingface/transformers/pull/2674"
            )

        # 调用内部方法，准备输入以供模型处理
        return self.prepare_for_model(
            text=text,
            text_pair=text_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding=padding_strategy.value,
            truncation=truncation_strategy.value,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            prepend_batch_axis=True,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            verbose=verbose,
        )

    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def prepare_for_model(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        prepend_batch_axis: bool = False,
        **kwargs,
    ):
        # 准备输入以供模型处理，根据参数配置进行处理
        # 详细文档参考 LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING 和 LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
        pass

    def truncate_sequences(
        self,
        ids: List[int],
        token_boxes: List[List[int]],
        pair_ids: Optional[List[int]] = None,
        pair_token_boxes: Optional[List[List[int]]] = None,
        labels: Optional[List[int]] = None,
        num_tokens_to_remove: int = 0,
        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
        stride: int = 0,
    # 定义一个私有方法 `_pad`，用于填充输入序列以达到指定的最大长度
    # encoded_inputs: 可以是单个编码输入的字典或批编码对象
    # max_length: 可选参数，指定填充后的最大长度
    # padding_strategy: 填充策略，默认为不填充
    # pad_to_multiple_of: 可选参数，填充后的长度将是该参数的倍数
    # return_attention_mask: 可选参数，控制是否返回注意力掩码
# 从transformers.models.bert.tokenization_bert.BasicTokenizer复制的代码
class BasicTokenizer(object):
    """
    构建一个BasicTokenizer对象，用于执行基本的分词（如标点符号分割、转换为小写等）。

    Args:
        do_lower_case (`bool`, *可选*, 默认为 `True`):
            在分词时是否将输入转换为小写。
        never_split (`Iterable`, *可选*):
            在分词时永远不会被分割的token集合。仅在`do_basic_tokenize=True`时生效。
        tokenize_chinese_chars (`bool`, *可选*, 默认为 `True`):
            是否分词中文字符。

            对于日语，这可能需要禁用（参见这个
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *可选*):
            是否去除所有的重音符号。如果没有指定此选项，则会由`lowercase`的值来确定（与原始BERT一样）。
        do_split_on_punc (`bool`, *可选*, 默认为 `True`):
            在某些情况下，我们希望跳过基本的标点符号分割，以便后续的分词可以捕捉到单词的完整上下文，例如缩写词。

    """

    def __init__(
        self,
        do_lower_case=True,
        never_split=None,
        tokenize_chinese_chars=True,
        strip_accents=None,
        do_split_on_punc=True,
    ):
        if never_split is None:
            never_split = []
        self.do_lower_case = do_lower_case  # 是否进行小写转换
        self.never_split = set(never_split)  # 永远不分割的token集合，转换成集合类型
        self.tokenize_chinese_chars = tokenize_chinese_chars  # 是否分割中文字符
        self.strip_accents = strip_accents  # 是否去除重音符号
        self.do_split_on_punc = do_split_on_punc  # 是否基于标点符号分割
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # union() returns a new set by concatenating the two sets.
        # 如果提供了never_split参数，则将其与self.never_split合并成一个新的集合，用于记录不需要分割的token集合
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        # 清理文本，例如去除多余的空格等
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        # 如果设置了tokenize_chinese_chars标志位，则调用_tokenize_chinese_chars方法处理中文字符
        if self.tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        # 使用NFC规范化Unicode文本，确保不同的Unicode编码的同一字符被视为相同
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        # 使用空格分割文本，生成原始token列表
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        split_tokens = []
        # 遍历每个token，根据条件处理token并分割
        for token in orig_tokens:
            if token not in never_split:
                if self.do_lower_case:
                    # 如果开启了小写化，则将token转换为小写
                    token = token.lower()
                    if self.strip_accents is not False:
                        # 如果strip_accents不为False，则移除token中的重音符号
                        token = self._run_strip_accents(token)
                elif self.strip_accents:
                    # 否则只移除token中的重音符号
                    token = self._run_strip_accents(token)
            # 将处理后的token列表添加到split_tokens中
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 使用空格分割处理后的token列表，生成最终的output_tokens
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 使用NFD规范化Unicode文本，将重音符号与字符分开表示
        text = unicodedata.normalize("NFD", text)
        output = []
        # 遍历每个字符，如果字符的Unicode category是Mn（Nonspacing_Mark），则跳过该字符，否则添加到output中
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        # 将字符列表连接成字符串，返回处理后的文本
        return "".join(output)
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        # 如果不需要在标点符号处分割或者指定的文本在never_split中，直接返回原始文本列表
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        # 将文本转换为字符列表
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        # 遍历字符列表
        while i < len(chars):
            char = chars[i]
            # 如果当前字符是标点符号，将其作为新的列表项添加到输出列表中，并标记下一个字符为新单词的起始
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                # 如果当前字符不是标点符号
                if start_new_word:
                    output.append([])  # 添加一个新的空列表项
                start_new_word = False  # 取消新单词的起始标记
                output[-1].append(char)  # 将当前字符添加到当前单词的最后一个列表项中
            i += 1

        # 将列表中的列表项合并为字符串并返回
        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        # 遍历文本中的每个字符
        for char in text:
            cp = ord(char)
            # 如果字符是中文字符，添加空格字符作为分隔符
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)  # 否则直接添加当前字符
        # 将字符列表转换为字符串并返回
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # 检查传入的码点是否属于CJK字符的Unicode块范围
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True  # 是CJK字符返回True
        return False  # 否则返回False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        # 遍历文本中的每个字符
        for char in text:
            cp = ord(char)
            # 如果字符是无效字符或控制字符，直接跳过
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            # 如果字符是空白字符，替换为单个空格字符，否则直接添加当前字符
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        # 将字符列表转换为字符串并返回
        return "".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        # 初始化 WordpieceTokenizer 类的实例，设置词汇表、未知 token 和单词最大字符数
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.
        """
        # 初始化输出 token 列表
        output_tokens = []
        # 将输入文本按空白字符分割成 token，并逐个处理
        for token in whitespace_tokenize(text):
            # 将当前 token 转换为字符列表
            chars = list(token)
            # 若当前 token 的字符数超过设定的最大字符数，则添加未知 token 并跳过
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            # 初始化标志变量和起始位置
            is_bad = False
            start = 0
            sub_tokens = []
            # 迭代处理字符列表直到处理完所有字符
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                # 从当前起始位置到结束位置，逐步减少子字符串长度，直到找到在词汇表中存在的最长子字符串
                while start < end:
                    substr = "".join(chars[start:end])
                    # 如果起始位置不是第一个字符，则在找到的子字符串前加上 "##"
                    if start > 0:
                        substr = "##" + substr
                    # 如果找到了在词汇表中的子字符串，则保存当前子字符串并退出内循环
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                # 如果未找到合适的子字符串，则标记为无效，并结束外循环
                if cur_substr is None:
                    is_bad = True
                    break
                # 将找到的子字符串添加到 sub_tokens 列表中
                sub_tokens.append(cur_substr)
                # 更新起始位置为当前子字符串的结束位置
                start = end

            # 根据标志变量决定将未知 token 或有效子 token 添加到输出列表
            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        # 返回最终的 wordpiece token 列表
        return output_tokens

`.\models\layoutlmv2\tokenization_layoutlmv2_fast.py`

# 设定编码方式为 UTF-8
# 版权声明 2021 年 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本授权，除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"分发，不附带任何明示或暗示的保证或条件。
# 有关更多详细信息，请参阅许可证。
"""
LayoutLMv2 的快速分词器类。覆盖了慢分词器类的两个方法：_batch_encode_plus 和 _encode_plus，其中使用了 Rust 分词器。
"""

import json
from typing import Dict, List, Optional, Tuple, Union

# 导入正则化工具
from tokenizers import normalizers

# 导入基础分词器和快速分词器的相关工具和类
from ...tokenization_utils_base import (
    BatchEncoding,
    EncodedInput,
    PaddingStrategy,
    PreTokenizedInput,
    TensorType,
    TextInput,
    TextInputPair,
    TruncationStrategy,
)

# 导入 LayoutLMv2 的快速分词器类
from ...tokenization_utils_fast import PreTrainedTokenizerFast

# 导入日志工具和 LayoutLMv2 分词器的相关类
from ...utils import add_end_docstrings, logging
from .tokenization_layoutlmv2 import (
    LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING,
    LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
    LayoutLMv2Tokenizer,
)

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义词汇表和分词器文件的名称
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}

# 预训练模型的词汇表文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/layoutlmv2-base-uncased": (
            "https://huggingface.co/microsoft/layoutlmv2-base-uncased/resolve/main/vocab.txt"
        ),
    },
    "tokenizer_file": {
        "microsoft/layoutlmv2-base-uncased": (
            "https://huggingface.co/microsoft/layoutlmv2-base-uncased/resolve/main/tokenizer.json"
        ),
    },
}

# 预训练模型的位置嵌入大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/layoutlmv2-base-uncased": 512,
}

# 预训练模型的初始化配置
PRETRAINED_INIT_CONFIGURATION = {
    "microsoft/layoutlmv2-base-uncased": {"do_lower_case": True},
}


class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
    r"""
    构建一个基于 HuggingFace 的 *tokenizers* 库支持的"快速" LayoutLMv2 分词器。基于 WordPiece。

    该分词器继承自 [`PreTrainedTokenizerFast`]，其中包含大多数主要方法。用户应参考此超类以获取更多关于这些方法的信息。
    # 初始化词汇文件名列表，使用预定义的全局常量
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型的词汇文件映射，包含文件名到预训练模型配置的映射关系
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 预训练模型初始化的配置，包含了预定义的配置参数
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # 将预训练模型的位置嵌入大小赋值给 max_model_input_sizes
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    # 将 LayoutLMv2Tokenizer 类赋值给 slow_tokenizer_class
    slow_tokenizer_class = LayoutLMv2Tokenizer

    # 初始化函数，用于创建一个 LayoutLMv2Tokenizer 对象
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=True,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        cls_token_box=[0, 0, 0, 0],
        sep_token_box=[1000, 1000, 1000, 1000],
        pad_token_box=[0, 0, 0, 0],
        pad_token_label=-100,
        only_label_first_subword=True,
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        # 调用父类的初始化方法，设置相关属性
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            cls_token_box=cls_token_box,
            sep_token_box=sep_token_box,
            pad_token_box=pad_token_box,
            pad_token_label=pad_token_label,
            only_label_first_subword=only_label_first_subword,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

        # 从 backend_tokenizer 中获取当前的标准化状态
        pre_tok_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())

        # 检查预处理器的小写和去重音选项是否与参数中的设置一致，若不一致则更新预处理器状态
        if (
            pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
            or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
        ):
            # 获取预处理器的类，并更新参数
            pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
            pre_tok_state["lowercase"] = do_lower_case
            pre_tok_state["strip_accents"] = strip_accents
            # 实例化新的预处理器对象
            self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)

        # 设置实例的属性
        self.do_lower_case = do_lower_case
        self.cls_token_box = cls_token_box
        self.sep_token_box = sep_token_box
        self.pad_token_box = pad_token_box
        self.pad_token_label = pad_token_label
        self.only_label_first_subword = only_label_first_subword

    # 将函数的装饰器添加到当前类中
    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # 使用装饰器添加文档字符串，其中包含 LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING 和 LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING 的内容
    def batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,
        boxes: Optional[List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
    # batch_encode_plus 方法用于批量编码文本或文本对，并返回编码后的结果
        ) -> BatchEncoding:
        # 为了向后兼容 'truncation_strategy', 'pad_to_max_length' 参数
        # 调用内部方法获取填充和截断策略以及其他参数
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用内部方法进行批量编码处理，并返回结果
        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            is_pair=is_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
        # 将输入文本和可选的配对文本构成批量输入
        batched_input = [(text, pair)] if pair else [text]
        # 使用内部的分词器对批量输入进行编码处理
        encodings = self._tokenizer.encode_batch(
            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
        )

        # 返回第一个编码结果的 tokens 属性，即分词后的文本列表
        return encodings[0].tokens

    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
        ):
        # 使用特定的文本和配对文本、框、单词标签等信息进行编码处理
        # 设置默认添加特殊标记，以及填充和截断策略
        # 返回编码后的结果，根据参数选择是否返回张量形式的数据
    ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
        `__call__` should be used instead.

        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
            text_pair (`List[str]` or `List[int]`, *optional*):
                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
                list of list of strings (words of a batch of examples).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        # 获取填充和截断策略，以及其他相关参数
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用内部方法 `_encode_plus` 进行编码
        return self._encode_plus(
            text=text,
            boxes=boxes,
            text_pair=text_pair,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,
        boxes: Optional[List[List[List[int]]]] = None,
        word_labels: Optional[List[List[int]]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[str] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput],  # 定义函数参数text，可以是单文本或预分词文本输入
        text_pair: Optional[PreTokenizedInput] = None,  # 可选参数，用于处理文本对
        boxes: Optional[List[List[int]]] = None,  # 可选参数，用于处理边界框信息
        word_labels: Optional[List[int]] = None,  # 可选参数，用于处理单词级别标签
        add_special_tokens: bool = True,  # 是否添加特殊token，默认为True
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略，默认不填充
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略，默认不截断
        max_length: Optional[int] = None,  # 可选参数，最大长度限制
        stride: int = 0,  # 步长，默认为0
        pad_to_multiple_of: Optional[int] = None,  # 可选参数，填充到某个倍数
        return_tensors: Optional[bool] = None,  # 可选参数，返回张量形式
        return_token_type_ids: Optional[bool] = None,  # 可选参数，返回token类型IDs
        return_attention_mask: Optional[bool] = None,  # 可选参数，返回注意力掩码
        return_overflowing_tokens: bool = False,  # 是否返回溢出的token，默认不返回
        return_special_tokens_mask: bool = False,  # 是否返回特殊token的掩码，默认不返回
        return_offsets_mapping: bool = False,  # 是否返回偏移映射，默认不返回
        return_length: bool = False,  # 是否返回长度，默认不返回
        verbose: bool = True,  # 是否显示详细信息，默认为True
        **kwargs,  # 其他未指定的关键字参数
    ) -> BatchEncoding:
        # 将输入文本处理为批次输入
        # 有两种选项：
        # 1) 只有text，此时text必须是str的列表
        # 2) text + text_pair，此时text是str，text_pair是str的列表
        batched_input = [(text, text_pair)] if text_pair else [text]
        # 将边界框信息处理为批次边界框
        batched_boxes = [boxes]
        # 将单词级别标签处理为批次标签
        batched_word_labels = [word_labels] if word_labels is not None else None
        # 使用_batch_encode_plus方法处理批次输入
        batched_output = self._batch_encode_plus(
            batched_input,
            is_pair=bool(text_pair is not None),  # 是否是文本对
            boxes=batched_boxes,  # 批次边界框信息
            word_labels=batched_word_labels,  # 批次单词级别标签
            add_special_tokens=add_special_tokens,  # 是否添加特殊token
            padding_strategy=padding_strategy,  # 填充策略
            truncation_strategy=truncation_strategy,  # 截断策略
            max_length=max_length,  # 最大长度限制
            stride=stride,  # 步长
            pad_to_multiple_of=pad_to_multiple_of,  # 填充到某个倍数
            return_tensors=return_tensors,  # 是否返回张量形式
            return_token_type_ids=return_token_type_ids,  # 是否返回token类型IDs
            return_attention_mask=return_attention_mask,  # 是否返回注意力掩码
            return_overflowing_tokens=return_overflowing_tokens,  # 是否返回溢出的token
            return_special_tokens_mask=return_special_tokens_mask,  # 是否返回特殊token的掩码
            return_offsets_mapping=return_offsets_mapping,  # 是否返回偏移映射
            return_length=return_length,  # 是否返回长度
            verbose=verbose,  # 是否显示详细信息
            **kwargs,  # 其他未指定的关键字参数
        )

        # 如果返回的张量为None，并且不返回溢出的token，则移除批次输出的前导批次轴
        # 如果返回的值为批次的输出，则在这种情况下保留它们
        if return_tensors is None and not return_overflowing_tokens:
            batched_output = BatchEncoding(
                {
                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
                    for key, value in batched_output.items()
                },
                batched_output.encodings,
            )

        # 检查并警告处理后序列过长的情况
        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)

        # 返回处理后的批次输出
        return batched_output
    def _pad(
        self,
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
    ):
        """
        Pad encoded inputs according to specified parameters.

        Args:
            encoded_inputs (Union[Dict[str, EncodedInput], BatchEncoding]):
                Dictionary or batch encoding containing encoded inputs.
            max_length (Optional[int], *optional*):
                Maximum length to pad or truncate the sequences.
            padding_strategy (PaddingStrategy):
                Strategy for padding the sequences.
            pad_to_multiple_of (Optional[int], *optional*):
                Pad to a multiple of this value.
            return_attention_mask (Optional[bool], *optional*):
                Whether to return attention mask.

        Returns:
            Union[Dict[str, torch.Tensor], BatchEncoding]:
                Padded and encoded inputs.
        """

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequences by adding special tokens.

        Args:
            token_ids_0 (List[int]):
                List of IDs for the first sequence.
            token_ids_1 (List[int], *optional*):
                Optional list of IDs for the second sequence.

        Returns:
            List[int]: List of input IDs with added special tokens.
        """
        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        if token_ids_1:
            output += token_ids_1 + [self.sep_token_id]

        return output

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create token type IDs from sequences for sequence-pair classification tasks.

        Args:
            token_ids_0 (List[int]):
                List of IDs for the first sequence.
            token_ids_1 (List[int], *optional*):
                Optional list of IDs for the second sequence.

        Returns:
            List[int]: List of token type IDs indicating the sequence segments.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary of the tokenizer model.

        Args:
            save_directory (str):
                Directory to save the vocabulary files.
            filename_prefix (Optional[str], *optional*):
                Prefix for the vocabulary filenames.

        Returns:
            Tuple[str]: Tuple containing the paths of the saved files.
        """
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

`.\models\layoutlmv2\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING
# 从 utils 中导入相关函数和异常类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tokenizers_available,
    is_torch_available,
    is_vision_available,
)

# 定义模块导入结构
_import_structure = {
    "configuration_layoutlmv2": ["LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMv2Config"],
    "processing_layoutlmv2": ["LayoutLMv2Processor"],
    "tokenization_layoutlmv2": ["LayoutLMv2Tokenizer"],
}

# 检查是否可用 tokenizers 库，如果不可用则引发异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则添加 tokenization_layoutlmv2 到导入结构中
    _import_structure["tokenization_layoutlmv2_fast"] = ["LayoutLMv2TokenizerFast"]

# 检查是否可用 vision 库，如果不可用则引发异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则添加 feature_extraction_layoutlmv2 和 image_processing_layoutlmv2 到导入结构中
    _import_structure["feature_extraction_layoutlmv2"] = ["LayoutLMv2FeatureExtractor"]
    _import_structure["image_processing_layoutlmv2"] = ["LayoutLMv2ImageProcessor"]

# 检查是否可用 torch 库，如果不可用则引发异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则添加 modeling_layoutlmv2 到导入结构中
    _import_structure["modeling_layoutlmv2"] = [
        "LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "LayoutLMv2ForQuestionAnswering",
        "LayoutLMv2ForSequenceClassification",
        "LayoutLMv2ForTokenClassification",
        "LayoutLMv2Layer",
        "LayoutLMv2Model",
        "LayoutLMv2PreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 从相应的模块中导入特定的类和变量
    from .configuration_layoutlmv2 import LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMv2Config
    from .processing_layoutlmv2 import LayoutLMv2Processor
    from .tokenization_layoutlmv2 import LayoutLMv2Tokenizer

    try:
        # 再次检查 tokenizers 库是否可用
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，则导入 LayoutLMv2TokenizerFast
        from .tokenization_layoutlmv2_fast import LayoutLMv2TokenizerFast

    try:
        # 再次检查 vision 库是否可用
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，则导入 LayoutLMv2FeatureExtractor 和 LayoutLMv2ImageProcessor
        from .feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2ImageProcessor

    try:
        # 再次检查 torch 库是否可用
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从当前目录下的 `modeling_layoutlmv2` 模块中导入多个类和常量
        from .modeling_layoutlmv2 import (
            LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST,
            LayoutLMv2ForQuestionAnswering,
            LayoutLMv2ForSequenceClassification,
            LayoutLMv2ForTokenClassification,
            LayoutLMv2Layer,
            LayoutLMv2Model,
            LayoutLMv2PreTrainedModel,
        )
else:
    # 导入 sys 模块，用于操作 Python 解释器的运行时环境
    import sys

    # 将当前模块(__name__)的引用映射到一个自定义的 _LazyModule 对象上，
    # 并传入当前模块的名称、文件路径、导入结构 _import_structure，
    # 同时传入模块规范 __spec__（如果有的话）来指定模块的详细规范信息。
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\layoutlmv3\configuration_layoutlmv3.py`

# coding=utf-8
# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" LayoutLMv3 model configuration"""

# 引入 OrderedDict 类和一些类型定义
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional

# 引入 version 模块
from packaging import version

# 引入配置工具函数
from ...configuration_utils import PretrainedConfig

# 引入 OnnxConfig 类和相关工具函数
from ...onnx import OnnxConfig
from ...onnx.utils import compute_effective_axis_dimension

# 引入日志记录工具
from ...utils import logging

# 如果是类型检查阶段，引入额外的类型和工具
if TYPE_CHECKING:
    from ...processing_utils import ProcessorMixin
    from ...utils import TensorType

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# LayoutLMv3 预训练模型配置文件映射
LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/resolve/main/config.json",
}


# LayoutLMv3 配置类，继承自 PretrainedConfig
class LayoutLMv3Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LayoutLMv3Model`]. It is used to instantiate an
    LayoutLMv3 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the LayoutLMv3
    [microsoft/layoutlmv3-base](https://huggingface.co/microsoft/layoutlmv3-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import LayoutLMv3Config, LayoutLMv3Model

    >>> # Initializing a LayoutLMv3 microsoft/layoutlmv3-base style configuration
    >>> configuration = LayoutLMv3Config()

    >>> # Initializing a model (with random weights) from the microsoft/layoutlmv3-base style configuration
    >>> model = LayoutLMv3Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型标识为 "layoutlmv3"
    model_type = "layoutlmv3"
    # 初始化函数，用于创建一个新的对象
    def __init__(
        self,
        vocab_size=50265,  # 词汇表大小，默认为50265
        hidden_size=768,  # 隐藏层大小，默认为768
        num_hidden_layers=12,  # 隐藏层层数，默认为12
        num_attention_heads=12,  # 注意力头数，默认为12
        intermediate_size=3072,  # 中间层大小，默认为3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为GELU
        hidden_dropout_prob=0.1,  # 隐藏层的dropout概率，默认为0.1
        attention_probs_dropout_prob=0.1,  # 注意力概率的dropout概率，默认为0.1
        max_position_embeddings=512,  # 最大位置嵌入数，默认为512
        type_vocab_size=2,  # 类型词汇表大小，默认为2
        initializer_range=0.02,  # 初始化范围，默认为0.02
        layer_norm_eps=1e-5,  # 层归一化的epsilon值，默认为1e-5
        pad_token_id=1,  # 填充token的ID，默认为1
        bos_token_id=0,  # 开始token的ID，默认为0
        eos_token_id=2,  # 结束token的ID，默认为2
        max_2d_position_embeddings=1024,  # 最大二维位置嵌入数，默认为1024
        coordinate_size=128,  # 坐标大小，默认为128
        shape_size=128,  # 形状大小，默认为128
        has_relative_attention_bias=True,  # 是否有相对注意力偏置，默认为True
        rel_pos_bins=32,  # 相对位置bin数，默认为32
        max_rel_pos=128,  # 最大相对位置，默认为128
        rel_2d_pos_bins=64,  # 二维相对位置bin数，默认为64
        max_rel_2d_pos=256,  # 最大二维相对位置，默认为256
        has_spatial_attention_bias=True,  # 是否有空间注意力偏置，默认为True
        text_embed=True,  # 是否包含文本嵌入，默认为True
        visual_embed=True,  # 是否包含视觉嵌入，默认为True
        input_size=224,  # 输入大小，默认为224
        num_channels=3,  # 通道数，默认为3
        patch_size=16,  # 补丁大小，默认为16
        classifier_dropout=None,  # 分类器的dropout，默认为None
        **kwargs,  # 其他关键字参数
    ):
        # 调用父类（Transformer）的初始化函数，传递参数
        super().__init__(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            intermediate_size=intermediate_size,
            hidden_act=hidden_act,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_position_embeddings=max_position_embeddings,
            type_vocab_size=type_vocab_size,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )
        # 设置对象的其他属性
        self.max_2d_position_embeddings = max_2d_position_embeddings
        self.coordinate_size = coordinate_size
        self.shape_size = shape_size
        self.has_relative_attention_bias = has_relative_attention_bias
        self.rel_pos_bins = rel_pos_bins
        self.max_rel_pos = max_rel_pos
        self.has_spatial_attention_bias = has_spatial_attention_bias
        self.rel_2d_pos_bins = rel_2d_pos_bins
        self.max_rel_2d_pos = max_rel_2d_pos
        self.text_embed = text_embed
        self.visual_embed = visual_embed
        self.input_size = input_size
        self.num_channels = num_channels
        self.patch_size = patch_size
        self.classifier_dropout = classifier_dropout
class LayoutLMv3OnnxConfig(OnnxConfig):
    # 定义 torch ONNX 要求的最低版本为 1.12
    torch_onnx_minimum_version = version.parse("1.12")

    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 输入的顺序在问答和序列分类任务中有所不同
        if self.task in ["question-answering", "sequence-classification"]:
            # 返回有序字典，包含不同输入名称及其维度信息
            return OrderedDict(
                [
                    ("input_ids", {0: "batch", 1: "sequence"}),
                    ("attention_mask", {0: "batch", 1: "sequence"}),
                    ("bbox", {0: "batch", 1: "sequence"}),
                    ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
                ]
            )
        else:
            # 返回有序字典，包含不同输入名称及其维度信息
            return OrderedDict(
                [
                    ("input_ids", {0: "batch", 1: "sequence"}),
                    ("bbox", {0: "batch", 1: "sequence"}),
                    ("attention_mask", {0: "batch", 1: "sequence"}),
                    ("pixel_values", {0: "batch", 1: "num_channels"}),
                ]
            )

    @property
    def atol_for_validation(self) -> float:
        # 设置用于验证的绝对误差容限
        return 1e-5

    @property
    def default_onnx_opset(self) -> int:
        # 默认的 ONNX 运算集版本
        return 12

    def generate_dummy_inputs(
        self,
        processor: "ProcessorMixin",
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional["TensorType"] = None,
        num_channels: int = 3,
        image_width: int = 40,
        image_height: int = 40,
        """
        Generate inputs to provide to the ONNX exporter for the specific framework

        Args:
            processor ([`ProcessorMixin`]):
                The processor associated with this model configuration.
            batch_size (`int`, *optional*, defaults to -1):
                The batch size to export the model for (-1 means dynamic axis).
            seq_length (`int`, *optional*, defaults to -1):
                The sequence length to export the model for (-1 means dynamic axis).
            is_pair (`bool`, *optional*, defaults to `False`):
                Indicate if the input is a pair (sentence 1, sentence 2).
            framework (`TensorType`, *optional*, defaults to `None`):
                The framework (PyTorch or TensorFlow) that the processor will generate tensors for.
            num_channels (`int`, *optional*, defaults to 3):
                The number of channels of the generated images.
            image_width (`int`, *optional*, defaults to 40):
                The width of the generated images.
            image_height (`int`, *optional*, defaults to 40):
                The height of the generated images.

        Returns:
            Mapping[str, Any]: holding the kwargs to provide to the model's forward function
        """

        # A dummy image is used so OCR should not be applied
        setattr(processor.image_processor, "apply_ocr", False)

        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
        # 计算有效的批量维度，如果为动态轴（-1），则使用默认的固定批量维度，避免ONNX的优化
        batch_size = compute_effective_axis_dimension(
            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
        )
        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
        # 根据是否成对生成的token数量，计算有效的序列长度维度，避免ONNX的优化
        token_to_add = processor.tokenizer.num_special_tokens_to_add(is_pair)
        seq_length = compute_effective_axis_dimension(
            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
        )
        # Generate dummy inputs according to compute batch and sequence
        # 根据计算得到的批次和序列长度生成虚拟文本输入
        dummy_text = [[" ".join([processor.tokenizer.unk_token]) * seq_length]] * batch_size

        # Generate dummy bounding boxes
        # 生成虚拟的边界框输入
        dummy_bboxes = [[[48, 84, 73, 128]]] * batch_size

        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
        # 根据是否成对生成的token数量，计算有效的批量维度，避免ONNX的优化
        dummy_image = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)

        # 将生成的虚拟输入传递给处理器，生成模型前向函数所需的kwargs字典
        inputs = dict(
            processor(
                dummy_image,
                text=dummy_text,
                boxes=dummy_bboxes,
                return_tensors=framework,
            )
        )

        return inputs

`.\models\layoutlmv3\feature_extraction_layoutlmv3.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Feature extractor class for LayoutLMv3.
"""

import warnings  # 导入警告模块

from ...utils import logging  # 导入日志工具
from .image_processing_layoutlmv3 import LayoutLMv3ImageProcessor  # 导入图像处理的LayoutLMv3类


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


class LayoutLMv3FeatureExtractor(LayoutLMv3ImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        warnings.warn(
            "The class LayoutLMv3FeatureExtractor is deprecated and will be removed in version 5 of Transformers."
            " Please use LayoutLMv3ImageProcessor instead.",
            FutureWarning,
        )
        super().__init__(*args, **kwargs)  # 调用父类构造函数，初始化LayoutLMv3ImageProcessor的实例

`.\models\layoutlmv3\image_processing_layoutlmv3.py`

# 设置编码格式为 UTF-8
# 版权声明及许可证明，此代码受 Apache License, Version 2.0 许可，详见链接
"""LayoutLMv3 的图像处理器类。"""

# 导入必要的模块和类型定义
from typing import Dict, Iterable, Optional, Union

import numpy as np  # 导入 NumPy 库

# 导入自定义的图像处理工具函数和类
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
# 导入图像变换函数
from ...image_transforms import resize, to_channel_dimension_format, to_pil_image
# 导入图像相关的实用工具函数和常量
from ...image_utils import (
    IMAGENET_STANDARD_MEAN,
    IMAGENET_STANDARD_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
# 导入通用实用函数和类型定义
from ...utils import TensorType, is_pytesseract_available, is_vision_available, logging, requires_backends

# 如果 Vision 相关库可用，则导入 PIL 库
if is_vision_available():
    import PIL

# 如果安装了 pytesseract 库，则导入该库
if is_pytesseract_available():
    import pytesseract

# 获取日志记录器
logger = logging.get_logger(__name__)


def normalize_box(box, width, height):
    """将边界框的坐标归一化为 [0, 1000] 的范围内。

    Args:
        box (list): 边界框的坐标 [left, top, right, bottom]。
        width (int): 图像宽度。
        height (int): 图像高度。

    Returns:
        list: 归一化后的边界框坐标 [left_norm, top_norm, right_norm, bottom_norm]。
    """
    return [
        int(1000 * (box[0] / width)),
        int(1000 * (box[1] / height)),
        int(1000 * (box[2] / width)),
        int(1000 * (box[3] / height)),
    ]


def apply_tesseract(
    image: np.ndarray,
    lang: Optional[str],
    tesseract_config: Optional[str],
    input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
    """对文档图像应用 Tesseract OCR，并返回识别的单词及归一化的边界框。

    Args:
        image (np.ndarray): 输入的图像数据。
        lang (Optional[str]): OCR 使用的语言设置。
        tesseract_config (Optional[str]): Tesseract 配置选项。
        input_data_format (Optional[Union[ChannelDimension, str]]): 输入图像的通道格式。

    Returns:
        None
    """
    # 将 NumPy 数组转换为 PIL 图像对象
    pil_image = to_pil_image(image, input_data_format=input_data_format)
    # 获取 PIL 图像的宽度和高度
    image_width, image_height = pil_image.size
    # 使用 pytesseract 库进行 OCR，返回识别的单词及其相关数据
    data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
    # 解析 OCR 结果中的单词、左上角坐标、宽度和高度信息
    words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]

    # 过滤掉空单词及其对应的坐标信息
    irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
    words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
    left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
    top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
    width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
    height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]

    # 将坐标转换为 (left, top, right, bottom) 格式
    # 初始化空列表，用于存储计算得到的实际边界框坐标
    actual_boxes = []
    # 使用 zip 函数迭代 left, top, width, height 四个列表，并依次取出对应的 x, y, w, h 值
    for x, y, w, h in zip(left, top, width, height):
        # 计算每个边界框的实际坐标，格式为 [左上角 x 坐标, 左上角 y 坐标, 右下角 x 坐标, 右下角 y 坐标]
        actual_box = [x, y, x + w, y + h]
        # 将计算得到的实际边界框坐标添加到 actual_boxes 列表中
        actual_boxes.append(actual_box)
    
    # 最终，对边界框进行归一化处理
    normalized_boxes = []
    # 遍历每个实际边界框，调用 normalize_box 函数对其进行归一化处理，并将处理后的结果添加到 normalized_boxes 列表中
    for box in actual_boxes:
        normalized_boxes.append(normalize_box(box, image_width, image_height))
    
    # 断言确保识别出的单词数量与归一化后的边界框数量相等，否则抛出异常信息 "Not as many words as there are bounding boxes"
    assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
    
    # 返回识别出的单词列表和归一化后的边界框列表作为结果
    return words, normalized_boxes
class LayoutLMv3ImageProcessor(BaseImageProcessor):
    r"""
    Constructs a LayoutLMv3 image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to `(size["height"], size["width"])`. Can be
            overridden by `do_resize` in `preprocess`.
        size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image's pixel values by the specified `rescale_value`. Can be overridden by
            `do_rescale` in `preprocess`.
        rescale_factor (`float`, *optional*, defaults to 1 / 255):
            Value by which the image's pixel values are rescaled. Can be overridden by `rescale_factor` in
            `preprocess`.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.
        image_mean (`Iterable[float]` or `float`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`Iterable[float]` or `float`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
        apply_ocr (`bool`, *optional*, defaults to `True`):
            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
            the `apply_ocr` parameter in the `preprocess` method.
        ocr_lang (`str`, *optional*):
            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
            used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
        tesseract_config (`str`, *optional*):
            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
            Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
            `preprocess` method.
    """

    # 定义 LayoutLMv3 图像处理器类，继承自 BaseImageProcessor 类

    # 模型输入的名称列表，仅包含 "pixel_values"
    model_input_names = ["pixel_values"]
    # 初始化函数，用于初始化图像处理器对象的各种参数和属性
    def __init__(
        self,
        do_resize: bool = True,  # 是否进行图像大小调整的标志，默认为True
        size: Dict[str, int] = None,  # 图像大小的字典，包含高度和宽度，默认为None
        resample: PILImageResampling = PILImageResampling.BILINEAR,  # 图像重采样方法，默认为双线性插值
        do_rescale: bool = True,  # 是否进行图像像素值缩放的标志，默认为True
        rescale_value: float = 1 / 255,  # 图像像素值缩放的因子，默认为1/255
        do_normalize: bool = True,  # 是否进行图像标准化的标志，默认为True
        image_mean: Union[float, Iterable[float]] = None,  # 图像标准化的均值，默认为IMAGENET_STANDARD_MEAN
        image_std: Union[float, Iterable[float]] = None,  # 图像标准化的标准差，默认为IMAGENET_STANDARD_STD
        apply_ocr: bool = True,  # 是否应用OCR识别的标志，默认为True
        ocr_lang: Optional[str] = None,  # OCR识别使用的语言，默认为None
        tesseract_config: Optional[str] = "",  # Tesseract OCR的配置参数，默认为空字符串
        **kwargs,  # 其他关键字参数
    ) -> None:
        # 调用父类的初始化函数，传入其他关键字参数
        super().__init__(**kwargs)
        # 如果size参数为None，则设置默认的图像大小为{"height": 224, "width": 224}
        size = size if size is not None else {"height": 224, "width": 224}
        # 调用get_size_dict函数，确保size参数是符合要求的字典形式
        size = get_size_dict(size)

        # 初始化各个属性
        self.do_resize = do_resize  # 是否进行图像大小调整
        self.size = size  # 图像大小的字典
        self.resample = resample  # 图像重采样方法
        self.do_rescale = do_rescale  # 是否进行图像像素值缩放
        self.rescale_factor = rescale_value  # 图像像素值缩放因子
        self.do_normalize = do_normalize  # 是否进行图像标准化
        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN  # 图像标准化的均值
        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD  # 图像标准化的标准差
        self.apply_ocr = apply_ocr  # 是否应用OCR识别
        self.ocr_lang = ocr_lang  # OCR识别使用的语言
        self.tesseract_config = tesseract_config  # Tesseract OCR的配置参数
        # 图像处理器对象的有效键列表，包括各个属性名称和其他通用参数
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "apply_ocr",
            "ocr_lang",
            "tesseract_config",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    # 从transformers.models.vit.image_processing_vit.ViTImageProcessor.resize中复制而来的函数
    def resize(
        self,
        image: np.ndarray,  # 输入的图像数据，为NumPy数组
        size: Dict[str, int],  # 调整后的图像大小字典，包含高度和宽度
        resample: PILImageResampling = PILImageResampling.BILINEAR,  # 图像重采样方法，默认为双线性插值
        data_format: Optional[Union[str, ChannelDimension]] = None,  # 输出数据格式
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据格式
        **kwargs,  # 其他关键字参数
    ) -> np.ndarray:
        """
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        """
        # 获取标准化后的尺寸字典
        size = get_size_dict(size)
        # 检查尺寸字典中是否包含必要的键
        if "height" not in size or "width" not in size:
            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
        # 将尺寸字典转换为输出尺寸元组
        output_size = (size["height"], size["width"])
        # 调用resize函数进行图像调整大小操作，返回调整后的图像
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

    def preprocess(
        self,
        images: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample=None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Union[float, Iterable[float]] = None,
        image_std: Union[float, Iterable[float]] = None,
        apply_ocr: bool = None,
        ocr_lang: Optional[str] = None,
        tesseract_config: Optional[str] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,

`.\models\layoutlmv3\modeling_layoutlmv3.py`

# 导入必要的模块和类
import collections
import math
from typing import Optional, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入与模型相关的输出类和工具函数
from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_layoutlmv3 import LayoutLMv3Config

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义配置文件字符串
_CONFIG_FOR_DOC = "LayoutLMv3Config"

# 预训练模型列表
LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/layoutlmv3-base",
    "microsoft/layoutlmv3-large",
    # See all LayoutLMv3 models at https://huggingface.co/models?filter=layoutlmv3
]

# LayoutLMv3模型的起始文档字符串
LAYOUTLMV3_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`LayoutLMv3Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# LayoutLMv3模型输入的文档字符串
LAYOUTLMV3_MODEL_INPUTS_DOCSTRING = r"""
"""

# LayoutLMv3模型下游任务输入的文档字符串
LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING = r"""
"""


class LayoutLMv3PatchEmbeddings(nn.Module):
    """LayoutLMv3 image (patch) embeddings. This class also automatically interpolates the position embeddings for varying
    image sizes."""
    # 初始化函数，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()

        # 根据配置对象确定输入图像的大小，若输入大小是可迭代对象则直接使用，否则将其作为宽高相同的元组
        image_size = (
            config.input_size
            if isinstance(config.input_size, collections.abc.Iterable)
            else (config.input_size, config.input_size)
        )
        # 根据配置对象确定图像的分块大小，同样处理方式
        patch_size = (
            config.patch_size
            if isinstance(config.patch_size, collections.abc.Iterable)
            else (config.patch_size, config.patch_size)
        )
        # 计算分块的形状，即图像在每个维度上分成多少个块
        self.patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
        
        # 使用 nn.Conv2d 创建一个卷积层对象，将输入通道数转换为隐藏层的大小，使用指定的卷积核大小和步长
        self.proj = nn.Conv2d(config.num_channels, config.hidden_size, kernel_size=patch_size, stride=patch_size)

    # 前向传播函数，接收像素值和位置嵌入作为参数
    def forward(self, pixel_values, position_embedding=None):
        # 使用定义好的卷积层进行投影，得到特征嵌入
        embeddings = self.proj(pixel_values)

        # 如果位置嵌入不为 None，则进行插值以使其与特征嵌入的尺寸相匹配
        if position_embedding is not None:
            # 将位置嵌入重塑为与分块形状相匹配的形状
            position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1)
            # 将通道维度移动到正确的位置
            position_embedding = position_embedding.permute(0, 3, 1, 2)
            # 获取特征嵌入的高度和宽度
            patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
            # 使用双三次插值方法将位置嵌入插值到特征嵌入的尺寸
            position_embedding = F.interpolate(position_embedding, size=(patch_height, patch_width), mode="bicubic")
            # 将插值后的位置嵌入与特征嵌入相加
            embeddings = embeddings + position_embedding

        # 将特征嵌入展平，然后交换维度以符合预期的输出格式
        embeddings = embeddings.flatten(2).transpose(1, 2)
        # 返回最终的特征嵌入
        return embeddings
    """
    LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings.
    """

    def __init__(self, config):
        super().__init__()
        # 定义词嵌入层，将词汇索引映射为隐藏状态向量，支持填充标记
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 定义token类型嵌入层，将token类型索引映射为隐藏状态向量
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # LayerNorm层，对隐藏状态向量进行归一化
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout层，用于随机失活以减少过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 注册缓冲区，存储位置ID向量，用于序列化和内存连续访问
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )

        # 设置填充索引，用于位置嵌入层
        self.padding_idx = config.pad_token_id
        # 位置嵌入层，将位置索引映射为隐藏状态向量，支持填充标记
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

        # x坐标位置嵌入层，将x坐标索引映射为坐标大小向量
        self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
        # y坐标位置嵌入层，将y坐标索引映射为坐标大小向量
        self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
        # 高度位置嵌入层，将高度索引映射为形状大小向量
        self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
        # 宽度位置嵌入层，将宽度索引映射为形状大小向量
        self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)

    def calculate_spatial_position_embeddings(self, bbox):
        try:
            # 获取左侧位置嵌入向量，根据bbox的x坐标第一列
            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
            # 获取上侧位置嵌入向量，根据bbox的y坐标第二列
            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
            # 获取右侧位置嵌入向量，根据bbox的x坐标第三列
            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
            # 获取下侧位置嵌入向量，根据bbox的y坐标第四列
            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
        except IndexError as e:
            # 抛出错误，提示bbox坐标值应在0-1000范围内
            raise IndexError("The `bbox` coordinate values should be within 0-1000 range.") from e

        # 计算高度位置嵌入向量，根据bbox的上下坐标差，裁剪在0-1023范围内
        h_position_embeddings = self.h_position_embeddings(torch.clip(bbox[:, :, 3] - bbox[:, :, 1], 0, 1023))
        # 计算宽度位置嵌入向量，根据bbox的左右坐标差，裁剪在0-1023范围内
        w_position_embeddings = self.w_position_embeddings(torch.clip(bbox[:, :, 2] - bbox[:, :, 0], 0, 1023))

        # 合并所有空间位置嵌入向量为一个张量，按最后一个维度连接
        spatial_position_embeddings = torch.cat(
            [
                left_position_embeddings,
                upper_position_embeddings,
                right_position_embeddings,
                lower_position_embeddings,
                h_position_embeddings,
                w_position_embeddings,
            ],
            dim=-1,
        )
        return spatial_position_embeddings
    def create_position_ids_from_input_ids(self, input_ids, padding_idx):
        """
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
        """
        # 根据输入的token ids替换非填充符号为它们的位置编号。位置编号从padding_idx+1开始。填充符号被忽略。
        # 创建一个mask张量，标记非填充符号为1，填充符号为0
        mask = input_ids.ne(padding_idx).int()
        # 使用torch.cumsum沿着dim=1的维度累积求和，生成增量索引，再乘以mask，忽略填充符号
        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask)) * mask
        # 返回增量索引加上padding_idx后的长整型张量
        return incremental_indices.long() + padding_idx

    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
        """
        # 从直接提供的嵌入中生成顺序的位置编号，无法推断哪些是填充的
        input_shape = inputs_embeds.size()[:-1]
        sequence_length = input_shape[1]

        # 使用torch.arange生成从self.padding_idx+1开始，到sequence_length + self.padding_idx + 1结束的长整型张量
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        # 将position_ids张量的维度扩展为与inputs_embeds相同的形状
        return position_ids.unsqueeze(0).expand(input_shape)

    def forward(
        self,
        input_ids=None,
        bbox=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
    ):
        if position_ids is None:
            if input_ids is not None:
                # 如果未提供位置ids，且提供了input_ids，则从input_ids创建位置ids，保留填充的token
                position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx).to(
                    input_ids.device
                )
            else:
                # 如果未提供位置ids且未提供input_ids，则从inputs_embeds创建位置ids
                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        if token_type_ids is None:
            # 如果未提供token_type_ids，则创建一个形状与input_shape相同的零张量
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            # 如果未提供inputs_embeds，则使用self.word_embeddings从input_ids获取嵌入
            inputs_embeds = self.word_embeddings(input_ids)
        # 使用self.token_type_embeddings获取token_type_ids的嵌入
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将输入嵌入与token类型嵌入相加
        embeddings = inputs_embeds + token_type_embeddings
        # 使用self.position_embeddings获取位置ids的嵌入
        position_embeddings = self.position_embeddings(position_ids)
        # 将位置嵌入与之前的嵌入相加
        embeddings += position_embeddings

        # 计算空间位置嵌入
        spatial_position_embeddings = self.calculate_spatial_position_embeddings(bbox)

        # 将空间位置嵌入与之前的嵌入相加
        embeddings = embeddings + spatial_position_embeddings

        # 使用LayerNorm对嵌入进行归一化处理
        embeddings = self.LayerNorm(embeddings)
        # 使用dropout对嵌入进行随机失活处理
        embeddings = self.dropout(embeddings)
        # 返回处理后的嵌入张量
        return embeddings
# LayoutLMv3PreTrainedModel 类定义，继承自 PreTrainedModel，用于处理权重初始化和预训练模型的简单接口
class LayoutLMv3PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 LayoutLMv3Config
    config_class = LayoutLMv3Config
    # 基础模型前缀为 "layoutlmv3"
    base_model_prefix = "layoutlmv3"

    # 初始化模型权重的方法
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果 module 是 nn.Linear 或 nn.Conv2d 类型
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 使用正态分布初始化权重，均值为 0.0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项，将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果 module 是 nn.Embedding 类型
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，均值为 0.0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果设置了 padding_idx，将对应位置的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果 module 是 nn.LayerNorm 类型
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为 1.0
            module.weight.data.fill_(1.0)


# LayoutLMv3SelfAttention 类定义，继承自 nn.Module，实现自注意力机制
class LayoutLMv3SelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 如果 hidden_size 不能被 num_attention_heads 整除，并且没有 embedding_size 属性，则抛出 ValueError
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 设置注意力头的数量和每个头的尺寸
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 定义查询、键、值的线性变换层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # Dropout 层，用于注意力概率的随机丢弃
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        # 是否具有相对注意力偏置和空间注意力偏置
        self.has_relative_attention_bias = config.has_relative_attention_bias
        self.has_spatial_attention_bias = config.has_spatial_attention_bias

    def transpose_for_scores(self, x):
        # 调整 x 的形状以便计算注意力分数，从 [batch_size, seq_length, hidden_size] 转换为 [batch_size, num_attention_heads, seq_length, attention_head_size]
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)
    def cogview_attention(self, attention_scores, alpha=32):
        """
        https://arxiv.org/abs/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation
        (PB-Relax). A replacement of the original nn.Softmax(dim=-1)(attention_scores). Seems the new attention_probs
        will result in a slower speed and a little bias. Can use torch.allclose(standard_attention_probs,
        cogview_attention_probs, atol=1e-08) for comparison. The smaller atol (e.g., 1e-08), the better.
        """
        # 缩放注意力分数，通过除以 alpha
        scaled_attention_scores = attention_scores / alpha
        # 计算每行中的最大值，并扩展维度以便广播
        max_value = scaled_attention_scores.amax(dim=(-1)).unsqueeze(-1)
        # 应用 PB-Relax 算法，调整注意力分数
        new_attention_scores = (scaled_attention_scores - max_value) * alpha
        # 应用 softmax 函数来获得新的注意力概率分布
        return nn.Softmax(dim=-1)(new_attention_scores)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        rel_pos=None,
        rel_2d_pos=None,
    ):
        # 使用 self.query 对隐藏状态进行查询，获取混合查询层
        mixed_query_layer = self.query(hidden_states)

        # 使用 self.key 对隐藏状态进行键的转换，并调用 self.transpose_for_scores 进行进一步处理
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        
        # 使用 self.value 对隐藏状态进行值的转换，并调用 self.transpose_for_scores 进行进一步处理
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        
        # 对混合查询层进行转置处理
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 计算注意力分数，采用点积 "query" 和 "key" 得到原始注意力分数
        # 注意力分数 QT K/√d 可能明显大于输入元素，并导致溢出。
        # 将计算顺序修改为 QT(K/√d) 可以缓解这个问题。详见：https://arxiv.org/pdf/2105.13290.pdf
        attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_head_size), key_layer.transpose(-1, -2))

        # 如果存在相对注意力偏置和空间注意力偏置
        if self.has_relative_attention_bias and self.has_spatial_attention_bias:
            # 添加相对位置偏置和二维相对位置偏置
            attention_scores += (rel_pos + rel_2d_pos) / math.sqrt(self.attention_head_size)
        elif self.has_relative_attention_bias:
            # 添加相对位置偏置
            attention_scores += rel_pos / math.sqrt(self.attention_head_size)

        # 如果存在注意力遮罩
        if attention_mask is not None:
            # 应用预先计算的注意力遮罩（在 RobertaModel 的 forward() 函数中进行预计算）
            attention_scores = attention_scores + attention_mask

        # 将注意力分数归一化为概率
        attention_probs = self.cogview_attention(attention_scores)

        # 使用 dropout 进行注意力概率的随机失活
        # 这实际上是删除整个待注意的令牌，这可能看起来有些不寻常，但源自原始 Transformer 论文。
        attention_probs = self.dropout(attention_probs)

        # 如果存在头部掩码
        if head_mask is not None:
            # 应用头部掩码
            attention_probs = attention_probs * head_mask

        # 计算上下文层，将注意力概率与值层进行矩阵相乘
        context_layer = torch.matmul(attention_probs, value_layer)

        # 对上下文层进行维度变换和重整，以适应全头大小
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        # 如果需要输出注意力信息，则返回上下文层和注意力概率；否则仅返回上下文层
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        # 返回计算结果
        return outputs
# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput
class LayoutLMv3SelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  # 初始化 LayerNorm 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # 初始化 Dropout 层

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)  # 前向传播：全连接层
        hidden_states = self.dropout(hidden_states)  # 前向传播：应用 Dropout
        hidden_states = self.LayerNorm(hidden_states + input_tensor)  # 前向传播：应用 LayerNorm
        return hidden_states


# Copied from transformers.models.layoutlmv2.modeling_layoutlmv2.LayoutLMv2Attention with LayoutLMv2->LayoutLMv3
class LayoutLMv3Attention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.self = LayoutLMv3SelfAttention(config)  # 初始化自注意力层
        self.output = LayoutLMv3SelfOutput(config)  # 初始化自注意力输出层

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        rel_pos=None,
        rel_2d_pos=None,
    ):
        self_outputs = self.self(  # 调用自注意力层
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions,
            rel_pos=rel_pos,
            rel_2d_pos=rel_2d_pos,
        )
        attention_output = self.output(self_outputs[0], hidden_states)  # 应用自注意力输出层
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出注意力权重，则添加到输出中
        return outputs


# Copied from transformers.models.layoutlmv2.modeling_layoutlmv2.LayoutLMv2Layer with LayoutLMv2->LayoutLMv3
class LayoutLMv3Layer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1
        self.attention = LayoutLMv3Attention(config)  # 初始化注意力层
        self.intermediate = LayoutLMv3Intermediate(config)  # 初始化中间层
        self.output = LayoutLMv3Output(config)  # 初始化输出层

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        rel_pos=None,
        rel_2d_pos=None,
    ):
        self_attention_outputs = self.attention(  # 调用注意力层
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            rel_pos=rel_pos,
            rel_2d_pos=rel_2d_pos,
        )
        attention_output = self_attention_outputs[0]

        outputs = self_attention_outputs[1:]  # 如果需要输出注意力权重，则添加到输出中

        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )  # 应用分块机制进行前向传播
        outputs = (layer_output,) + outputs

        return outputs
    # 定义神经网络的前向传播方法，处理注意力输出
    def feed_forward_chunk(self, attention_output):
        # 使用神经网络的中间层处理注意力输出，得到中间输出
        intermediate_output = self.intermediate(attention_output)
        # 使用神经网络的输出层处理中间输出和注意力输出，得到最终层输出
        layer_output = self.output(intermediate_output, attention_output)
        # 返回最终的层输出作为本次前向传播的结果
        return layer_output
    # LayoutLMv3Encoder 类的构造函数，接收一个配置对象 config 作为参数
    def __init__(self, config):
        # 调用父类 nn.Module 的构造函数
        super().__init__()
        # 将传入的配置对象保存到实例变量中
        self.config = config
        # 创建一个包含多个 LayoutLMv3Layer 对象的模块列表，列表长度为 config.num_hidden_layers
        self.layer = nn.ModuleList([LayoutLMv3Layer(config) for _ in range(config.num_hidden_layers)])
        # 初始化梯度检查点标志为 False
        self.gradient_checkpointing = False

        # 检查是否有相对注意力偏置
        self.has_relative_attention_bias = config.has_relative_attention_bias
        # 检查是否有空间注意力偏置
        self.has_spatial_attention_bias = config.has_spatial_attention_bias

        # 如果有相对注意力偏置，则初始化相关变量
        if self.has_relative_attention_bias:
            # 从配置中获取相对位置的桶数和最大相对位置
            self.rel_pos_bins = config.rel_pos_bins
            self.max_rel_pos = config.max_rel_pos
            # 创建一个线性层，用于计算相对位置偏置
            self.rel_pos_bias = nn.Linear(self.rel_pos_bins, config.num_attention_heads, bias=False)

        # 如果有空间注意力偏置，则初始化相关变量
        if self.has_spatial_attention_bias:
            # 从配置中获取二维相对位置的最大值和桶数
            self.max_rel_2d_pos = config.max_rel_2d_pos
            self.rel_2d_pos_bins = config.rel_2d_pos_bins
            # 创建两个线性层，分别用于计算二维相对位置的 x 和 y 偏置
            self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
            self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)

    # 计算相对位置桶的函数
    def relative_position_bucket(self, relative_position, bidirectional=True, num_buckets=32, max_distance=128):
        # 初始化返回值为 0
        ret = 0
        # 如果是双向的，则将桶数减半，并根据相对位置的正负来选择相应的偏置
        if bidirectional:
            num_buckets //= 2
            ret += (relative_position > 0).long() * num_buckets
            n = torch.abs(relative_position)
        else:
            n = torch.max(-relative_position, torch.zeros_like(relative_position))
        # 现在 n 的范围是 [0, inf)

        # 桶的一半用于精确增量的位置
        max_exact = num_buckets // 2
        is_small = n < max_exact

        # 另一半桶用于位置在 max_distance 范围内的对数增大的位置
        val_if_large = max_exact + (
            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
        ).to(torch.long)
        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))

        # 根据 is_small 条件选择使用 n 还是 val_if_large
        ret += torch.where(is_small, n, val_if_large)
        return ret

    # 计算一维位置嵌入的函数
    def _cal_1d_pos_emb(self, position_ids):
        # 计算位置 id 之间的相对位置矩阵
        rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)

        # 使用 relative_position_bucket 函数计算相对位置的桶
        rel_pos = self.relative_position_bucket(
            rel_pos_mat,
            num_buckets=self.rel_pos_bins,
            max_distance=self.max_rel_pos,
        )
        # 获取相对位置偏置并进行相应的形状转换
        rel_pos = self.rel_pos_bias.weight.t()[rel_pos].permute(0, 3, 1, 2)
        rel_pos = rel_pos.contiguous()
        return rel_pos
    # 计算二维位置编码的方法，根据给定的边界框数据计算得到
    def _cal_2d_pos_emb(self, bbox):
        # 提取边界框中的 x 坐标和 y 坐标信息
        position_coord_x = bbox[:, :, 0]
        position_coord_y = bbox[:, :, 3]
        # 计算 x 方向和 y 方向上的相对位置矩阵
        rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
        rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
        # 使用预定义的函数对相对位置矩阵进行桶化处理，得到相对位置编码
        rel_pos_x = self.relative_position_bucket(
            rel_pos_x_2d_mat,
            num_buckets=self.rel_2d_pos_bins,
            max_distance=self.max_rel_2d_pos,
        )
        rel_pos_y = self.relative_position_bucket(
            rel_pos_y_2d_mat,
            num_buckets=self.rel_2d_pos_bins,
            max_distance=self.max_rel_2d_pos,
        )
        # 从权重矩阵中获取 x 和 y 方向上的位置偏置，并进行维度调整
        rel_pos_x = self.rel_pos_x_bias.weight.t()[rel_pos_x].permute(0, 3, 1, 2)
        rel_pos_y = self.rel_pos_y_bias.weight.t()[rel_pos_y].permute(0, 3, 1, 2)
        # 确保数据的连续性
        rel_pos_x = rel_pos_x.contiguous()
        rel_pos_y = rel_pos_y.contiguous()
        # 计算得到最终的二维位置编码
        rel_2d_pos = rel_pos_x + rel_pos_y
        return rel_2d_pos

    # 前向传播方法，用于模型的计算过程
    def forward(
        self,
        hidden_states,
        bbox=None,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        position_ids=None,
        patch_height=None,
        patch_width=None,
        ):
            # 如果设置了输出隐藏状态，初始化一个空元组；否则置为None
            all_hidden_states = () if output_hidden_states else None
            # 如果设置了输出注意力权重，初始化一个空元组；否则置为None
            all_self_attentions = () if output_attentions else None

            # 如果模型支持相对位置注意力偏置，则计算一维位置编码
            rel_pos = self._cal_1d_pos_emb(position_ids) if self.has_relative_attention_bias else None
            # 如果模型支持空间注意力偏置，则计算二维位置编码
            rel_2d_pos = self._cal_2d_pos_emb(bbox) if self.has_spatial_attention_bias else None

            # 遍历每一个层模块
            for i, layer_module in enumerate(self.layer):
                # 如果需要输出隐藏状态，将当前隐藏状态加入到all_hidden_states中
                if output_hidden_states:
                    all_hidden_states = all_hidden_states + (hidden_states,)

                # 获取当前层的头部掩码
                layer_head_mask = head_mask[i] if head_mask is not None else None

                # 如果启用了梯度检查点且在训练阶段，则使用梯度检查点函数执行当前层的前向传播
                if self.gradient_checkpointing and self.training:
                    layer_outputs = self._gradient_checkpointing_func(
                        layer_module.__call__,
                        hidden_states,
                        attention_mask,
                        layer_head_mask,
                        output_attentions,
                        rel_pos,
                        rel_2d_pos,
                    )
                else:
                    # 否则，直接调用当前层的前向传播函数
                    layer_outputs = layer_module(
                        hidden_states,
                        attention_mask,
                        layer_head_mask,
                        output_attentions,
                        rel_pos=rel_pos,
                        rel_2d_pos=rel_2d_pos,
                    )

                # 更新隐藏状态为当前层的输出的第一个元素
                hidden_states = layer_outputs[0]
                # 如果需要输出注意力权重，将当前层的注意力权重加入到all_self_attentions中
                if output_attentions:
                    all_self_attentions = all_self_attentions + (layer_outputs[1],)

            # 如果需要输出隐藏状态，将最终的隐藏状态加入到all_hidden_states中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果不需要返回字典形式的结果，返回一个元组，包括hidden_states, all_hidden_states和all_self_attentions中非None的部分
            if not return_dict:
                return tuple(
                    v
                    for v in [
                        hidden_states,
                        all_hidden_states,
                        all_self_attentions,
                    ]
                    if v is not None
                )
            # 否则，返回一个BaseModelOutput对象，包含最终的隐藏状态、所有隐藏状态和所有注意力权重
            return BaseModelOutput(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states,
                attentions=all_self_attentions,
            )
# Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate
class LayoutLMv3Intermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层，输入大小为 config.hidden_size，输出大小为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择隐藏层激活函数，将字符串类型的激活函数映射到对应的函数上
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 输入 hidden_states 经过全连接层 dense
        hidden_states = self.dense(hidden_states)
        # 再经过选择的隐藏层激活函数 intermediate_act_fn
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.roberta.modeling_roberta.RobertaOutput
class LayoutLMv3Output(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层，输入大小为 config.intermediate_size，输出大小为 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # LayerNorm 层，对隐藏状态进行归一化，大小为 config.hidden_size，eps 为 config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层，以 config.hidden_dropout_prob 的概率进行 dropout
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 输入 hidden_states 经过全连接层 dense
        hidden_states = self.dense(hidden_states)
        # 经过 dropout
        hidden_states = self.dropout(hidden_states)
        # 输入 hidden_states 与 input_tensor 相加，然后经过 LayerNorm
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


@add_start_docstrings(
    "The bare LayoutLMv3 Model transformer outputting raw hidden-states without any specific head on top.",
    LAYOUTLMV3_START_DOCSTRING,
)
class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config

        # 如果配置中包含文本嵌入层，则使用 LayoutLMv3TextEmbeddings 初始化 embeddings 属性
        if config.text_embed:
            self.embeddings = LayoutLMv3TextEmbeddings(config)

        # 如果配置中包含视觉嵌入层，则使用 LayoutLMv3PatchEmbeddings 初始化 patch_embed 属性
        if config.visual_embed:
            # 当输入大小较大时，根据 fine-tuning 参数调整位置嵌入，在前向传播时进行插值
            self.patch_embed = LayoutLMv3PatchEmbeddings(config)

            size = int(config.input_size / config.patch_size)
            # 初始化 cls_token 为一个全零的可训练参数
            self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
            # 初始化位置嵌入为一个全零的可训练参数
            self.pos_embed = nn.Parameter(torch.zeros(1, size * size + 1, config.hidden_size))
            # 初始化 pos_drop 为一个 dropout 层，概率为 0.0
            self.pos_drop = nn.Dropout(p=0.0)

            # 初始化 LayerNorm 层，对隐藏状态进行归一化，大小为 config.hidden_size，eps 为 config.layer_norm_eps
            self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
            # 初始化 dropout 层，以 config.hidden_dropout_prob 的概率进行 dropout
            self.dropout = nn.Dropout(config.hidden_dropout_prob)

            # 如果配置中包含相对注意力偏置或空间注意力偏置，则初始化视觉边界框
            if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
                self.init_visual_bbox(image_size=(size, size))

            # 初始化 norm 层，对隐藏状态进行归一化，大小为 config.hidden_size，eps 为 1e-6
            self.norm = nn.LayerNorm(config.hidden_size, eps=1e-6)

        # 使用 LayoutLMv3Encoder 初始化 encoder 属性
        self.encoder = LayoutLMv3Encoder(config)

        # 初始化模型权重
        self.init_weights()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        See base class PreTrainedModel
        """
        # 遍历需要修剪的层和对应要修剪的注意力头
        for layer, heads in heads_to_prune.items():
            # 调用编码器中指定层的注意力模型的修剪方法
            self.encoder.layer[layer].attention.prune_heads(heads)

    def init_visual_bbox(self, image_size=(14, 14), max_len=1000):
        """
        Create the bounding boxes for the visual (patch) tokens.
        """
        # 计算视觉（patch）标记的边界框
        visual_bbox_x = torch.div(
            torch.arange(0, max_len * (image_size[1] + 1), max_len), image_size[1], rounding_mode="trunc"
        )
        visual_bbox_y = torch.div(
            torch.arange(0, max_len * (image_size[0] + 1), max_len), image_size[0], rounding_mode="trunc"
        )
        visual_bbox = torch.stack(
            [
                visual_bbox_x[:-1].repeat(image_size[0], 1),
                visual_bbox_y[:-1].repeat(image_size[1], 1).transpose(0, 1),
                visual_bbox_x[1:].repeat(image_size[0], 1),
                visual_bbox_y[1:].repeat(image_size[1], 1).transpose(0, 1),
            ],
            dim=-1,
        ).view(-1, 4)

        cls_token_box = torch.tensor([[0 + 1, 0 + 1, max_len - 1, max_len - 1]])
        # 将 [CLS] 标记的边界框与视觉标记的边界框连接起来
        self.visual_bbox = torch.cat([cls_token_box, visual_bbox], dim=0)

    def calculate_visual_bbox(self, device, dtype, batch_size):
        # 将视觉边界框重复扩展到批次中每个样本
        visual_bbox = self.visual_bbox.repeat(batch_size, 1, 1)
        # 将视觉边界框移动到指定设备上，并指定数据类型
        visual_bbox = visual_bbox.to(device).type(dtype)
        return visual_bbox

    def forward_image(self, pixel_values):
        embeddings = self.patch_embed(pixel_values)

        # 添加 [CLS] 标记
        batch_size, seq_len, _ = embeddings.size()
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        embeddings = torch.cat((cls_tokens, embeddings), dim=1)

        # 添加位置嵌入
        if self.pos_embed is not None:
            embeddings = embeddings + self.pos_embed

        embeddings = self.pos_drop(embeddings)
        embeddings = self.norm(embeddings)

        return embeddings

    @add_start_docstrings_to_model_forward(
        LAYOUTLMV3_MODEL_INPUTS_DOCSTRING.format("batch_size, token_sequence_length")
    )
    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        bbox: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Override of the forward method in the parent class with specific
        docstrings added for layoutlmv3 model inputs and outputs.
        """
class LayoutLMv3ClassificationHead(nn.Module):
    """
    Head for sentence-level classification tasks. Reference: RobertaClassificationHead
    """

    def __init__(self, config, pool_feature=False):
        super().__init__()
        self.pool_feature = pool_feature
        if pool_feature:
            # 如果需要池化特征，则使用三倍的隐藏状态大小作为输入维度进行线性变换
            self.dense = nn.Linear(config.hidden_size * 3, config.hidden_size)
        else:
            # 否则使用隐藏状态大小作为输入维度进行线性变换
            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 根据配置设置分类器的丢弃率，如果未指定，则使用隐藏层丢弃率
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        # 最终的线性变换层，输出维度为标签数
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, x):
        # 应用丢弃操作
        x = self.dropout(x)
        # 应用线性变换
        x = self.dense(x)
        # 应用双曲正切激活函数
        x = torch.tanh(x)
        # 再次应用丢弃操作
        x = self.dropout(x)
        # 应用最终的线性变换，得到分类结果
        x = self.out_proj(x)
        return x


@add_start_docstrings(
    """
    LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
    for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
    [SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
    [Kleister-NDA](https://github.com/applicaai/kleister-nda).
    """,
    LAYOUTLMV3_START_DOCSTRING,
)
class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # 初始化 LayoutLMv3 模型
        self.layoutlmv3 = LayoutLMv3Model(config)
        # 应用丢弃操作到隐藏层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 根据标签数确定分类器的类型
        if config.num_labels < 10:
            # 如果标签数小于10，使用简单的线性分类器
            self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        else:
            # 否则使用复杂的分类头 LayoutLMv3ClassificationHead
            self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)

        # 初始化权重
        self.init_weights()

    @add_start_docstrings_to_model_forward(
        LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING.format("batch_size, sequence_length")
    )
    @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        bbox: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.LongTensor] = None,
        ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Returns:

        Examples:

        ```
        >>> from transformers import AutoProcessor, AutoModelForTokenClassification
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
        >>> word_labels = example["ner_tags"]

        >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")

        >>> outputs = model(**encoding)
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 根据传入的参数确定是否使用返回字典，若未指定则使用模型配置中的默认设置

        outputs = self.layoutlmv3(
            input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            pixel_values=pixel_values,
        )
        # 调用LayoutLMv3模型进行前向传播，传入各种输入参数

        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]
        # 根据输入的数据类型，确定输入形状的大小

        seq_length = input_shape[1]
        # 获取序列长度

        # 只取输出表示的文本部分
        sequence_output = outputs[0][:, :seq_length]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        # 对序列输出进行分类器操作，生成最终的逻辑回归输出

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # 如果提供了标签，计算分类损失

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output
        # 如果不使用返回字典，则返回一个元组

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
        # 如果使用返回字典，则返回TokenClassifierOutput对象，包含损失、逻辑回归输出、隐藏状态和注意力权重
"""
LayoutLMv3 Model with a span classification head on top for extractive question-answering tasks such as
[DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to
compute `span start logits` and `span end logits`).
"""
# 定义 LayoutLMv3 问题回答模型，包含用于抽取式问答任务的跨度分类头部
# 例如 [DocVQA](https://rrc.cvc.uab.es/?ch=17)，在隐藏状态输出的文本部分之上使用线性层计算 `span start logits` 和 `span end logits`
@add_start_docstrings(
    """
    LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for document image classification tasks such as the
    [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
    """,
    LAYOUTLMV3_START_DOCSTRING,
)
# 定义 LayoutLMv3 序列分类模型，包含在顶部的序列分类头部
# 例如用于文档图像分类任务如 [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) 数据集的线性层
class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config
        self.layoutlmv3 = LayoutLMv3Model(config)  # 初始化 LayoutLMv3 模型
        self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)  # 初始化分类头部

        self.init_weights()  # 初始化模型权重

    @add_start_docstrings_to_model_forward(
        LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING.format("batch_size, sequence_length")
    )
    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
    # 为模型的 forward 方法添加文档字符串，描述其输入和输出
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        bbox: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.LongTensor] = None,
    # 定义前向传播方法，用于模型的正向推断
    def forward(
        self,
        # 输入序列的 token IDs，类型为长整型张量，可选
        input_ids: Optional[torch.LongTensor] = None,
        # 注意力掩码张量，类型为浮点数张量，可选
        attention_mask: Optional[torch.FloatTensor] = None,
        # 分段 token IDs，类型为长整型张量，可选
        token_type_ids: Optional[torch.LongTensor] = None,
        # 位置 IDs，类型为长整型张量，可选
        position_ids: Optional[torch.LongTensor] = None,
        # 头部掩码，类型为浮点数张量，可选
        head_mask: Optional[torch.FloatTensor] = None,
        # 输入嵌入张量，类型为浮点数张量，可选
        inputs_embeds: Optional[torch.FloatTensor] = None,
        # 标签，类型为长整型张量，可选
        labels: Optional[torch.LongTensor] = None,
        # 是否输出注意力权重，类型为布尔值，可选
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态，类型为布尔值，可选
        output_hidden_states: Optional[bool] = None,
        # 是否返回字典格式的输出，类型为布尔值，可选
        return_dict: Optional[bool] = None,
        # 包围框信息，类型为长整型张量，可选
        bbox: Optional[torch.LongTensor] = None,
        # 像素数值信息，类型为长整型张量，可选
        pixel_values: Optional[torch.LongTensor] = None,

`.\models\layoutlmv3\modeling_tf_layoutlmv3.py`

# 导入必要的模块和类
from __future__ import annotations

import collections  # 导入collections模块，用于处理集合类型数据
import math  # 导入math模块，提供数学函数
from typing import List, Optional, Tuple, Union  # 导入类型注解相关的模块

import tensorflow as tf  # 导入TensorFlow库

from ...activations_tf import get_tf_activation  # 导入自定义的TensorFlow激活函数获取函数
from ...modeling_tf_outputs import (  # 导入TensorFlow模型输出相关类
    TFBaseModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (  # 导入TensorFlow模型工具函数
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds  # 导入检查嵌入是否在范围内的函数
from ...utils import (  # 导入通用工具函数
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from .configuration_layoutlmv3 import LayoutLMv3Config  # 导入LayoutLMv3的配置类


_CONFIG_FOR_DOC = "LayoutLMv3Config"  # 文档中使用的配置字符串

_DUMMY_INPUT_IDS = [  # 虚拟输入 ID 列表，用于测试
    [7, 6, 1],
    [1, 2, 0],
]

_DUMMY_BBOX = [  # 虚拟边界框列表，用于测试
    [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
    [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]],
]

TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST = [  # 预训练模型存档列表
    "microsoft/layoutlmv3-base",
    "microsoft/layoutlmv3-large",
    # 查看所有LayoutLMv3模型：https://huggingface.co/models?filter=layoutlmv3
]

LARGE_NEGATIVE = -1e8  # 定义一个大负数常量


class TFLayoutLMv3PatchEmbeddings(keras.layers.Layer):
    """LayoutLMv3 图像（patch）嵌入层。"""

    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(**kwargs)  # 调用父类的初始化方法
        # 根据配置初始化图像补丁嵌入层
        patch_sizes = (
            config.patch_size
            if isinstance(config.patch_size, collections.abc.Iterable)
            else (config.patch_size, config.patch_size)
        )
        self.proj = keras.layers.Conv2D(  # 创建二维卷积层，用于投影图像补丁到隐藏向量空间
            filters=config.hidden_size,
            kernel_size=patch_sizes,
            strides=patch_sizes,
            padding="valid",
            data_format="channels_last",
            use_bias=True,
            kernel_initializer=get_initializer(config.initializer_range),
            name="proj",
        )
        self.hidden_size = config.hidden_size  # 记录隐藏大小
        self.num_patches = (config.input_size**2) // (patch_sizes[0] * patch_sizes[1])  # 计算补丁数量
        self.config = config  # 记录配置信息
    def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
        # 当在 CPU 上运行时，`keras.layers.Conv2D` 不支持 `NCHW` 格式。
        # 因此，将输入格式从 `NCHW` 转换为 `NHWC`。
        pixel_values = tf.transpose(pixel_values, perm=[0, 2, 3, 1])

        # 使用 self.proj 对象进行投影操作，生成嵌入向量
        embeddings = self.proj(pixel_values)
        
        # 将嵌入向量重新整形为 (-1, self.num_patches, self.hidden_size) 的形状
        embeddings = tf.reshape(embeddings, (-1, self.num_patches, self.hidden_size))
        
        # 返回嵌入向量
        return embeddings

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        
        # 标记当前模块为已构建
        self.built = True
        
        # 如果 self.proj 属性已存在，则构建它
        if getattr(self, "proj", None) is not None:
            with tf.name_scope(self.proj.name):
                # 使用 self.proj 的 build 方法构建投影层，输入形状为 [None, None, None, self.config.num_channels]
                self.proj.build([None, None, None, self.config.num_channels])
class TFLayoutLMv3TextEmbeddings(keras.layers.Layer):
    """
    LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings.
    """

    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(**kwargs)
        # 初始化词嵌入层，用于将词汇 ID 映射为隐藏状态大小的向量
        self.word_embeddings = keras.layers.Embedding(
            config.vocab_size,
            config.hidden_size,
            embeddings_initializer=get_initializer(config.initializer_range),
            name="word_embeddings",
        )
        # 初始化 token 类型嵌入层，用于区分不同类型的 tokens（如 segment A/B）
        self.token_type_embeddings = keras.layers.Embedding(
            config.type_vocab_size,
            config.hidden_size,
            embeddings_initializer=get_initializer(config.initializer_range),
            name="token_type_embeddings",
        )
        # LayerNormalization 层，用于标准化输入数据
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # Dropout 层，用于随机失活，防止过拟合
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        # 填充 token 的索引，用于在序列中标记填充位置
        self.padding_token_index = config.pad_token_id
        # 初始化位置嵌入层，用于编码位置信息
        self.position_embeddings = keras.layers.Embedding(
            config.max_position_embeddings,
            config.hidden_size,
            embeddings_initializer=get_initializer(config.initializer_range),
            name="position_embeddings",
        )
        # X 轴位置嵌入层，用于编码水平位置信息
        self.x_position_embeddings = keras.layers.Embedding(
            config.max_2d_position_embeddings,
            config.coordinate_size,
            embeddings_initializer=get_initializer(config.initializer_range),
            name="x_position_embeddings",
        )
        # Y 轴位置嵌入层，用于编码垂直位置信息
        self.y_position_embeddings = keras.layers.Embedding(
            config.max_2d_position_embeddings,
            config.coordinate_size,
            embeddings_initializer=get_initializer(config.initializer_range),
            name="y_position_embeddings",
        )
        # 高度位置嵌入层，用于编码形状高度信息
        self.h_position_embeddings = keras.layers.Embedding(
            config.max_2d_position_embeddings,
            config.shape_size,
            embeddings_initializer=get_initializer(config.initializer_range),
            name="h_position_embeddings",
        )
        # 宽度位置嵌入层，用于编码形状宽度信息
        self.w_position_embeddings = keras.layers.Embedding(
            config.max_2d_position_embeddings,
            config.shape_size,
            embeddings_initializer=get_initializer(config.initializer_range),
            name="w_position_embeddings",
        )
        # 最大二维位置数量，用于限制二维位置编码的范围
        self.max_2d_positions = config.max_2d_position_embeddings
        # 保存配置对象，包含模型的各种配置参数
        self.config = config
    # 计算空间位置嵌入的函数，接受一个边界框张量作为输入
    def calculate_spatial_position_embeddings(self, bbox: tf.Tensor) -> tf.Tensor:
        try:
            # 提取边界框的左边界位置索引
            left_position_ids = bbox[:, :, 0]
            # 提取边界框的上边界位置索引
            upper_position_ids = bbox[:, :, 1]
            # 提取边界框的右边界位置索引
            right_position_ids = bbox[:, :, 2]
            # 提取边界框的下边界位置索引
            lower_position_ids = bbox[:, :, 3]
        except IndexError as exception:
            # 如果边界框的形状不是 (batch_size, seq_length, 4)，抛出异常
            raise IndexError("Bounding box is not of shape (batch_size, seq_length, 4).") from exception

        try:
            # 使用 x_position_embeddings 方法为左边界位置创建嵌入
            left_position_embeddings = self.x_position_embeddings(left_position_ids)
            # 使用 y_position_embeddings 方法为上边界位置创建嵌入
            upper_position_embeddings = self.y_position_embeddings(upper_position_ids)
            # 使用 x_position_embeddings 方法为右边界位置创建嵌入
            right_position_embeddings = self.x_position_embeddings(right_position_ids)
            # 使用 y_position_embeddings 方法为下边界位置创建嵌入
            lower_position_embeddings = self.y_position_embeddings(lower_position_ids)
        except IndexError as exception:
            # 如果 bbox 坐标值不在 0 到 max_2d_positions 范围内，抛出异常
            raise IndexError(
                f"The `bbox` coordinate values should be within 0-{self.max_2d_positions} range."
            ) from exception

        # 计算高度嵌入，使用 h_position_embeddings 方法，并裁剪高度在 0 到 max_position_id 范围内
        max_position_id = self.max_2d_positions - 1
        h_position_embeddings = self.h_position_embeddings(
            tf.clip_by_value(bbox[:, :, 3] - bbox[:, :, 1], 0, max_position_id)
        )
        # 计算宽度嵌入，使用 w_position_embeddings 方法，并裁剪宽度在 0 到 max_position_id 范围内
        w_position_embeddings = self.w_position_embeddings(
            tf.clip_by_value(bbox[:, :, 2] - bbox[:, :, 0], 0, max_position_id)
        )

        # LayoutLMv1 将空间嵌入求和，但 LayoutLMv3 将它们连接起来
        # 将所有嵌入连接起来形成最终的空间位置嵌入
        spatial_position_embeddings = tf.concat(
            [
                left_position_embeddings,
                upper_position_embeddings,
                right_position_embeddings,
                lower_position_embeddings,
                h_position_embeddings,
                w_position_embeddings,
            ],
            axis=-1,
        )
        return spatial_position_embeddings

    # 从输入嵌入创建位置 id 的函数，接受一个输入嵌入张量作为输入
    def create_position_ids_from_inputs_embeds(self, inputs_embds: tf.Tensor) -> tf.Tensor:
        """
        We are provided embeddings directly. We cannot infer which are padded, so just generate sequential position
        ids.
        """
        # 获取输入张量的形状
        input_shape = tf.shape(inputs_embds)
        # 获取序列长度
        sequence_length = input_shape[1]
        # 计算起始位置索引，即填充令牌索引加 1
        start_index = self.padding_token_index + 1
        # 计算结束位置索引，即填充令牌索引加上序列长度再加 1
        end_index = self.padding_token_index + sequence_length + 1
        # 生成从 start_index 到 end_index 的连续整数作为位置 ids
        position_ids = tf.range(start_index, end_index, dtype=tf.int32)
        # 获取批量大小
        batch_size = input_shape[0]
        # 将位置 ids 重塑为形状为 (1, sequence_length) 的张量
        position_ids = tf.reshape(position_ids, (1, sequence_length))
        # 使用 tf.tile 在第一维度（批量大小维度）上复制位置 ids，使其形状与 inputs_embds 相同
        position_ids = tf.tile(position_ids, (batch_size, 1))
        return position_ids
    # 根据输入的输入 ID 创建位置 ID，并返回位置 ID 的张量
    def create_position_ids_from_input_ids(self, input_ids: tf.Tensor) -> tf.Tensor:
        """
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_token_index + 1.
        """
        # 创建一个掩码，标记非填充符号的位置为1，填充符号位置为0
        mask = tf.cast(tf.not_equal(input_ids, self.padding_token_index), input_ids.dtype)
        # 计算累积和，乘以掩码，用于生成非填充符号的位置 ID
        position_ids = tf.cumsum(mask, axis=1) * mask
        # 将生成的位置 ID 调整为正确的位置，加上填充符号索引
        position_ids = position_ids + self.padding_token_index
        return position_ids

    # 根据输入的输入 ID 和嵌入张量创建位置 ID 的张量
    def create_position_ids(self, input_ids: tf.Tensor, inputs_embeds: tf.Tensor) -> tf.Tensor:
        if input_ids is None:
            return self.create_position_ids_from_inputs_embeds(inputs_embeds)
        else:
            return self.create_position_ids_from_input_ids(input_ids)

    # 模型的调用方法，根据传入的参数生成嵌入表示并返回
    def call(
        self,
        input_ids: tf.Tensor | None = None,
        bbox: tf.Tensor = None,
        token_type_ids: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        training: bool = False,
    ) -> tf.Tensor:
        # 如果未提供位置 ID，则根据输入 ID 和输入嵌入生成位置 ID
        if position_ids is None:
            position_ids = self.create_position_ids(input_ids, inputs_embeds)

        # 根据输入 ID 或输入嵌入张量的形状确定输入形状
        if input_ids is not None:
            input_shape = tf.shape(input_ids)
        else:
            input_shape = tf.shape(inputs_embeds)[:-1]

        # 如果未提供 token_type_ids，则使用零张量填充，与位置 ID 张量的数据类型相同
        if token_type_ids is None:
            token_type_ids = tf.zeros(input_shape, dtype=position_ids.dtype)

        # 如果未提供输入嵌入张量，则根据输入 ID 检查并生成输入嵌入张量
        if inputs_embeds is None:
            check_embeddings_within_bounds(input_ids, self.word_embeddings.input_dim)
            inputs_embeds = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 计算总的嵌入表示，包括输入嵌入、token_type 嵌入和位置嵌入
        embeddings = inputs_embeds + token_type_embeddings
        position_embeddings = self.position_embeddings(position_ids)
        embeddings += position_embeddings

        # 计算空间位置嵌入
        spatial_position_embeddings = self.calculate_spatial_position_embeddings(bbox)

        # 添加空间位置嵌入到总的嵌入表示中
        embeddings += spatial_position_embeddings

        # 应用 LayerNormalization
        embeddings = self.LayerNorm(embeddings)
        # 应用 Dropout，如果处于训练状态
        embeddings = self.dropout(embeddings, training=training)
        return embeddings
    # 如果已经构建过网络结构，则直接返回，不重复构建
    if self.built:
        return
    
    # 将构建标志设置为 True，表示网络结构已经构建
    self.built = True
    
    # 如果存在 word_embeddings 属性，则构建 word_embeddings 层
    if getattr(self, "word_embeddings", None) is not None:
        # 使用 word_embeddings 层的名称作为命名空间，构建该层
        with tf.name_scope(self.word_embeddings.name):
            self.word_embeddings.build(None)
    
    # 如果存在 token_type_embeddings 属性，则构建 token_type_embeddings 层
    if getattr(self, "token_type_embeddings", None) is not None:
        # 使用 token_type_embeddings 层的名称作为命名空间，构建该层
        with tf.name_scope(self.token_type_embeddings.name):
            self.token_type_embeddings.build(None)
    
    # 如果存在 LayerNorm 属性，则构建 LayerNorm 层
    if getattr(self, "LayerNorm", None) is not None:
        # 使用 LayerNorm 层的名称作为命名空间，构建该层
        self.LayerNorm.build([None, None, self.config.hidden_size])
    
    # 如果存在 position_embeddings 属性，则构建 position_embeddings 层
    if getattr(self, "position_embeddings", None) is not None:
        # 使用 position_embeddings 层的名称作为命名空间，构建该层
        with tf.name_scope(self.position_embeddings.name):
            self.position_embeddings.build(None)
    
    # 如果存在 x_position_embeddings 属性，则构建 x_position_embeddings 层
    if getattr(self, "x_position_embeddings", None) is not None:
        # 使用 x_position_embeddings 层的名称作为命名空间，构建该层
        with tf.name_scope(self.x_position_embeddings.name):
            self.x_position_embeddings.build(None)
    
    # 如果存在 y_position_embeddings 属性，则构建 y_position_embeddings 层
    if getattr(self, "y_position_embeddings", None) is not None:
        # 使用 y_position_embeddings 层的名称作为命名空间，构建该层
        with tf.name_scope(self.y_position_embeddings.name):
            self.y_position_embeddings.build(None)
    
    # 如果存在 h_position_embeddings 属性，则构建 h_position_embeddings 层
    if getattr(self, "h_position_embeddings", None) is not None:
        # 使用 h_position_embeddings 层的名称作为命名空间，构建该层
        with tf.name_scope(self.h_position_embeddings.name):
            self.h_position_embeddings.build(None)
    
    # 如果存在 w_position_embeddings 属性，则构建 w_position_embeddings 层
    if getattr(self, "w_position_embeddings", None) is not None:
        # 使用 w_position_embeddings 层的名称作为命名空间，构建该层
        with tf.name_scope(self.w_position_embeddings.name):
            self.w_position_embeddings.build(None)
class TFLayoutLMv3SelfAttention(keras.layers.Layer):
    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(**kwargs)
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.attention_score_normaliser = math.sqrt(self.attention_head_size)

        # 创建用于查询、键和值的全连接层，每个都初始化为给定的范围
        self.query = keras.layers.Dense(
            self.all_head_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="query",
        )
        self.key = keras.layers.Dense(
            self.all_head_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="key",
        )
        self.value = keras.layers.Dense(
            self.all_head_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="value",
        )

        # Dropout 层，用于注意力概率的随机失活
        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
        self.has_relative_attention_bias = config.has_relative_attention_bias
        self.has_spatial_attention_bias = config.has_spatial_attention_bias
        self.config = config

    def transpose_for_scores(self, x: tf.Tensor):
        # 重塑张量形状，以便适应多头注意力的计算
        shape = tf.shape(x)
        new_shape = (
            shape[0],  # batch_size
            shape[1],  # seq_length
            self.num_attention_heads,
            self.attention_head_size,
        )
        x = tf.reshape(x, new_shape)
        return tf.transpose(x, perm=[0, 2, 1, 3])  # 返回转置后的张量，用于多头注意力计算

    def cogview_attention(self, attention_scores: tf.Tensor, alpha: Union[float, int] = 32):
        """
        https://arxiv.org/abs/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation
        (PB-Relax). A replacement of the original keras.layers.Softmax(axis=-1)(attention_scores). Seems the new
        attention_probs will result in a slower speed and a little bias. Can use
        tf.debugging.assert_near(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison. The
        smaller atol (e.g., 1e-08), the better.
        """
        # 缩放注意力分数，根据给定的 alpha 参数
        scaled_attention_scores = attention_scores / alpha
        # 计算缩放后的注意力分数的最大值
        max_value = tf.expand_dims(tf.reduce_max(scaled_attention_scores, axis=-1), axis=-1)
        # 应用 PB-Relax 方法调整注意力分数，然后使用 softmax 计算新的注意力概率
        new_attention_scores = (scaled_attention_scores - max_value) * alpha
        return tf.math.softmax(new_attention_scores, axis=-1)
    def call(
        self,
        hidden_states: tf.Tensor,  # 输入张量，表示模型的隐藏状态
        attention_mask: tf.Tensor | None,  # 注意力掩码张量，用于屏蔽无效的注意力位置
        head_mask: tf.Tensor | None,  # 头部掩码张量，用于屏蔽特定注意力头部
        output_attentions: bool,  # 布尔值，表示是否输出注意力概率
        rel_pos: tf.Tensor | None = None,  # 相对位置张量，用于相对位置注意力
        rel_2d_pos: tf.Tensor | None = None,  # 二维相对位置张量，用于空间位置注意力
        training: bool = False,  # 布尔值，表示是否在训练模式下
    ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
        # 计算 Query、Key 和 Value 张量的转置并处理成注意力得分
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        query_layer = self.transpose_for_scores(self.query(hidden_states))

        # 对 Query 和 Key 进行点积，得到原始的注意力分数
        normalised_query_layer = query_layer / self.attention_score_normaliser
        transposed_key_layer = tf.transpose(
            key_layer, perm=[0, 1, 3, 2]
        )  # batch_size, num_heads, attention_head_size, seq_length
        attention_scores = tf.matmul(normalised_query_layer, transposed_key_layer)

        # 添加相对注意力偏置（如果有的话）到注意力分数中
        if self.has_relative_attention_bias and self.has_spatial_attention_bias:
            attention_scores += (rel_pos + rel_2d_pos) / self.attention_score_normaliser
        elif self.has_relative_attention_bias:
            attention_scores += rel_pos / self.attention_score_normaliser

        # 如果存在注意力掩码，将其应用到注意力分数中
        if attention_mask is not None:
            # 应用预先计算的注意力掩码（在 TFLayoutLMv3Model 的 call() 函数中计算）
            attention_scores += attention_mask

        # 将注意力分数归一化为注意力概率
        # 使用 CogView 论文中的技巧来稳定训练
        attention_probs = self.cogview_attention(attention_scores)

        # 根据训练模式进行 dropout 处理
        attention_probs = self.dropout(attention_probs, training=training)

        # 如果存在头部掩码，将其应用到注意力概率中
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算上下文层张量，通过注意力概率和 Value 层张量的乘积得到
        context_layer = tf.matmul(attention_probs, value_layer)
        context_layer = tf.transpose(
            context_layer, perm=[0, 2, 1, 3]
        )  # batch_size, seq_length, num_heads, attention_head_size
        shape = tf.shape(context_layer)
        context_layer = tf.reshape(
            context_layer, (shape[0], shape[1], self.all_head_size)
        )  # batch_size, seq_length, num_heads * attention_head_size

        # 根据是否需要输出注意力概率来选择输出
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
    # 构建方法，用于构建自定义层的输入形状
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 如果存在查询张量，则构建查询张量的形状
        if getattr(self, "query", None) is not None:
            # 使用查询张量的名称作为作用域，构建其形状
            with tf.name_scope(self.query.name):
                self.query.build([None, None, self.config.hidden_size])
        # 如果存在键张量，则构建键张量的形状
        if getattr(self, "key", None) is not None:
            # 使用键张量的名称作为作用域，构建其形状
            with tf.name_scope(self.key.name):
                self.key.build([None, None, self.config.hidden_size])
        # 如果存在值张量，则构建值张量的形状
        if getattr(self, "value", None) is not None:
            # 使用值张量的名称作为作用域，构建其形状
            with tf.name_scope(self.value.name):
                self.value.build([None, None, self.config.hidden_size])
# Copied from models.roberta.modeling_tf_roberta.TFRobertaSelfOutput
class TFLayoutLMv3SelfOutput(keras.layers.Layer):
    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(**kwargs)

        # 定义一个全连接层，用于对隐藏状态进行线性变换
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 定义一个 LayerNormalization 层，用于归一化隐藏状态
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 定义一个 Dropout 层，用于在训练时随机屏蔽神经元，防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将隐藏状态传入全连接层进行线性变换
        hidden_states = self.dense(inputs=hidden_states)
        # 在训练时应用 Dropout，随机屏蔽部分神经元
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 使用 LayerNormalization 层归一化隐藏状态并加上输入张量（残差连接）
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 dense 层已定义，构建其权重
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果 LayerNorm 层已定义，构建其权重
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


class TFLayoutLMv3Attention(keras.layers.Layer):
    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(**kwargs)
        # 创建自注意力层和自注意力输出层对象
        self.self_attention = TFLayoutLMv3SelfAttention(config, name="self")
        self.self_output = TFLayoutLMv3SelfOutput(config, name="output")

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor | None,
        head_mask: tf.Tensor | None,
        output_attentions: bool,
        rel_pos: tf.Tensor | None = None,
        rel_2d_pos: tf.Tensor | None = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
        # 调用自注意力层进行计算
        self_outputs = self.self_attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions,
            rel_pos,
            rel_2d_pos,
            training=training,
        )
        # 将自注意力层的输出传递给自注意力输出层进行处理
        attention_output = self.self_output(self_outputs[0], hidden_states, training=training)
        # 返回处理后的输出，包括注意力输出和可能的额外输出（如注意力权重）
        outputs = (attention_output,) + self_outputs[1:]  # 如果有额外输出，则将其添加到结果中
        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 self_attention 层已定义，构建其权重
        if getattr(self, "self_attention", None) is not None:
            with tf.name_scope(self.self_attention.name):
                self.self_attention.build(None)
        # 如果 self_output 层已定义，构建其权重
        if getattr(self, "self_output", None) is not None:
            with tf.name_scope(self.self_output.name):
                self.self_output.build(None)
# Copied from models.roberta.modeling_tf_bert.TFRobertaIntermediate
class TFLayoutLMv3Intermediate(keras.layers.Layer):
    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，用于处理中间层的输出
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 根据配置文件中的激活函数类型，获取对应的 TensorFlow 激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 通过全连接层处理输入的隐藏状态
        hidden_states = self.dense(inputs=hidden_states)
        # 应用中间层的激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在全连接层，根据输入的形状构建全连接层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# Copied from models.roberta.modeling_tf_bert.TFRobertaOutput
class TFLayoutLMv3Output(keras.layers.Layer):
    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，用于处理输出层的输出
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 创建 LayerNormalization 层，用于规范化输出
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建 Dropout 层，用于在训练时随机丢弃部分神经元，防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 通过全连接层处理输入的隐藏状态
        hidden_states = self.dense(inputs=hidden_states)
        # 在训练时，通过 Dropout 层随机丢弃部分神经元
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 对输出进行 LayerNormalization，并添加残差连接
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在全连接层，根据输入的形状构建全连接层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])
        # 如果存在 LayerNormalization 层，根据输入的形状构建 LayerNormalization 层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


class TFLayoutLMv3Layer(keras.layers.Layer):
    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(**kwargs)
        # 初始化注意力层对象
        self.attention = TFLayoutLMv3Attention(config, name="attention")
        # 初始化中间层对象
        self.intermediate = TFLayoutLMv3Intermediate(config, name="intermediate")
        # 初始化输出层对象
        self.bert_output = TFLayoutLMv3Output(config, name="output")
    # 定义一个方法 `call`，用于执行 Transformer 层的前向传播
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor | None,
        head_mask: tf.Tensor | None,
        output_attentions: bool,
        rel_pos: tf.Tensor | None = None,
        rel_2d_pos: tf.Tensor | None = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
        # 调用注意力层进行自注意力计算
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            rel_pos=rel_pos,
            rel_2d_pos=rel_2d_pos,
            training=training,
        )
        # 获取自注意力计算的输出
        attention_output = self_attention_outputs[0]
        # 如果需要输出注意力权重，则将注意力权重也加入到输出中
        outputs = self_attention_outputs[1:]  # 如果输出注意力权重，则添加自注意力权重
        # 将自注意力输出传入中间层进行处理
        intermediate_output = self.intermediate(attention_output)
        # 将中间层的输出和自注意力输出传入 BERT 输出层进行最终处理
        layer_output = self.bert_output(intermediate_output, attention_output, training=training)
        # 将本层的输出和可能的注意力权重输出合并成最终输出
        outputs = (layer_output,) + outputs
        # 返回最终输出
        return outputs

    # 定义一个方法 `build`，用于构建 Transformer 层
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 如果存在注意力层，则构建注意力层
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        # 如果存在中间层，则构建中间层
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        # 如果存在 BERT 输出层，则构建 BERT 输出层
        if getattr(self, "bert_output", None) is not None:
            with tf.name_scope(self.bert_output.name):
                self.bert_output.build(None)
# 定义 TFLayoutLMv3Encoder 类，继承自 keras.layers.Layer
class TFLayoutLMv3Encoder(keras.layers.Layer):
    # 初始化方法，接受 LayoutLMv3Config 类型的 config 参数和其他关键字参数
    def __init__(self, config: LayoutLMv3Config, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 将传入的 config 参数赋值给对象的 config 属性
        self.config = config
        # 创建一个列表，包含 config.num_hidden_layers 个 TFLayoutLMv3Layer 对象，每个对象命名为 "layer.{i}"
        self.layer = [TFLayoutLMv3Layer(config, name=f"layer.{i}") for i in range(config.num_hidden_layers)]

        # 检查是否具有相对注意力偏置
        self.has_relative_attention_bias = config.has_relative_attention_bias
        # 检查是否具有空间注意力偏置
        self.has_spatial_attention_bias = config.has_spatial_attention_bias

        # 如果具有相对注意力偏置，进行以下设置
        if self.has_relative_attention_bias:
            # 将 config.rel_pos_bins 赋值给对象的 rel_pos_bins 属性
            self.rel_pos_bins = config.rel_pos_bins
            # 将 config.max_rel_pos 赋值给对象的 max_rel_pos 属性
            self.max_rel_pos = config.max_rel_pos
            # 创建一个 Dense 层用于相对位置偏置，单元数为 config.num_attention_heads，
            # 内核初始化方式为 get_initializer(config.initializer_range)，不使用偏置，命名为 "rel_pos_bias"
            self.rel_pos_bias = keras.layers.Dense(
                units=config.num_attention_heads,
                kernel_initializer=get_initializer(config.initializer_range),
                use_bias=False,
                name="rel_pos_bias",
            )

        # 如果具有空间注意力偏置，进行以下设置
        if self.has_spatial_attention_bias:
            # 将 config.max_rel_2d_pos 赋值给对象的 max_rel_2d_pos 属性
            self.max_rel_2d_pos = config.max_rel_2d_pos
            # 将 config.rel_2d_pos_bins 赋值给对象的 rel_2d_pos_bins 属性
            self.rel_2d_pos_bins = config.rel_2d_pos_bins
            # 创建一个 Dense 层用于 X 方向的相对位置偏置，单元数为 config.num_attention_heads，
            # 内核初始化方式为 get_initializer(config.initializer_range)，不使用偏置，命名为 "rel_pos_x_bias"
            self.rel_pos_x_bias = keras.layers.Dense(
                units=config.num_attention_heads,
                kernel_initializer=get_initializer(config.initializer_range),
                use_bias=False,
                name="rel_pos_x_bias",
            )
            # 创建一个 Dense 层用于 Y 方向的相对位置偏置，单元数为 config.num_attention_heads，
            # 内核初始化方式为 get_initializer(config.initializer_range)，不使用偏置，命名为 "rel_pos_y_bias"
            self.rel_pos_y_bias = keras.layers.Dense(
                units=config.num_attention_heads,
                kernel_initializer=get_initializer(config.initializer_range),
                use_bias=False,
                name="rel_pos_y_bias",
            )
    # 根据相对位置计算桶索引，用于位置编码
    def relative_position_bucket(self, relative_positions: tf.Tensor, num_buckets: int, max_distance: int):
        # 负的相对位置被分配到区间 [0, num_buckets / 2]
        # 我们通过将绝对值的相对位置分配到区间 [0, num_buckets / 2] 来处理这一点
        # 然后在最后将正的相对位置偏移 num_buckets / 2
        num_buckets = num_buckets // 2
        buckets = tf.abs(relative_positions)

        # 一半的桶用于精确增量的位置
        max_exact_buckets = num_buckets // 2
        is_small = buckets < max_exact_buckets

        # 另一半的桶用于位置在最大距离 max_distance 内的对数增大的区间
        buckets_log_ratio = tf.math.log(tf.cast(buckets, tf.float32) / max_exact_buckets)
        distance_log_ratio = math.log(max_distance / max_exact_buckets)
        buckets_big_offset = (
            buckets_log_ratio / distance_log_ratio * (num_buckets - max_exact_buckets)
        )  # 缩放在 [0, num_buckets - max_exact_buckets] 的范围内
        buckets_big = max_exact_buckets + buckets_big_offset  # 范围是 [max_exact_buckets, num_buckets]
        buckets_big = tf.cast(buckets_big, buckets.dtype)
        buckets_big = tf.minimum(buckets_big, num_buckets - 1)

        return (tf.cast(relative_positions > 0, buckets.dtype) * num_buckets) + tf.where(
            is_small, buckets, buckets_big
        )

    # 计算位置编码的一维版本
    def _cal_1d_pos_emb(self, position_ids: tf.Tensor):
        return self._cal_pos_emb(self.rel_pos_bias, position_ids, self.rel_pos_bins, self.max_rel_pos)

    # 计算位置编码的二维版本
    def _cal_2d_pos_emb(self, bbox: tf.Tensor):
        position_coord_x = bbox[:, :, 0]  # 左边界
        position_coord_y = bbox[:, :, 3]  # 底边界
        rel_pos_x = self._cal_pos_emb(
            self.rel_pos_x_bias,
            position_coord_x,
            self.rel_2d_pos_bins,
            self.max_rel_2d_pos,
        )
        rel_pos_y = self._cal_pos_emb(
            self.rel_pos_y_bias,
            position_coord_y,
            self.rel_2d_pos_bins,
            self.max_rel_2d_pos,
        )
        rel_2d_pos = rel_pos_x + rel_pos_y
        return rel_2d_pos
    def call(
        self,
        hidden_states: tf.Tensor,
        bbox: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        position_ids: tf.Tensor | None = None,
        training: bool = False,
    ) -> Union[
        TFBaseModelOutput,
        Tuple[tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
    ]:
        # 如果需要输出隐藏状态，则初始化空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力权重，则初始化空元组
        all_self_attentions = () if output_attentions else None

        # 如果模型支持相对位置注意力偏置，则计算一维位置嵌入
        rel_pos = self._cal_1d_pos_emb(position_ids) if self.has_relative_attention_bias else None
        # 如果模型支持空间注意力偏置，则计算二维位置嵌入
        rel_2d_pos = self._cal_2d_pos_emb(bbox) if self.has_spatial_attention_bias else None

        # 遍历每个层模块
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则保存当前隐藏状态
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的注意力头遮罩
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 调用层模块进行前向传播
            layer_outputs = layer_module(
                hidden_states,
                attention_mask,
                layer_head_mask,
                output_attentions,
                rel_pos=rel_pos,
                rel_2d_pos=rel_2d_pos,
                training=training,
            )

            # 更新隐藏状态为当前层模块的输出
            hidden_states = layer_outputs[0]
            # 如果需要输出注意力权重，则保存当前层的自注意力权重
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果需要输出隐藏状态，则保存最终的隐藏状态
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果设置返回字典，则返回 TFBaseModelOutput 对象
        if return_dict:
            return TFBaseModelOutput(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states,
                attentions=all_self_attentions,
            )
        # 否则，根据是否有有效值返回元组
        else:
            return tuple(
                value for value in [hidden_states, all_hidden_states, all_self_attentions] if value is not None
            )

    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return
        # 标记模型已构建
        self.built = True

        # 如果存在相对位置偏置，则构建相对位置偏置
        if getattr(self, "rel_pos_bias", None) is not None:
            with tf.name_scope(self.rel_pos_bias.name):
                self.rel_pos_bias.build([None, None, self.rel_pos_bins])
        
        # 如果存在 X 方向的相对位置偏置，则构建 X 方向相对位置偏置
        if getattr(self, "rel_pos_x_bias", None) is not None:
            with tf.name_scope(self.rel_pos_x_bias.name):
                self.rel_pos_x_bias.build([None, None, self.rel_2d_pos_bins])
        
        # 如果存在 Y 方向的相对位置偏置，则构建 Y 方向相对位置偏置
        if getattr(self, "rel_pos_y_bias", None) is not None:
            with tf.name_scope(self.rel_pos_y_bias.name):
                self.rel_pos_y_bias.build([None, None, self.rel_2d_pos_bins])
        
        # 遍历每个层并构建它们
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                with tf.name_scope(layer.name):
                    layer.build(None)
# 使用 keras_serializable 装饰器将该类声明为可序列化的 Keras 层
@keras_serializable
class TFLayoutLMv3MainLayer(keras.layers.Layer):
    # 指定配置类为 LayoutLMv3Config
    config_class = LayoutLMv3Config

    # 初始化方法，接收 LayoutLMv3Config 对象和其他关键字参数
    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(**kwargs)

        # 将传入的配置对象保存在 self.config 中
        self.config = config

        # 如果配置要求包含文本嵌入，则创建 TFLayoutLMv3TextEmbeddings 对象并命名为 "embeddings"
        if config.text_embed:
            self.embeddings = TFLayoutLMv3TextEmbeddings(config, name="embeddings")

        # 如果配置要求包含视觉嵌入
        if config.visual_embed:
            # 创建 TFLayoutLMv3PatchEmbeddings 对象并命名为 "patch_embed"
            self.patch_embed = TFLayoutLMv3PatchEmbeddings(config, name="patch_embed")
            # 创建 LayerNormalization 层并设置参数
            self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
            # 创建 Dropout 层并设置丢弃率
            self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")

            # 如果配置中有相对注意力偏置或空间注意力偏置，则初始化视觉边界框
            if config.has_relative_attention_bias or config.has_spatial_attention_bias:
                image_size = config.input_size // config.patch_size
                self.init_visual_bbox(image_size=(image_size, image_size))

            # 创建 LayerNormalization 层并设置参数
            self.norm = keras.layers.LayerNormalization(epsilon=1e-6, name="norm")

        # 创建 TFLayoutLMv3Encoder 对象并命名为 "encoder"
        self.encoder = TFLayoutLMv3Encoder(config, name="encoder")

    # 构建方法，在此处根据输入形状构建网络层
    def build(self, input_shape=None):
        # 如果配置中包含视觉嵌入
        if self.config.visual_embed:
            image_size = self.config.input_size // self.config.patch_size
            # 创建用于分类的 token，初始化为全零
            self.cls_token = self.add_weight(
                shape=(1, 1, self.config.hidden_size),
                initializer="zeros",
                trainable=True,
                dtype=tf.float32,
                name="cls_token",
            )
            # 创建位置嵌入矩阵，初始化为全零
            self.pos_embed = self.add_weight(
                shape=(1, image_size * image_size + 1, self.config.hidden_size),
                initializer="zeros",
                trainable=True,
                dtype=tf.float32,
                name="pos_embed",
            )

        # 如果已经构建过，则直接返回
        if self.built:
            return
        self.built = True
        # 如果存在 self.encoder 属性，则在命名空间下构建 encoder
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        # 如果存在 self.embeddings 属性，则在命名空间下构建 embeddings
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        # 如果存在 self.patch_embed 属性，则在命名空间下构建 patch_embed
        if getattr(self, "patch_embed", None) is not None:
            with tf.name_scope(self.patch_embed.name):
                self.patch_embed.build(None)
        # 如果存在 self.LayerNorm 属性，则在命名空间下构建 LayerNorm
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])
        # 如果存在 self.dropout 属性，则在命名空间下构建 dropout
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)
        # 如果存在 self.norm 属性，则在命名空间下构建 norm
        if getattr(self, "norm", None) is not None:
            with tf.name_scope(self.norm.name):
                self.norm.build([None, None, self.config.hidden_size])

    # 获取输入嵌入层的方法，返回 embeddings 的 word_embeddings 属性
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.embeddings.word_embeddings

    # 设置输入嵌入层的方法，将 value 赋值给 embeddings 的 word_embeddings 属性
    def set_input_embeddings(self, value: tf.Variable):
        self.embeddings.word_embeddings.weight = value
    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads 复制而来的方法，用于剪枝模型中的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError

    # 初始化视觉边界框，设置图像的大小和最大长度
    def init_visual_bbox(self, image_size: Tuple[int, int], max_len: int = 1000):
        # 不应该将 max_len 硬编码为 1000，但是参考实现这样做了，为了与预训练权重兼容，我们保留了这个值。
        # 更正确的做法应该是传递 max_len=config.max_2d_position_embeddings - 1。
        height, width = image_size

        # 计算水平边界框的 x 坐标
        visual_bbox_x = tf.range(0, max_len * (width + 1), max_len) // width
        visual_bbox_x = tf.expand_dims(visual_bbox_x, axis=0)
        visual_bbox_x = tf.tile(visual_bbox_x, [width, 1])  # (width, width + 1)

        # 计算垂直边界框的 y 坐标
        visual_bbox_y = tf.range(0, max_len * (height + 1), max_len) // height
        visual_bbox_y = tf.expand_dims(visual_bbox_y, axis=1)
        visual_bbox_y = tf.tile(visual_bbox_y, [1, height])  # (height + 1, height)

        # 组合 x 和 y 坐标，形成边界框的四个角的坐标
        visual_bbox = tf.stack(
            [visual_bbox_x[:, :-1], visual_bbox_y[:-1], visual_bbox_x[:, 1:], visual_bbox_y[1:]],
            axis=-1,
        )
        visual_bbox = tf.reshape(visual_bbox, [-1, 4])

        # 添加一个表示 [CLS] 标记的边界框
        cls_token_box = tf.constant([[1, 1, max_len - 1, max_len - 1]], dtype=tf.int32)
        self.visual_bbox = tf.concat([cls_token_box, visual_bbox], axis=0)

    # 计算视觉边界框的形状并复制到指定批次大小
    def calculate_visual_bbox(self, batch_size: int, dtype: tf.DType):
        visual_bbox = tf.expand_dims(self.visual_bbox, axis=0)
        visual_bbox = tf.tile(visual_bbox, [batch_size, 1, 1])
        visual_bbox = tf.cast(visual_bbox, dtype=dtype)
        return visual_bbox

    # 嵌入图像像素值，返回嵌入后的张量
    def embed_image(self, pixel_values: tf.Tensor) -> tf.Tensor:
        # 使用补丁嵌入器将像素值转换为嵌入表示
        embeddings = self.patch_embed(pixel_values)

        # 添加 [CLS] 标记
        batch_size = tf.shape(embeddings)[0]
        cls_tokens = tf.tile(self.cls_token, [batch_size, 1, 1])
        embeddings = tf.concat([cls_tokens, embeddings], axis=1)

        # 添加位置嵌入
        if getattr(self, "pos_embed", None) is not None:
            embeddings += self.pos_embed

        # 归一化嵌入张量
        embeddings = self.norm(embeddings)
        return embeddings
    # Adapted from transformers.modelling_utils.ModuleUtilsMixin.get_extended_attention_mask
    # 根据注意力掩码的维度数量进行扩展，使其适用于多头注意力机制

    n_dims = len(attention_mask.shape)
    # 获取注意力掩码张量的维度数量

    if n_dims == 3:
        # 如果维度为3，表示提供了自定义的自注意力掩码 [batch_size, from_seq_length, to_seq_length]
        # 扩展维度使其适用于所有注意力头
        extended_attention_mask = tf.expand_dims(attention_mask, axis=1)
    elif n_dims == 2:
        # 如果维度为2，表示提供了填充掩码 [batch_size, seq_length]
        # 扩展维度使其适用于 [batch_size, num_heads, seq_length, seq_length]
        extended_attention_mask = tf.expand_dims(attention_mask, axis=1)  # (batch_size, 1, seq_length)
        extended_attention_mask = tf.expand_dims(extended_attention_mask, axis=1)  # (batch_size, 1, 1, seq_length)
    else:
        # 抛出异常，注意力掩码的形状不正确
        raise ValueError(f"Wrong shape for attention_mask (shape {attention_mask.shape}).")

    # 由于注意力掩码中 1.0 表示要关注的位置，0.0 表示掩码位置，
    # 这个操作将创建一个张量，对于要关注的位置为 0.0，对于掩码位置为 -10000.0
    # 在 softmax 前将其加到原始分数中，等效于完全移除这些位置的影响
    extended_attention_mask = tf.cast(extended_attention_mask, self.compute_dtype)
    extended_attention_mask = (1.0 - extended_attention_mask) * LARGE_NEGATIVE

    return extended_attention_mask
    def get_head_mask(self, head_mask: tf.Tensor | None) -> Union[tf.Tensor, List[tf.Tensor | None]]:
        if head_mask is None:
            # 如果头部掩码为 None，则返回一个包含 None 的列表，长度为模型隐藏层的数量
            return [None] * self.config.num_hidden_layers

        # 获取头部掩码的张量维度数
        n_dims = tf.rank(head_mask)
        if n_dims == 1:
            # 获取每个头部的掩码张量
            head_mask = tf.expand_dims(head_mask, axis=0)  # 1, num_heads
            head_mask = tf.expand_dims(head_mask, axis=0)  # 1, 1, num_heads
            head_mask = tf.expand_dims(head_mask, axis=-1)  # 1, 1, num_heads, 1
            head_mask = tf.expand_dims(head_mask, axis=-1)  # 1, 1, num_heads, 1, 1
            # 复制头部掩码以适应每个隐藏层
            head_mask = tf.tile(
                head_mask, [self.config.num_hidden_layers, 1, 1, 1, 1]
            )  # seq_length, 1, num_heads, 1, 1
        elif n_dims == 2:
            # 获取每个层和头部的掩码张量
            head_mask = tf.expand_dims(head_mask, axis=1)  # seq_length, 1, num_heads
            head_mask = tf.expand_dims(head_mask, axis=-1)  # seq_length, 1, num_heads, 1
            head_mask = tf.expand_dims(head_mask, axis=-1)  # seq_length, 1, num_heads, 1, 1
        elif n_dims != 5:
            # 如果掩码维度不是5，则抛出异常
            raise ValueError(f"Wrong shape for head_mask (shape {head_mask.shape}).")
        # 确保头部掩码的维度为5
        assert tf.rank(head_mask) == 5, f"Got head_mask rank of {tf.rank(head_mask)}, but require 5."
        # 将头部掩码转换为计算数据类型
        head_mask = tf.cast(head_mask, self.compute_dtype)
        return head_mask

    @unpack_inputs
    def call(
        self,
        input_ids: tf.Tensor | None = None,
        bbox: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        pixel_values: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[
        TFBaseModelOutput,
        Tuple[tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
    ]:
        # 此函数定义了模型的调用方式，处理多个输入和配置参数，返回一个包含不同输出类型的联合类型
class TFLayoutLMv3PreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 LayoutLMv3Config
    config_class = LayoutLMv3Config
    # 基础模型前缀为 "layoutlmv3"
    base_model_prefix = "layoutlmv3"

    @property
    def input_signature(self):
        # 获取父类 TFPreTrainedModel 的输入签名
        sig = super().input_signature
        # 添加一个新的输入 "bbox"，表示边界框，格式为 (None, None, 4) 的 int32 张量
        sig["bbox"] = tf.TensorSpec((None, None, 4), tf.int32, name="bbox")
        return sig


LAYOUTLMV3_START_DOCSTRING = r"""
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`LayoutLMv3Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""

LAYOUTLMV3_INPUTS_DOCSTRING = r"""
"""


@add_start_docstrings(
    # 添加开始文档字符串注释，后续继续
    "The bare LayoutLMv3 Model transformer outputting raw hidden-states without any specific head on top.",
    LAYOUTLMV3_START_DOCSTRING,
)
class TFLayoutLMv3Model(TFLayoutLMv3PreTrainedModel):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    _keys_to_ignore_on_load_unexpected = [r"position_ids"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_ids: tf.Tensor | None = None,
        bbox: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        pixel_values: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[
        TFBaseModelOutput,
        Tuple[tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
    ]:
        r"""
        Forward pass for the TFLayoutLMv3Model.
        
        Args:
            input_ids (tf.Tensor, optional): The input token IDs.
            bbox (tf.Tensor, optional): The bounding boxes of tokens.
            attention_mask (tf.Tensor, optional): The attention mask.
            token_type_ids (tf.Tensor, optional): The token type IDs.
            position_ids (tf.Tensor, optional): The position IDs.
            head_mask (tf.Tensor, optional): The mask for attention heads.
            inputs_embeds (tf.Tensor, optional): The embedded inputs.
            pixel_values (tf.Tensor, optional): The pixel values of images.
            output_attentions (bool, optional): Whether to output attentions.
            output_hidden_states (bool, optional): Whether to output hidden states.
            return_dict (bool, optional): Whether to return a dictionary.
            training (bool, optional): Whether in training mode.

        Returns:
            Union[TFBaseModelOutput, Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor], Tuple[tf.Tensor, tf.Tensor, tf.Tensor]]:
            The model outputs.

        Examples:
            Example usage of TFLayoutLMv3Model for token classification.

            ```
            >>> from transformers import AutoProcessor, TFAutoModel
            >>> from datasets import load_dataset

            >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
            >>> model = TFAutoModel.from_pretrained("microsoft/layoutlmv3-base")

            >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
            >>> example = dataset[0]
            >>> image = example["image"]
            >>> words = example["tokens"]
            >>> boxes = example["bboxes"]

            >>> encoding = processor(image, words, boxes=boxes, return_tensors="tf")

            >>> outputs = model(**encoding)
            >>> last_hidden_states = outputs.last_hidden_state
            ```
        """

        # Pass input arguments to the main layer TFLayoutLMv3MainLayer
        outputs = self.layoutlmv3(
            input_ids=input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "layoutlmv3", None) is not None:
            with tf.name_scope(self.layoutlmv3.name):
                self.layoutlmv3.build(None)


class TFLayoutLMv3ClassificationHead(keras.layers.Layer):
    """
    Placeholder for the classification head of the TFLayoutLMv3Model.
    """
    Head for sentence-level classification tasks. Reference: RobertaClassificationHead
    """

    # 初始化函数，用于创建一个分类器头部对象
    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(**kwargs)
        
        # 创建一个全连接层，输出维度为config.hidden_size，激活函数为tanh
        self.dense = keras.layers.Dense(
            config.hidden_size,
            activation="tanh",
            kernel_initializer=get_initializer(config.initializer_range),
            name="dense",
        )
        
        # 设置分类器的dropout层，根据config中的设定选择classifier_dropout或者hidden_dropout_prob
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = keras.layers.Dropout(
            classifier_dropout,
            name="dropout",
        )
        
        # 创建一个全连接层，输出维度为config.num_labels，用于最终的输出投影
        self.out_proj = keras.layers.Dense(
            config.num_labels,
            kernel_initializer=get_initializer(config.initializer_range),
            name="out_proj",
        )
        
        # 保存配置信息
        self.config = config

    # 调用函数，用于执行前向传播
    def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 对输入数据进行dropout处理
        outputs = self.dropout(inputs, training=training)
        
        # 经过全连接层dense处理
        outputs = self.dense(outputs)
        
        # 再次对处理后的结果进行dropout处理
        outputs = self.dropout(outputs, training=training)
        
        # 最终通过全连接层out_proj输出结果
        outputs = self.out_proj(outputs)
        return outputs

    # 构建函数，用于构建模型的层次结构
    def build(self, input_shape=None):
        if self.built:
            return
        
        # 标记模型已经构建
        self.built = True
        
        # 如果dense层存在，则构建dense层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        
        # 如果dropout层存在，则构建dropout层
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)
        
        # 如果out_proj层存在，则构建out_proj层
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.config.hidden_size])
"""
LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
[CLS] token) e.g. for document image classification tasks such as the
[RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
"""
# 继承自 TFLayoutLMv3PreTrainedModel 和 TFSequenceClassificationLoss 的 TFLayoutLMv3ForSequenceClassification 类，
# 用于文档图像分类任务，通过在最终隐藏状态的[CLS]标记之上添加线性层来进行序列分类。
@add_start_docstrings(
    """
    LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for document image classification tasks such as the
    [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
    """,
    LAYOUTLMV3_START_DOCSTRING,
)
class TFLayoutLMv3ForSequenceClassification(TFLayoutLMv3PreTrainedModel, TFSequenceClassificationLoss):
    # 在从 PT 模型加载 TF 模型时，忽略的授权外层或缺失层的名称列表，包含不带位置标识符的项
    _keys_to_ignore_on_load_unexpected = [r"position_ids"]

    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(config, **kwargs)
        self.config = config
        # 创建 LayoutLMv3 主层，并命名为 "layoutlmv3"
        self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
        # 创建 LayoutLMv3 分类头，并命名为 "classifier"
        self.classifier = TFLayoutLMv3ClassificationHead(config, name="classifier")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
    # 调用函数，接收多种输入参数，返回 TFSequenceClassifierOutput 或其它类型的元组
    def call(
        self,
        input_ids: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        labels: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        bbox: tf.Tensor | None = None,
        pixel_values: tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[
        TFSequenceClassifierOutput,
        Tuple[tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
        # 多种返回类型的联合
    ]:
        """
        Returns:

        Examples:

        ```
        >>> from transformers import AutoProcessor, TFAutoModelForSequenceClassification
        >>> from datasets import load_dataset
        >>> import tensorflow as tf

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, words, boxes=boxes, return_tensors="tf")
        >>> sequence_label = tf.convert_to_tensor([1])

        >>> outputs = model(**encoding, labels=sequence_label)
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```"""

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 return_dict 参数为 None，则使用模型配置中的默认设置

        outputs = self.layoutlmv3(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            bbox=bbox,
            pixel_values=pixel_values,
            training=training,
        )
        # 使用 LayoutLMv3 模型处理输入数据，输出模型的各种结果

        sequence_output = outputs[0][:, 0, :]
        # 提取模型输出的序列输出的第一个位置的特征向量

        logits = self.classifier(sequence_output, training=training)
        # 使用分类器对序列输出进行分类预测

        loss = None if labels is None else self.hf_compute_loss(labels, logits)
        # 如果没有标签，则损失值为 None；否则计算模型预测与标签之间的损失

        if not return_dict:
            # 如果不要求返回字典格式的输出
            output = (logits,) + outputs[1:]
            # 构建输出元组，包含 logits 和模型其他输出
            return ((loss,) + output) if loss is not None else output
            # 如果有损失则将损失加入输出，否则只输出 logits 和其他结果

        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
        # 返回 TFSequenceClassifierOutput 格式的输出，包括损失、logits、隐藏状态和注意力权重

    def build(self, input_shape=None):
        if self.built:
            return
        # 如果模型已经建立则直接返回

        self.built = True
        # 设置模型已建立标志为 True

        if getattr(self, "layoutlmv3", None) is not None:
            # 如果模型有 layoutlmv3 属性
            with tf.name_scope(self.layoutlmv3.name):
                self.layoutlmv3.build(None)
                # 在 TensorFlow 的命名空间下构建 layoutlmv3 模型

        if getattr(self, "classifier", None) is not None:
            # 如果模型有 classifier 属性
            with tf.name_scope(self.classifier.name):
                self.classifier.build(None)
                # 在 TensorFlow 的命名空间下构建 classifier 分类器模型
"""
LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
[SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
[Kleister-NDA](https://github.com/applicaai/kleister-nda).

This class inherits from TFLayoutLMv3PreTrainedModel and TFTokenClassificationLoss. It provides a token
classification model specifically tailored for layout-aware tasks.

Attributes:
    _keys_to_ignore_on_load_unexpected (list): Names of layers to ignore when loading a TF model from a PT model.

Args:
    config (LayoutLMv3Config): Configuration class instance defining the model architecture and hyperparameters.
"""
@add_start_docstrings(
    """
    LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
    for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
    [SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
    [Kleister-NDA](https://github.com/applicaai/kleister-nda).
    """,
    LAYOUTLMV3_START_DOCSTRING,
)
class TFLayoutLMv3ForTokenClassification(TFLayoutLMv3PreTrainedModel, TFTokenClassificationLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    _keys_to_ignore_on_load_unexpected = [r"position_ids"]

    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(config, **kwargs)
        self.num_labels = config.num_labels

        # Initialize the main layers of the LayoutLMv3 model
        self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")

        # Initialize the classifier layer based on the number of labels in the configuration
        if config.num_labels < 10:
            self.classifier = keras.layers.Dense(
                config.num_labels,
                kernel_initializer=get_initializer(config.initializer_range),
                name="classifier",
            )
        else:
            self.classifier = TFLayoutLMv3ClassificationHead(config, name="classifier")

        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_ids: tf.Tensor | None = None,
        bbox: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        labels: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[
        TFTokenClassifierOutput,
        Tuple[tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
        # More return types depending on the inputs and configuration
    ]:
        """
        Performs the forward pass of the model for token classification.

        Args (depending on the input types):
            input_ids (tf.Tensor, optional): Tensor of input token IDs.
            bbox (tf.Tensor, optional): Tensor of bounding boxes for each token.
            attention_mask (tf.Tensor, optional): Mask indicating which tokens should be attended to.
            token_type_ids (tf.Tensor, optional): Type IDs to distinguish different sequences in the input.
            position_ids (tf.Tensor, optional): Positional IDs to indicate the position of tokens.
            head_mask (tf.Tensor, optional): Mask to hide certain heads in the self-attention layers.
            inputs_embeds (tf.Tensor, optional): Embedded inputs if the input tokens are already embedded.
            labels (tf.Tensor, optional): Labels for the token classification task.
            output_attentions (bool, optional): Whether to output attentions.
            output_hidden_states (bool, optional): Whether to output hidden states.
            return_dict (bool, optional): Whether to return a dictionary instead of a tuple of outputs.
            pixel_values (tf.Tensor, optional): Pixel values for image tokens if images are part of inputs.
            training (bool, optional): Whether the model is in training mode.

        Returns:
            TFTokenClassifierOutput or Tuple of Tensors: Output depending on the configuration and inputs.

        Raises:
            ValueError: If the configuration is invalid or incompatible with the model.
        """
        # 如果 `return_dict` 未指定，则使用模型配置中的默认设置来确定是否返回字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 LayoutLMv3 模型进行前向传播
        outputs = self.layoutlmv3(
            input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            pixel_values=pixel_values,
            training=training,
        )

        # 如果提供了 `input_ids`，则获取其形状；否则获取 `inputs_embeds` 的形状，去掉最后一维
        if input_ids is not None:
            input_shape = tf.shape(input_ids)
        else:
            input_shape = tf.shape(inputs_embeds)[:-1]

        # 获取序列的长度
        seq_length = input_shape[1]

        # 从模型输出中提取文本部分的表示
        sequence_output = outputs[0][:, :seq_length]

        # 在训练过程中对序列输出进行 dropout 操作
        sequence_output = self.dropout(sequence_output, training=training)

        # 将处理后的序列输出传入分类器以获得 logits
        logits = self.classifier(sequence_output)

        # 如果没有提供标签，则不计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不要求返回字典格式的输出，则按需返回 logits 和其他输出信息
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFTokenClassifierOutput 类型的对象，包含损失、logits、隐藏状态和注意力权重
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 如果模型已经构建完成，则直接返回，避免重复构建
    if self.built:
        return
    # 将模型标记为已构建状态
    self.built = True
    
    # 如果存在 layoutlmv3 属性，则构建 layoutlmv3 模型部分
    if getattr(self, "layoutlmv3", None) is not None:
        # 使用 layoutlmv3 的名称作为命名空间
        with tf.name_scope(self.layoutlmv3.name):
            # 构建 layoutlmv3 模型
            self.layoutlmv3.build(None)
    
    # 如果存在 dropout 属性，则构建 dropout 模型部分
    if getattr(self, "dropout", None) is not None:
        # 使用 dropout 的名称作为命名空间
        with tf.name_scope(self.dropout.name):
            # 构建 dropout 模型
            self.dropout.build(None)
    
    # 如果存在 classifier 属性，则构建 classifier 模型部分
    if getattr(self, "classifier", None) is not None:
        # 使用 classifier 的名称作为命名空间
        with tf.name_scope(self.classifier.name):
            # 构建 classifier 模型，输入形状为 [None, None, self.config.hidden_size]
            self.classifier.build([None, None, self.config.hidden_size])
"""
LayoutLMv3 Model with a span classification head on top for extractive question-answering tasks such as
[DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to
compute `span start logits` and `span end logits`).
"""
@add_start_docstrings(
    """
    LayoutLMv3 Model with a span classification head on top for extractive question-answering tasks such as
    [DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to
    compute `span start logits` and `span end logits`).
    """,
    LAYOUTLMV3_START_DOCSTRING,
)
class TFLayoutLMv3ForQuestionAnswering(TFLayoutLMv3PreTrainedModel, TFQuestionAnsweringLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    _keys_to_ignore_on_load_unexpected = [r"position_ids"]

    def __init__(self, config: LayoutLMv3Config, **kwargs):
        super().__init__(config, **kwargs)

        self.num_labels = config.num_labels

        # Initialize the main LayoutLMv3 layer with the provided configuration
        self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
        
        # Initialize the question answering classification head for LayoutLMv3
        self.qa_outputs = TFLayoutLMv3ClassificationHead(config, name="qa_outputs")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_ids: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        start_positions: tf.Tensor | None = None,
        end_positions: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        bbox: tf.Tensor | None = None,
        pixel_values: tf.Tensor | None = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[
        TFQuestionAnsweringModelOutput,
        Tuple[tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
        Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor],
    ]:
        """
        Forward pass of the TFLayoutLMv3ForQuestionAnswering model.
        
        Args:
            input_ids: Tensor of input token IDs.
            attention_mask: Tensor of attention mask.
            token_type_ids: Tensor of token type IDs.
            position_ids: Tensor of position IDs.
            head_mask: Tensor of head masks.
            inputs_embeds: Tensor of input embeddings.
            start_positions: Tensor of start positions for QA.
            end_positions: Tensor of end positions for QA.
            output_attentions: Whether to output attentions.
            output_hidden_states: Whether to output hidden states.
            bbox: Tensor of bounding boxes.
            pixel_values: Tensor of pixel values.
            return_dict: Whether to return a dictionary of outputs.
            training: Whether the model is in training mode.
        
        Returns:
            TFQuestionAnsweringModelOutput or tuple of output tensors.
        """

    def build(self, input_shape=None):
        """
        Builds the TFLayoutLMv3ForQuestionAnswering model.
        
        Args:
            input_shape: Shape of the input tensor.
        """
        if self.built:
            return
        
        self.built = True
        
        # Build the LayoutLMv3 main layer if it exists
        if getattr(self, "layoutlmv3", None) is not None:
            with tf.name_scope(self.layoutlmv3.name):
                self.layoutlmv3.build(None)
        
        # Build the QA classification head if it exists
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build(None)

Transformers-源码解析-六十二-

Transformers 源码解析（六十二）

.\models\layoutlmv2\processing_layoutlmv2.py

.\models\layoutlmv2\tokenization_layoutlmv2.py

.\models\layoutlmv2\tokenization_layoutlmv2_fast.py

.\models\layoutlmv2\__init__.py

.\models\layoutlmv3\configuration_layoutlmv3.py

.\models\layoutlmv3\feature_extraction_layoutlmv3.py

.\models\layoutlmv3\image_processing_layoutlmv3.py

.\models\layoutlmv3\modeling_layoutlmv3.py

.\models\layoutlmv3\modeling_tf_layoutlmv3.py

`.\models\layoutlmv2\processing_layoutlmv2.py`

`.\models\layoutlmv2\tokenization_layoutlmv2.py`

`.\models\layoutlmv2\tokenization_layoutlmv2_fast.py`

`.\models\layoutlmv2\init.py`

`.\models\layoutlmv3\configuration_layoutlmv3.py`

`.\models\layoutlmv3\feature_extraction_layoutlmv3.py`

`.\models\layoutlmv3\image_processing_layoutlmv3.py`

`.\models\layoutlmv3\modeling_layoutlmv3.py`

`.\models\layoutlmv3\modeling_tf_layoutlmv3.py`