Transformers 源码解析（六十一）

`.\models\kosmos2\processing_kosmos2.py`

# coding=utf-8
# 上面是指定文件编码为 UTF-8

# 版权声明，版权归 Microsoft Research 和 HuggingFace Inc. 团队所有
# 根据 Apache 许可证版本 2.0 使用本文件，除非符合许可证，否则不得使用本文件
# 可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
# 除非适用法律要求或书面同意，否则按"原样"分发软件，不提供任何形式的担保或条件
# 有关特定语言的权限，请参阅许可证文档

"""KOSMOS-2 的处理器类。"""

import copy  # 导入 copy 模块，用于复制对象
import math  # 导入 math 模块，用于数学运算
import re  # 导入 re 模块，用于正则表达式操作
from typing import List, Optional, Tuple, Union  # 导入类型提示相关的模块

from ...image_processing_utils import BatchFeature  # 导入批量特征处理工具
from ...image_utils import ImageInput, is_batched  # 导入图像输入和批处理检查函数
from ...processing_utils import ProcessorMixin  # 导入处理器混合类
from ...tokenization_utils import AddedToken  # 导入添加的标记类
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy  # 导入批编码相关类和策略
from ...utils import TensorType  # 导入张量类型

BboxInput = Union[
    List[Tuple[int, int]],  # BboxInput 可以是 (int, int) 元组的列表
    List[Tuple[float, float, float, float]],  # 或者是浮点数 (float, float, float, float) 元组的列表
    List[List[Tuple[int, int]]],  # 或者是 (int, int) 元组列表的列表
    List[List[Tuple[float, float, float]]],  # 或者是浮点数 (float, float, float) 元组列表的列表
]

class Kosmos2Processor(ProcessorMixin):
    """
    构造一个 KOSMOS-2 处理器，将 KOSMOS-2 图像处理器和 KOSMOS-2 分词器封装成一个单一的处理器。

    [`Kosmos2Processor`] 提供了 [`CLIPImageProcessor`] 的所有功能以及 [`XLMRobertaTokenizerFast`] 的一些功能。
    更多信息请参阅 [`~Kosmos2Processor.__call__`] 和 [`~Kosmos2Processor.decode`] 的文档字符串。

    Args:
        image_processor (`CLIPImageProcessor`):
            一个 [`CLIPImageProcessor`] 实例。图像处理器是必需的输入。
        tokenizer (`XLMRobertaTokenizerFast`):
            一个 [`XLMRobertaTokenizerFast`] 实例。分词器是必需的输入。
        num_patch_index_tokens (`int`, *optional*, 默认为 1024):
            表示补丁索引的标记数。
    """

    attributes = ["image_processor", "tokenizer"]  # 定义类的属性列表
    image_processor_class = "CLIPImageProcessor"  # 图像处理器的类名
    tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")  # 分词器的类名
        def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024):
            # 设置 tokenizer 不返回 token 类型 ID
            tokenizer.return_token_type_ids = False

            # 定义结束文档标记
            self.eod_token = "</doc>"

            # 定义图像开始标记和结束标记
            self.boi_token = "<image>"
            self.eoi_token = "</image>"

            # 定义块结束和行结束标记
            self.eoc_token = "</chunk>"
            self.eol_token = "</line>"

            # 定义短语开始和结束标记
            self.bop_token = "<phrase>"
            self.eop_token = "</phrase>"

            # 定义对象开始和结束标记
            self.boo_token = "<object>"
            self.eoo_token = "</object>"

            # 定义多对象分隔符结束标记
            self.dom_token = "<|delimiter_of_multi_objects|>"

            # 定义图像 grounding 标记
            self.grd_token = "<grounding>"

            # 将所有标记放入列表中
            self.tag_tokens = [
                self.eod_token,
                self.boi_token,
                self.eoi_token,
                self.eoc_token,
                self.eol_token,
                self.bop_token,
                self.eop_token,
                self.boo_token,
                self.eoo_token,
                self.dom_token,
                self.grd_token,
            ]

            # 设置索引 token 的数量
            self.num_patch_index_tokens = num_patch_index_tokens
            # 生成索引 token 列表，格式为 "<patch_index_0000>" 到 "<patch_index_1023>"
            patch_index_tokens = [f"<patch_index_{str(x).zfill(4)}>" for x in range(self.num_patch_index_tokens)]

            # 创建要添加的 token 列表
            tokens_to_add = []
            # 将所有标记和索引 token 添加为 AddedToken 对象到 tokenizer 中
            for token in self.tag_tokens + patch_index_tokens:
                tokens_to_add.append(AddedToken(token, lstrip=True, rstrip=False, normalized=False))
            tokenizer.add_tokens(tokens_to_add)

            # 调用父类初始化方法，传递图像处理器和 tokenizer
            super().__init__(image_processor, tokenizer)

        def __call__(
            self,
            images: ImageInput = None,
            text: Union[TextInput, List[TextInput]] = None,
            bboxes: BboxInput = None,
            num_image_tokens: Optional[int] = 64,
            first_image_token_id: Optional[int] = None,
            add_special_tokens: bool = True,
            add_eos_token: bool = False,
            padding: Union[bool, str, PaddingStrategy] = False,
            truncation: Union[bool, str, TruncationStrategy] = None,
            max_length: Optional[int] = None,
            pad_to_multiple_of: Optional[int] = None,
            return_attention_mask: Optional[bool] = None,
            return_length: bool = False,
            verbose: bool = True,
            return_tensors: Optional[Union[str, TensorType]] = None,
            **kwargs,
    def _check_bboxes_for_single_text(self, bboxes):
        """
        Check `bboxes` for a single text example. It could be
            - `None`: no bounding box associated to a text.
            - A list with each element being the bounding boxes associated to one `<phrase> ... </phrase>` pair found
              in a text. This could be:
                  - `None`: no bounding box associated to a `<phrase> ... </phrase>` pair.
                  - A tuple of 2 integers: A single bounding box specified by patch indices.
                  - A tuple of 4 float point number: A single bounding box specified by (normalized) coordinates.
                  - A list containing the above 2 tuple types: Multiple bounding boxes for a
                   `<phrase> ... </phrase>` pair.
        """
        if bboxes is None:
            return  # 如果 bboxes 是 None，直接返回，表示没有边界框与文本关联

        elif not isinstance(bboxes, list):
            raise ValueError("`bboxes` (for a single text example) should be `None` or a list.")
            # 如果 bboxes 不是 list 类型，则引发 ValueError 异常，说明它应该是 None 或者一个列表

        # `bbox` is the bounding boxes for a single <phrase> </phrase> pair
        for bbox in bboxes:
            if bbox is None:
                continue  # 如果 bbox 是 None，则跳过当前循环，继续下一个 bbox

            elif not isinstance(bbox, list):
                bbox = [bbox]  # 如果 bbox 不是 list 类型，则转换成单元素的列表

            for element in bbox:
                if not isinstance(element, tuple) or not (
                    (len(element) == 2 and all(isinstance(x, int) for x in element))
                    or (len(element) == 4 and all(isinstance(x, float) for x in element))
                ):
                    raise ValueError(
                        "Each element in `bboxes` (for a single text example) should be either `None`, a tuple containing "
                        "2 integers or 4 float point numbers, or a list containing such tuples. Also "
                        "make sure the arguments `texts` and `bboxes` passed to `preprocess_text` are both in "
                        "batches or both for a single example."
                    )
                    # 检查每个 element 是否符合要求，如果不符合，则引发 ValueError 异常

    def _preprocess_single_example(self, text, image, bboxes, img_info_tokens):
        text = text.strip()  # 去除 text 的首尾空白字符
        if image is not None:
            # 在文本前添加 `<image> ... (fake) image tokens ... </image>`
            text = f"{img_info_tokens} {text}"

        # 在 `<phrase> phrase text </phrase>` 后面添加 `<object> <patch_idx_xxxx> <patch_idx_yyy> </object>`
        text = self._insert_patch_index_tokens(text, bboxes)
        return text
    ) -> Union[str, List[str]]:
        """Add image and bounding box information to `texts` as image and patch index tokens.

        Args:
            texts (`Union[TextInput, List[TextInput]]`): The texts to be processed.
            images (`ImageInput`, *optional*): The images associated to `texts`.
            bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
            num_image_tokens (`int`, *optional*, defaults to 64):
                The number of image tokens (used as latent queries). This should corresponds to the `latent_query_num`
                attribute in `Kosmos2Config`.

        Returns:
            `Union[TextInput, List[TextInput]]`: The processed texts with image and patch index tokens.
        """
        # These are fake `<image>` tokens enclosed between (the actual) `<image>` token and `</image>`.
        # 创建一个包含指定数量 `<image>` token 的列表，并将它们用空格分隔成一个字符串
        img_tokens = [self.boi_token] * num_image_tokens
        img_info_tokens = " ".join([self.boi_token] + img_tokens + [self.eoi_token])

        # make batch to simplify processing logic
        # 如果 texts 是单个字符串，则转换成单元素列表
        batched = True
        if isinstance(texts, str):
            batched = False
            texts = [texts]

        # 如果 images 为 None，则将其初始化为与 texts 相同长度的 None 列表
        if images is None:
            images = [None] * len(texts)
        # 如果 images 不是批量输入，则转换为单元素列表
        elif not is_batched(images):
            images = [images]
        # 检查 texts 和 images 的数量是否相同，否则引发 ValueError
        if len(texts) != len(images):
            raise ValueError(
                f"The number of examples in `texts` and `images` should be the same. Got {len(texts)} v.s. {len(images)} instead."
            )

        # 如果 texts 不是批量输入，则检查单个文本的 bboxes 格式
        if not batched:
            self._check_bboxes_for_single_text(bboxes)
            bboxes = [bboxes]
        # 如果 texts 是批量输入且 bboxes 不为 None，则检查 bboxes 的格式
        elif bboxes is not None:
            if not isinstance(bboxes, list):
                raise ValueError("`bboxes` should be `None` or a list (as a batch) when `texts` is passed as a batch.")
            for x in bboxes:
                self._check_bboxes_for_single_text(x)
        # 如果 bboxes 为 None，则初始化为与 texts 相同长度的 None 列表
        else:
            bboxes = [None] * len(texts)

        # 检查 texts 和 bboxes 的数量是否相同，否则引发 ValueError
        if len(bboxes) != len(texts):
            raise ValueError(
                f"The number of examples in `texts` and `bboxes` should be the same. Got {len(texts)} v.s. {len(bboxes)} instead."
            )

        # 对每个文本、对应的图片和边界框进行预处理，返回结果列表
        result = [
            self._preprocess_single_example(text, image, bbox, img_info_tokens)
            for text, image, bbox in zip(texts, images, bboxes)
        ]
        # 如果 texts 不是批量输入，则将结果转换为单个元素
        # 反之，如果是批量输入，则保持结果列表形式
        if not batched:
            result = result[0]

        # 返回处理后的结果列表或单个文本
        return result

    # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
    # 将所有参数转发给 PreTrainedTokenizer 的 batch_decode 方法
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    # 从 transformers.models.blip.processing_blip.BlipProcessor.decode 复制代码，并将 BertTokenizerFast->PreTrainedTokenizer
    # 将所有参数转发给 PreTrainedTokenizer 的 decode 方法
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    # 对生成的文本进行后处理，可以清理文本并提取实体及其边界框
    def post_process_generation(self, text, cleanup_and_extract=True):
        caption = text.split(self.eoi_token)[-1]
        if cleanup_and_extract:
            return clean_text_and_extract_entities_with_bboxes(caption)
        return caption

    @property
    # 从 transformers.models.blip.processing_blip.BlipProcessor.model_input_names 复制代码
    # 返回模型输入的名称列表，包括 Tokenizer 和图像处理器的输入名称，并去除重复项
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
    # 将补丁索引标记插入文本中的指定短语区域
    def _insert_patch_index_tokens(self, text: str, bboxes: Union[List[Tuple[int]], List[Tuple[float]]]) -> str:
        # 如果未提供边界框或边界框列表为空，则直接返回原始文本
        if bboxes is None or len(bboxes) == 0:
            return text

        # 找出文本中所有匹配的 `<phrase>...</phrase>` 对
        matched_phrases = list(re.finditer(r"<phrase>.+?</phrase>", string=text))
        # 检查匹配到的短语对数与边界框数量是否相等，若不相等则引发异常
        if len(matched_phrases) != len(bboxes):
            raise ValueError(
                f"The number of elements in `bboxes` should be the same as the number of `<phrase> ... </phrase>` pairs in `text`. Got {len(matched_phrases)} v.s. {len(bboxes)} instead."
            )

        # 插入对象的补丁索引标记到找到的 `<phrase>...</phrase>` 对中
        curr_pos = 0
        buffer = []
        for matched, bbox in zip(matched_phrases, bboxes):
            _, end = matched.span()
            buffer.append(text[curr_pos:end])
            curr_pos = end

            # 如果边界框为 None，则跳过当前短语的处理
            if bbox is None:
                continue

            # 如果边界框是单个元组，则转换为列表以便处理
            if isinstance(bbox, tuple):
                bbox = [bbox]

            # 检查边界框列表中是否有 None 值，若有则引发异常
            if not all(box is not None for box in bbox):
                raise ValueError(
                    "The multiple bounding boxes for a single phrase should not contain any `None` value."
                )

            patch_index_strings = []
            # 对于每个边界框，将其转换为补丁索引标记，并构建标记字符串列表
            for box in bbox:
                patch_index_1, patch_index_2 = self._convert_bbox_to_patch_index_tokens(box)
                patch_index_strings.append(f"{patch_index_1} {patch_index_2}")

            # 如果标记字符串列表为空，则跳过当前短语的处理
            if len(patch_index_strings) == 0:
                continue

            # 将补丁索引标记字符串插入到 `<object>...</object>` 标签中
            position_str = " <|delimiter_of_multi_objects|> ".join(patch_index_strings)
            buffer.append(f"<object> {position_str} </object>")

        # 处理剩余的文本部分并将其添加到缓冲区中
        if curr_pos < len(text):
            buffer.append(text[curr_pos:])

        # 将缓冲区中的文本片段合并为最终的修改后的文本
        text = "".join(buffer)
        return text

    # 将边界框转换为对应的补丁索引标记
    def _convert_bbox_to_patch_index_tokens(
        self, bbox: Union[Tuple[int, int], Tuple[float, float, float, float]]
    ) -> Tuple[str, str]:
        # 如果边界框长度为 2，则表示已经是补丁索引标记，直接使用
        if len(bbox) == 2:
            idx_1, idx_2 = bbox
        # 否则，根据 (归一化的) 坐标计算对应的补丁索引标记
        else:
            # 使用 self.tokenizer 获取 num_patches_per_side
            num_patches_per_side = int(math.sqrt(self.num_patch_index_tokens))
            idx_1, idx_2 = coordinate_to_patch_index(bbox, num_patches_per_side)

        # 构建补丁索引标记字符串并返回
        token_1 = f"<patch_index_{str(idx_1).zfill(4)}>"
        token_2 = f"<patch_index_{str(idx_2).zfill(4)}>"

        return token_1, token_2
# 将边界框转换为一对补丁索引。
def coordinate_to_patch_index(bbox: Tuple[float, float, float, float], num_patches_per_side: int) -> Tuple[int, int]:
    """Convert a bounding box to a pair of patch indices.

    Args:
        bbox (`Tuple[float, float, float, float]`):
            The 4 coordinates of the bounding box, with the format being (x1, y1, x2, y2) specifying the upper-left and
            lower-right corners of the box. It should have x2 > x1 and y2 > y1.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `Tuple[int, int]`: A pair of patch indices representing the upper-left patch and lower-right patch.
    """
    (x1, y1, x2, y2) = bbox  # 解包边界框坐标

    if not (x2 > x1 and y2 > y1):  # 检查边界框坐标是否有效
        raise ValueError("The coordinates in `bbox` should be `(x1, y1, x2, y2)` with `x2 > x1` and `y2 > y1`.")

    ul_x = math.floor(x1 * num_patches_per_side)  # 计算上左角补丁的 x 索引
    ul_y = math.floor(y1 * num_patches_per_side)  # 计算上左角补丁的 y 索引

    lr_x = math.ceil(x2 * num_patches_per_side - 1)  # 计算下右角补丁的 x 索引
    lr_y = math.ceil(y2 * num_patches_per_side - 1)  # 计算下右角补丁的 y 索引

    ul_idx = ul_y * num_patches_per_side + ul_x  # 计算上左角补丁的索引
    lr_idx = lr_y * num_patches_per_side + lr_x  # 计算下右角补丁的索引

    return ul_idx, lr_idx  # 返回补丁索引对


# 从 https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L35C1-L75C38 复制（格式修改）
def patch_index_to_coordinate(ul_idx: int, lr_idx: int, num_patches_per_side: int):
    """
    Given a grid of length `num_patches_per_side` and the indices of the upper-left and lower-right corners of a
    bounding box, returns the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).

    Args:
        ul_idx (`int`): the index of the grid cell that corresponds to the upper-left corner of the bounding box.
        lr_idx (`int`): the index of the grid cell that corresponds to the lower-right corner of the bounding box.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `Tuple[float]`: the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
    """
    # Compute the size of each cell in the grid
    cell_size = 1.0 / num_patches_per_side  # 计算网格中每个单元格的大小

    # Compute the x and y indices of the upper-left and lower-right corners of the bounding box
    ul_x = ul_idx % num_patches_per_side  # 计算上左角补丁的 x 索引
    ul_y = ul_idx // num_patches_per_side  # 计算上左角补丁的 y 索引

    lr_x = lr_idx % num_patches_per_side  # 计算下右角补丁的 x 索引
    lr_y = lr_idx // num_patches_per_side  # 计算下右角补丁的 y 索引

    # Compute the normalized coordinates of the bounding box
    if ul_idx == lr_idx:
        x1 = ul_x * cell_size  # 左上角 x 坐标
        y1 = ul_y * cell_size  # 左上角 y 坐标
        x2 = lr_x * cell_size + cell_size  # 右下角 x 坐标
        y2 = lr_y * cell_size + cell_size  # 右下角 y 坐标
    elif ul_x == lr_x or ul_y == lr_y:
        x1 = ul_x * cell_size  # 左上角 x 坐标
        y1 = ul_y * cell_size  # 左上角 y 坐标
        x2 = lr_x * cell_size + cell_size  # 右下角 x 坐标
        y2 = lr_y * cell_size + cell_size  # 右下角 y 坐标
    # 如果条件不满足，执行以下语句
    else:
        # 计算矩形左上角点的 x 坐标
        x1 = ul_x * cell_size + cell_size / 2
        # 计算矩形左上角点的 y 坐标
        y1 = ul_y * cell_size + cell_size / 2
        # 计算矩形右下角点的 x 坐标
        x2 = lr_x * cell_size + cell_size / 2
        # 计算矩形右下角点的 y 坐标
        y2 = lr_y * cell_size + cell_size / 2

    # 返回计算得到的四个坐标值作为元组
    return x1, y1, x2, y2
# 从给定的文本中提取带有补丁索引的实体信息
def extract_entities_with_patch_indices(text):
    """Extract entities contained in `text`. The bounding bboxes is given in the form of patch indices.

    This function is only intended to be used within `clean_text_and_extract_entities_with_bboxes` where further
    processing happens, including converting to normalized coordinates and whitespace character cleaning up.

    Examples:

    ```
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> entities = extract_entities_with_patch_indices(text)
    >>> entities
    [(' a snowman', (31, 41), [(44, 863)]), (' a fire', (130, 137), [(5, 911)])]
    ```"""

    # 匹配所需格式的正则表达式模式
    pattern = r"(?:(<phrase>([^<]+)</phrase>))?<object>((?:<patch_index_\d+><patch_index_\d+><|delimiter_of_multi_objects|>)*<patch_index_\d+><patch_index_\d+>)</object>"

    # 在给定的文本中找到所有匹配项
    matches = re.finditer(pattern, text)

    # 初始化一个空列表，用于存储有效的补丁索引组合
    entities_with_patch_indices = []

    for match in matches:
        # 获取 `<phrase>` 标签之间的文本范围
        span = match.span(2)
        phrase_tag, phrase, match_content = match.groups()
        if not phrase_tag:
            phrase = None
            # 如果没有 `<phrase>` 标签，使用 `<object>` 的起始位置作为文本范围起始
            span = (match.span(0)[0], match.span(0)[0])

        # 使用特定分隔符拆分 match_content 以获取单个补丁索引对
        patch_index_pairs = match_content.split("<|delimiter_of_multi_objects|>")

        entity_bboxes = []
        for pair in patch_index_pairs:
            # 从补丁索引对中提取 xxxx 和 yyyy 的值
            x = re.search(r"<patch_index_(\d+)>", pair)
            y = re.search(r"<patch_index_(\d+)>", pair[1:])

            if x and y:
                if phrase:
                    entity_bboxes.append((int(x.group(1)), int(y.group(1))))
                else:
                    entity_bboxes.append((int(x.group(1)), int(y.group(1))))

        if phrase:
            entities_with_patch_indices.append((phrase, span, entity_bboxes))
        else:
            for bbox in entity_bboxes:
                # 构造一个虚假的实体名称
                entity = f"<patch_index_{bbox[0]}><patch_index_{bbox[1]}>"
                entities_with_patch_indices.append((entity, span, [bbox]))

    return entities_with_patch_indices


def adjust_entity_positions(entity, text):
    """Adjust the positions of the entities in `text` to be relative to the text with special fields removed."""
    entity_name, (start, end) = entity
    # 计算去除特殊字段（标签标记、补丁索引标记等）后的字符串长度
    adjusted_start = len(re.sub("<.*?>", "", text[:start]))
    # 计算去除特殊字段后，起始到结束位置之间的字符串长度
    adjusted_end = len(re.sub("<.*?>", "", text[:end]))
    # 构建调整后的实体信息元组，包括实体名称和调整后的起始、结束位置
    adjusted_entity = (entity_name, (adjusted_start, adjusted_end))
    # 返回调整后的实体信息元组
    return adjusted_entity
# 从文本中清除周围的空格和其中的实体
def _cleanup_spaces(text, entities):
    # 去除文本两侧的空格
    new_text = text.strip()
    # 计算文本开头的空格数量
    leading_spaces = len(text) - len(text.lstrip())

    # 处理实体列表中的每一个实体
    new_entities = []
    for entity_name, (start, end), bboxes in entities:
        # 计算实体名称开头的空格数量
        entity_name_leading_spaces = len(entity_name) - len(entity_name.lstrip())
        # 计算实体名称末尾的空格数量
        entity_name_trailing_spaces = len(entity_name) - len(entity_name.rstrip())

        # 调整实体的起始和结束位置，考虑到文本开头的空格
        start = start - leading_spaces + entity_name_leading_spaces
        end = end - leading_spaces - entity_name_trailing_spaces
        # 去除实体名称两侧的空格
        entity_name = entity_name.strip()

        # 将处理后的实体添加到新的实体列表中
        new_entities.append((entity_name, (start, end), bboxes))

    # 返回处理后的文本和实体列表
    return new_text, new_entities


# 从 https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L77-L87 处复制并稍作格式修改
def clean_text_and_extract_entities_with_bboxes(text, num_patches_per_side=32):
    """从 `text` 中删除标签标记，提取其中的实体并清除一些空白字符。

    示例：

    ```
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> clean_text, entities = clean_text_and_extract_entities_with_bboxes(text)
    >>> clean_text
    'An image of a snowman warming himself by a fire.'

    >>> entities
    [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
    ```"""
    # 删除特殊字段（标签标记、补丁索引标记等）
    processed_text = re.sub("<.*?>", "", text)

    # 提取带有补丁索引的实体
    entities_with_patch_indices = extract_entities_with_patch_indices(text)
    entities = []
    for item in entities_with_patch_indices:
        # 获取实体和其边界框
        entity, bboxes = item[0:2], item[2]
        # 调整实体在文本中的位置
        adjusted_entity = adjust_entity_positions(entity, text)
        # 将边界框的补丁索引转换为坐标
        bboxes_in_coords = [patch_index_to_coordinate(bbox[0], bbox[1], num_patches_per_side) for bbox in bboxes]

        # 将调整后的实体和坐标添加到实体列表中
        entities.append(adjusted_entity + (bboxes_in_coords,))

    # 返回清理空格后的文本和处理后的实体列表
    return _cleanup_spaces(processed_text, entities)

`.\models\kosmos2\init.py`

# coding=utf-8
# 文件编码声明，使用 UTF-8 编码格式
# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
# 版权声明，版权归 Microsoft Research 和 HuggingFace Inc. 团队所有
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 授权许可，采用 Apache 许可证 2.0 版本
# you may not use this file except in compliance with the License.
# 除非遵守许可证，否则不得使用此文件
# You may obtain a copy of the License at
# 可以在以下网址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# 除非适用法律要求或书面同意，否则软件
# distributed under the License is distributed on an "AS IS" BASIS,
# 依据许可证分发的软件是基于"原样"分发的，
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有任何明示或暗示的保证或条件
# See the License for the specific language governing permissions and
# 请参阅许可证以了解特定语言的权限
# limitations under the License.
# 许可证下的限制

from typing import TYPE_CHECKING
# 引入类型检查模块

from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
    is_vision_available,
)
# 从相对路径中引入依赖模块和函数

_import_structure = {
    "configuration_kosmos2": ["KOSMOS2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Kosmos2Config"],
    "processing_kosmos2": ["Kosmos2Processor"],
}
# 定义导入结构字典，包含模块名称和对应的导入内容列表

try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["modeling_kosmos2"] = [
        "KOSMOS2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Kosmos2ForConditionalGeneration",
        "Kosmos2Model",
        "Kosmos2PreTrainedModel",
    ]
# 尝试导入 torch 库，如果不可用则抛出 OptionalDependencyNotAvailable 异常并忽略，否则将模型相关内容加入导入结构字典

if TYPE_CHECKING:
    from .configuration_kosmos2 import KOSMOS2_PRETRAINED_CONFIG_ARCHIVE_MAP, Kosmos2Config
    from .processing_kosmos2 import Kosmos2Processor
    # 如果在类型检查模式下，则从相对路径导入配置和处理器模块内容

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_kosmos2 import (
            KOSMOS2_PRETRAINED_MODEL_ARCHIVE_LIST,
            Kosmos2ForConditionalGeneration,
            Kosmos2Model,
            Kosmos2PreTrainedModel,
        )
        # 如果在类型检查模式下且 torch 可用，则从相对路径导入模型相关内容

else:
    import sys
    # 如果不在类型检查模式下，则导入系统模块

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
    # 将当前模块注册为懒加载模块，指定模块名、文件名和导入结构

`.\models\layoutlm\configuration_layoutlm.py`

# coding=utf-8
# Copyright 2010, The Microsoft Research Asia LayoutLM Team authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" LayoutLM model configuration"""
# 导入所需的模块
from collections import OrderedDict
from typing import Any, List, Mapping, Optional

# 导入预训练模型配置类和预训练分词器
from ... import PretrainedConfig, PreTrainedTokenizer
# 导入ONNX相关的配置和补丁规范
from ...onnx import OnnxConfig, PatchingSpec
# 导入工具函数：张量类型、是否有torch可用、日志记录
from ...utils import TensorType, is_torch_available, logging

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# LayoutLM预训练模型配置文件映射表，包含预训练模型的名称和对应的配置文件URL
LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/layoutlm-base-uncased": (
        "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/config.json"
    ),
    "microsoft/layoutlm-large-uncased": (
        "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/config.json"
    ),
}

# LayoutLM配置类，继承自PretrainedConfig
class LayoutLMConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LayoutLMModel`]. It is used to instantiate a
    LayoutLM model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the LayoutLM
    [microsoft/layoutlm-base-uncased](https://huggingface.co/microsoft/layoutlm-base-uncased) architecture.

    Configuration objects inherit from [`BertConfig`] and can be used to control the model outputs. Read the
    documentation from [`BertConfig`] for more information.


    Examples:

    ```
    >>> from transformers import LayoutLMConfig, LayoutLMModel

    >>> # Initializing a LayoutLM configuration
    >>> configuration = LayoutLMConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = LayoutLMModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    # 指定模型类型为"layoutlm"
    model_type = "layoutlm"

    # LayoutLM配置类的初始化函数，定义了各种模型参数和超参数
    def __init__(
        self,
        vocab_size=30522,  # 词汇表大小，默认为30522
        hidden_size=768,   # 隐藏层大小，默认为768
        num_hidden_layers=12,  # 隐藏层层数，默认为12
        num_attention_heads=12,  # 注意力头数，默认为12
        intermediate_size=3072,  # 中间层大小，默认为3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为gelu
        hidden_dropout_prob=0.1,  # 隐藏层dropout概率，默认为0.1
        attention_probs_dropout_prob=0.1,  # 注意力概率dropout概率，默认为0.1
        max_position_embeddings=512,  # 最大位置嵌入长度，默认为512
        type_vocab_size=2,  # 类型词汇表大小，默认为2
        initializer_range=0.02,  # 初始化范围，默认为0.02
        layer_norm_eps=1e-12,  # 层归一化epsilon值，默认为1e-12
        pad_token_id=0,  # 填充token的ID，默认为0
        position_embedding_type="absolute",  # 位置嵌入类型，默认为绝对位置嵌入
        use_cache=True,  # 是否使用缓存，默认为True
        max_2d_position_embeddings=1024,  # 最大二维位置嵌入长度，默认为1024
        **kwargs,  # 其余参数，用于接收任意额外的关键字参数
        ):
        # 调用父类的初始化方法，设置填充标记ID和其他参数
        super().__init__(pad_token_id=pad_token_id, **kwargs)
        # 设置词汇表大小
        self.vocab_size = vocab_size
        # 设置隐藏层大小
        self.hidden_size = hidden_size
        # 设置隐藏层数量
        self.num_hidden_layers = num_hidden_layers
        # 设置注意力头数量
        self.num_attention_heads = num_attention_heads
        # 设置隐藏层激活函数类型
        self.hidden_act = hidden_act
        # 设置中间层大小
        self.intermediate_size = intermediate_size
        # 设置隐藏层的dropout概率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 设置注意力概率的dropout概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 设置最大位置嵌入的大小
        self.max_position_embeddings = max_position_embeddings
        # 设置类型词汇表的大小
        self.type_vocab_size = type_vocab_size
        # 设置初始化范围
        self.initializer_range = initializer_range
        # 设置层归一化的epsilon值
        self.layer_norm_eps = layer_norm_eps
        # 设置位置嵌入的类型
        self.position_embedding_type = position_embedding_type
        # 设置是否使用缓存
        self.use_cache = use_cache
        # 设置二维位置嵌入的最大值
        self.max_2d_position_embeddings = max_2d_position_embeddings
# LayoutLMOnnxConfig 类，继承自 OnnxConfig 类
class LayoutLMOnnxConfig(OnnxConfig):
    # 初始化方法
    def __init__(
        self,
        config: PretrainedConfig,  # 预训练配置对象
        task: str = "default",  # 任务名称，默认为 "default"
        patching_specs: List[PatchingSpec] = None,  # 补丁规范列表，默认为空
    ):
        # 调用父类 OnnxConfig 的初始化方法
        super().__init__(config, task=task, patching_specs=patching_specs)
        # 设置最大的二维位置嵌入数量为配置对象的最大二维位置嵌入数量减一
        self.max_2d_positions = config.max_2d_position_embeddings - 1

    # inputs 属性方法，返回一个有序字典，描述了模型的输入
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                ("input_ids", {0: "batch", 1: "sequence"}),  # 输入的 token IDs，第一维为 batch，第二维为序列
                ("bbox", {0: "batch", 1: "sequence"}),  # 包围框信息，第一维为 batch，第二维为序列
                ("attention_mask", {0: "batch", 1: "sequence"}),  # 注意力遮罩，第一维为 batch，第二维为序列
                ("token_type_ids", {0: "batch", 1: "sequence"}),  # token 类型 IDs，第一维为 batch，第二维为序列
            ]
        )

    # generate_dummy_inputs 方法，生成用于 ONNX 导出器的虚拟输入
    def generate_dummy_inputs(
        self,
        tokenizer: PreTrainedTokenizer,  # 预训练 tokenizer 对象
        batch_size: int = -1,  # 批量大小，默认为动态轴
        seq_length: int = -1,  # 序列长度，默认为动态轴
        is_pair: bool = False,  # 是否为句对输入，默认为 False
        framework: Optional[TensorType] = None,  # 框架类型，可选的 TensorType 对象
    ) -> Mapping[str, Any]:  # 返回一个映射，包含提供给模型前向函数的参数
        """
        生成用于 ONNX 导出器的特定框架的输入

        Args:
            tokenizer: 与该模型配置关联的 tokenizer
            batch_size: 导出模型的批次大小（整数）（-1 表示动态轴）
            seq_length: 导出模型的序列长度（整数）（-1 表示动态轴）
            is_pair: 表示输入是否为句对（句子1，句子2）
            framework: tokenizer 将为其生成张量的框架（可选）

        Returns:
            Mapping[str, Tensor]，包含要提供给模型前向函数的参数
        """

        # 调用父类的 generate_dummy_inputs 方法，获取基本的输入字典
        input_dict = super().generate_dummy_inputs(
            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
        )

        # 生成一个虚拟的包围框
        box = [48, 84, 73, 128]

        # 如果框架不是 PyTorch，抛出 NotImplementedError
        if not framework == TensorType.PYTORCH:
            raise NotImplementedError("Exporting LayoutLM to ONNX is currently only supported for PyTorch.")

        # 如果没有安装 PyTorch，抛出 ValueError
        if not is_torch_available():
            raise ValueError("Cannot generate dummy inputs without PyTorch installed.")
        import torch

        # 获取输入中 input_ids 的批次大小和序列长度
        batch_size, seq_length = input_dict["input_ids"].shape
        # 将包围框信息转换为 PyTorch 张量，并在批次维度上进行复制
        input_dict["bbox"] = torch.tensor([*[box] * seq_length]).tile(batch_size, 1, 1)
        return input_dict

`.\models\layoutlm\modeling_layoutlm.py`

# coding=utf-8
# 版权归 Microsoft Research Asia LayoutLM Team 作者和 HuggingFace Inc. 团队所有。
#
# 根据 Apache 许可证 2.0 版本（"许可证"）授权;
# 除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按"原样"分发的软件，
# 没有任何明示或暗示的保证或条件。
# 请查阅许可证获取具体的法律授权和限制。
""" PyTorch LayoutLM 模型。"""


import math
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    MaskedLMOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_layoutlm import LayoutLMConfig


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "LayoutLMConfig"
_CHECKPOINT_FOR_DOC = "microsoft/layoutlm-base-uncased"

LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "layoutlm-base-uncased",
    "layoutlm-large-uncased",
]


LayoutLMLayerNorm = nn.LayerNorm


class LayoutLMEmbeddings(nn.Module):
    """从词嵌入、位置嵌入和标记类型嵌入构建嵌入。"""

    def __init__(self, config):
        super(LayoutLMEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
        self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
        self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
        self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        self.LayerNorm = LayoutLMLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
    # 定义前向传播方法，接收多个输入参数
    def forward(
        self,
        input_ids=None,
        bbox=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
    ):
        # 如果传入了 input_ids 参数，获取其形状
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则，获取 inputs_embeds 的形状，排除最后一维
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度
        seq_length = input_shape[1]

        # 确定设备，如果有 input_ids 使用 input_ids 的设备，否则使用 inputs_embeds 的设备
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # 如果 position_ids 为 None，则使用 self.position_ids 的前 seq_length 列
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果 token_type_ids 为 None，则初始化为全零张量
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # 如果 inputs_embeds 为 None，则使用 self.word_embeddings 对 input_ids 进行嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        # 获取单词嵌入向量
        words_embeddings = inputs_embeds
        # 获取位置嵌入向量
        position_embeddings = self.position_embeddings(position_ids)

        # 尝试获取左上右下四个方向的位置嵌入向量，并处理 IndexError 异常
        try:
            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
        except IndexError as e:
            # 如果 IndexError 发生，抛出异常并提供更具体的错误信息
            raise IndexError("The `bbox` coordinate values should be within 0-1000 range.") from e

        # 获取高度和宽度的位置嵌入向量
        h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3] - bbox[:, :, 1])
        w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2] - bbox[:, :, 0])

        # 获取 token_type 的嵌入向量
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 计算最终的嵌入向量，包括单词、位置、各方向位置、高度、宽度和 token_type 的嵌入向量
        embeddings = (
            words_embeddings
            + position_embeddings
            + left_position_embeddings
            + upper_position_embeddings
            + right_position_embeddings
            + lower_position_embeddings
            + h_position_embeddings
            + w_position_embeddings
            + token_type_embeddings
        )

        # 对嵌入向量进行 LayerNorm
        embeddings = self.LayerNorm(embeddings)
        # 对嵌入向量应用 dropout
        embeddings = self.dropout(embeddings)

        # 返回最终的嵌入向量
        return embeddings
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->LayoutLM
class LayoutLMSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查隐藏大小是否能被注意力头数整除，若不能则引发值错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 初始化注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 初始化查询、键、值的线性变换层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 初始化dropout层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # 设置位置嵌入类型，默认为绝对位置编码
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果位置嵌入类型是相对键或相对键-查询，则初始化距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 判断是否是解码器模式
        self.is_decoder = config.is_decoder

    # 将张量变换为注意力分数的形状
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ):
        # 此处省略部分前向传播的具体实现
        pass


# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->LayoutLM
class LayoutLMSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化全连接层、LayerNorm和dropout
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数定义
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 全连接层变换
        hidden_states = self.dense(hidden_states)
        # dropout
        hidden_states = self.dropout(hidden_states)
        # LayerNorm层和残差连接
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->LayoutLM
# LayoutLMAttention 类，用于 LayoutLM 模型中的注意力机制部分
class LayoutLMAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 初始化自注意力层和输出层
        self.self = LayoutLMSelfAttention(config, position_embedding_type=position_embedding_type)
        self.output = LayoutLMSelfOutput(config)
        self.pruned_heads = set()

    # 剪枝注意力头部
    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 找到可剪枝的注意力头部和索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储剪枝后的头部
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 执行自注意力层的前向传播
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 将自注意力层的输出经过输出层
        attention_output = self.output(self_outputs[0], hidden_states)
        outputs = (attention_output,) + self_outputs[1:]  # 如果有需要，添加注意力权重
        return outputs


# Copied from transformers.models.bert.modeling_bert.BertIntermediate
# LayoutLMIntermediate 类，用于 LayoutLM 模型中的中间层
class LayoutLMIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化线性层和激活函数
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播函数
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 线性变换
        hidden_states = self.dense(hidden_states)
        # 应用中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->LayoutLM
# LayoutLMOutput 类，用于 LayoutLM 模型中的输出层
class LayoutLMOutput(nn.Module):
    # 初始化函数，用于创建一个新的神经网络层
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__()
        # 创建一个线性层，输入维度为 config.intermediate_size，输出维度为 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个 LayerNorm 层，输入维度为 config.hidden_size，设置 epsilon 参数为 config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层，用于在训练过程中随机丢弃部分神经元，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，定义了神经网络的计算流程
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用线性层进行计算，将 hidden_states 映射到新的表示空间
        hidden_states = self.dense(hidden_states)
        # 对计算结果进行 dropout 操作，随机丢弃部分神经元的输出
        hidden_states = self.dropout(hidden_states)
        # 将 dropout 后的结果与 input_tensor 相加，并进行 LayerNorm 操作，得到最终的隐藏状态表示
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回最终的隐藏状态表示作为本层的输出
        return hidden_states
# 从transformers.models.bert.modeling_bert.BertLayer复制代码，将Bert替换为LayoutLM
class LayoutLMLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化LayoutLMLayer类，设置前馈块的分块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度设置为1
        self.seq_len_dim = 1
        # 创建LayoutLMAttention对象并赋给self.attention
        self.attention = LayoutLMAttention(config)
        # 检查是否为解码器模型
        self.is_decoder = config.is_decoder
        # 检查是否添加跨注意力
        self.add_cross_attention = config.add_cross_attention
        # 如果添加跨注意力但不是解码器模型，则引发错误
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 使用绝对位置嵌入类型创建LayoutLMAttention对象并赋给self.crossattention
            self.crossattention = LayoutLMAttention(config, position_embedding_type="absolute")
        # 创建LayoutLMIntermediate对象并赋给self.intermediate
        self.intermediate = LayoutLMIntermediate(config)
        # 创建LayoutLMOutput对象并赋给self.output
        self.output = LayoutLMOutput(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
        ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 使用当前的 self_attn_past_key_value 执行 self-attention 计算
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        attention_output = self_attention_outputs[0]

        # if decoder, the last output is tuple of self-attn cache
        if self.is_decoder:
            # 从 self_attention_outputs 中提取除了最后一个元素之外的所有元素作为 outputs
            outputs = self_attention_outputs[1:-1]
            present_key_value = self_attention_outputs[-1]
        else:
            # 如果不是 decoder，将 self_attention_outputs 中除了第一个元素之外的所有元素作为 outputs
            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                # 如果没有设置 cross-attention 层，则抛出 ValueError
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
            # 从 past_key_value 中提取出后两个元素作为 cross_attn_past_key_value
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            attention_output = cross_attention_outputs[0]
            # 将 cross_attention_outputs 中除了最后一个元素之外的所有元素添加到 outputs 中
            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights

            # add cross-attn cache to positions 3,4 of present_key_value tuple
            # 将 cross_attention_outputs 中的最后一个元素添加到 present_key_value 中
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # 对 attention_output 应用 chunking 策略来处理长序列
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        outputs = (layer_output,) + outputs

        # if decoder, return the attn key/values as the last output
        # 如果是 decoder，将 present_key_value 作为最后一个输出添加到 outputs 中
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs

    def feed_forward_chunk(self, attention_output):
        # 将 attention_output 应用 feed-forward 网络的中间层和输出层
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
# 从transformers.models.bert.modeling_bert.BertEncoder复制并修改为LayoutLMEncoder类
class LayoutLMEncoder(nn.Module):
    # 初始化函数，接受一个配置参数config
    def __init__(self, config):
        super().__init__()
        # 将传入的配置参数保存到self.config中
        self.config = config
        # 创建一个由多个LayoutLMLayer对象组成的层列表，数量为config.num_hidden_layers
        self.layer = nn.ModuleList([LayoutLMLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点标志，初始设置为False
        self.gradient_checkpointing = False

    # 前向传播函数，接受多个输入参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 初始化空元组，根据参数设置是否输出隐藏状态、注意力权重和交叉注意力权重
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果开启了梯度检查点且在训练模式下，处理缓存使用情况
        if self.gradient_checkpointing and self.training:
            if use_cache:
                # 如果同时使用缓存和梯度检查点，给出警告并强制将 use_cache 设置为 False
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 初始化下一个解码器缓存的元组，根据 use_cache 参数决定是否为空
        next_decoder_cache = () if use_cache else None
        # 遍历所有的层模块
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前隐藏状态加入到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 根据 head_mask 参数决定当前层的注意力头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 根据 past_key_values 参数决定过去的键值对
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果开启了梯度检查点且在训练模式下，调用梯度检查点函数处理当前层的计算
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层模块进行前向传播计算
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新隐藏状态为当前层模块的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果使用缓存，将当前层的输出的最后一个元素加入到下一个解码器缓存中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力权重，将当前层模块的输出的第二个元素加入到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果模型配置中添加了交叉注意力，将当前层模块的输出的第三个元素加入到 all_cross_attentions 中
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 最后一层计算完成后，如果需要输出隐藏状态，将最终的隐藏状态加入到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 根据 return_dict 参数决定返回类型，返回相应的结果
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 返回包含最终结果的 BaseModelOutputWithPastAndCrossAttentions 对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# Copied from transformers.models.bert.modeling_bert.BertPooler
class LayoutLMPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，输入和输出维度都是config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 激活函数使用双曲正切函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 从隐藏状态中取出第一个 token 的隐藏状态作为池化输出
        first_token_tensor = hidden_states[:, 0]
        # 将第一个 token 的隐藏状态通过全连接层进行线性变换
        pooled_output = self.dense(first_token_tensor)
        # 将线性变换后的输出应用激活函数
        pooled_output = self.activation(pooled_output)
        return pooled_output


# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->LayoutLM
class LayoutLMPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，输入和输出维度都是config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 根据配置文件选择激活函数，如果是字符串，则使用预定义的激活函数；否则直接使用给定的激活函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        # Layer normalization，对隐藏状态进行归一化处理
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 首先将隐藏状态通过全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 然后将线性变换后的结果应用激活函数
        hidden_states = self.transform_act_fn(hidden_states)
        # 最后对处理后的隐藏状态进行 Layer normalization
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->LayoutLM
class LayoutLMLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个预测头变换层，用于将隐藏状态映射到预测值
        self.transform = LayoutLMPredictionHeadTransform(config)

        # 输出权重与输入嵌入层相同，但每个标记都有一个仅输出的偏置
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 输出偏置，用于每个标记的预测
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # 需要连接两个变量，以便在 `resize_token_embeddings` 时正确调整偏置大小
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        # 首先通过变换层处理隐藏状态
        hidden_states = self.transform(hidden_states)
        # 然后通过线性层进行预测
        hidden_states = self.decoder(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->LayoutLM
class LayoutLMOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化预测层，用于MLM任务的预测
        self.predictions = LayoutLMLMPredictionHead(config)

    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
        # 对序列输出进行预测得分计算
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


class LayoutLMPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定使用的配置类为LayoutLMConfig
    config_class = LayoutLMConfig
    # 使用预训练模型的存档映射列表作为初始值
    pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST
    # 定义基础模型的前缀字符串
    base_model_prefix = "layoutlm"
    # 模型支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """初始化模型权重"""
        # 如果模块是线性层
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化权重，标准差为配置文件中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有偏置项，将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是嵌入层
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，标准差为配置文件中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有填充索引，将其对应的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果模块是自定义的 LayoutLMLayerNorm 层
        elif isinstance(module, LayoutLMLayerNorm):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为全 1
            module.weight.data.fill_(1.0)
"""
LAYOUTLM_START_DOCSTRING = r"""
    The LayoutLM model was proposed in [LayoutLM: Pre-training of Text and Layout for Document Image
    Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei and
    Ming Zhou.

    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`LayoutLMConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

LAYOUTLM_INPUTS_DOCSTRING = r"""
    Args:
        batch_size (int): The batch size of the input data.
        sequence_length (int): The length of the input sequences.

    This method returns the LayoutLM Model's outputs with the specified input parameters.
"""

@add_start_docstrings(
    "The bare LayoutLM Model transformer outputting raw hidden-states without any specific head on top.",
    LAYOUTLM_START_DOCSTRING,
)
class LayoutLMModel(LayoutLMPreTrainedModel):
    def __init__(self, config):
        super(LayoutLMModel, self).__init__(config)
        self.config = config

        self.embeddings = LayoutLMEmbeddings(config)
        self.encoder = LayoutLMEncoder(config)
        self.pooler = LayoutLMPooler(config)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        Returns:
            torch.nn.Embedding: The word embedding layer of the LayoutLM Model.
        """
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        """
        Args:
            value (torch.Tensor): The new input embeddings to be set for the LayoutLM Model.
        """
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model.

        Args:
            heads_to_prune (dict): Dictionary of {layer_num: list of heads to prune in this layer}.
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        bbox: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Args:
            input_ids (torch.LongTensor, optional): The input IDs of the tokens.
            bbox (torch.LongTensor, optional): The bounding boxes of each token in the input.
            attention_mask (torch.FloatTensor, optional): The attention mask for the input.
            token_type_ids (torch.LongTensor, optional): The token type IDs for the input.
            position_ids (torch.LongTensor, optional): The position IDs for positional embeddings.
            head_mask (torch.FloatTensor, optional): The mask for heads in the multi-head attention mechanism.
            inputs_embeds (torch.FloatTensor, optional): The embedded input sequences.
            encoder_hidden_states (torch.FloatTensor, optional): The hidden states from the encoder.
            encoder_attention_mask (torch.FloatTensor, optional): The attention mask for encoder hidden states.
            output_attentions (bool, optional): Whether to output attentions.
            output_hidden_states (bool, optional): Whether to output hidden states.
            return_dict (bool, optional): Whether to return a dictionary as output.

        Returns:
            BaseModelOutputWithPoolingAndCrossAttentions or torch.Tensor:
                The model outputs with additional pooling and cross-attention information if configured.
        """
        return super().forward(
            input_ids=input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
# 创建一个 LayoutLMForMaskedLM 类，继承自 LayoutLMPreTrainedModel 类
class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
    # 定义一个包含需要共享权重的 key 的列表
    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]

    # 定义初始化方法，接受一个 config 对象参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 创建一个 LayoutLMModel 对象
        self.layoutlm = LayoutLMModel(config)
        # 创建一个 LayoutLMOnlyMLMHead 对象
        self.cls = LayoutLMOnlyMLMHead(config)

        # 调用自定义的 post_init 方法
        self.post_init()

    # 定义方法，返回 layoutlm.embeddings.word_embeddings 对象
    def get_input_embeddings(self):
        return self.layoutlm.embeddings.word_embeddings

    # 定义方法，返回 cls.predictions.decoder 对象
    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    # 定义方法，设置 cls.predictions.decoder 对象的值为 new_embeddings
    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    # 使用装饰器添加模型前向传播方法的文档注释
    # 使用装饰器替换返回文档注释
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        bbox: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 添加模型前向传播的文档注释
        # 使用布尔值参数指定是否返回字典类型输出


# 创建一个 LayoutLMForSequenceClassification 类，继承自 LayoutLMPreTrainedModel 类
class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel):
    # 定义初始化方法，接受一个 config 对象参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 设置类别数量为 config 的 num_labels 属性
        self.num_labels = config.num_labels
        # 创建一个 LayoutLMModel 对象
        self.layoutlm = LayoutLMModel(config)
        # 创建一个 nn.Dropout 对象，用于屏蔽部分神经元
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 创建一个 nn.Linear 对象，用于线性变换
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 调用自定义的 post_init 方法
        self.post_init()

    # 定义方法，返回 layoutlm.embeddings.word_embeddings 对象
    def get_input_embeddings(self):
        return self.layoutlm.embeddings.word_embeddings

    # 使用装饰器添加模型前向传播方法的文档注释
    # 使用装饰器替换返回文档注释
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        bbox: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 添加模型前向传播的文档注释
    # 定义一个方法 `forward`，用于模型的前向传播计算
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        bbox: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    """
    LayoutLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    sequence labeling (information extraction) tasks such as the [FUNSD](https://guillaumejaume.github.io/FUNSD/)
    dataset and the [SROIE](https://rrc.cvc.uab.es/?ch=13) dataset.
    """,
    LAYOUTLM_START_DOCSTRING,
)
class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
    """
    LayoutLM 模型，顶部带有一个标记分类头部（在隐藏状态输出之上的线性层），例如用于序列标记（信息提取）任务，如 FUNSD 和 SROIE 数据集。
    继承自 LayoutLMPreTrainedModel。
    """

    def __init__(self, config):
        """
        初始化方法，配置模型参数和各层组件。
        
        Args:
            config (LayoutLMConfig): 包含模型配置的对象实例。
        """
        super().__init__(config)
        self.num_labels = config.num_labels
        self.layoutlm = LayoutLMModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        返回模型的输入嵌入层，这里是 layoutlm.embeddings.word_embeddings。
        
        Returns:
            nn.Embedding: 输入嵌入层对象。
        """
        return self.layoutlm.embeddings.word_embeddings

    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        bbox: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传播方法，接受一系列输入参数，执行模型的前向计算。
        
        Args:
            input_ids (torch.LongTensor, optional): 输入 token IDs，形状为 [batch_size, sequence_length]。
            bbox (torch.LongTensor, optional): Bounding box 数据，形状为 [batch_size, sequence_length, 4]。
            attention_mask (torch.FloatTensor, optional): 注意力掩码，形状为 [batch_size, sequence_length]。
            token_type_ids (torch.LongTensor, optional): Token 类型 IDs，形状为 [batch_size, sequence_length]。
            position_ids (torch.LongTensor, optional): 位置 IDs，形状为 [batch_size, sequence_length]。
            head_mask (torch.FloatTensor, optional): 头部掩码，形状为 [num_heads] 或 [num_hidden_layers x num_heads]。
            inputs_embeds (torch.FloatTensor, optional): 嵌入输入，形状为 [batch_size, sequence_length, embedding_size]。
            labels (torch.LongTensor, optional): 标签数据，形状为 [batch_size, sequence_length]。
            output_attentions (bool, optional): 是否输出注意力权重。
            output_hidden_states (bool, optional): 是否输出隐藏状态。
            return_dict (bool, optional): 是否返回字典格式的输出。
        
        Returns:
            TokenClassifierOutput: 根据配置返回不同的输出对象，包含损失和预测等信息。
        """
        # 省略部分代码...
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 return_dict 不为 None，则使用它；否则使用 self.config.use_return_dict

        outputs = self.layoutlm(
            input_ids=input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 调用 layoutlm 模型进行前向传播，传入各种输入参数，并根据 return_dict 参数返回不同的输出形式

        sequence_output = outputs[0]
        # 从模型输出中获取序列输出

        sequence_output = self.dropout(sequence_output)
        # 对序列输出进行 dropout 操作

        logits = self.classifier(sequence_output)
        # 使用分类器对序列输出进行分类，得到 logits（对数概率）

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # 定义交叉熵损失函数

            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            # 计算预测 logits 和真实标签 labels 之间的交叉熵损失

        if not return_dict:
            output = (logits,) + outputs[2:]
            # 如果 return_dict 是 False，构建输出元组，包括 logits 和额外的输出信息

            return ((loss,) + output) if loss is not None else output
            # 如果有损失，则返回损失和输出元组；否则只返回输出元组

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
        # 返回 TokenClassifierOutput 对象，包含损失、logits、隐藏状态和注意力信息
@add_start_docstrings(
    """
    LayoutLM Model with a span classification head on top for extractive question-answering tasks such as
    [DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the final hidden-states output to compute `span
    start logits` and `span end logits`).
    """,
    LAYOUTLM_START_DOCSTRING,
)
class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel):
    """
    LayoutLMForQuestionAnswering extends LayoutLMPreTrainedModel for question answering tasks.
    It includes a span classification head on top for tasks like DocVQA.
    """

    def __init__(self, config, has_visual_segment_embedding=True):
        """
        Initializes LayoutLMForQuestionAnswering model with the provided configuration.

        Args:
            config: The configuration object that defines the model architecture.
            has_visual_segment_embedding: Whether to include visual segment embeddings. Default is True.
        """
        super().__init__(config)
        self.num_labels = config.num_labels

        # Initialize LayoutLM model backbone
        self.layoutlm = LayoutLMModel(config)
        
        # Linear layer for question answering outputs
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        Returns the word embeddings from the LayoutLM model.

        Returns:
            torch.Tensor: Word embeddings.
        """
        return self.layoutlm.embeddings.word_embeddings

    @replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        bbox: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

`.\models\layoutlm\modeling_tf_layoutlm.py`

# coding=utf-8
# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 LayoutLM model."""


from __future__ import annotations

import math
import warnings
from typing import Dict, Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutputWithPastAndCrossAttentions,
    TFBaseModelOutputWithPoolingAndCrossAttentions,
    TFMaskedLMOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_layoutlm import LayoutLMConfig


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "LayoutLMConfig"

TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/layoutlm-base-uncased",
    "microsoft/layoutlm-large-uncased",
]


class TFLayoutLMEmbeddings(keras.layers.Layer):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config: LayoutLMConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化 TFLayoutLMEmbeddings 类
        self.config = config  # 保存配置对象
        self.hidden_size = config.hidden_size  # 隐藏层大小从配置中获取
        self.max_position_embeddings = config.max_position_embeddings  # 最大位置嵌入数从配置中获取
        self.max_2d_position_embeddings = config.max_2d_position_embeddings  # 二维最大位置嵌入数从配置中获取
        self.initializer_range = config.initializer_range  # 初始化范围从配置中获取
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")  # LayerNorm 层，使用配置中的 epsilon
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)  # Dropout 层，使用配置中的 dropout 率
    # 定义神经网络层的构建方法，用于构建模型的输入形状
    def build(self, input_shape=None):
        # 在名为"word_embeddings"的命名空间下创建权重张量，形状为[vocab_size, hidden_size]
        self.weight = self.add_weight(
            name="weight",
            shape=[self.config.vocab_size, self.hidden_size],
            initializer=get_initializer(self.initializer_range),
        )

        # 在名为"token_type_embeddings"的命名空间下创建权重张量，形状为[type_vocab_size, hidden_size]
        self.token_type_embeddings = self.add_weight(
            name="embeddings",
            shape=[self.config.type_vocab_size, self.hidden_size],
            initializer=get_initializer(self.initializer_range),
        )

        # 在名为"position_embeddings"的命名空间下创建权重张量，形状为[max_position_embeddings, hidden_size]
        self.position_embeddings = self.add_weight(
            name="embeddings",
            shape=[self.max_position_embeddings, self.hidden_size],
            initializer=get_initializer(self.initializer_range),
        )

        # 在名为"x_position_embeddings"的命名空间下创建权重张量，形状为[max_2d_position_embeddings, hidden_size]
        self.x_position_embeddings = self.add_weight(
            name="embeddings",
            shape=[self.max_2d_position_embeddings, self.hidden_size],
            initializer=get_initializer(self.initializer_range),
        )

        # 在名为"y_position_embeddings"的命名空间下创建权重张量，形状为[max_2d_position_embeddings, hidden_size]
        self.y_position_embeddings = self.add_weight(
            name="embeddings",
            shape=[self.max_2d_position_embeddings, self.hidden_size],
            initializer=get_initializer(self.initializer_range),
        )

        # 在名为"h_position_embeddings"的命名空间下创建权重张量，形状为[max_2d_position_embeddings, hidden_size]
        self.h_position_embeddings = self.add_weight(
            name="embeddings",
            shape=[self.max_2d_position_embeddings, self.hidden_size],
            initializer=get_initializer(self.initializer_range),
        )

        # 在名为"w_position_embeddings"的命名空间下创建权重张量，形状为[max_2d_position_embeddings, hidden_size]
        self.w_position_embeddings = self.add_weight(
            name="embeddings",
            shape=[self.max_2d_position_embeddings, self.hidden_size],
            initializer=get_initializer(self.initializer_range),
        )

        # 如果模型已经构建，则直接返回，避免重复构建
        if self.built:
            return

        # 标记模型已经构建
        self.built = True

        # 如果存在LayerNorm层，则在其命名空间下构建LayerNorm层，输入形状为[None, None, hidden_size]
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])

    # 定义神经网络层的调用方法，处理输入数据的前向传播
    def call(
        self,
        input_ids: tf.Tensor = None,
        bbox: tf.Tensor = None,
        position_ids: tf.Tensor = None,
        token_type_ids: tf.Tensor = None,
        inputs_embeds: tf.Tensor = None,
        training: bool = False,
        ```
    ) -> tf.Tensor:
        """
        应用基于输入张量的嵌入。

        Returns:
            final_embeddings (`tf.Tensor`): 输出的嵌入张量。
        """
        assert not (input_ids is None and inputs_embeds is None)

        if input_ids is not None:
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            # 从权重矩阵中根据 input_ids 获取对应的嵌入向量
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        input_shape = shape_list(inputs_embeds)[:-1]

        if token_type_ids is None:
            # 如果未提供 token_type_ids，则初始化为全零张量，形状与 inputs_embeds 最后一维之前相同
            token_type_ids = tf.fill(dims=input_shape, value=0)

        if position_ids is None:
            # 如果未提供 position_ids，则创建一个范围为 [0, input_shape[-1]) 的张量
            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        if position_ids is None:
            # 如果再次出现未提供 position_ids 的情况，则创建一个范围为 [0, input_shape[-1]) 的张量
            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        if bbox is None:
            # 如果未提供 bbox，则初始化为全零张量，形状为 input_shape 加上 [4]
            bbox = bbox = tf.fill(input_shape + [4], value=0)
        try:
            # 根据 bbox 的坐标值从位置嵌入矩阵中获取对应的位置嵌入
            left_position_embeddings = tf.gather(self.x_position_embeddings, bbox[:, :, 0])
            upper_position_embeddings = tf.gather(self.y_position_embeddings, bbox[:, :, 1])
            right_position_embeddings = tf.gather(self.x_position_embeddings, bbox[:, :, 2])
            lower_position_embeddings = tf.gather(self.y_position_embeddings, bbox[:, :, 3])
        except IndexError as e:
            raise IndexError("The `bbox`coordinate values should be within 0-1000 range.") from e
        # 根据 bbox 的高度和宽度计算对应的位置嵌入
        h_position_embeddings = tf.gather(self.h_position_embeddings, bbox[:, :, 3] - bbox[:, :, 1])
        w_position_embeddings = tf.gather(self.w_position_embeddings, bbox[:, :, 2] - bbox[:, :, 0])

        # 根据 position_ids 获取位置嵌入
        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        # 根据 token_type_ids 获取 token 类型嵌入
        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
        
        # 计算最终的嵌入向量，将各部分嵌入相加
        final_embeddings = (
            inputs_embeds
            + position_embeds
            + token_type_embeds
            + left_position_embeddings
            + upper_position_embeddings
            + right_position_embeddings
            + lower_position_embeddings
            + h_position_embeddings
            + w_position_embeddings
        )
        # 应用 LayerNorm 归一化
        final_embeddings = self.LayerNorm(inputs=final_embeddings)
        # 应用 dropout
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)

        return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->LayoutLM

class TFLayoutLMSelfAttention(keras.layers.Layer):
    def __init__(self, config: LayoutLMConfig, **kwargs):
        super().__init__(**kwargs)

        # 检查隐藏层大小是否能被注意力头数整除
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
                f"of attention heads ({config.num_attention_heads})"
            )

        # 初始化层参数
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)

        # 定义查询、键、值的全连接层
        self.query = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
        )
        self.key = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
        )
        self.value = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )

        # 定义 Dropout 层
        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)

        # 设置是否为解码器
        self.is_decoder = config.is_decoder
        self.config = config

    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # 将形状从 [batch_size, seq_length, all_head_size] 转换为 [batch_size, seq_length, num_attention_heads, attention_head_size]
        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))

        # 将张量转置为 [batch_size, num_attention_heads, seq_length, attention_head_size]
        return tf.transpose(tensor, perm=[0, 2, 1, 3])

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        encoder_hidden_states: tf.Tensor,
        encoder_attention_mask: tf.Tensor,
        past_key_value: Tuple[tf.Tensor],
        output_attentions: bool,
        training: bool = False,
    ):
        # 这里应该实现自注意力机制，包括查询、键、值的计算，Dropout 和注意力矩阵的计算
        # 实现详细逻辑在此处省略，应该包括自注意力、Multi-Head Attention 等

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 检查并构建查询、键、值的全连接层
        if getattr(self, "query", None) is not None:
            with tf.name_scope(self.query.name):
                self.query.build([None, None, self.config.hidden_size])
        if getattr(self, "key", None) is not None:
            with tf.name_scope(self.key.name):
                self.key.build([None, None, self.config.hidden_size])
        if getattr(self, "value", None) is not None:
            with tf.name_scope(self.value.name):
                self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->LayoutLM
class TFLayoutLMSelfOutput(keras.layers.Layer):
    def __init__(self, config: LayoutLMConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义全连接层，用于变换隐藏状态的维度
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 定义层归一化层，用于归一化隐藏状态
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 定义 dropout 层，用于在训练时随机置零部分隐藏状态
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 对隐藏状态进行全连接变换
        hidden_states = self.dense(inputs=hidden_states)
        # 对变换后的隐藏状态进行 dropout 操作
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 将 dropout 后的结果与输入张量相加，并进行层归一化
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经定义了 dense 层，构建其参数
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果已经定义了 LayerNorm 层，构建其参数
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->LayoutLM
class TFLayoutLMAttention(keras.layers.Layer):
    def __init__(self, config: LayoutLMConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义 LayoutLM 的自注意力层
        self.self_attention = TFLayoutLMSelfAttention(config, name="self")
        # 定义 LayoutLM 的输出层
        self.dense_output = TFLayoutLMSelfOutput(config, name="output")

    def prune_heads(self, heads):
        # 暂未实现裁剪注意力头部的方法
        raise NotImplementedError

    def call(
        self,
        input_tensor: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        encoder_hidden_states: tf.Tensor,
        encoder_attention_mask: tf.Tensor,
        past_key_value: Tuple[tf.Tensor],
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 调用自注意力层，获取自注意力层的输出
        self_outputs = self.self_attention(
            hidden_states=input_tensor,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            training=training,
        )
        # 将自注意力层的输出作为输入，调用输出层进行处理
        attention_output = self.dense_output(
            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
        )
        # 如果需要输出注意力信息，则将注意力信息添加到输出中
        outputs = (attention_output,) + self_outputs[1:]

        return outputs
    # 定义一个方法 `build`，用于构建神经网络层的计算图
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 将标记设置为已构建
        self.built = True
        
        # 如果存在自注意力层 `self_attention`
        if getattr(self, "self_attention", None) is not None:
            # 使用 `self_attention` 的名称作为命名空间，构建自注意力层
            with tf.name_scope(self.self_attention.name):
                self.self_attention.build(None)
        
        # 如果存在密集输出层 `dense_output`
        if getattr(self, "dense_output", None) is not None:
            # 使用 `dense_output` 的名称作为命名空间，构建密集输出层
            with tf.name_scope(self.dense_output.name):
                self.dense_output.build(None)
# 从 transformers.models.bert.modeling_tf_bert.TFBertIntermediate 复制并修改为 LayoutLM
class TFLayoutLMIntermediate(keras.layers.Layer):
    def __init__(self, config: LayoutLMConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，用于中间输出，指定输出单元数和初始化器
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 确定中间激活函数，可以是字符串或函数，根据配置选择合适的 TensorFlow 激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 通过全连接层进行前向传播
        hidden_states = self.dense(inputs=hidden_states)
        # 应用中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            # 构建全连接层，指定输入形状和输出单元数
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# 从 transformers.models.bert.modeling_tf_bert.TFBertOutput 复制并修改为 LayoutLM
class TFLayoutLMOutput(keras.layers.Layer):
    def __init__(self, config: LayoutLMConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，用于输出层，指定输出单元数和初始化器
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 创建 LayerNormalization 层，用于归一化
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建 Dropout 层，用于随机失活
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 通过全连接层进行前向传播
        hidden_states = self.dense(inputs=hidden_states)
        # 应用 Dropout 进行随机失活
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 应用 LayerNormalization 进行归一化，并加上残差连接
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            # 构建全连接层，指定输入形状和输出单元数
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])
        if getattr(self, "LayerNorm", None) is not None:
            # 构建 LayerNormalization 层，指定输入形状和归一化的维度
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# 从 transformers.models.bert.modeling_tf_bert.TFBertLayer 复制并修改为 LayoutLM
class TFLayoutLMLayer(keras.layers.Layer):
    # 使用 LayoutLMConfig 对象和其他关键字参数初始化函数
    def __init__(self, config: LayoutLMConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 创建 LayoutLMAttention 层，使用给定的配置和名称
        self.attention = TFLayoutLMAttention(config, name="attention")

        # 设置是否作为解码器的标志
        self.is_decoder = config.is_decoder

        # 设置是否添加跨注意力的标志
        self.add_cross_attention = config.add_cross_attention

        # 如果需要添加跨注意力
        if self.add_cross_attention:
            # 如果不是解码器模型，抛出数值错误异常
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")

            # 创建跨注意力层，使用给定的配置和名称
            self.crossattention = TFLayoutLMAttention(config, name="crossattention")

        # 创建 LayoutLMIntermediate 层，使用给定的配置和名称
        self.intermediate = TFLayoutLMIntermediate(config, name="intermediate")

        # 创建 LayoutLMOutput 层，使用给定的配置和名称
        self.bert_output = TFLayoutLMOutput(config, name="output")

    # 定义调用函数，处理输入的张量和参数，生成输出结果
    def call(
        self,
        hidden_states: tf.Tensor,                          # 输入的隐藏状态张量
        attention_mask: tf.Tensor,                         # 注意力掩码张量
        head_mask: tf.Tensor,                              # 头部掩码张量
        encoder_hidden_states: tf.Tensor | None,           # 编码器隐藏状态张量或空
        encoder_attention_mask: tf.Tensor | None,          # 编码器注意力掩码张量或空
        past_key_value: Tuple[tf.Tensor] | None,           # 过去的键值元组或空
        output_attentions: bool,                           # 是否输出注意力张量的标志
        training: bool = False,                            # 是否在训练模式的标志，默认为 False
    ) -> Tuple[tf.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        # 如果有过去的键/值对，则取前两个，否则为 None
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 使用自注意力机制处理隐藏状态
        self_attention_outputs = self.attention(
            input_tensor=hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            past_key_value=self_attn_past_key_value,
            output_attentions=output_attentions,
            training=training,
        )
        # 获取自注意力机制的输出
        attention_output = self_attention_outputs[0]

        # 如果是解码器，则最后一个输出是自注意力缓存的元组
        if self.is_decoder:
            # 输出中排除最后一个元组，因为它是自注意力缓存
            outputs = self_attention_outputs[1:-1]
            # 当前的键/值对为最后一个元组
            present_key_value = self_attention_outputs[-1]
        else:
            # 输出中排除第一个元素，因为它是隐藏状态处理后的输出
            outputs = self_attention_outputs[1:]  # 如果输出注意力权重，则添加自注意力
        

        cross_attn_present_key_value = None
        # 如果是解码器且有编码器隐藏状态
        if self.is_decoder and encoder_hidden_states is not None:
            # 检查是否具有交叉注意力层
            if not hasattr(self, "crossattention"):
                # 如果传入了编码器隐藏状态，则需要通过设置 `config.add_cross_attention=True` 来实例化交叉注意力层
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # 交叉注意力的缓存键/值对在过去键/值对元组的第3、4个位置
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 使用交叉注意力机制处理自注意力输出
            cross_attention_outputs = self.crossattention(
                input_tensor=attention_output,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=cross_attn_past_key_value,
                output_attentions=output_attentions,
                training=training,
            )
            # 获取交叉注意力机制的输出
            attention_output = cross_attention_outputs[0]
            # 将交叉注意力机制的输出添加到输出中，排除最后一个元组（如果有的话）
            outputs = outputs + cross_attention_outputs[1:-1]

            # 将交叉注意力的当前键/值对添加到现有的键/值对中
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # 使用中间层处理注意力输出
        intermediate_output = self.intermediate(hidden_states=attention_output)
        # 使用BERT输出层处理中间层和注意力输出
        layer_output = self.bert_output(
            hidden_states=intermediate_output, input_tensor=attention_output, training=training
        )
        # 如果输出注意力权重，则添加到输出中
        outputs = (layer_output,) + outputs

        # 如果是解码器，将注意力的键/值对作为最后的输出
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs
    # 定义一个方法 `build`，用于构建模型的层次结构。如果已经构建过，则直接返回。
    def build(self, input_shape=None):
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        
        # 如果存在 `attention` 属性，则构建 `attention` 层，并使用 `tf.name_scope` 包装作用域。
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        
        # 如果存在 `intermediate` 属性，则构建 `intermediate` 层，并使用 `tf.name_scope` 包装作用域。
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        
        # 如果存在 `bert_output` 属性，则构建 `bert_output` 层，并使用 `tf.name_scope` 包装作用域。
        if getattr(self, "bert_output", None) is not None:
            with tf.name_scope(self.bert_output.name):
                self.bert_output.build(None)
        
        # 如果存在 `crossattention` 属性，则构建 `crossattention` 层，并使用 `tf.name_scope` 包装作用域。
        if getattr(self, "crossattention", None) is not None:
            with tf.name_scope(self.crossattention.name):
                self.crossattention.build(None)
# 从transformers.models.bert.modeling_tf_bert.TFBertEncoder复制代码，修改为LayoutLM模型的编码器
class TFLayoutLMEncoder(keras.layers.Layer):
    def __init__(self, config: LayoutLMConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        # 创建多个LayoutLM层，编号从"layer_._0"到"layer_._(config.num_hidden_layers - 1)"
        self.layer = [TFLayoutLMLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]

    def call(
        self,
        hidden_states: tf.Tensor,  # 输入的隐藏状态张量
        attention_mask: tf.Tensor,  # 注意力掩码张量
        head_mask: tf.Tensor,  # 头部掩码张量列表
        encoder_hidden_states: tf.Tensor | None,  # 编码器的隐藏状态张量或None
        encoder_attention_mask: tf.Tensor | None,  # 编码器的注意力掩码张量或None
        past_key_values: Tuple[Tuple[tf.Tensor]] | None,  # 先前的键值对或None
        use_cache: Optional[bool],  # 是否使用缓存的标志
        output_attentions: bool,  # 是否输出注意力张量
        output_hidden_states: bool,  # 是否输出隐藏状态张量
        return_dict: bool,  # 是否返回字典格式的输出
        training: bool = False,  # 是否处于训练模式
    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
        # 如果需要输出隐藏状态，则初始化空元组all_hidden_states
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力张量，则初始化空元组all_attentions
        all_attentions = () if output_attentions else None
        # 如果需要输出交叉注意力张量且配置中允许，则初始化空元组all_cross_attentions
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果需要使用缓存，则初始化空元组next_decoder_cache
        next_decoder_cache = () if use_cache else None
        # 遍历每个LayoutLM层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前隐藏状态加入all_hidden_states元组
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的先前键值对
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 调用当前层的模块，计算层的输出
            layer_outputs = layer_module(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask[i],
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                training=training,
            )
            # 更新隐藏状态为当前层输出的第一个张量
            hidden_states = layer_outputs[0]

            # 如果需要使用缓存，则将当前层的最后一个输出加入next_decoder_cache
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)

            # 如果需要输出注意力张量，则将当前层的注意力张量加入all_attentions元组
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)
                # 如果配置中允许添加交叉注意力且有编码器的隐藏状态，则将交叉注意力张量加入all_cross_attentions元组
                if self.config.add_cross_attention and encoder_hidden_states is not None:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 添加最后一层的隐藏状态到all_hidden_states中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典格式的结果，则按顺序返回非None的张量组成的元组
        if not return_dict:
            return tuple(
                v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
            )

        # 返回字典格式的TFBaseModelOutputWithPastAndCrossAttentions对象
        return TFBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
            cross_attentions=all_cross_attentions,
        )
    # 定义神经网络层的构建方法，接受输入形状作为参数（默认为None）
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 设置标志位表示已经构建过
        self.built = True
        # 如果存在layer属性（神经网络层），则逐个构建每一层
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                # 使用 TensorFlow 的命名空间（name_scope），设置当前层的名称作用域
                with tf.name_scope(layer.name):
                    # 调用每一层的build方法进行具体的构建
                    layer.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->LayoutLM
class TFLayoutLMPooler(keras.layers.Layer):
    def __init__(self, config: LayoutLMConfig, **kwargs):
        super().__init__(**kwargs)

        # Initialize a dense layer for pooling with specified units, initializer, and activation function
        self.dense = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # Pooling operation: take the hidden state corresponding to the first token
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(inputs=first_token_tensor)

        return pooled_output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                # Build the dense layer with the specified input shape and hidden size from config
                self.dense.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->LayoutLM
class TFLayoutLMPredictionHeadTransform(keras.layers.Layer):
    def __init__(self, config: LayoutLMConfig, **kwargs):
        super().__init__(**kwargs)

        # Initialize a dense layer for transformation with specified units and initializer
        self.dense = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="dense",
        )

        # Determine the activation function for transformation based on config
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.transform_act_fn = config.hidden_act

        # Layer normalization for stabilizing learning and handling covariate shift
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # Perform dense transformation followed by activation and layer normalization
        hidden_states = self.dense(inputs=hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(inputs=hidden_states)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                # Build the dense layer with the specified input shape and hidden size from config
                self.dense.build([None, None, self.config.hidden_size])
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                # Build the layer normalization with the specified input shape and hidden size from config
                self.LayerNorm.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->LayoutLM
class TFLayoutLMLMPredictionHead(keras.layers.Layer):
    # This class definition was not provided in the snippet provided.
    pass
    # 初始化方法，接受配置对象和输入嵌入层作为参数
    def __init__(self, config: LayoutLMConfig, input_embeddings: keras.layers.Layer, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 设置配置对象和隐藏层大小
        self.config = config
        self.hidden_size = config.hidden_size

        # 使用 TFLayoutLMPredictionHeadTransform 对象对输入进行转换
        self.transform = TFLayoutLMPredictionHeadTransform(config, name="transform")

        # 输入嵌入层，这里的权重与输入嵌入的权重相同，但每个标记都有一个输出偏置
        self.input_embeddings = input_embeddings

    # 构建方法，用于构建模型层次结构
    def build(self, input_shape=None):
        # 添加一个形状为 (self.config.vocab_size,) 的可训练的零初始化偏置
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True

        # 如果存在转换层，构建转换层
        if getattr(self, "transform", None) is not None:
            with tf.name_scope(self.transform.name):
                self.transform.build(None)

    # 获取输出嵌入层对象
    def get_output_embeddings(self) -> keras.layers.Layer:
        return self.input_embeddings

    # 设置输出嵌入层的权重和词汇大小
    def set_output_embeddings(self, value: tf.Variable):
        self.input_embeddings.weight = value
        self.input_embeddings.vocab_size = shape_list(value)[0]

    # 获取偏置字典
    def get_bias(self) -> Dict[str, tf.Variable]:
        return {"bias": self.bias}

    # 设置偏置值
    def set_bias(self, value: tf.Variable):
        self.bias = value["bias"]
        self.config.vocab_size = shape_list(value["bias"])[0]

    # 前向传播方法，接收隐藏状态并返回预测结果
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 使用转换层对隐藏状态进行转换
        hidden_states = self.transform(hidden_states=hidden_states)

        # 获取序列长度
        seq_length = shape_list(hidden_states)[1]

        # 将隐藏状态重塑为二维张量
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])

        # 矩阵乘法，将隐藏状态与输入嵌入层的权重相乘
        hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)

        # 将结果重塑为三维张量
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])

        # 添加偏置项到结果张量
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)

        # 返回预测结果张量
        return hidden_states
# 从 transformers.models.bert.modeling_tf_bert.TFBertMLMHead 复制并将 Bert 替换为 LayoutLM
class TFLayoutLMMLMHead(keras.layers.Layer):
    def __init__(self, config: LayoutLMConfig, input_embeddings: keras.layers.Layer, **kwargs):
        super().__init__(**kwargs)

        # 初始化预测层，用于生成 MLM 的预测分数
        self.predictions = TFLayoutLMLMPredictionHead(config, input_embeddings, name="predictions")

    def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
        # 调用预测层，生成预测分数
        prediction_scores = self.predictions(hidden_states=sequence_output)

        return prediction_scores

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "predictions", None) is not None:
            with tf.name_scope(self.predictions.name):
                # 构建预测层
                self.predictions.build(None)


@keras_serializable
class TFLayoutLMMainLayer(keras.layers.Layer):
    config_class = LayoutLMConfig

    def __init__(self, config: LayoutLMConfig, add_pooling_layer: bool = True, **kwargs):
        super().__init__(**kwargs)

        # 初始化主层的配置
        self.config = config

        # 初始化 LayoutLM 的嵌入层
        self.embeddings = TFLayoutLMEmbeddings(config, name="embeddings")
        
        # 初始化 LayoutLM 的编码器
        self.encoder = TFLayoutLMEncoder(config, name="encoder")
        
        # 如果需要添加池化层，则初始化 LayoutLM 的池化层
        self.pooler = TFLayoutLMPooler(config, name="pooler") if add_pooling_layer else None

    def get_input_embeddings(self) -> keras.layers.Layer:
        # 返回嵌入层
        return self.embeddings

    def set_input_embeddings(self, value: tf.Variable):
        # 设置输入的嵌入层权重和词汇大小
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 剪枝模型的注意力头，具体实现未完成
        raise NotImplementedError

    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        bbox: np.ndarray | tf.Tensor | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    # 定义 build 方法，用于构建模型的结构
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 将 built 标志设置为 True，表示模型已经构建
        self.built = True
        
        # 如果模型具有 embeddings 属性且不为 None，则构建 embeddings 层
        if getattr(self, "embeddings", None) is not None:
            # 使用 embeddings 层的名称作为命名空间
            with tf.name_scope(self.embeddings.name):
                # 调用 embeddings 层的 build 方法构建其内部结构
                self.embeddings.build(None)
        
        # 如果模型具有 encoder 属性且不为 None，则构建 encoder 层
        if getattr(self, "encoder", None) is not None:
            # 使用 encoder 层的名称作为命名空间
            with tf.name_scope(self.encoder.name):
                # 调用 encoder 层的 build 方法构建其内部结构
                self.encoder.build(None)
        
        # 如果模型具有 pooler 属性且不为 None，则构建 pooler 层
        if getattr(self, "pooler", None) is not None:
            # 使用 pooler 层的名称作为命名空间
            with tf.name_scope(self.pooler.name):
                # 调用 pooler 层的 build 方法构建其内部结构
                self.pooler.build(None)
"""
This model class extends TFPreTrainedModel and provides methods for weights initialization, downloading pretrained models,
and handling input signatures.
"""

class TFLayoutLMPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.

    Attributes:
        config_class: A class attribute indicating the configuration class for this model.
        base_model_prefix: A string attribute representing the prefix used for the base model.
    """

    # Set the configuration class for this model
    config_class = LayoutLMConfig
    # Define the prefix for the base model
    base_model_prefix = "layoutlm"

    @property
    def input_signature(self):
        """
        Override the input_signature property of TFPreTrainedModel.

        Returns:
            dict: Updated signature including 'bbox' as a TensorSpec with shape (None, None, 4) and dtype tf.int32.
        """
        signature = super().input_signature
        # Add 'bbox' input with shape (None, None, 4) and dtype tf.int32
        signature["bbox"] = tf.TensorSpec(shape=(None, None, 4), dtype=tf.int32, name="bbox")
        return signature


LAYOUTLM_START_DOCSTRING = r"""
"""

LAYOUTLM_INPUTS_DOCSTRING = r"""
"""
# 添加起始文档字符串，描述这是一个输出原始隐藏状态的LayoutLM模型转换器，没有特定的顶部头
# 包含LayoutLM的起始文档字符串
class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
    def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化LayoutLM的主要层，并命名为"layoutlm"
        self.layoutlm = TFLayoutLMMainLayer(config, name="layoutlm")

    # 解包输入
    # 向模型的前向传递添加起始文档字符串，描述输入的格式为"batch_size, sequence_length"
    # 替换返回文档字符串，指定输出类型为TFBaseModelOutputWithPoolingAndCrossAttentions，使用_CONFIG_FOR_DOC配置类
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        bbox: np.ndarray | tf.Tensor | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        r"""
        Returns:
        此方法的返回类型为 TFBaseModelOutputWithPoolingAndCrossAttentions 或 Tuple[tf.Tensor]。

        Examples:
        示例代码演示如何使用该方法：
        
        ```
        >>> from transformers import AutoTokenizer, TFLayoutLMModel
        >>> import tensorflow as tf

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
        >>> model = TFLayoutLMModel.from_pretrained("microsoft/layoutlm-base-uncased")

        >>> words = ["Hello", "world"]
        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]

        >>> token_boxes = []
        >>> for word, box in zip(words, normalized_word_boxes):
        ...     word_tokens = tokenizer.tokenize(word)
        ...     token_boxes.extend([box] * len(word_tokens))
        >>> # add bounding boxes of cls + sep tokens
        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]

        >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
        >>> input_ids = encoding["input_ids"]
        >>> attention_mask = encoding["attention_mask"]
        >>> token_type_ids = encoding["token_type_ids"]
        >>> bbox = tf.convert_to_tensor([token_boxes])

        >>> outputs = model(
        ...     input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids
        ... )

        >>> last_hidden_states = outputs.last_hidden_state
        ```
        执行示例代码，使用模型进行推理并获取最后隐藏状态的输出。

        """
        outputs = self.layoutlm(
            input_ids=input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "layoutlm", None) is not None:
            with tf.name_scope(self.layoutlm.name):
                self.layoutlm.build(None)
@add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING)
class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    _keys_to_ignore_on_load_unexpected = [
        r"pooler",
        r"cls.seq_relationship",
        r"cls.predictions.decoder.weight",
        r"nsp___cls",
    ]

    def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # Warns about potential issues with bi-directional self-attention if config.is_decoder is True
        if config.is_decoder:
            logger.warning(
                "If you want to use `TFLayoutLMForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # Initializes the main LayoutLM layer with optional pooling and names it "layoutlm"
        self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm")

        # Initializes the MLM (Masked Language Modeling) head for LayoutLM with input embeddings from layoutlm layer, names it "mlm___cls"
        self.mlm = TFLayoutLMMLMHead(config, input_embeddings=self.layoutlm.embeddings, name="mlm___cls")

    def get_lm_head(self) -> keras.layers.Layer:
        # Returns the predictions layer of the MLM head
        return self.mlm.predictions

    def get_prefix_bias_name(self) -> str:
        # Warns that this method is deprecated and suggests using 'get_bias' instead
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        # Returns a concatenated string representing the name path of the MLM head's predictions layer
        return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name

    @unpack_inputs
    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        bbox: np.ndarray | tf.Tensor | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """

        # 负责接收用于计算掩码语言建模损失的标签，形状为 `(batch_size, sequence_length)` 的张量或数组（可选）
        # 标签的索引应在 `[-100, 0, ..., config.vocab_size]` 范围内（参见 `input_ids` 文档字符串）
        # 索引为 `-100` 的标记会被忽略（掩码），损失仅计算具有 `[0, ..., config.vocab_size]` 范围内标签的标记

        Returns:

        # 返回结果说明部分，用于描述函数返回的内容及其含义

        Examples:

        ```
        >>> from transformers import AutoTokenizer, TFLayoutLMForMaskedLM
        >>> import tensorflow as tf

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
        >>> model = TFLayoutLMForMaskedLM.from_pretrained("microsoft/layoutlm-base-uncased")

        >>> words = ["Hello", "[MASK]"]
        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]

        >>> token_boxes = []
        >>> for word, box in zip(words, normalized_word_boxes):
        ...     word_tokens = tokenizer.tokenize(word)
        ...     token_boxes.extend([box] * len(word_tokens))
        >>> # add bounding boxes of cls + sep tokens
        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]

        >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
        >>> input_ids = encoding["input_ids"]
        >>> attention_mask = encoding["attention_mask"]
        >>> token_type_ids = encoding["token_type_ids"]
        >>> bbox = tf.convert_to_tensor([token_boxes])

        >>> labels = tokenizer("Hello world", return_tensors="tf")["input_ids"]

        >>> outputs = model(
        ...     input_ids=input_ids,
        ...     bbox=bbox,
        ...     attention_mask=attention_mask,
        ...     token_type_ids=token_type_ids,
        ...     labels=labels,
        ... )

        >>> loss = outputs.loss
        ```

        # 示例用法部分，展示了如何使用该函数进行模型推理和损失计算
        """
        outputs = self.layoutlm(
            input_ids=input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 调用 layoutlm 模型进行前向传播，得到模型输出
        sequence_output = outputs[0]

        # 从模型输出中提取序列输出
        prediction_scores = self.mlm(sequence_output=sequence_output, training=training)

        # 如果提供了标签，则计算损失；否则损失设为 None
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)

        # 如果 return_dict 为 False，则组装输出并返回
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 TFMaskedLMOutput 对象
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 定义模型的构建方法，用于构建模型结构
    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 如果模型中存在 layoutlm 属性，则构建 layoutlm 模块
        if getattr(self, "layoutlm", None) is not None:
            # 使用 layoutlm 模块的名称作为命名空间
            with tf.name_scope(self.layoutlm.name):
                # 调用 layoutlm 模块的 build 方法，传入 None 作为输入形状
                self.layoutlm.build(None)
        # 如果模型中存在 mlm 属性，则构建 mlm 模块
        if getattr(self, "mlm", None) is not None:
            # 使用 mlm 模块的名称作为命名空间
            with tf.name_scope(self.mlm.name):
                # 调用 mlm 模块的 build 方法，传入 None 作为输入形状
                self.mlm.build(None)
"""
LayoutLM Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
"""
# 使用 LayoutLM 模型进行序列分类或回归，顶部包含一个线性层（在池化输出之上），例如用于 GLUE 任务。
@add_start_docstrings(
    """
    LayoutLM Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    LAYOUTLM_START_DOCSTRING,
)
class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceClassificationLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # 以'.'结尾的名称表示在从 PT 模型加载 TF 模型时授权的意外/缺失层
    _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"]
    # 缺失的名称以'.'结尾，表示在从 PT 模型加载 TF 模型时忽略的层
    _keys_to_ignore_on_load_missing = [r"dropout"]

    def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        self.num_labels = config.num_labels

        self.layoutlm = TFLayoutLMMainLayer(config, name="layoutlm")  # 初始化 LayoutLM 主层
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)  # Dropout 层
        self.classifier = keras.layers.Dense(
            units=config.num_labels,
            kernel_initializer=get_initializer(config.initializer_range),
            name="classifier",
        )  # 分类器层
        self.config = config  # 配置信息

    @unpack_inputs
    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
    # 调用模型正向传播，对输入进行解包，并替换返回结果的文档字符串
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        bbox: np.ndarray | tf.Tensor | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ):
        # 正向传播函数的定义，接收多种输入参数和可选的训练标志
        pass  # 占位符，实际功能在后续实现中完成

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "layoutlm", None) is not None:
            with tf.name_scope(self.layoutlm.name):
                self.layoutlm.build(None)  # 构建 LayoutLM 主层
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])  # 构建分类器层
    # 在加载模型时需要忽略的键列表，用于处理意外的键
    _keys_to_ignore_on_load_unexpected = [
        r"pooler",               # 忽略名为"pooler"的键
        r"mlm___cls",            # 忽略名为"mlm___cls"的键
        r"nsp___cls",            # 忽略名为"nsp___cls"的键
        r"cls.predictions",      # 忽略名为"cls.predictions"的键
        r"cls.seq_relationship", # 忽略名为"cls.seq_relationship"的键
    ]
    # 在加载模型时需要忽略的键列表，用于处理缺失的键
    _keys_to_ignore_on_load_missing = [r"dropout"]

    # LayoutLM 模型的初始化方法，继承自父类的初始化方法
    def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 设置模型的标签数目
        self.num_labels = config.num_labels

        # 初始化 LayoutLM 主层，包括一个可选的池化层
        self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm")
        
        # 设置 Dropout 层，根据配置中的隐藏层丢弃率
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        
        # 分类器层，用于模型输出预测
        self.classifier = keras.layers.Dense(
            units=config.num_labels,  # 分类器单元数等于配置中的标签数目
            kernel_initializer=get_initializer(config.initializer_range),  # 使用配置中的初始化范围初始化权重
            name="classifier",  # 层的名称为"classifier"
        )
        
        # 保存配置对象
        self.config = config

    # 模型调用方法的装饰器，用于解压输入参数并添加文档字符串
    @unpack_inputs
    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        bbox: np.ndarray | tf.Tensor | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Returns:

        Examples:

        ```
        >>> import tensorflow as tf
        >>> from transformers import AutoTokenizer, TFLayoutLMForTokenClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
        >>> model = TFLayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased")

        >>> words = ["Hello", "world"]
        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]

        >>> token_boxes = []
        >>> for word, box in zip(words, normalized_word_boxes):
        ...     word_tokens = tokenizer.tokenize(word)
        ...     token_boxes.extend([box] * len(word_tokens))
        >>> # add bounding boxes of cls + sep tokens
        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]

        >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
        >>> input_ids = encoding["input_ids"]
        >>> attention_mask = encoding["attention_mask"]
        >>> token_type_ids = encoding["token_type_ids"]
        >>> bbox = tf.convert_to_tensor([token_boxes])
        >>> token_labels = tf.convert_to_tensor([1, 1, 0, 0])

        >>> outputs = model(
        ...     input_ids=input_ids,
        ...     bbox=bbox,
        ...     attention_mask=attention_mask,
        ...     token_type_ids=token_type_ids,
        ...     labels=token_labels,
        ... )

        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```"""

        # 调用模型的前向传播方法，传入各种输入参数
        outputs = self.layoutlm(
            input_ids=input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取模型输出的序列输出（通常是模型最后一层的输出）
        sequence_output = outputs[0]
        # 在训练模式下对序列输出应用 dropout
        sequence_output = self.dropout(inputs=sequence_output, training=training)
        # 将 dropout 后的输出送入分类器，得到最终的 logits
        logits = self.classifier(inputs=sequence_output)
        # 如果提供了标签，则计算损失函数，否则损失为 None
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果 return_dict 为 False，则返回输出的元组形式
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则以 TFTokenClassifierOutput 对象形式返回结果
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 构建模型的方法，用于设置模型结构和参数
    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        
        # 如果模型具有 layoutlm 属性且不为 None，则构建 layoutlm 组件
        if getattr(self, "layoutlm", None) is not None:
            # 在 TensorFlow 中为 layoutlm 组件创建命名空间
            with tf.name_scope(self.layoutlm.name):
                # 调用 layoutlm 组件的 build 方法，传入 None 作为输入形状
                self.layoutlm.build(None)
        
        # 如果模型具有 classifier 属性且不为 None，则构建 classifier 组件
        if getattr(self, "classifier", None) is not None:
            # 在 TensorFlow 中为 classifier 组件创建命名空间
            with tf.name_scope(self.classifier.name):
                # 调用 classifier 组件的 build 方法，传入包含 None、None 和 self.config.hidden_size 的列表作为输入形状
                self.classifier.build([None, None, self.config.hidden_size])
"""
LayoutLM Model with a span classification head on top for extractive question-answering tasks such as
[DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the final hidden-states output to compute `span
start logits` and `span end logits`).
"""
# 使用 LayoutLM 模型，其顶部有一个用于抽取式问答任务的跨度分类头部，例如 [DocVQA](https://rrc.cvc.uab.es/?ch=17)。
# 这个头部是在最终隐藏状态输出之上的线性层，用于计算“跨度起始 logits” 和 “跨度终止 logits”。

class TFLayoutLMForQuestionAnswering(TFLayoutLMPreTrainedModel, TFQuestionAnsweringLoss):
    """
    LayoutLM 用于问答的 TensorFlow 模型，继承自 TFLayoutLMPreTrainedModel 和 TFQuestionAnsweringLoss。
    """
    
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    _keys_to_ignore_on_load_unexpected = [
        r"pooler",
        r"mlm___cls",
        r"nsp___cls",
        r"cls.predictions",
        r"cls.seq_relationship",
    ]
    """
    在从 PyTorch 模型加载 TF 模型时，带有 '.' 的名称表示授权的意外/丢失的层。
    """

    def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
        """
        初始化 LayoutLMForQuestionAnswering 模型。

        Args:
            config (LayoutLMConfig): LayoutLM 模型的配置对象。
            *inputs: 可变数量的输入。
            **kwargs: 其他关键字参数。
        """
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        # 初始化 LayoutLM 主层
        self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm")
        
        # 初始化用于问答输出的全连接层
        self.qa_outputs = keras.layers.Dense(
            units=config.num_labels,
            kernel_initializer=get_initializer(config.initializer_range),
            name="qa_outputs",
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        bbox: np.ndarray | tf.Tensor | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ):
        """
        LayoutLM 问答模型的前向传播方法。

        Args:
            input_ids (TFModelInputType, optional): 输入的 token IDs.
            bbox (np.ndarray or tf.Tensor, optional): 边界框信息.
            attention_mask (np.ndarray or tf.Tensor, optional): 注意力掩码.
            token_type_ids (np.ndarray or tf.Tensor, optional): token 类型 IDs.
            position_ids (np.ndarray or tf.Tensor, optional): 位置 IDs.
            head_mask (np.ndarray or tf.Tensor, optional): 头部掩码.
            inputs_embeds (np.ndarray or tf.Tensor, optional): 嵌入的输入.
            output_attentions (bool, optional): 是否输出注意力权重.
            output_hidden_states (bool, optional): 是否输出隐藏状态.
            return_dict (bool, optional): 是否返回字典类型的输出.
            start_positions (np.ndarray or tf.Tensor, optional): 起始位置.
            end_positions (np.ndarray or tf.Tensor, optional): 结束位置.
            training (bool, optional): 是否处于训练模式.

        Returns:
            TFQuestionAnsweringModelOutput: LayoutLM 问答模型的输出对象。
        """
        # 省略了具体的前向传播逻辑，用文档字符串和装饰器指定了输入输出的详细描述
        pass

    def build(self, input_shape=None):
        """
        构建模型。

        Args:
            input_shape: 输入的形状信息，可选。

        Notes:
            如果已经构建过，则直接返回。
            构建 LayoutLM 和 qa_outputs 层。
        """
        if self.built:
            return
        self.built = True
        if getattr(self, "layoutlm", None) is not None:
            with tf.name_scope(self.layoutlm.name):
                self.layoutlm.build(None)
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])

`.\models\layoutlm\tokenization_layoutlm.py`

# coding=utf-8
# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization class for model LayoutLM."""

import collections  # 导入 collections 模块，用于处理数据集合
import os  # 导入 os 模块，用于处理操作系统相关功能
import unicodedata  # 导入 unicodedata 模块，用于 Unicode 字符数据的处理
from typing import List, Optional, Tuple  # 导入类型提示相关的功能

from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace  # 导入其他模块中的相关功能
from ...utils import logging  # 导入日志记录工具

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}  # 定义词汇表文件名字典

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/layoutlm-base-uncased": (
            "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/vocab.txt"
        ),
        "microsoft/layoutlm-large-uncased": (
            "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/vocab.txt"
        ),
    }
}  # 预训练词汇表文件映射字典，指定 LayoutLM 模型的预训练词汇表文件及其来源 URL

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/layoutlm-base-uncased": 512,
    "microsoft/layoutlm-large-uncased": 512,
}  # 预训练位置嵌入大小字典，指定 LayoutLM 不同预训练模型的位置嵌入大小

PRETRAINED_INIT_CONFIGURATION = {
    "microsoft/layoutlm-base-uncased": {"do_lower_case": True},
    "microsoft/layoutlm-large-uncased": {"do_lower_case": True},
}  # 预训练初始化配置字典，指定 LayoutLM 不同预训练模型的初始化配置

# Copied from transformers.models.bert.tokenization_bert.load_vocab
def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()  # 创建一个有序字典对象 vocab
    with open(vocab_file, "r", encoding="utf-8") as reader:  # 打开词汇表文件进行读取
        tokens = reader.readlines()  # 读取文件的所有行
    for index, token in enumerate(tokens):  # 遍历行号和行内容
        token = token.rstrip("\n")  # 去除行末的换行符
        vocab[token] = index  # 将单词和对应索引存入字典
    return vocab  # 返回加载后的词汇表字典

# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()  # 去除文本两端空白字符
    if not text:  # 如果文本为空
        return []  # 返回空列表
    tokens = text.split()  # 使用空格分割文本，得到单词列表
    return tokens  # 返回分割后的单词列表

# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with Bert->LayoutLM,BERT->LayoutLM
class LayoutLMTokenizer(PreTrainedTokenizer):
    r"""
    Construct a LayoutLM tokenizer. Based on WordPiece.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    Args:
        vocab_file (`str`):
            File containing the vocabulary.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
            Whether or not to do basic tokenization before WordPiece.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original LayoutLM).
    ```

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    ```
    # 初始化方法，用于创建一个新的 Tokenizer 对象
    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
    ```
    ):
        # 如果提供的词汇文件路径不是一个文件，则抛出数值错误异常，提示找不到词汇文件
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = LayoutLMTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # 加载词汇表并赋值给实例变量 self.vocab
        self.vocab = load_vocab(vocab_file)
        # 创建一个有序字典，将词汇表中的 id 和 token 对调，赋值给实例变量 self.ids_to_tokens
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        # 初始化是否进行基础分词的标志
        self.do_basic_tokenize = do_basic_tokenize
        # 如果需要进行基础分词
        if do_basic_tokenize:
            # 创建 BasicTokenizer 实例并赋值给 self.basic_tokenizer
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )

        # 创建 WordpieceTokenizer 实例并赋值给 self.wordpiece_tokenizer，使用未知标记 unk_token
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

        # 调用父类的初始化方法，并传递相应参数
        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

    @property
    def do_lower_case(self):
        # 返回基础分词器的小写标志位
        return self.basic_tokenizer.do_lower_case

    @property
    def vocab_size(self):
        # 返回词汇表的大小
        return len(self.vocab)

    def get_vocab(self):
        # 返回包含额外 tokens 编码器的词汇表字典
        return dict(self.vocab, **self.added_tokens_encoder)

    def _tokenize(self, text, split_special_tokens=False):
        # 分词后的 token 列表
        split_tokens = []
        # 如果需要进行基础分词
        if self.do_basic_tokenize:
            # 使用基础分词器对文本进行分词
            for token in self.basic_tokenizer.tokenize(
                text, never_split=self.all_special_tokens if not split_special_tokens else None
            ):
                # 如果 token 是不能分割的特殊 token
                if token in self.basic_tokenizer.never_split:
                    split_tokens.append(token)
                else:
                    # 使用 WordpieceTokenizer 对 token 进行分词，并添加到 split_tokens 中
                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
        else:
            # 直接使用 WordpieceTokenizer 对文本进行分词
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 将 token 转换为对应的 id，如果 token 不在词汇表中，则使用 unk_token
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 将索引 index 转换为对应的 token，如果索引不在 ids_to_tokens 中，则使用 unk_token
        return self.ids_to_tokens.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 将 tokens 序列转换为单个字符串，去除 " ##" 并去除首尾空格
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create token type IDs from a sequence or a pair of sequences for sequence classification tasks. This method assigns
        different token type IDs to distinguish between the first sequence, the second sequence (if provided), and padding.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs representing the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs representing the second sequence.

        Returns:
            `List[int]`: List of token type IDs.
        """
        if token_ids_1 is None:
            # For a single sequence, token type IDs are 0 for all tokens
            return [0] * len(token_ids_0)
        # For a pair of sequences, assign token type 0 to the first sequence and token type 1 to the second sequence
        token_type_ids = [0] * len(token_ids_0) + [1] * len(token_ids_1)
        return token_type_ids
    def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Optional[List[int]]) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A LayoutLM sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs according to the given sequence(s).
        """
        # Define separator and classification token IDs
        sep = [self.sep_token_id]  # Separation token ID
        cls = [self.cls_token_id]  # Classification token ID

        # If only one sequence is provided (token_ids_1 is None), return mask for the first sequence
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]

        # If two sequences are provided, return combined mask for both sequences
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary of the model to a specified directory or file.

        Args:
            save_directory (str):
                Directory path where the vocabulary will be saved.
            filename_prefix (str, *optional*):
                Optional prefix to prepend to the vocabulary file name.

        Returns:
            Tuple[str]: A tuple containing the path to the saved vocabulary file.
        """
        index = 0

        # Determine the full path for saving the vocabulary file
        if os.path.isdir(save_directory):
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory

        # Write the vocabulary to the specified file
        with open(vocab_file, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                # Check for non-consecutive indices and issue a warning if found
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index
                writer.write(token + "\n")
                index += 1

        # Return the path to the saved vocabulary file
        return (vocab_file,)
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
# 从transformers.models.bert.tokenization_bert.BasicTokenizer复制而来

class BasicTokenizer(object):
    """
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
    创建一个BasicTokenizer对象，执行基本的分词（标点符号拆分，小写转换等）。

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
            是否在分词时将输入转换为小写。
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
            在分词过程中不会被拆分的token集合，仅在`do_basic_tokenize=True`时生效。
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.
            是否分词中文字符。对于日语，应该禁用此选项（参见此issue）。
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
            是否删除所有的重音符号。如果未指定此选项，则将由`lowercase`的值决定（与原始BERT相同）。
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
            在某些情况下，我们希望跳过基本的标点符号拆分，以便后续的分词可以捕获词语的完整上下文，如缩写。

    """

    def __init__(
        self,
        do_lower_case=True,
        never_split=None,
        tokenize_chinese_chars=True,
        strip_accents=None,
        do_split_on_punc=True,
    ):
        if never_split is None:
            never_split = []
        # 初始化BasicTokenizer对象
        self.do_lower_case = do_lower_case
        # 是否进行小写处理
        self.never_split = set(never_split)
        # 设置不会被拆分的token集合
        self.tokenize_chinese_chars = tokenize_chinese_chars
        # 是否分词中文字符
        self.strip_accents = strip_accents
        # 是否删除重音符号
        self.do_split_on_punc = do_split_on_punc
        # 是否基于标点符号进行拆分
    # 对输入的文本进行基本的分词处理。用于子词分词，请参见 WordPieceTokenizer。

    # 如果传入了 never_split 参数，则将其与类属性 never_split 的集合取并集，以获取最终的不分割的标记集合。
    never_split = self.never_split.union(set(never_split)) if never_split else self.never_split

    # 清理文本，去除可能存在的特殊符号和空白字符。
    text = self._clean_text(text)

    # 若设置了 tokenize_chinese_chars 标志为 True，则对包含中文字符的文本进行特殊处理。
    if self.tokenize_chinese_chars:
        text = self._tokenize_chinese_chars(text)

    # 对文本进行 Unicode 规范化，确保文本中的字符使用 NFC 规范。
    unicode_normalized_text = unicodedata.normalize("NFC", text)

    # 使用空白字符进行基本的分词，得到原始的 token 列表。
    orig_tokens = whitespace_tokenize(unicode_normalized_text)

    # 初始化空列表，用于存储最终的分词结果。
    split_tokens = []

    # 遍历原始 token 列表，对每个 token 进行处理。
    for token in orig_tokens:
        # 如果 token 不在不分割的标记集合中，则进行进一步处理。
        if token not in never_split:
            # 如果设置了 do_lower_case 标志为 True，则将 token 转换为小写。
            if self.do_lower_case:
                token = token.lower()
                # 如果 strip_accents 不为 False，则移除 token 中的重音符号。
                if self.strip_accents is not False:
                    token = self._run_strip_accents(token)
            # 如果 strip_accents 标志为 True，则移除 token 中的重音符号。
            elif self.strip_accents:
                token = self._run_strip_accents(token)

        # 将处理后的 token 列表拼接到 split_tokens 中。
        split_tokens.extend(self._run_split_on_punc(token, never_split))

    # 将拼接后的分词结果使用空白字符再次进行分割，得到最终的输出 token 列表。
    output_tokens = whitespace_tokenize(" ".join(split_tokens))

    # 返回最终的输出 token 列表作为函数的返回值。
    return output_tokens
    def _run_split_on_punc(self, text, never_split=None):
        """按标点符号分割文本。

        Args:
            text (str): 要分割的文本字符串。
            never_split (set): 不希望分割的文本集合。

        Returns:
            list: 分割后的文本列表。

        Notes:
            如果不需要按标点符号分割或者指定的文本在never_split中，直接返回原文本。
        """
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])  # 将标点符号作为单独的列表项添加到输出列表中
                start_new_word = True  # 标记需要开始一个新单词
            else:
                if start_new_word:
                    output.append([])  # 如果需要开始一个新单词，添加一个空列表
                start_new_word = False  # 取消开始新单词的标记
                output[-1].append(char)  # 将当前字符添加到最后一个单词的列表中
            i += 1

        return ["".join(x) for x in output]  # 将列表中的字符列表连接成字符串后返回一个列表

    def _tokenize_chinese_chars(self, text):
        """在每个CJK字符周围添加空格。

        Args:
            text (str): 要处理的文本字符串。

        Returns:
            str: 处理后的文本字符串。
        """
        output = []
        for char in text:
            cp = ord(char)
            if self._is_chinese_char(cp):
                output.append(" ")  # 在CJK字符前后添加空格
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)  # 将列表中的字符连接成一个字符串后返回

    def _is_chinese_char(self, cp):
        """检查CP是否为CJK字符的码点。

        Args:
            cp (int): Unicode码点值。

        Returns:
            bool: 如果是CJK字符则返回True，否则返回False。
        """
        # 这里的CJK字符指的是CJK统一表意文字的Unicode块：
        # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # 注意，CJK Unicode块并不包含所有的日语和韩语字符，
        # 现代韩语的谚文字母和片假名、片假名分别属于不同的Unicode块，
        # 这些字符用于书写空格分隔的词语，因此不会被特殊对待而被处理。
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

    def _clean_text(self, text):
        """清除文本中的无效字符和空白字符。

        Args:
            text (str): 要清理的文本字符串。

        Returns:
            str: 清理后的文本字符串。
        """
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")  # 将空白字符替换为单个空格
            else:
                output.append(char)
        return "".join(output)  # 将列表中的字符连接成一个字符串后返回
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        # 初始化WordpieceTokenizer对象，设置词汇表、未知token和单词最大字符数限制
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.
        """
        # 初始化输出token列表
        output_tokens = []
        # 使用空白字符分词器对文本进行分词，返回的是一个token列表
        for token in whitespace_tokenize(text):
            chars = list(token)
            # 如果token的字符数超过最大字符数限制，则将其替换为未知token
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    # 构建当前子字符串
                    substr = "".join(chars[start:end])
                    # 如果不是第一个子字符串，则在前面加上"##"
                    if start > 0:
                        substr = "##" + substr
                    # 如果当前子字符串在词汇表中，则选择当前子字符串作为最长匹配词
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                # 如果没有找到匹配的词，则标记为无效
                if cur_substr is None:
                    is_bad = True
                    break
                # 将匹配的词加入到sub_tokens列表中
                sub_tokens.append(cur_substr)
                start = end

            # 如果token被标记为无效，则使用未知token代替
            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        # 返回最终的token列表
        return output_tokens

`.\models\layoutlm\tokenization_layoutlm_fast.py`

# coding=utf-8
# 设置脚本编码为 UTF-8

# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors.
# 版权声明，指出此代码的版权信息

# Licensed under the Apache License, Version 2.0 (the "License");
# 使用 Apache License, Version 2.0 授权许可

# you may not use this file except in compliance with the License.
# 除非遵循本许可证，否则不能使用本文件

# You may obtain a copy of the License at
# 可以在以下网址获取许可证副本

#     http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# 根据适用法律规定或书面同意的情况下，软件

# distributed under the License is distributed on an "AS IS" BASIS,
# 分发时遵循"原样"分发

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有明示或暗示的任何保证或条件

# See the License for the specific language governing permissions and
# 详见许可证，获取特定语言的权限以及

# limitations under the License.
# 许可下的限制

""" Tokenization class for model LayoutLM."""

# LayoutLM 模型的分词器类

import json
# 导入 json 模块，用于处理 JSON 格式数据
from typing import List, Optional, Tuple
# 导入 typing 模块，用于类型提示

from tokenizers import normalizers
# 从 tokenizers 库中导入 normalizers 模块

from ...tokenization_utils_fast import PreTrainedTokenizerFast
# 从 ...tokenization_utils_fast 中导入 PreTrainedTokenizerFast 类
from ...utils import logging
# 从 ...utils 中导入 logging 模块
from .tokenization_layoutlm import LayoutLMTokenizer
# 从当前目录下的 tokenization_layoutlm 模块中导入 LayoutLMTokenizer 类


logger = logging.get_logger(__name__)
# 获取当前脚本的日志记录器对象

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
# 定义词汇表文件名和分词器文件名的映射关系

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/layoutlm-base-uncased": (
            "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/vocab.txt"
        ),
        "microsoft/layoutlm-large-uncased": (
            "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/vocab.txt"
        ),
    },
    "tokenizer_file": {
        "microsoft/layoutlm-base-uncased": (
            "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/tokenizer.json"
        ),
        "microsoft/layoutlm-large-uncased": (
            "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/tokenizer.json"
        ),
    },
}
# 定义预训练模型和对应的词汇表文件及分词器文件的下载地址映射关系

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/layoutlm-base-uncased": 512,
    "microsoft/layoutlm-large-uncased": 512,
}
# 定义不同预训练模型的位置嵌入大小

PRETRAINED_INIT_CONFIGURATION = {
    "microsoft/layoutlm-base-uncased": {"do_lower_case": True},
    "microsoft/layoutlm-large-uncased": {"do_lower_case": True},
}
# 定义不同预训练模型的初始化配置，如是否小写化等设置


# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with Bert->LayoutLM,BERT->LayoutLM
# 从 transformers.models.bert.tokenization_bert_fast.BertTokenizerFast 复制并修改为 LayoutLMTokenizerFast

class LayoutLMTokenizerFast(PreTrainedTokenizerFast):
    r"""
    Construct a "fast" LayoutLM tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    """
    # 构造一个基于 WordPiece 并由 HuggingFace 的 tokenizers 库支持的 "快速" LayoutLM 分词器

    def __init__(
        self,
        vocab_file: str,
        tokenizer_file: str,
        **kwargs
    ):
        # 初始化方法，接受词汇表文件和分词器文件的路径参数及其他关键字参数

        super().__init__(
            vocab_file=vocab_file,
            tokenizer_file=tokenizer_file,
            **kwargs
        )
        # 调用父类的初始化方法，传递词汇表文件和分词器文件路径及其他参数
    # 定义一些常量，这些常量用于初始化 LayoutLMTokenizer 类的实例
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    slow_tokenizer_class = LayoutLMTokenizer
    
    # LayoutLMTokenizer 类的构造函数，初始化实例时会调用
    def __init__(
        self,
        vocab_file=None,  # 词汇表文件的路径
        tokenizer_file=None,  # 分词器文件的路径（可选）
        do_lower_case=True,  # 是否将输入文本转换为小写（默认为 True）
        unk_token="[UNK]",  # 未知词汇的特殊标记
        sep_token="[SEP]",  # 分隔符标记，用于多个序列的组合
        pad_token="[PAD]",  # 填充标记，用于序列的长度不同时进行填充
        cls_token="[CLS]",  # 分类器标记，用于序列分类任务中的第一个标记
        mask_token="[MASK]",  # 掩码标记，用于掩码语言建模任务中的掩码预测
        tokenize_chinese_chars=True,  # 是否分词中文字符（默认为 True）
        strip_accents=None,  # 是否去除所有的重音符号（默认根据 lowercase 的值确定）
        **kwargs,  # 其他可选参数
    ):
    ):
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

# 调用父类的构造方法，初始化一个新的对象，传入必要的参数和可选参数


        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())

# 从后端的标记器（tokenizer）获取其正规化器（normalizer）的状态，并将其解析为 JSON 格式


        if (
            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
        ):

# 检查解析出的正规化器状态是否与当前对象的设定不一致，如果不一致则执行下面的操作


            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
            normalizer_state["lowercase"] = do_lower_case
            normalizer_state["strip_accents"] = strip_accents
            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)

# 根据解析出的正规化器类型，更新正规化器的设置以与当前对象的设定一致


        self.do_lower_case = do_lower_case

# 更新对象的小写设置为传入的参数值


    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A LayoutLM sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

# 构建模型输入，根据输入的序列或序列对来生成用于序列分类任务的特殊标记，包括连接和添加特殊标记


        if token_ids_1 is not None:
            output += token_ids_1 + [self.sep_token_id]

        return output

# 如果有第二个序列，将其添加到输出列表中，并添加分隔符标记后返回输出列表


    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None

# 创建用于 LayoutLM 序列的 token type IDs，根据输入的序列或序列对生成相应的类型 ID 列表```
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

# 调用父类的构造方法，初始化一个新的对象，传入必要的参数和可选参数


        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())

# 从后端的标记器（tokenizer）获取其正规化器（normalizer）的状态，并将其解析为 JSON 格式


        if (
            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
        ):

# 检查解析出的正规化器状态是否与当前对象的设定不一致，如果不一致则执行下面的操作


            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
            normalizer_state["lowercase"] = do_lower_case
            normalizer_state["strip_accents"] = strip_accents
            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)

# 根据解析出的正规化器类型，更新正规化器的设置以与当前对象的设定一致


        self.do_lower_case = do_lower_case

# 更新对象的小写设置为传入的参数值


    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A LayoutLM sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

# 构建模型输入，根据输入的序列或序列对来生成用于序列分类任务的特殊标记，包括连接和添加特殊标记


        if token_ids_1 is not None:
            output += token_ids_1 + [self.sep_token_id]

        return output

# 如果有第二个序列，将其添加到输出列表中，并添加分隔符标记后返回输出列表


    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None

# 创建用于 LayoutLM 序列的 token type IDs，根据输入的序列或序列对生成相应的类型 ID 列表
    def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A LayoutLM sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs according to the given sequence(s).
        """
        # Define separators for the beginning and end of the first sequence
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        
        # If only one sequence is provided, return a mask with zeros for the first sequence
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # If two sequences are provided, concatenate their token IDs and return a mask with zeros for the first sequence
        # and ones for the second sequence
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary of the tokenizer model to the specified directory.

        Args:
            save_directory (str):
                Directory path where the vocabulary will be saved.
            filename_prefix (str, *optional*):
                Optional prefix for the saved files.

        Returns:
            Tuple[str]: Tuple containing the filenames saved.
        """
        # Save the vocabulary files using the tokenizer model's save method
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        # Return the filenames as a tuple
        return tuple(files)

`.\models\layoutlm\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING

# 从工具模块中导入所需的类和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义一个字典，包含模块导入结构
_import_structure = {
    "configuration_layoutlm": ["LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMConfig", "LayoutLMOnnxConfig"],
    "tokenization_layoutlm": ["LayoutLMTokenizer"],
}

# 检查是否存在 tokenizers 库，如果不存在则引发异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 tokenizers 库，则添加对应的模块到导入结构字典中
    _import_structure["tokenization_layoutlm_fast"] = ["LayoutLMTokenizerFast"]

# 检查是否存在 torch 库，如果不存在则引发异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 torch 库，则添加对应的模块到导入结构字典中
    _import_structure["modeling_layoutlm"] = [
        "LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST",
        "LayoutLMForMaskedLM",
        "LayoutLMForSequenceClassification",
        "LayoutLMForTokenClassification",
        "LayoutLMForQuestionAnswering",
        "LayoutLMModel",
        "LayoutLMPreTrainedModel",
    ]

# 检查是否存在 tensorflow 库，如果不存在则引发异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 tensorflow 库，则添加对应的模块到导入结构字典中
    _import_structure["modeling_tf_layoutlm"] = [
        "TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFLayoutLMForMaskedLM",
        "TFLayoutLMForSequenceClassification",
        "TFLayoutLMForTokenClassification",
        "TFLayoutLMForQuestionAnswering",
        "TFLayoutLMMainLayer",
        "TFLayoutLMModel",
        "TFLayoutLMPreTrainedModel",
    ]

# 如果是类型检查模式，导入类型检查所需的模块和类
if TYPE_CHECKING:
    from .configuration_layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMConfig, LayoutLMOnnxConfig
    from .tokenization_layoutlm import LayoutLMTokenizer

    # 检查是否存在 tokenizers 库，如果不存在则引发异常
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果存在 tokenizers 库，则导入对应的模块
        from .tokenization_layoutlm_fast import LayoutLMTokenizerFast

    # 检查是否存在 torch 库，如果不存在则引发异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    # 如果不是 TensorFlow 可用状态，则引发 OptionalDependencyNotAvailable 异常
    else:
        # 从当前包中导入相关模块和符号
        from .modeling_layoutlm import (
            LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,  # 导入预训练模型的存档列表
            LayoutLMForMaskedLM,  # 导入用于遮蔽语言建模的 LayoutLM 模型
            LayoutLMForQuestionAnswering,  # 导入用于问答任务的 LayoutLM 模型
            LayoutLMForSequenceClassification,  # 导入用于序列分类任务的 LayoutLM 模型
            LayoutLMForTokenClassification,  # 导入用于标记分类任务的 LayoutLM 模型
            LayoutLMModel,  # 导入 LayoutLM 的基础模型
            LayoutLMPreTrainedModel,  # 导入 LayoutLM 的预训练模型基类
        )
    try:
        # 检查 TensorFlow 是否可用，如果不可用，则触发 OptionalDependencyNotAvailable 异常
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 TensorFlow 不可用，直接跳过
        pass
    else:
        # 从当前包中导入 TensorFlow 版本的相关模块和符号
        from .modeling_tf_layoutlm import (
            TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,  # 导入 TensorFlow 版本的预训练模型存档列表
            TFLayoutLMForMaskedLM,  # 导入用于遮蔽语言建模的 TensorFlow 版本的 LayoutLM 模型
            TFLayoutLMForQuestionAnswering,  # 导入用于问答任务的 TensorFlow 版本的 LayoutLM 模型
            TFLayoutLMForSequenceClassification,  # 导入用于序列分类任务的 TensorFlow 版本的 LayoutLM 模型
            TFLayoutLMForTokenClassification,  # 导入用于标记分类任务的 TensorFlow 版本的 LayoutLM 模型
            TFLayoutLMMainLayer,  # 导入 TensorFlow 版本的 LayoutLM 主层
            TFLayoutLMModel,  # 导入 TensorFlow 版本的 LayoutLM 基础模型
            TFLayoutLMPreTrainedModel,  # 导入 TensorFlow 版本的 LayoutLM 预训练模型基类
        )
# 否则（即非if条件下的情况），导入sys模块
else:
    # 使用sys.modules字典，将当前模块名(__name__)映射到一个_LazyModule对象
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\layoutlmv2\configuration_layoutlmv2.py`

# coding=utf-8
# Copyright Microsoft Research and The HuggingFace Inc. team. All rights reserved.
# 标明代码文件使用UTF-8编码，版权信息声明
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Apache License, Version 2.0授权声明，许可信息的获取网址链接
""" LayoutLMv2 model configuration"""
# LayoutLMv2模型配置

from ...configuration_utils import PretrainedConfig
# 从transformers包中导入PretrainedConfig类
from ...utils import is_detectron2_available, logging
# 从transformers包中导入is_detectron2_available函数和logging模块

logger = logging.get_logger(__name__)
# 获取当前模块的logger对象

LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "layoutlmv2-base-uncased": "https://huggingface.co/microsoft/layoutlmv2-base-uncased/resolve/main/config.json",
    "layoutlmv2-large-uncased": "https://huggingface.co/microsoft/layoutlmv2-large-uncased/resolve/main/config.json",
    # See all LayoutLMv2 models at https://huggingface.co/models?filter=layoutlmv2
}
# LayoutLMv2预训练模型配置文件映射表，包含模型名称到配置文件URL的映射

# soft dependency
if is_detectron2_available():
    import detectron2
    # 如果detectron2可用，导入detectron2模块

class LayoutLMv2Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LayoutLMv2Model`]. It is used to instantiate an
    LayoutLMv2 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the LayoutLMv2
    [microsoft/layoutlmv2-base-uncased](https://huggingface.co/microsoft/layoutlmv2-base-uncased) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import LayoutLMv2Config, LayoutLMv2Model

    >>> # Initializing a LayoutLMv2 microsoft/layoutlmv2-base-uncased style configuration
    >>> configuration = LayoutLMv2Config()

    >>> # Initializing a model (with random weights) from the microsoft/layoutlmv2-base-uncased style configuration
    >>> model = LayoutLMv2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    # LayoutLMv2配置类，用于存储[`LayoutLMv2Model`]的配置，根据指定的参数实例化LayoutLMv2模型，定义模型架构。
    # 使用默认配置实例化将产生类似于LayoutLMv2 [microsoft/layoutlmv2-base-uncased](https://huggingface.co/microsoft/layoutlmv2-base-uncased)架构的配置。

    model_type = "layoutlmv2"
    # 模型类型为"layoutlmv2"
    # 定义一个初始化方法，用于初始化一个新的对象实例
    def __init__(
        self,
        vocab_size=30522,  # 词汇表大小，默认为30522
        hidden_size=768,  # 隐藏层大小，默认为768
        num_hidden_layers=12,  # 隐藏层的数量，默认为12
        num_attention_heads=12,  # 注意力头的数量，默认为12
        intermediate_size=3072,  # 中间层大小，默认为3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为GELU
        hidden_dropout_prob=0.1,  # 隐藏层的dropout概率，默认为0.1
        attention_probs_dropout_prob=0.1,  # 注意力概率的dropout概率，默认为0.1
        max_position_embeddings=512,  # 最大位置编码数，默认为512
        type_vocab_size=2,  # 类型词汇表大小，默认为2
        initializer_range=0.02,  # 初始化范围，默认为0.02
        layer_norm_eps=1e-12,  # 层归一化的epsilon，默认为1e-12
        pad_token_id=0,  # 填充标记ID，默认为0
        max_2d_position_embeddings=1024,  # 最大二维位置编码数，默认为1024
        max_rel_pos=128,  # 最大相对位置，默认为128
        rel_pos_bins=32,  # 相对位置的bins数，默认为32
        fast_qkv=True,  # 是否使用快速的QKV计算，默认为True
        max_rel_2d_pos=256,  # 最大二维相对位置，默认为256
        rel_2d_pos_bins=64,  # 二维相对位置的bins数，默认为64
        convert_sync_batchnorm=True,  # 是否转换同步批归一化，默认为True
        image_feature_pool_shape=[7, 7, 256],  # 图像特征池形状，默认为[7, 7, 256]
        coordinate_size=128,  # 坐标大小，默认为128
        shape_size=128,  # 形状大小，默认为128
        has_relative_attention_bias=True,  # 是否具有相对注意力偏置，默认为True
        has_spatial_attention_bias=True,  # 是否具有空间注意力偏置，默认为True
        has_visual_segment_embedding=False,  # 是否具有视觉段嵌入，默认为False
        detectron2_config_args=None,  # detectron2配置参数，默认为None
        **kwargs,  # 其他关键字参数
    ):
        # 调用父类的初始化方法，传入相关参数
        super().__init__(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            intermediate_size=intermediate_size,
            hidden_act=hidden_act,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_position_embeddings=max_position_embeddings,
            type_vocab_size=type_vocab_size,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            pad_token_id=pad_token_id,
            **kwargs,
        )
        # 初始化对象的特有属性
        self.max_2d_position_embeddings = max_2d_position_embeddings
        self.max_rel_pos = max_rel_pos
        self.rel_pos_bins = rel_pos_bins
        self.fast_qkv = fast_qkv
        self.max_rel_2d_pos = max_rel_2d_pos
        self.rel_2d_pos_bins = rel_2d_pos_bins
        self.convert_sync_batchnorm = convert_sync_batchnorm
        self.image_feature_pool_shape = image_feature_pool_shape
        self.coordinate_size = coordinate_size
        self.shape_size = shape_size
        self.has_relative_attention_bias = has_relative_attention_bias
        self.has_spatial_attention_bias = has_spatial_attention_bias
        self.has_visual_segment_embedding = has_visual_segment_embedding
        # 如果detectron2_config_args不是None，则使用给定的配置参数；否则使用默认的detectron2配置参数
        self.detectron2_config_args = (
            detectron2_config_args if detectron2_config_args is not None else self.get_default_detectron2_config()
        )

    @classmethod
    # 返回一个包含默认参数的字典，用于配置Detectron2模型
    def get_default_detectron2_config(self):
        return {
            "MODEL.MASK_ON": True,  # 开启模型的遮罩功能
            "MODEL.PIXEL_STD": [57.375, 57.120, 58.395],  # 图像每个通道的像素标准偏差
            "MODEL.BACKBONE.NAME": "build_resnet_fpn_backbone",  # 使用的主干网络名称
            "MODEL.FPN.IN_FEATURES": ["res2", "res3", "res4", "res5"],  # 特征金字塔网络的输入特征层
            "MODEL.ANCHOR_GENERATOR.SIZES": [[32], [64], [128], [256], [512]],  # 锚点生成器的大小
            "MODEL.RPN.IN_FEATURES": ["p2", "p3", "p4", "p5", "p6"],  # 区域生成网络的输入特征层
            "MODEL.RPN.PRE_NMS_TOPK_TRAIN": 2000,  # RPN训练时的NMS前TopK
            "MODEL.RPN.PRE_NMS_TOPK_TEST": 1000,  # RPN测试时的NMS前TopK
            "MODEL.RPN.POST_NMS_TOPK_TRAIN": 1000,  # RPN训练后的NMS后TopK
            "MODEL.POST_NMS_TOPK_TEST": 1000,  # 测试时的NMS后TopK
            "MODEL.ROI_HEADS.NAME": "StandardROIHeads",  # 区域兴趣头部的名称
            "MODEL.ROI_HEADS.NUM_CLASSES": 5,  # 区域兴趣头部的类别数量
            "MODEL.ROI_HEADS.IN_FEATURES": ["p2", "p3", "p4", "p5"],  # 区域兴趣头部的输入特征层
            "MODEL.ROI_BOX_HEAD.NAME": "FastRCNNConvFCHead",  # 区域兴趣框头部的名称
            "MODEL.ROI_BOX_HEAD.NUM_FC": 2,  # 区域兴趣框头部全连接层的数量
            "MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION": 14,  # 区域兴趣框头部的池化分辨率
            "MODEL.ROI_MASK_HEAD.NAME": "MaskRCNNConvUpsampleHead",  # 区域兴趣遮罩头部的名称
            "MODEL.ROI_MASK_HEAD.NUM_CONV": 4,  # 区域兴趣遮罩头部的卷积层数量
            "MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION": 7,  # 区域兴趣遮罩头部的池化分辨率
            "MODEL.RESNETS.DEPTH": 101,  # ResNet主干网络的深度
            "MODEL.RESNETS.SIZES": [[32], [64], [128], [256], [512]],  # ResNet主干网络中的尺寸
            "MODEL.RESNETS.ASPECT_RATIOS": [[0.5, 1.0, 2.0]],  # ResNet主干网络中的长宽比
            "MODEL.RESNETS.OUT_FEATURES": ["res2", "res3", "res4", "res5"],  # ResNet主干网络中的输出特征层
            "MODEL.RESNETS.NUM_GROUPS": 32,  # ResNet主干网络中的组数
            "MODEL.RESNETS.WIDTH_PER_GROUP": 8,  # ResNet主干网络中每组的宽度
            "MODEL.RESNETS.STRIDE_IN_1X1": False,  # ResNet主干网络中1x1卷积是否采用stride
        }

    # 返回配置好的Detectron2模型配置
    def get_detectron2_config(self):
        # 调用Detectron2库的函数获取一个空的配置对象
        detectron2_config = detectron2.config.get_cfg()
        # 遍历传入的参数字典
        for k, v in self.detectron2_config_args.items():
            # 按照点分割键，用来设置配置对象中的属性
            attributes = k.split(".")
            to_set = detectron2_config
            # 通过反射设置配置对象中的属性
            for attribute in attributes[:-1]:
                to_set = getattr(to_set, attribute)
            setattr(to_set, attributes[-1], v)

        # 返回配置好的Detectron2模型配置对象
        return detectron2_config

`.\models\layoutlmv2\feature_extraction_layoutlmv2.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Feature extractor class for LayoutLMv2.
"""

import warnings

from ...utils import logging
from .image_processing_layoutlmv2 import LayoutLMv2ImageProcessor

# 获取全局日志记录器对象
logger = logging.get_logger(__name__)

# LayoutLMv2FeatureExtractor 类，继承自 LayoutLMv2ImageProcessor 类
class LayoutLMv2FeatureExtractor(LayoutLMv2ImageProcessor):
    
    # 初始化方法
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告，提示 LayoutLMv2FeatureExtractor 类将在 Transformers 的第五个版本中弃用并移除，建议使用 LayoutLMv2ImageProcessor 类替代
        warnings.warn(
            "The class LayoutLMv2FeatureExtractor is deprecated and will be removed in version 5 of Transformers."
            " Please use LayoutLMv2ImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类 LayoutLMv2ImageProcessor 的初始化方法，传入所有位置参数和关键字参数
        super().__init__(*args, **kwargs)

`.\models\layoutlmv2\image_processing_layoutlmv2.py`

# 定义一个名为 normalize_box 的函数，用于将边界框归一化为相对于图像宽高的千分比
def normalize_box(box, width, height):
    return [
        int(1000 * (box[0] / width)),    # 左上角 x 坐标的归一化值
        int(1000 * (box[1] / height)),   # 左上角 y 坐标的归一化值
        int(1000 * (box[2] / width)),    # 右下角 x 坐标的归一化值
        int(1000 * (box[3] / height)),   # 右下角 y 坐标的归一化值
    ]

# 定义一个名为 apply_tesseract 的函数，用于在文档图像上应用 Tesseract OCR，并返回识别的单词和归一化的边界框
def apply_tesseract(
    image: np.ndarray,
    lang: Optional[str],
    tesseract_config: Optional[str] = None,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
    """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
    tesseract_config = tesseract_config if tesseract_config is not None else ""

    # 将 numpy 数组的图像转换为 PIL 图像格式
    pil_image = to_pil_image(image, input_data_format=input_data_format)
    # 获取 PIL 图像的宽度和高度
    image_width, image_height = pil_image.size
    # 使用 pytesseract 对 PIL 图像进行 OCR，返回识别的文本和详细信息字典
    data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
    # 解析识别结果中的文本、左上角坐标、宽度和高度信息
    words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]

    # 过滤掉空白文本和对应的坐标
    irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
    words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
    left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
    top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
    width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
    height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
    # 将坐标转换为 (左, 上, 左+宽, 上+高) 的格式
    actual_boxes = []
    for x, y, w, h in zip(left, top, width, height):
        actual_box = [x, y, x + w, y + h]  # 计算每个边界框的左、上、右、下坐标
        actual_boxes.append(actual_box)  # 将计算得到的边界框添加到列表中

    # 最后，对边界框进行归一化处理
    normalized_boxes = []
    for box in actual_boxes:
        normalized_boxes.append(normalize_box(box, image_width, image_height))  # 调用归一化函数处理边界框

    # 断言确保单词列表的长度与归一化后的边界框列表长度相等
    assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"

    # 返回处理后的单词列表和归一化后的边界框列表
    return words, normalized_boxes
    r"""
    Constructs a LayoutLMv2 image processor.
    构造一个 LayoutLMv2 图像处理器。

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to `(size["height"], size["width"])`. Can be
            overridden by `do_resize` in `preprocess`.
            是否将图像的 (height, width) 尺寸调整为 `(size["height"], size["width"])`。可以在 `preprocess` 中通过 `do_resize` 覆盖。

        size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
            调整大小后的图像尺寸。可以在 `preprocess` 中通过 `size` 覆盖。

        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
            `preprocess` method.
            如果调整图像大小，要使用的重采样滤波器。可以在 `preprocess` 方法中通过 `resample` 参数覆盖。

        apply_ocr (`bool`, *optional*, defaults to `True`):
            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
            `apply_ocr` in `preprocess`.
            是否应用 Tesseract OCR 引擎来获取单词 + 标准化边界框。可以在 `preprocess` 中通过 `apply_ocr` 覆盖。

        ocr_lang (`str`, *optional*):
            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
            used. Can be overridden by `ocr_lang` in `preprocess`.
            Tesseract OCR 引擎使用的语言，使用 ISO 代码指定。默认使用英语。可以在 `preprocess` 中通过 `ocr_lang` 覆盖。

        tesseract_config (`str`, *optional*, defaults to `""`):
            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
            Tesseract. For example: '--psm 6'. Can be overridden by `tesseract_config` in `preprocess`.
            调用 Tesseract 时转发给 `config` 参数的任何额外自定义配置标志。例如：'--psm 6'。可以在 `preprocess` 中通过 `tesseract_config` 覆盖。
    """

    model_input_names = ["pixel_values"]

    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        apply_ocr: bool = True,
        ocr_lang: Optional[str] = None,
        tesseract_config: Optional[str] = "",
        **kwargs,
    ) -> None:
        # Call the constructor of the superclass (BaseImageProcessor) with any additional keyword arguments (**kwargs)
        super().__init__(**kwargs)

        # Set default size if not provided
        size = size if size is not None else {"height": 224, "width": 224}
        size = get_size_dict(size)  # Normalize size to a dictionary format

        # Initialize instance variables with provided or default values
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.apply_ocr = apply_ocr
        self.ocr_lang = ocr_lang
        self.tesseract_config = tesseract_config

        # Define a list of valid keys for the processor configuration
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "apply_ocr",
            "ocr_lang",
            "tesseract_config",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        """
        size = get_size_dict(size)  # 调用函数 `get_size_dict` 将 `size` 参数转换为标准格式的字典
        if "height" not in size or "width" not in size:
            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
        output_size = (size["height"], size["width"])  # 根据 `size` 字典确定输出图像的尺寸
        return resize(
            image,  # 调用 `resize` 函数对输入图像进行调整大小操作
            size=output_size,  # 设置调整后的图像尺寸
            resample=resample,  # 设置图像调整大小时使用的重采样方法
            data_format=data_format,  # 设置输出图像的通道格式
            input_data_format=input_data_format,  # 设置输入图像的通道格式，如果未指定则从输入图像推断
            **kwargs,  # 允许传递其他关键字参数给 `resize` 函数
        )

`.\models\layoutlmv2\modeling_layoutlmv2.py`

# coding=utf-8
# 版权 2021 Microsoft Research The HuggingFace Inc. team. 保留所有权利。
#
# 根据 Apache 许可证 2.0 版本授权使用此文件；
# 您不得在未遵守许可证的情况下使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，软件
# 根据许可证“按原样”分发，
# 没有任何明示或暗示的担保或条件。
# 有关特定语言的详细信息，请参见许可证。
""" PyTorch LayoutLMv2 模型。"""

import math
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入自定义的激活函数映射
from ...activations import ACT2FN
# 导入模型输出类
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPooling,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
# 导入模型工具类
from ...modeling_utils import PreTrainedModel
# 导入 PyTorch 实用工具
from ...pytorch_utils import apply_chunking_to_forward
# 导入通用工具函数
from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_detectron2_available,
    logging,
    replace_return_docstrings,
    requires_backends,
)
# 导入 LayoutLMv2 配置类
from .configuration_layoutlmv2 import LayoutLMv2Config

# 检查是否有 detectron2 可用（软依赖）
if is_detectron2_available():
    import detectron2
    from detectron2.modeling import META_ARCH_REGISTRY

# 获取 logger 实例
logger = logging.get_logger(__name__)

# 用于文档的检查点路径
_CHECKPOINT_FOR_DOC = "microsoft/layoutlmv2-base-uncased"
# 用于文档的配置类名称
_CONFIG_FOR_DOC = "LayoutLMv2Config"

# LayoutLMv2 预训练模型存档列表
LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/layoutlmv2-base-uncased",
    "microsoft/layoutlmv2-large-uncased",
    # 查看所有 LayoutLMv2 模型：https://huggingface.co/models?filter=layoutlmv2
]

class LayoutLMv2Embeddings(nn.Module):
    """从词、位置和标记类型嵌入构建嵌入。"""
    # 初始化函数，接受一个配置对象 `config`
    def __init__(self, config):
        # 调用父类的初始化方法
        super(LayoutLMv2Embeddings, self).__init__()
        
        # 创建词嵌入层，使用 nn.Embedding
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        
        # 创建位置嵌入层，用于编码位置信息
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)

        # 创建二维空间坐标位置嵌入层，用于编码 X 方向的位置信息
        self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
        
        # 创建二维空间坐标位置嵌入层，用于编码 Y 方向的位置信息
        self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
        
        # 创建二维空间坐标位置嵌入层，用于编码 H (高度) 的位置信息
        self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
        
        # 创建二维空间坐标位置嵌入层，用于编码 W (宽度) 的位置信息
        self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
        
        # 创建标记类型嵌入层，用于编码标记类型信息
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 创建 LayerNorm 层，用于标准化隐藏状态向量
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 创建 Dropout 层，用于随机失活以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 注册一个缓冲区 "position_ids"，用于保存位置编码的张量
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )

    # 计算空间位置嵌入的私有方法，接受一个 bbox 张量作为输入
    def _calc_spatial_position_embeddings(self, bbox):
        try:
            # 使用 X 位置嵌入层编码 bbox 的左边界位置信息
            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
            
            # 使用 Y 位置嵌入层编码 bbox 的上边界位置信息
            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
            
            # 使用 X 位置嵌入层编码 bbox 的右边界位置信息
            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
            
            # 使用 Y 位置嵌入层编码 bbox 的下边界位置信息
            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
        except IndexError as e:
            # 如果 bbox 的坐标值不在预期范围内（0-1000），抛出异常
            raise IndexError("The `bbox` coordinate values should be within 0-1000 range.") from e

        # 使用 H 位置嵌入层编码 bbox 的高度信息（下边界 - 上边界）
        h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3] - bbox[:, :, 1])
        
        # 使用 W 位置嵌入层编码 bbox 的宽度信息（右边界 - 左边界）
        w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2] - bbox[:, :, 0])

        # 将所有位置嵌入张量拼接在一起，形成空间位置嵌入张量
        spatial_position_embeddings = torch.cat(
            [
                left_position_embeddings,
                upper_position_embeddings,
                right_position_embeddings,
                lower_position_embeddings,
                h_position_embeddings,
                w_position_embeddings,
            ],
            dim=-1,
        )
        # 返回计算得到的空间位置嵌入张量
        return spatial_position_embeddings
        # LayoutLMv2SelfAttention 类的初始化函数
        def __init__(self, config):
            # 调用父类的初始化方法
            super().__init__()
            # 检查 hidden_size 是否能被 num_attention_heads 整除，如果不行且没有 embedding_size 属性，则引发 ValueError
            if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
                raise ValueError(
                    f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                    f"heads ({config.num_attention_heads})"
                )
            # 是否使用快速 QKV 模式
            self.fast_qkv = config.fast_qkv
            # 注意力头的数量
            self.num_attention_heads = config.num_attention_heads
            # 每个注意力头的大小
            self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
            # 所有注意力头的总大小
            self.all_head_size = self.num_attention_heads * self.attention_head_size

            # 是否具有相对注意力偏置
            self.has_relative_attention_bias = config.has_relative_attention_bias
            # 是否具有空间注意力偏置
            self.has_spatial_attention_bias = config.has_spatial_attention_bias

            # 如果使用快速 QKV 模式，则定义线性变换和偏置参数
            if config.fast_qkv:
                self.qkv_linear = nn.Linear(config.hidden_size, 3 * self.all_head_size, bias=False)
                self.q_bias = nn.Parameter(torch.zeros(1, 1, self.all_head_size))
                self.v_bias = nn.Parameter(torch.zeros(1, 1, self.all_head_size))
            else:
                # 否则分别定义查询、键、值的线性变换
                self.query = nn.Linear(config.hidden_size, self.all_head_size)
                self.key = nn.Linear(config.hidden_size, self.all_head_size)
                self.value = nn.Linear(config.hidden_size, self.all_head_size)

            # 定义 dropout 层
            self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

        # 将输入张量变形以便进行注意力计算
        def transpose_for_scores(self, x):
            new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
            x = x.view(*new_x_shape)
            return x.permute(0, 2, 1, 3)

        # 计算 QKV 的函数，根据是否使用快速 QKV 模式选择不同的处理方式
        def compute_qkv(self, hidden_states):
            if self.fast_qkv:
                # 使用快速 QKV 模式，进行线性变换并分割得到 Q、K、V
                qkv = self.qkv_linear(hidden_states)
                q, k, v = torch.chunk(qkv, 3, dim=-1)
                # 如果 Q 的维度与 q_bias 的维度相同，则直接加上偏置；否则进行维度调整后再加上偏置
                if q.ndimension() == self.q_bias.ndimension():
                    q = q + self.q_bias
                    v = v + self.v_bias
                else:
                    _sz = (1,) * (q.ndimension() - 1) + (-1,)
                    q = q + self.q_bias.view(*_sz)
                    v = v + self.v_bias.view(*_sz)
            else:
                # 否则分别计算 Q、K、V
                q = self.query(hidden_states)
                k = self.key(hidden_states)
                v = self.value(hidden_states)
            return q, k, v

        # 前向传播函数，接收隐藏状态、注意力掩码、头部掩码等输入，返回处理后的结果
        def forward(
            self,
            hidden_states,
            attention_mask=None,
            head_mask=None,
            output_attentions=False,
            rel_pos=None,
            rel_2d_pos=None,
        ):
            # 使用给定的隐藏状态计算查询、键和值
            q, k, v = self.compute_qkv(hidden_states)

            # (B, L, H*D) -> (B, H, L, D)
            # 将查询、键、值张量重新排列为注意力头的形状
            query_layer = self.transpose_for_scores(q)
            key_layer = self.transpose_for_scores(k)
            value_layer = self.transpose_for_scores(v)

            # 缩放查询张量，以确保稳定的注意力分数
            query_layer = query_layer / math.sqrt(self.attention_head_size)

            # 计算注意力分数
            # [BSZ, NAT, L, L]
            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

            # 如果存在相对位置注意力偏置，添加到注意力分数中
            if self.has_relative_attention_bias:
                attention_scores += rel_pos

            # 如果存在空间注意力偏置，添加到注意力分数中
            if self.has_spatial_attention_bias:
                attention_scores += rel_2d_pos

            # 对注意力分数进行掩码处理，将不需要的位置置为极小值
            attention_scores = attention_scores.float().masked_fill_(
                attention_mask.to(torch.bool), torch.finfo(attention_scores.dtype).min
            )

            # 计算注意力权重，通过 softmax 函数归一化
            attention_probs = nn.functional.softmax(attention_scores, dim=-1, dtype=torch.float32).type_as(value_layer)

            # 使用 dropout 进行注意力权重的随机丢弃
            # 这实际上是丢弃整个待注意的标记，这在传统 Transformer 论文中是正常的做法
            attention_probs = self.dropout(attention_probs)

            # 如果指定了头部掩码，应用头部掩码
            if head_mask is not None:
                attention_probs = attention_probs * head_mask

            # 计算上下文向量，通过加权值层乘以值层得到
            context_layer = torch.matmul(attention_probs, value_layer)

            # 调整上下文向量的维度，使其符合输出的所有头部大小
            context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
            new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
            context_layer = context_layer.view(*new_context_layer_shape)

            # 返回模型的输出，包括上下文向量和注意力权重（如果指定输出注意力权重）
            outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
            return outputs
# 定义 LayoutLMv2Attention 类，继承自 nn.Module
class LayoutLMv2Attention(nn.Module):
    # 初始化方法，接收配置参数 config
    def __init__(self, config):
        super().__init__()
        # 初始化 self 属性为 LayoutLMv2SelfAttention 类的实例，传入配置参数 config
        self.self = LayoutLMv2SelfAttention(config)
        # 初始化 output 属性为 LayoutLMv2SelfOutput 类的实例，传入配置参数 config
        self.output = LayoutLMv2SelfOutput(config)

    # 前向传播方法
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        rel_pos=None,
        rel_2d_pos=None,
    ):
        # 调用 self 属性（LayoutLMv2SelfAttention 实例）的 forward 方法
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions,
            rel_pos=rel_pos,
            rel_2d_pos=rel_2d_pos,
        )
        # 调用 output 属性（LayoutLMv2SelfOutput 实例）的 forward 方法
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力权重，将其加入 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # 如果要输出注意力权重，则添加到输出中
        return outputs


# 定义 LayoutLMv2SelfOutput 类，继承自 nn.Module
class LayoutLMv2SelfOutput(nn.Module):
    # 初始化方法，接收配置参数 config
    def __init__(self, config):
        super().__init__()
        # 初始化 dense 属性为 nn.Linear 类的实例，实现线性变换，输入输出维度为 hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化 LayerNorm 属性为 nn.LayerNorm 类的实例，实现层归一化，输入维度为 hidden_size
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 dropout 属性为 nn.Dropout 类的实例，实现随机失活，丢弃概率为 hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法
    def forward(self, hidden_states, input_tensor):
        # 经过全连接层 dense，实现线性变换
        hidden_states = self.dense(hidden_states)
        # 经过 dropout 层，实现随机失活
        hidden_states = self.dropout(hidden_states)
        # 输入 hidden_states 与 input_tensor 的残差连接，经过 LayerNorm 层，实现层归一化
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制过来，替换 Bert 为 LayoutLMv2
class LayoutLMv2Intermediate(nn.Module):
    # 初始化方法，接收配置参数 config
    def __init__(self, config):
        super().__init__()
        # 初始化 dense 属性为 nn.Linear 类的实例，实现线性变换，输入维度为 hidden_size，输出维度为 intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果 hidden_act 是字符串类型，使用 ACT2FN 字典中对应的激活函数，否则直接使用配置中的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播方法，接收隐藏状态 hidden_states，返回经过线性变换和激活函数处理后的 hidden_states
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertOutput 复制过来，替换 Bert 为 LayoutLMv2
class LayoutLMv2Output(nn.Module):
    # 初始化方法，接收配置参数 config
    def __init__(self, config):
        super().__init__()
        # 初始化 dense 属性为 nn.Linear 类的实例，实现线性变换，输入维度为 intermediate_size，输出维度为 hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 初始化 LayerNorm 属性为 nn.LayerNorm 类的实例，实现层归一化，输入维度为 hidden_size
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 dropout 属性为 nn.Dropout 类的实例，实现随机失活，丢弃概率为 hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法，接收 hidden_states 和 input_tensor，返回经过线性变换、层归一化和随机失活处理后的 hidden_states
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# 定义 LayoutLMv2Layer 类，继承自 nn.Module，具体内容未完整给出，故未添加进一步的注释
class LayoutLMv2Layer(nn.Module):
    # 初始化函数，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 设置前向传播中的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度设为1（通常用于处理序列数据）
        self.seq_len_dim = 1
        # 创建 LayoutLMv2Attention 对象
        self.attention = LayoutLMv2Attention(config)
        # 创建 LayoutLMv2Intermediate 对象
        self.intermediate = LayoutLMv2Intermediate(config)
        # 创建 LayoutLMv2Output 对象
        self.output = LayoutLMv2Output(config)

    # 前向传播函数，处理输入的隐藏状态和其他可选参数
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        rel_pos=None,
        rel_2d_pos=None,
    ):
        # 使用 self.attention 对象处理隐藏状态和注意力掩码等参数
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            rel_pos=rel_pos,
            rel_2d_pos=rel_2d_pos,
        )
        # 获取注意力输出，通常是元组的第一个元素
        attention_output = self_attention_outputs[0]

        # 如果需要输出注意力权重，将注意力输出添加到 outputs 中
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        # 将前向传播函数 apply_chunking_to_forward 应用于 attention_output
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # 将处理后的层输出添加到 outputs 中
        outputs = (layer_output,) + outputs

        # 返回最终的输出结果
        return outputs

    # feed_forward_chunk 方法，处理注意力输出并返回层输出
    def feed_forward_chunk(self, attention_output):
        # 使用 self.intermediate 处理注意力输出
        intermediate_output = self.intermediate(attention_output)
        # 使用 self.output 处理 intermediate_output 和 attention_output，并返回层输出
        layer_output = self.output(intermediate_output, attention_output)
        # 返回层输出
        return layer_output
def relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
    """
    Adapted from Mesh Tensorflow:
    https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
    Translate relative position to a bucket number for relative attention. The relative position is defined as
    memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
    position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for small
    absolute relative_position and larger buckets for larger absolute relative_positions. All relative positions
    >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. This should
    allow for more graceful generalization to longer sequences than the model has been trained on.

    Args:
        relative_position: an int32 Tensor - 相对位置，表示从注意位置到被注意位置的距离（以标记为单位）
        bidirectional: a boolean - 是否双向关注
        num_buckets: an integer - 桶的数量，用于映射相对位置到桶号
        max_distance: an integer - 最大距离限制，超过此距离的相对位置映射到同一个桶

    Returns:
        a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        返回一个形状与relative_position相同的Tensor，包含范围在[0, num_buckets)内的int32值
    """

    ret = 0  # 初始化返回值

    if bidirectional:
        num_buckets //= 2  # 如果是双向的注意力，桶的数量减半
        ret += (relative_position > 0).long() * num_buckets  # 根据相对位置的正负决定加的桶数
        n = torch.abs(relative_position)  # 取相对位置的绝对值
    else:
        n = torch.max(-relative_position, torch.zeros_like(relative_position))  # 若为单向注意力，取负相对位置的最大值或0

    # 现在n的范围为[0, inf)

    # 将一半的桶用于精确增量的位置
    max_exact = num_buckets // 2
    is_small = n < max_exact  # 判断是否为小范围的相对位置

    # 另一半的桶用于对数级别更大的位置范围，直到max_distance
    val_if_large = max_exact + (
        torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
    ).to(torch.long)
    val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))  # 确保不超过桶的上限

    ret += torch.where(is_small, n, val_if_large)  # 根据是否为小范围选择对应的值加到返回值上
    return ret  # 返回计算得到的桶号
    # 初始化函数，接收一个配置参数对象
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 保存配置对象到实例变量中
        self.config = config
        # 创建一个由多个 LayoutLMv2Layer 组成的模块列表
        self.layer = nn.ModuleList([LayoutLMv2Layer(config) for _ in range(config.num_hidden_layers)])

        # 检查是否有相对注意力偏置
        self.has_relative_attention_bias = config.has_relative_attention_bias
        # 检查是否有空间注意力偏置
        self.has_spatial_attention_bias = config.has_spatial_attention_bias

        # 如果有相对注意力偏置，创建相对位置偏置线性层
        if self.has_relative_attention_bias:
            self.rel_pos_bins = config.rel_pos_bins
            self.max_rel_pos = config.max_rel_pos
            self.rel_pos_bias = nn.Linear(self.rel_pos_bins, config.num_attention_heads, bias=False)

        # 如果有空间注意力偏置，创建空间位置偏置线性层（x 和 y 方向各一个）
        if self.has_spatial_attention_bias:
            self.max_rel_2d_pos = config.max_rel_2d_pos
            self.rel_2d_pos_bins = config.rel_2d_pos_bins
            self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
            self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)

        # 梯度检查点，默认为 False
        self.gradient_checkpointing = False

    # 计算一维位置嵌入
    def _calculate_1d_position_embeddings(self, position_ids):
        # 计算相对位置矩阵
        rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
        # 将相对位置矩阵映射到桶中，并返回相对位置嵌入
        rel_pos = relative_position_bucket(
            rel_pos_mat,
            num_buckets=self.rel_pos_bins,
            max_distance=self.max_rel_pos,
        )
        # 使用相对位置偏置线性层进行映射和转置操作
        rel_pos = self.rel_pos_bias.weight.t()[rel_pos].permute(0, 3, 1, 2)
        # 保证数据连续性，并返回结果
        rel_pos = rel_pos.contiguous()
        return rel_pos

    # 计算二维位置嵌入
    def _calculate_2d_position_embeddings(self, bbox):
        # 提取边界框的 x 和 y 坐标
        position_coord_x = bbox[:, :, 0]
        position_coord_y = bbox[:, :, 3]
        # 计算 x 和 y 方向上的相对位置矩阵
        rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
        rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
        # 将二维相对位置矩阵映射到桶中，并返回二维相对位置嵌入（x 和 y 方向分别处理）
        rel_pos_x = relative_position_bucket(
            rel_pos_x_2d_mat,
            num_buckets=self.rel_2d_pos_bins,
            max_distance=self.max_rel_2d_pos,
        )
        rel_pos_y = relative_position_bucket(
            rel_pos_y_2d_mat,
            num_buckets=self.rel_2d_pos_bins,
            max_distance=self.max_rel_2d_pos,
        )
        # 使用相对位置偏置线性层进行映射和转置操作（x 和 y 方向分别处理）
        rel_pos_x = self.rel_pos_x_bias.weight.t()[rel_pos_x].permute(0, 3, 1, 2)
        rel_pos_y = self.rel_pos_y_bias.weight.t()[rel_pos_y].permute(0, 3, 1, 2)
        # 保证数据连续性，并将 x 和 y 方向上的嵌入相加作为最终的二维相对位置嵌入
        rel_pos_x = rel_pos_x.contiguous()
        rel_pos_y = rel_pos_y.contiguous()
        rel_2d_pos = rel_pos_x + rel_pos_y
        return rel_2d_pos

    # 前向传播函数，处理模型的输入并返回输出
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        bbox=None,
        position_ids=None,
    ):
        ):
            # 如果输出隐藏状态被设置为真，则初始化一个空元组用于存储所有隐藏状态
            all_hidden_states = () if output_hidden_states else None
            # 如果输出注意力权重被设置为真，则初始化一个空元组用于存储所有自注意力权重
            all_self_attentions = () if output_attentions else None

            # 如果模型支持相对位置注意力偏置，则计算一维位置嵌入
            rel_pos = self._calculate_1d_position_embeddings(position_ids) if self.has_relative_attention_bias else None
            # 如果模型支持空间注意力偏置，则计算二维位置嵌入
            rel_2d_pos = self._calculate_2d_position_embeddings(bbox) if self.has_spatial_attention_bias else None

            # 遍历模型的每一个层，并进行相应操作
            for i, layer_module in enumerate(self.layer):
                # 如果需要输出隐藏状态，则将当前隐藏状态添加到所有隐藏状态元组中
                if output_hidden_states:
                    all_hidden_states = all_hidden_states + (hidden_states,)

                # 如果有头部掩码，则从给定的掩码列表中选择当前层的头部掩码
                layer_head_mask = head_mask[i] if head_mask is not None else None

                # 如果启用了梯度检查点且处于训练模式下，则使用梯度检查点函数进行前向传播
                if self.gradient_checkpointing and self.training:
                    layer_outputs = self._gradient_checkpointing_func(
                        layer_module.__call__,
                        hidden_states,
                        attention_mask,
                        layer_head_mask,
                        output_attentions,
                        rel_pos=rel_pos,
                        rel_2d_pos=rel_2d_pos,
                    )
                else:
                    # 否则，直接调用当前层模块进行前向传播
                    layer_outputs = layer_module(
                        hidden_states,
                        attention_mask,
                        layer_head_mask,
                        output_attentions,
                        rel_pos=rel_pos,
                        rel_2d_pos=rel_2d_pos,
                    )

                # 更新隐藏状态为当前层的输出的第一个元素（即隐藏状态）
                hidden_states = layer_outputs[0]
                # 如果需要输出注意力权重，则将当前层的注意力权重添加到所有自注意力权重元组中
                if output_attentions:
                    all_self_attentions = all_self_attentions + (layer_outputs[1],)

            # 如果需要输出隐藏状态，则将最终的隐藏状态添加到所有隐藏状态元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果不需要以字典形式返回结果，则按顺序返回隐藏状态、所有隐藏状态、所有自注意力权重
            if not return_dict:
                return tuple(
                    v
                    for v in [
                        hidden_states,
                        all_hidden_states,
                        all_self_attentions,
                    ]
                    if v is not None
                )
            # 否则，以 BaseModelOutput 类的形式返回结果，包括最终隐藏状态、所有隐藏状态和所有自注意力权重
            return BaseModelOutput(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states,
                attentions=all_self_attentions,
            )
class LayoutLMv2PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 LayoutLMv2Config 类作为配置类
    config_class = LayoutLMv2Config
    # 预训练模型存档映射列表
    pretrained_model_archive_map = LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST
    # 基础模型前缀名称
    base_model_prefix = "layoutlmv2"

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            # 对线性层的权重进行初始化，使用正态分布，标准差为配置文件中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果存在偏置项，则将其初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 对嵌入层的权重进行初始化，使用正态分布，标准差为配置文件中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果指定了填充索引，则将填充索引位置的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 对 LayerNorm 层的偏置项初始化为零，权重初始化为 1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


def my_convert_sync_batchnorm(module, process_group=None):
    # 与 `nn.modules.SyncBatchNorm.convert_sync_batchnorm` 相同，但允许从 `detectron2.layers.FrozenBatchNorm2d` 转换
    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
        # 将普通 BatchNorm 转换为 SyncBatchNorm
        return nn.modules.SyncBatchNorm.convert_sync_batchnorm(module, process_group)
    module_output = module
    if isinstance(module, detectron2.layers.FrozenBatchNorm2d):
        # 如果是 FrozenBatchNorm2d，则创建对应的 SyncBatchNorm
        module_output = torch.nn.SyncBatchNorm(
            num_features=module.num_features,
            eps=module.eps,
            affine=True,
            track_running_stats=True,
            process_group=process_group,
        )
        # 设置权重和偏置项
        module_output.weight = torch.nn.Parameter(module.weight)
        module_output.bias = torch.nn.Parameter(module.bias)
        module_output.running_mean = module.running_mean
        module_output.running_var = module.running_var
        module_output.num_batches_tracked = torch.tensor(0, dtype=torch.long, device=module.running_mean.device)
    for name, child in module.named_children():
        # 递归调用，对子模块进行转换
        module_output.add_module(name, my_convert_sync_batchnorm(child, process_group))
    del module
    return module_output


class LayoutLMv2VisualBackbone(nn.Module):
    # 这里是 LayoutLMv2 的视觉骨干网络定义的开始
    # 初始化函数，接受一个配置参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 获取检测器的配置信息
        self.cfg = config.get_detectron2_config()
        
        # 获取模型的元架构（meta architecture）
        meta_arch = self.cfg.MODEL.META_ARCHITECTURE
        # 根据元架构从注册表中获取对应的模型，并使用配置初始化模型
        model = META_ARCH_REGISTRY.get(meta_arch)(self.cfg)
        
        # 断言模型的主干是 FPN（特征金字塔网络）
        assert isinstance(model.backbone, detectron2.modeling.backbone.FPN)
        # 将模型的主干赋值给当前对象的属性
        self.backbone = model.backbone
        
        # 断言像素均值和像素标准差的长度相等
        assert len(self.cfg.MODEL.PIXEL_MEAN) == len(self.cfg.MODEL.PIXEL_STD)
        num_channels = len(self.cfg.MODEL.PIXEL_MEAN)
        
        # 将像素均值作为缓冲区的一部分注册到当前对象
        self.register_buffer(
            "pixel_mean",
            torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1),
            persistent=False,
        )
        
        # 将像素标准差作为缓冲区的一部分注册到当前对象
        self.register_buffer(
            "pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1), persistent=False
        )
        
        # 设置输出特征的关键字为 "p2"
        self.out_feature_key = "p2"
        
        # 如果启用了确定性算法，则使用平均池化替代自适应平均池化
        if torch.are_deterministic_algorithms_enabled():
            logger.warning("using `AvgPool2d` instead of `AdaptiveAvgPool2d`")
            input_shape = (224, 224)
            backbone_stride = self.backbone.output_shape()[self.out_feature_key].stride
            
            # 根据配置计算池化层的输出大小，并创建平均池化层
            self.pool = nn.AvgPool2d(
                (
                    math.ceil(math.ceil(input_shape[0] / backbone_stride) / config.image_feature_pool_shape[0]),
                    math.ceil(math.ceil(input_shape[1] / backbone_stride) / config.image_feature_pool_shape[1]),
                )
            )
        else:
            # 否则使用自适应平均池化层根据配置的形状创建池化层
            self.pool = nn.AdaptiveAvgPool2d(config.image_feature_pool_shape[:2])
        
        # 如果配置的图像特征池化形状长度为2，则添加主干输出特征的通道数到配置中
        if len(config.image_feature_pool_shape) == 2:
            config.image_feature_pool_shape.append(self.backbone.output_shape()[self.out_feature_key].channels)
        
        # 断言主干输出特征的通道数与配置中的相匹配
        assert self.backbone.output_shape()[self.out_feature_key].channels == config.image_feature_pool_shape[2]

    # 前向传播函数，接收图像作为输入并返回处理后的特征
    def forward(self, images):
        # 如果输入是张量，则直接使用，否则获取其张量表示
        images_input = ((images if torch.is_tensor(images) else images.tensor) - self.pixel_mean) / self.pixel_std
        # 将输入图像特征传递给主干网络进行特征提取
        features = self.backbone(images_input)
        # 从提取的特征中选择指定的输出特征并进行池化，然后展平并转置以便后续处理
        features = self.pool(features[self.out_feature_key]).flatten(start_dim=2).transpose(1, 2).contiguous()
        # 返回处理后的特征
        return features
    # 同步批归一化操作的方法定义
    def synchronize_batch_norm(self):
        # 检查当前环境是否支持分布式训练，并且已经初始化，且进程的排名大于-1
        if not (
            torch.distributed.is_available()
            and torch.distributed.is_initialized()
            and torch.distributed.get_rank() > -1
        ):
            # 如果不满足条件，抛出运行时错误
            raise RuntimeError("Make sure torch.distributed is set up properly.")

        # 获取当前进程的排名
        self_rank = torch.distributed.get_rank()
        # 获取当前节点的 GPU 数量
        node_size = torch.cuda.device_count()
        # 获取整个分布式环境中的进程总数
        world_size = torch.distributed.get_world_size()
        # 检查进程总数是否可以被节点数整除
        if not (world_size % node_size == 0):
            # 如果不能整除，抛出运行时错误
            raise RuntimeError("Make sure the number of processes can be divided by the number of nodes")

        # 计算每个节点的全局排名列表
        node_global_ranks = [list(range(i * node_size, (i + 1) * node_size)) for i in range(world_size // node_size)]
        # 创建用于同步批归一化的分组列表
        sync_bn_groups = [
            torch.distributed.new_group(ranks=node_global_ranks[i]) for i in range(world_size // node_size)
        ]
        # 计算当前进程所在节点的索引
        node_rank = self_rank // node_size

        # 调用自定义的同步批归一化函数，将模型的骨干网络同步到对应的分组中
        self.backbone = my_convert_sync_batchnorm(self.backbone, process_group=sync_bn_groups[node_rank])
LAYOUTLMV2_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`LayoutLMv2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

LAYOUTLMV2_INPUTS_DOCSTRING = r"""
"""


class LayoutLMv2Pooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


@add_start_docstrings(
    "The bare LayoutLMv2 Model transformer outputting raw hidden-states without any specific head on top.",
    LAYOUTLMV2_START_DOCSTRING,
)
class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
    def __init__(self, config):
        requires_backends(self, "detectron2")  # 检查是否有 detectron2 后端支持
        super().__init__(config)
        self.config = config
        self.has_visual_segment_embedding = config.has_visual_segment_embedding
        self.embeddings = LayoutLMv2Embeddings(config)  # 初始化模型的嵌入层

        self.visual = LayoutLMv2VisualBackbone(config)  # 初始化视觉骨干网络
        self.visual_proj = nn.Linear(config.image_feature_pool_shape[-1], config.hidden_size)  # 图像特征投影层
        if self.has_visual_segment_embedding:
            self.visual_segment_embedding = nn.Parameter(nn.Embedding(1, config.hidden_size).weight[0])  # 可视化片段嵌入
        self.visual_LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  # 视觉层归一化
        self.visual_dropout = nn.Dropout(config.hidden_dropout_prob)  # 视觉层dropout

        self.encoder = LayoutLMv2Encoder(config)  # 初始化编码器
        self.pooler = LayoutLMv2Pooler(config)  # 初始化池化器

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings  # 返回输入的嵌入层

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value  # 设置输入的嵌入层
    # 计算文本输入的嵌入向量
    def _calc_text_embeddings(self, input_ids, bbox, position_ids, token_type_ids, inputs_embeds=None):
        # 如果有传入 input_ids，则获取其形状
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则获取 inputs_embeds 的形状，排除最后一个维度
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列的长度
        seq_length = input_shape[1]

        # 如果 position_ids 为空，则创建一个从 0 到 seq_length-1 的序列作为 position_ids
        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        
        # 如果 token_type_ids 为空，则创建与 input_ids 相同形状的全零张量
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        # 如果 inputs_embeds 为空，则使用 input_ids 从 word_embeddings 中获取嵌入向量
        if inputs_embeds is None:
            inputs_embeds = self.embeddings.word_embeddings(input_ids)
        
        # 获取位置嵌入向量和空间位置嵌入向量
        position_embeddings = self.embeddings.position_embeddings(position_ids)
        spatial_position_embeddings = self.embeddings._calc_spatial_position_embeddings(bbox)
        
        # 获取 token_type_ids 对应的嵌入向量
        token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)

        # 计算最终的输入嵌入向量，包括 word embeddings、位置 embeddings、空间位置 embeddings 和 token_type embeddings
        embeddings = inputs_embeds + position_embeddings + spatial_position_embeddings + token_type_embeddings
        
        # 对 embeddings 进行 LayerNorm 处理
        embeddings = self.embeddings.LayerNorm(embeddings)
        
        # 对 embeddings 进行 dropout 处理
        embeddings = self.embeddings.dropout(embeddings)
        
        # 返回计算得到的 embeddings
        return embeddings

    # 计算图像输入的嵌入向量
    def _calc_img_embeddings(self, image, bbox, position_ids):
        # 通过 visual 方法获取视觉特征，并通过 visual_proj 进行投影
        visual_embeddings = self.visual_proj(self.visual(image))
        
        # 获取位置嵌入向量和空间位置嵌入向量
        position_embeddings = self.embeddings.position_embeddings(position_ids)
        spatial_position_embeddings = self.embeddings._calc_spatial_position_embeddings(bbox)
        
        # 将视觉特征 embeddings、位置 embeddings 和空间位置 embeddings 相加
        embeddings = visual_embeddings + position_embeddings + spatial_position_embeddings
        
        # 如果模型具有视觉分段嵌入，则将其加到 embeddings 中
        if self.has_visual_segment_embedding:
            embeddings += self.visual_segment_embedding
        
        # 对 embeddings 进行 visual_LayerNorm 处理
        embeddings = self.visual_LayerNorm(embeddings)
        
        # 对 embeddings 进行 visual_dropout 处理
        embeddings = self.visual_dropout(embeddings)
        
        # 返回计算得到的 embeddings
        return embeddings
    @add_start_docstrings_to_model_forward(LAYOUTLMV2_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
    # 用于模型前向传播的函数定义，添加了输入文档字符串和返回值文档字符串的装饰器
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        bbox: Optional[torch.LongTensor] = None,
        image: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 如果指定了input_ids和inputs_embeds，则抛出异常，因为不能同时指定两者
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        # 如果指定了input_ids，则返回其大小
        elif input_ids is not None:
            return input_ids.size()
        # 如果指定了inputs_embeds，则返回其除最后一维之外的大小
        elif inputs_embeds is not None:
            return inputs_embeds.size()[:-1]
        else:
            # 如果既未指定input_ids也未指定inputs_embeds，则抛出异常，要求至少指定其中之一
            raise ValueError("You have to specify either input_ids or inputs_embeds")
@add_start_docstrings(
    """
    LayoutLMv2 Model with a sequence classification head on top (a linear layer on top of the concatenation of the
    final hidden state of the [CLS] token, average-pooled initial visual embeddings and average-pooled final visual
    embeddings, e.g. for document image classification tasks such as the
    [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
    """,
    LAYOUTLMV2_START_DOCSTRING,
)
class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):
    """
    LayoutLMv2模型，顶部带有序列分类头（一个线性层，位于[CLS] token的最终隐藏状态、平均池化的初始视觉嵌入和平均池化的最终视觉嵌入的连接处），
    例如用于文档图像分类任务，如[RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/)数据集。
    """

    def __init__(self, config):
        """
        初始化函数，配置LayoutLMv2序列分类模型。

        Args:
            config (LayoutLMv2Config): 模型配置对象

        """
        super().__init__(config)
        self.num_labels = config.num_labels
        self.layoutlmv2 = LayoutLMv2Model(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        获取输入嵌入层（word embeddings）。

        Returns:
            torch.nn.Embedding: LayoutLMv2模型的词嵌入层对象

        """
        return self.layoutlmv2.embeddings.word_embeddings

    @add_start_docstrings_to_model_forward(LAYOUTLMV2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        bbox: Optional[torch.LongTensor] = None,
        image: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传播函数，执行LayoutLMv2序列分类模型的前向计算。

        Args:
            input_ids (torch.LongTensor, optional): 输入token的ID张量，默认为None
            bbox (torch.LongTensor, optional): 边界框信息的张量，默认为None
            image (torch.FloatTensor, optional): 图像特征的张量，默认为None
            attention_mask (torch.FloatTensor, optional): 注意力掩码的张量，默认为None
            token_type_ids (torch.LongTensor, optional): token类型ID的张量，默认为None
            position_ids (torch.LongTensor, optional): 位置ID的张量，默认为None
            head_mask (torch.FloatTensor, optional): 头部掩码的张量，默认为None
            inputs_embeds (torch.FloatTensor, optional): 输入嵌入的张量，默认为None
            labels (torch.LongTensor, optional): 标签的张量，默认为None
            output_attentions (bool, optional): 是否输出注意力，默认为None
            output_hidden_states (bool, optional): 是否输出隐藏状态，默认为None
            return_dict (bool, optional): 是否返回字典格式的输出，默认为None

        Returns:
            SequenceClassifierOutput: 序列分类任务的输出对象

        """
        # 省略部分代码...

@add_start_docstrings(
    """
    LayoutLMv2 Model with a token classification head on top (a linear layer on top of the text part of the hidden
    states) e.g. for sequence labeling (information extraction) tasks such as
    [FUNSD](https://guillaumejaume.github.io/FUNSD/), [SROIE](https://rrc.cvc.uab.es/?ch=13),
    [CORD](https://github.com/clovaai/cord) and [Kleister-NDA](https://github.com/applicaai/kleister-nda).
    """,
    LAYOUTLMV2_START_DOCSTRING,
)
class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):
    """
    LayoutLMv2模型，顶部带有标记分类头（一个线性层，位于隐藏状态的文本部分的顶部），
    例如用于序列标记任务（信息提取），如FUNSD, SROIE, CORD和Kleister-NDA。
    """

    def __init__(self, config):
        """
        初始化函数，配置LayoutLMv2标记分类模型。

        Args:
            config (LayoutLMv2Config): 模型配置对象

        """
        super().__init__(config)
        self.num_labels = config.num_labels
        self.layoutlmv2 = LayoutLMv2Model(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        获取输入嵌入层（word embeddings）。

        Returns:
            torch.nn.Embedding: LayoutLMv2模型的词嵌入层对象

        """
        return self.layoutlmv2.embeddings.word_embeddings

    @add_start_docstrings_to_model_forward(LAYOUTLMV2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 使用装饰器替换返回文档字符串，设置输出类型为 TokenClassifierOutput，配置类为 _CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
    # 前向传播函数，接受多个输入参数和可选参数
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入 token IDs，类型为长整型张量，可选
        bbox: Optional[torch.LongTensor] = None,  # 边界框信息，类型为长整型张量，可选
        image: Optional[torch.FloatTensor] = None,  # 图像数据，类型为浮点数张量，可选
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力掩码，类型为浮点数张量，可选
        token_type_ids: Optional[torch.LongTensor] = None,  # token 类型 IDs，类型为长整型张量，可选
        position_ids: Optional[torch.LongTensor] = None,  # 位置 IDs，类型为长整型张量，可选
        head_mask: Optional[torch.FloatTensor] = None,  # 头部掩码，类型为浮点数张量，可选
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入嵌入，类型为浮点数张量，可选
        labels: Optional[torch.LongTensor] = None,  # 标签，类型为长整型张量，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力信息，可选布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选布尔值
"""
LayoutLMv2 Model with a span classification head on top for extractive question-answering tasks such as
[DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to
compute `span start logits` and `span end logits`).
"""
# 带有用于提取式问答任务的跨度分类头部的 LayoutLMv2 模型，例如 [DocVQA](https://rrc.cvc.uab.es/?ch=17)。
# 这个模型在隐藏状态输出的文本部分上增加了线性层，用于计算 `span start logits` 和 `span end logits`。

# 引用 LayoutLMv2 的起始文档字符串
LAYOUTLMV2_START_DOCSTRING = """

class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
    def __init__(self, config, has_visual_segment_embedding=True):
        # 调用 LayoutLMv2PreTrainedModel 的初始化方法
        super().__init__(config)
        # 设置模型需要输出的标签数
        self.num_labels = config.num_labels
        # 根据输入的配置，决定是否包含视觉段落嵌入
        config.has_visual_segment_embedding = has_visual_segment_embedding
        # 创建 LayoutLMv2Model 对象
        self.layoutlmv2 = LayoutLMv2Model(config)
        # 创建用于问答任务的线性输出层，输入大小为隐藏状态的大小，输出大小为标签数
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回 LayoutLMv2 模型中的词嵌入层
        return self.layoutlmv2.embeddings.word_embeddings

    # 引用 LAYOUTLMV2_INPUTS_DOCSTRING，添加到模型前向方法的文档字符串中
    @add_start_docstrings_to_model_forward(LAYOUTLMV2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 替换模型前向方法的返回文档字符串，使用 QuestionAnsweringModelOutput 类型，引用 _CONFIG_FOR_DOC 配置类
    @replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        bbox: Optional[torch.LongTensor] = None,
        image: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
"""

Transformers-源码解析-六十一-

Transformers 源码解析（六十一）

.\models\kosmos2\processing_kosmos2.py

.\models\kosmos2\__init__.py

.\models\layoutlm\configuration_layoutlm.py

.\models\layoutlm\modeling_layoutlm.py

.\models\layoutlm\modeling_tf_layoutlm.py

.\models\layoutlm\tokenization_layoutlm.py

.\models\layoutlm\tokenization_layoutlm_fast.py

.\models\layoutlm\__init__.py

.\models\layoutlmv2\configuration_layoutlmv2.py

.\models\layoutlmv2\feature_extraction_layoutlmv2.py

.\models\layoutlmv2\image_processing_layoutlmv2.py

.\models\layoutlmv2\modeling_layoutlmv2.py

`.\models\kosmos2\processing_kosmos2.py`

`.\models\kosmos2\init.py`

`.\models\layoutlm\configuration_layoutlm.py`

`.\models\layoutlm\modeling_layoutlm.py`

`.\models\layoutlm\modeling_tf_layoutlm.py`

`.\models\layoutlm\tokenization_layoutlm.py`

`.\models\layoutlm\tokenization_layoutlm_fast.py`

`.\models\layoutlm\init.py`

`.\models\layoutlmv2\configuration_layoutlmv2.py`

`.\models\layoutlmv2\feature_extraction_layoutlmv2.py`

`.\models\layoutlmv2\image_processing_layoutlmv2.py`

`.\models\layoutlmv2\modeling_layoutlmv2.py`