Transformers 源码解析（六十七）

`.\models\longt5\modeling_longt5.py`

# coding=utf-8
# Copyright 2022 Google LLC., LongT5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch LongT5 model."""


import copy
import math
import warnings
from typing import Any, List, Optional, Tuple, Union

import torch
from torch import nn
from torch.nn import CrossEntropyLoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    DUMMY_INPUTS,
    DUMMY_MASK,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_torch_fx_proxy,
    logging,
    replace_return_docstrings,
)
from .configuration_longt5 import LongT5Config


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "LongT5Config"
_CHECKPOINT_FOR_DOC = "google/long-t5-local-base"

# TODO: Update before the merge
LONGT5_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/long-t5-local-base",
    "google/long-t5-local-large",
    "google/long-t5-tglobal-base",
    "google/long-t5-tglobal-large",
]


def _pad_to_multiple(x: torch.Tensor, block_len: int, dim: int, pad_value: int = 0) -> torch.Tensor:
    """
    Pad a tensor so that a sequence length will be a multiple of `block_len`.

    Args:
        x (torch.Tensor): Input tensor to be padded.
        block_len (int): Desired block length for padding.
        dim (int): Dimension along which padding will be applied.
        pad_value (int, optional): Value used for padding. Defaults to 0.

    Returns:
        torch.Tensor: Padded tensor.
    """
    pad_len = -x.shape[dim] % block_len
    # Handle cases when an empty input sequence is given
    if not all(x.shape):
        new_shape = list(x.shape)
        new_shape[dim] += pad_len
        return torch.zeros(new_shape, dtype=x.dtype)

    pad = [(0, 0)] * x.ndim
    pad[dim] = (0, pad_len)
    pad = sum(pad[::-1], ())
    x = nn.functional.pad(x, pad=pad, mode="constant", value=pad_value)
    return x


def _split_into_blocks(x: torch.Tensor, block_len: int, dim: int) -> torch.Tensor:
    """
    Split an input tensor into blocks of a given `block_len` along the given `dim`.
    If the dimension length is not a multiple of `block_len`, it will be padded first
    with selected `pad_value`.

    Args:
        x (torch.Tensor): Input tensor to be split into blocks.
        block_len (int): Length of each block.
        dim (int): Dimension along which to split the tensor.

    Returns:
        torch.Tensor: Tensor split into blocks.
    """
    # pad tensor to multiple of block_len
    if x.shape[dim] % block_len != 0:
        x = _pad_to_multiple(x, block_len, dim, pad_value=0)
    num_blocks = x.shape[dim] // block_len
    output_shape = x.shape[:dim] + (num_blocks, block_len) + x.shape[(dim + 1) :]
    # 如果输出形状中包含0，由于与ONNX转换不兼容，无法应用reshape操作
    if 0 in output_shape:
        # 返回一个空的张量，形状与output_shape相同，数据类型与x相同，设备与x相同
        return torch.empty(output_shape, dtype=x.dtype, device=x.device)
    # 否则，对张量x进行reshape操作，将其形状调整为output_shape
    return x.reshape(output_shape)
def _concatenate_3_blocks(x: torch.Tensor, block_dim: int, sequence_dim: int, pad_value: int = 0) -> torch.Tensor:
    """Concatenate three consecutive blocks for each input block for local attentiont.

    For more information, see: https://arxiv.org/pdf/2112.07916.pdf.
    """
    num_blocks = x.shape[block_dim]

    # Calculate padding configuration to expand the tensor dimensions
    pad = [(0, 0)] * x.ndim
    pad[block_dim] = (1, 1)
    pad = sum(pad[::-1], ())  # Convert pad list to tuple for torch.nn.functional.pad
    # Pad the tensor along block_dim dimension with constant pad_value
    x = nn.functional.pad(x, pad=pad, mode="constant", value=pad_value)

    blocks_list: List[torch.Tensor] = []
    for i in range(3):
        # Slice tensor to extract blocks using specified block_dim and num_blocks
        indices = [slice(0, None)] * x.ndim
        indices[block_dim] = slice(i, i + num_blocks)
        indices = tuple(indices)
        blocks_list.append(x[indices])
    # Concatenate blocks along sequence_dim to form a single tensor
    return torch.cat(blocks_list, dim=sequence_dim)


def _make_3block_relative_position_ids(block_len: int) -> torch.Tensor:
    """Makes 3-blocked relative position ids for local attention."""
    # Generate position ids for 3-blocked structure
    position_ids = torch.arange(3 * block_len, dtype=torch.int32)
    # Extract center position ids for relative positioning
    center_position_ids = position_ids[block_len:-block_len]
    # Compute relative position ids based on center positions
    relative_position_ids = position_ids.unsqueeze(0) - center_position_ids.unsqueeze(1)
    return relative_position_ids


def _mask_local_attention_mask(local_attention_mask: torch.Tensor, block_len: int) -> torch.Tensor:
    """Mask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius."""
    # Generate relative position ids for 3-blocked structure
    relative_position_ids = _make_3block_relative_position_ids(block_len)
    # Create a mask enforcing local attention radius
    locality_mask = torch.abs(relative_position_ids) < block_len
    locality_mask = locality_mask[None, None, :, :]
    locality_mask = locality_mask.to(local_attention_mask.device)
    # Combine input attention mask and local attention mask
    return torch.logical_and(local_attention_mask, locality_mask)


def _get_local_attention_mask(attention_mask: torch.Tensor, block_len: int, device: torch.device) -> torch.Tensor:
    """Prepare attention mask to be applied for a local attention."""
    # Split attention_mask into blocks along the specified dimension
    _blocked_attention_mask = _split_into_blocks(attention_mask, block_len, dim=1)
    # Concatenate 3 blocks to form extended attention mask
    _3blocked_attention_mask = _concatenate_3_blocks(_blocked_attention_mask, block_dim=1, sequence_dim=2)

    _blocked_attention_mask = _blocked_attention_mask.unsqueeze(-1)
    _3blocked_attention_mask = _3blocked_attention_mask.unsqueeze(-2)
    # Create a local attention mask using logical operation
    local_attention_mask = torch.logical_and(_blocked_attention_mask, _3blocked_attention_mask)
    # Apply masking to enforce local attention constraints
    local_attention_mask = _mask_local_attention_mask(local_attention_mask, block_len)
    # Expand dimension to match the required output shape
    return local_attention_mask.unsqueeze(1).to(device)
# 根据输入的注意力掩码和全局块大小生成固定块的全局ID，并返回两个张量组成的元组
def _make_global_fixed_block_ids(
    attention_mask: torch.Tensor, global_block_size: int
) -> Tuple[torch.Tensor, torch.Tensor]:
    """获取每个输入标记对应的“固定块”全局ID。

    这个实现是从以下地址采用的 Flaxformr 原始实现的简化版本：
    https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

    在我们的场景中，由于我们仅用于解码器，孤立的标记（即不组成整个固定块的标记）将被分配到前一个块。

    原始序列中的填充标记由 -1 表示。
    """
    batch_size, seq_len = attention_mask.shape[:2]

    def handle_orphan_tokens(block_ids: torch.Tensor) -> torch.Tensor:
        # 确定每个块的结束位置，若为全局块大小减一，则为块的结尾
        block_ends = (torch.arange(seq_len) % global_block_size) == global_block_size - 1
        block_ends = block_ends.to(block_ids.device)
        true_block_ends = torch.logical_and(block_ends, block_ids >= 0)
        full_blocks = true_block_ends.sum(-1).unsqueeze(-1).type(block_ids.dtype) - 1
        block_ids = torch.where(block_ids < full_blocks, block_ids, full_blocks)
        return block_ids

    # 创建一个与注意力掩码相同大小的全1张量，然后对其进行累加，最后减去自身，以生成固定块掩码
    fixed_block_mask = torch.ones_like(attention_mask, device=attention_mask.device) / global_block_size
    fixed_block_mask = torch.cumsum(fixed_block_mask, axis=1) - fixed_block_mask

    # 将注意力掩码中非零元素替换为1.0，零元素替换为-1000.0，并生成全局块ID张量
    mask = torch.where(attention_mask != 0.0, 1.0, -1000.0).type(attention_mask.dtype)
    global_block_ids = torch.floor(mask + fixed_block_mask - 1.0).type(attention_mask.dtype)

    # 创建一个下界张量，并将全局块ID限制在下界以上
    _global_block_ids_lower_bound = torch.tensor(-1, dtype=global_block_ids.dtype, device=global_block_ids.device)
    global_block_ids = torch.where(
        global_block_ids > _global_block_ids_lower_bound, global_block_ids, _global_block_ids_lower_bound
    )

    # 将填充标记设为-1
    global_block_ids = (global_block_ids * attention_mask) + (attention_mask - 1)

    # 处理孤立标记，将它们分配到前一个块中
    global_block_ids = handle_orphan_tokens(global_block_ids)

    # 计算序列中全局块的数量
    num_globals = seq_len // global_block_size

    # 如果全局块数量大于0，则生成全局段ID张量，否则生成一个空张量
    if num_globals > 0:
        _sequence_block_ids_max = torch.max(global_block_ids, dim=-1).values.repeat(num_globals, 1).transpose(0, 1)
    else:
        _sequence_block_ids_max = torch.zeros(
            batch_size, 0, dtype=global_block_ids.dtype, device=global_block_ids.device
        )

    # 生成全局段ID张量，表示每个全局块的ID
    global_segment_ids = torch.cumsum(torch.ones(batch_size, num_globals), dim=-1) - 1
    global_segment_ids = global_segment_ids.to(attention_mask.device)
    global_segment_ids = torch.where(global_segment_ids <= _sequence_block_ids_max, 1, 0)

    # 返回全局块ID和全局段ID的整型张量
    return global_block_ids.type(torch.int), global_segment_ids.type(torch.int)


# 根据输入的注意力掩码和全局块大小创建用于局部到全局注意力的相对位置ID张量
def _make_side_relative_position_ids(attention_mask: torch.Tensor, global_block_size: int) -> torch.Tensor:
    """Create the relative position tensor for local -> global attention."""
    # 调用函数 _make_global_fixed_block_ids，生成全局固定块的标识和全局段标识
    block_ids, global_segment_ids = _make_global_fixed_block_ids(attention_mask, global_block_size)
    
    # 获取全局段标识数组的最后一个维度的长度，即全局序列的长度
    global_seq_len = global_segment_ids.shape[-1]
    
    # 在设备上创建一个张量，包含从0到全局序列长度的整数，设备与block_ids相同
    global_positions = torch.arange(global_seq_len, device=block_ids.device)
    
    # 计算侧向相对位置，即全局位置与每个块标识的差值，扩展维度以适应广播操作
    side_relative_position = global_positions - block_ids[..., None]
    
    # 将侧向相对位置转换为 torch.int64 类型并返回结果
    return side_relative_position.type(torch.int64)
def _create_global_aggregates(
    hidden_states: torch.Tensor, block_ids: torch.Tensor, global_seq_len: int
) -> torch.Tensor:
    """Compute individual block aggregates by summing over individual blocks."""
    # 将 block_ids 中小于 0 的值替换为 global_seq_len
    block_ids = block_ids.where(
        block_ids >= 0, torch.tensor(global_seq_len, dtype=block_ids.dtype, device=block_ids.device)
    )
    # 将 block_ids 转换成 one-hot 编码，维度为 (batch..., seq_len, global_seq_len)
    one_hot_block_ids = nn.functional.one_hot(block_ids.type(torch.int64), global_seq_len + 1)[:, :, :-1]
    # 使用 einsum 计算全局聚合，维度为 (...gd)，其中 g 是 global_seq_len，d 是 hidden_state 的最后一个维度
    return torch.einsum("...nd,...ng->...gd", hidden_states, one_hot_block_ids.type(hidden_states.dtype))


# 从 transformers.models.t5.modeling_t5.T5LayerNorm 复制到 LongT5LayerNorm
class LongT5LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
        """
        super().__init__()
        # 初始化权重参数，用于缩放
        self.weight = nn.Parameter(torch.ones(hidden_size))
        # 设置方差的小量值
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        # 计算隐藏状态的方差，不减去均值，使用 fp32 累积半精度输入
        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
        # 根据方差和小量值计算 layer_norm，并缩放权重
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)

        # 如果权重参数是半精度的，则将隐藏状态转换为相应精度
        if self.weight.dtype in [torch.float16, torch.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)

        return self.weight * hidden_states


try:
    from apex.normalization import FusedRMSNorm

    LongT5LayerNorm = FusedRMSNorm  # noqa

    # 如果成功导入 FusedRMSNorm，使用它并记录信息
    logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNorm")
except ImportError:
    # 如果导入失败，继续使用 LongT5LayerNorm
    pass
except Exception:
    # 如果发生其他异常，记录警告信息并回退到 LongT5LayerNorm
    logger.warning("discovered apex but it failed to load, falling back to LongT5LayerNorm")
    pass

# 将 LongT5LayerNorm 添加到全局变量 ALL_LAYERNORM_LAYERS 中
ALL_LAYERNORM_LAYERS.append(LongT5LayerNorm)


# 从 transformers.models.t5.modeling_t5.T5DenseActDense 复制到 LongT5DenseActDense
class LongT5DenseActDense(nn.Module):
    def __init__(self, config: LongT5Config):
        super().__init__()
        # 初始化两个线性层和一个 dropout 层，用于密集连接和激活
        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
        self.dropout = nn.Dropout(config.dropout_rate)
        # 设置激活函数
        self.act = ACT2FN[config.dense_act_fn]
    # 定义前向传播函数，接受隐藏状态作为输入参数
    def forward(self, hidden_states):
        # 使用权重矩阵 wi 对隐藏状态进行线性变换
        hidden_states = self.wi(hidden_states)
        # 对变换后的隐藏状态应用激活函数 act
        hidden_states = self.act(hidden_states)
        # 对激活后的隐藏状态应用 dropout 操作
        hidden_states = self.dropout(hidden_states)
        # 检查输出层权重是否是张量类型，并且隐藏状态的数据类型是否与输出层权重不同，且输出层权重不是 torch.int8 类型
        if (
            isinstance(self.wo.weight, torch.Tensor)
            and hidden_states.dtype != self.wo.weight.dtype
            and self.wo.weight.dtype != torch.int8
        ):
            # 将隐藏状态转换到与输出层权重相同的数据类型
            hidden_states = hidden_states.to(self.wo.weight.dtype)
        # 使用权重矩阵 wo 对处理后的隐藏状态进行线性变换
        hidden_states = self.wo(hidden_states)
        # 返回处理后的隐藏状态
        return hidden_states
# 定义一个名为 LongT5DenseGatedActDense 的 nn.Module 类，用于实现 LongT5 模型的 FeedForward 层
class LongT5DenseGatedActDense(nn.Module):
    def __init__(self, config: LongT5Config):
        super().__init__()
        # 使用 nn.Linear 定义一个线性层 wi_0，输入维度为 config.d_model，输出维度为 config.d_ff，无偏置
        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
        # 使用 nn.Linear 定义另一个线性层 wi_1，输入维度同上，输出维度同样为 config.d_ff，无偏置
        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
        # 使用 nn.Linear 定义一个线性层 wo，输入维度为 config.d_ff，输出维度为 config.d_model，无偏置
        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
        # 定义一个 Dropout 层，使用 config.dropout_rate 作为 dropout 概率
        self.dropout = nn.Dropout(config.dropout_rate)
        # 从全局 ACT2FN 字典中选择激活函数，函数由 config.dense_act_fn 决定
        self.act = ACT2FN[config.dense_act_fn]

    # 定义前向传播函数，接受 hidden_states 作为输入
    def forward(self, hidden_states):
        # 计算激活函数后的输出，输入为 wi_0 对 hidden_states 的线性变换结果
        hidden_gelu = self.act(self.wi_0(hidden_states))
        # 计算 wi_1 对 hidden_states 的线性变换结果
        hidden_linear = self.wi_1(hidden_states)
        # 将激活后的结果与线性变换结果相乘，作为下一步的隐藏状态
        hidden_states = hidden_gelu * hidden_linear
        # 对 hidden_states 应用 Dropout
        hidden_states = self.dropout(hidden_states)
        # 使用 wo 对 hidden_states 进行线性变换
        hidden_states = self.wo(hidden_states)
        # 返回变换后的 hidden_states
        return hidden_states


# 从 transformers.models.t5.modeling_t5.T5LayerFF 复制并修改为 LongT5
class LongT5LayerFF(nn.Module):
    def __init__(self, config: LongT5Config):
        super().__init__()
        # 根据配置选择是否使用门控激活函数，实例化对应的 Dense 层
        if config.is_gated_act:
            self.DenseReluDense = LongT5DenseGatedActDense(config)
        else:
            self.DenseReluDense = LongT5DenseActDense(config)

        # 使用 LongT5LayerNorm 对象对输入进行归一化
        self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 定义一个 Dropout 层，使用 config.dropout_rate 作为 dropout 概率
        self.dropout = nn.Dropout(config.dropout_rate)

    # 定义前向传播函数，接受 hidden_states 作为输入
    def forward(self, hidden_states):
        # 对输入 hidden_states 进行 LayerNorm 归一化
        forwarded_states = self.layer_norm(hidden_states)
        # 使用 DenseReluDense 层对归一化后的 hidden_states 进行前向传播
        forwarded_states = self.DenseReluDense(forwarded_states)
        # 将原始输入 hidden_states 与 Dropout 后的 forwarded_states 相加，作为输出
        hidden_states = hidden_states + self.dropout(forwarded_states)
        # 返回计算结果
        return hidden_states


# 从 transformers.models.t5.modeling_t5.T5Attention 复制并修改为 LongT5
class LongT5Attention(nn.Module):
    def __init__(self, config: LongT5Config, has_relative_attention_bias=False):
        super().__init__()
        # 根据配置初始化注意力模块的参数
        self.is_decoder = config.is_decoder
        self.has_relative_attention_bias = has_relative_attention_bias
        self.relative_attention_num_buckets = config.relative_attention_num_buckets
        self.relative_attention_max_distance = config.relative_attention_max_distance
        self.d_model = config.d_model
        self.key_value_proj_dim = config.d_kv
        self.n_heads = config.num_heads
        self.dropout = config.dropout_rate
        self.inner_dim = self.n_heads * self.key_value_proj_dim

        # 使用 nn.Linear 定义查询、键、值、输出映射层
        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)

        # 如果使用相对位置注意力偏置，初始化相对注意力偏置 Embedding
        if self.has_relative_attention_bias:
            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
        self.pruned_heads = set()
        self.gradient_checkpointing = False
    # 剪枝操作，根据给定的头部列表来剪枝
    def prune_heads(self, heads):
        # 如果头部列表为空，则直接返回，不进行剪枝操作
        if len(heads) == 0:
            return
        # 调用函数查找可剪枝的头部和对应索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
        )
        # 对四个线性层进行剪枝操作
        self.q = prune_linear_layer(self.q, index)
        self.k = prune_linear_layer(self.k, index)
        self.v = prune_linear_layer(self.v, index)
        self.o = prune_linear_layer(self.o, index, dim=1)
        # 更新超参数：减少头部数量、重新计算内部维度、更新已剪枝头部集合
        self.n_heads = self.n_heads - len(heads)
        self.inner_dim = self.key_value_proj_dim * self.n_heads
        self.pruned_heads = self.pruned_heads.union(heads)
    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor - represents the relative position between memory and query
            bidirectional: a boolean - indicates if attention is bidirectional or unidirectional
            num_buckets: an integer - number of buckets to divide the range of relative positions
            max_distance: an integer - maximum allowed distance for relative positions

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        """
        relative_buckets = 0
        # Adjust num_buckets and calculate relative_buckets based on bidirectional flag
        if bidirectional:
            num_buckets //= 2
            # Calculate relative_buckets based on whether relative_position is positive
            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
            relative_position = torch.abs(relative_position)
        else:
            # Ensure all relative_position values are non-positive
            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))

        # now relative_position is in the range [0, inf)
        
        # Determine if relative_position falls within small or large range
        max_exact = num_buckets // 2
        is_small = relative_position < max_exact

        # Calculate bucket index based on whether the position is small or large
        relative_position_if_large = max_exact + (
            torch.log(relative_position.float() / max_exact)
            / math.log(max_distance / max_exact)
            * (num_buckets - max_exact)
        ).to(torch.long)
        
        # Cap relative_position_if_large to num_buckets - 1
        relative_position_if_large = torch.min(
            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
        )

        # Determine final relative_buckets using a conditional selection
        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
        return relative_buckets
    # 计算相对位置偏置
    def compute_bias(self, query_length, key_length, device=None):
        """Compute binned relative position bias"""
        # 如果设备未指定，则使用相对注意力偏置权重的设备
        if device is None:
            device = self.relative_attention_bias.weight.device
        # 创建一个形状为 (query_length, 1) 的张量，表示查询位置
        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
        # 创建一个形状为 (1, key_length) 的张量，表示记忆位置
        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
        # 计算相对位置矩阵，形状为 (query_length, key_length)
        relative_position = memory_position - context_position
        # 将相对位置矩阵映射到预定义数量的桶中，形状仍为 (query_length, key_length)
        relative_position_bucket = self._relative_position_bucket(
            relative_position,
            bidirectional=(not self.is_decoder),
            num_buckets=self.relative_attention_num_buckets,
            max_distance=self.relative_attention_max_distance,
        )
        # 使用相对位置桶计算相对注意力偏置，形状为 (query_length, key_length, num_heads)
        values = self.relative_attention_bias(relative_position_bucket)
        # 将结果转置并增加维度，形状变为 (1, num_heads, query_length, key_length)
        values = values.permute([2, 0, 1]).unsqueeze(0)
        # 返回计算得到的相对注意力偏置
        return values

    # 前向传播方法
    def forward(
        self,
        hidden_states,
        mask=None,
        key_value_states=None,
        position_bias=None,
        past_key_value=None,
        layer_head_mask=None,
        query_length=None,
        use_cache=False,
        output_attentions=False,
    # 定义了一个名为 LongT5LocalAttention 的类，继承自 nn.Module
    class LongT5LocalAttention(nn.Module):
        def __init__(self, config: LongT5Config, has_relative_attention_bias: bool = False) -> None:
            super().__init__()
            # 初始化类的属性
            self.is_decoder = config.is_decoder  # 是否为解码器
            self.has_relative_attention_bias = has_relative_attention_bias  # 是否具有相对注意力偏置
            self.relative_attention_num_buckets = config.relative_attention_num_buckets  # 相对注意力的桶数量
            self.relative_attention_max_distance = config.relative_attention_max_distance  # 相对注意力的最大距离
            self.d_model = config.d_model  # 模型的输入维度
            self.key_value_proj_dim = config.d_kv  # 键值投影维度
            self.n_heads = config.num_heads  # 注意力头的数量
            self.local_radius = config.local_radius  # 局部注意力的半径
            self.block_len = self.local_radius + 1  # 区块长度
            self.dropout = config.dropout_rate  # 丢弃率
            self.inner_dim = self.n_heads * self.key_value_proj_dim  # 内部维度

            # 使用线性层定义模型的参数
            self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)  # 查询向量的线性层
            self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)  # 键向量的线性层
            self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)  # 值向量的线性层
            self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)  # 输出向量的线性层

            if self.has_relative_attention_bias:
                # 如果具有相对注意力偏置，使用嵌入层定义相对注意力偏置
                self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
            self.pruned_heads = set()  # 初始化剪枝的注意力头集合
            self.gradient_checkpointing = False  # 梯度检查点设为 False

        # 定义了一个方法 prune_heads，用于剪枝注意力头
        # 该方法的实现是从 transformers 库中 T5Attention 类的 prune_heads 方法复制而来
        def prune_heads(self, heads):
            if len(heads) == 0:
                return
            heads, index = find_pruneable_heads_and_indices(
                heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
            )
            # 剪枝线性层
            self.q = prune_linear_layer(self.q, index)
            self.k = prune_linear_layer(self.k, index)
            self.v = prune_linear_layer(self.v, index)
            self.o = prune_linear_layer(self.o, index, dim=1)
            # 更新超参数
            self.n_heads = self.n_heads - len(heads)
            self.inner_dim = self.key_value_proj_dim * self.n_heads
            self.pruned_heads = self.pruned_heads.union(heads)

        @staticmethod
        # 定义了一个静态方法 _relative_position_bucket，用于计算相对位置桶
        # 该方法的实现是从 transformers 库中 T5Attention 类的 _relative_position_bucket 方法复制而来
    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor - 相对位置，表示从当前位置到被关注位置的距离（记忆位置 - 查询位置）
            bidirectional: a boolean - 是否为双向注意力
            num_buckets: an integer - 桶的数量，用来分桶相对位置
            max_distance: an integer - 最大距离，超过此距离的相对位置都映射到同一个桶内

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
            返回一个形状与relative_position相同的张量，其中的值在区间[0, num_buckets)内，表示相对位置所属的桶编号
        """
        relative_buckets = 0
        if bidirectional:
            num_buckets //= 2
            # 计算相对位置的正负性，并将其映射到不同的桶
            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
            relative_position = torch.abs(relative_position)
        else:
            # 如果是单向注意力，将所有相对位置映射为非正的值
            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
        # 现在relative_position的范围为[0, inf)

        # 半数桶用于精确增量位置
        max_exact = num_buckets // 2
        is_small = relative_position < max_exact

        # 另一半桶用于对数级别的更大距离位置
        relative_position_if_large = max_exact + (
            torch.log(relative_position.float() / max_exact)
            / math.log(max_distance / max_exact)
            * (num_buckets - max_exact)
        ).to(torch.long)
        # 将超过最大桶数的相对位置映射到最大桶内
        relative_position_if_large = torch.min(
            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
        )

        # 根据相对位置大小，选择对应的桶编号
        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
        return relative_buckets
    def compute_bias(self, block_length: int):
        """Compute binned relative position bias"""
        # 获取self.relative_attention_bias的设备，如果不是"meta"类型则使用其设备，否则设为None
        target_device = (
            self.relative_attention_bias.weight.device
            if self.relative_attention_bias.weight.device.type != "meta"
            else None
        )
        # 创建一个长为3 * block_length的长整型张量，设备为target_device
        memory_position = torch.arange(3 * block_length, dtype=torch.long, device=target_device)
        # 从memory_position中选取出中间部分的位置作为context_position
        context_position = memory_position[block_length:-block_length]

        # 计算相对位置矩阵 (block_length, 3 * block_length)
        relative_position = memory_position[None, :] - context_position[:, None]
        # 将相对位置矩阵映射到桶中，返回相对位置桶的张量
        relative_position_bucket = self._relative_position_bucket(
            relative_position,  # (block_length, 3 * block_length)
            bidirectional=(not self.is_decoder),
            num_buckets=self.relative_attention_num_buckets,
            max_distance=self.relative_attention_max_distance,
        )
        # 计算相对注意力偏置值，维度为(block_length, 3 * block_length, num_heads)
        values = self.relative_attention_bias(relative_position_bucket)
        # 对结果进行维度变换，维度变为(1, 1, num_heads, block_length, 3 * block_length)
        values = values.permute([2, 0, 1]).unsqueeze(0).unsqueeze(0)
        # 返回计算的相对注意力偏置张量
        return values

    def forward(
        self,
        hidden_states,
        mask=None,
        position_bias=None,
        layer_head_mask=None,
        output_attentions=False,
    # 定义一个名为 LongT5TransientGlobalAttention 的新的 PyTorch 模型类，继承自 nn.Module
    class LongT5TransientGlobalAttention(nn.Module):
        # 初始化函数，接受 LongT5Config 类型的配置对象 config 和一个布尔类型的参数 has_relative_attention_bias
        def __init__(self, config: LongT5Config, has_relative_attention_bias: bool = False) -> None:
            super().__init__()
            # 根据配置对象 config 设置模型的各种属性
            self.is_decoder = config.is_decoder
            self.has_relative_attention_bias = has_relative_attention_bias
            self.relative_attention_num_buckets = config.relative_attention_num_buckets
            self.relative_attention_max_distance = config.relative_attention_max_distance
            self.d_model = config.d_model
            self.key_value_proj_dim = config.d_kv
            self.n_heads = config.num_heads
            self.local_radius = config.local_radius
            self.block_len = self.local_radius + 1
            self.global_block_size = config.global_block_size
            self.dropout = config.dropout_rate
            self.inner_dim = self.n_heads * self.key_value_proj_dim

            # 使用 nn.Linear 定义线性层，设置输入维度为 d_model，输出维度为 inner_dim，没有偏置
            self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
            self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
            self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
            self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)

            # 如果需要相对注意力偏置，使用 nn.Embedding 定义一个嵌入层，维度为 relative_attention_num_buckets x n_heads
            if self.has_relative_attention_bias:
                self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
            self.pruned_heads = set()

            # 如果需要全局注意力的相对注意力偏置，也使用 nn.Embedding 定义一个嵌入层
            if self.has_relative_attention_bias:
                self.global_relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
            # 使用 LongT5LayerNorm 类初始化全局注意力的输入层规范化
            self.global_input_layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)

        # 定义一个用于剪枝注意力头的静态方法，接受参数 heads
        def prune_heads(self, heads):
            # 如果没有要剪枝的头部，直接返回
            if len(heads) == 0:
                return
            # 调用 find_pruneable_heads_and_indices 函数找到要剪枝的头部和索引
            heads, index = find_pruneable_heads_and_indices(
                heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
            )
            # 对线性层进行剪枝操作
            self.q = prune_linear_layer(self.q, index)
            self.k = prune_linear_layer(self.k, index)
            self.v = prune_linear_layer(self.v, index)
            self.o = prune_linear_layer(self.o, index, dim=1)
            # 更新超参数
            self.n_heads = self.n_heads - len(heads)
            self.inner_dim = self.key_value_proj_dim * self.n_heads
            self.pruned_heads = self.pruned_heads.union(heads)

        # 静态方法，用于计算相对位置桶的索引
        @staticmethod
        def _relative_position_bucket():
    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor - represents the relative position between memory and query
            bidirectional: a boolean - whether the attention is bidirectional or unidirectional
            num_buckets: an integer - number of buckets to quantize relative positions into
            max_distance: an integer - maximum distance that a relative position can have

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        """
        # Initialize relative_buckets to 0
        relative_buckets = 0
        
        # Adjust num_buckets if bidirectional is True
        if bidirectional:
            num_buckets //= 2
            # Compute relative_buckets based on whether relative_position is positive
            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
            relative_position = torch.abs(relative_position)
        else:
            # Ensure relative_position is non-positive for unidirectional attention
            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
        
        # now relative_position is in the range [0, inf)

        # Determine if relative_position is small (less than max_exact)
        max_exact = num_buckets // 2
        is_small = relative_position < max_exact

        # Calculate relative position if it is large (greater than or equal to max_exact)
        relative_position_if_large = max_exact + (
            torch.log(relative_position.float() / max_exact)
            / math.log(max_distance / max_exact)
            * (num_buckets - max_exact)
        ).to(torch.long)
        
        # Cap relative_position_if_large to num_buckets - 1
        relative_position_if_large = torch.min(
            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
        )

        # Combine relative_buckets based on whether relative_position is small or large
        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
        
        # Return the computed relative_buckets
        return relative_buckets
    def compute_bias(self, block_length: int):
        """Compute binned relative position bias"""
        # 确定目标设备，以便在设备类型不是"meta"时将其用于计算
        target_device = (
            self.relative_attention_bias.weight.device
            if self.relative_attention_bias.weight.device.type != "meta"
            else None
        )
        # 创建一个长为 3 * block_length 的长整型张量，用于表示内存位置
        memory_position = torch.arange(3 * block_length, dtype=torch.long, device=target_device)
        # 根据 block_length 计算上下文位置，排除边界块长度
        context_position = memory_position[block_length:-block_length]

        # 计算相对位置张量 (block_length, 3 * block_length)
        relative_position = memory_position[None, :] - context_position[:, None]
        # 将相对位置映射到桶中，以便后续使用
        relative_position_bucket = self._relative_position_bucket(
            relative_position,
            bidirectional=(not self.is_decoder),
            num_buckets=self.relative_attention_num_buckets,
            max_distance=self.relative_attention_max_distance,
        )
        # 使用相对位置桶计算相对注意力偏置值 (block_length, 3 * block_length, num_heads)
        values = self.relative_attention_bias(relative_position_bucket)
        # 调整维度顺序以匹配模型期望的格式 (1, 1, num_heads, block_length, 3 * block_length)
        values = values.permute([2, 0, 1]).unsqueeze(0).unsqueeze(0)
        return values

    def compute_side_bias(self, mask: torch.Tensor, global_segment_ids: torch.Tensor) -> torch.Tensor:
        # 创建侧边注意力掩码 (batch_size, 1, seq_len, global_seq_len)
        side_attention_mask = torch.eq(mask[..., None], global_segment_ids[:, None, :])[:, None, ...]
        # 设置侧边注意力偏置，匹配掩码中的条件 (batch_size, 1, seq_len, global_seq_len)
        attention_side_bias = torch.where(side_attention_mask > 0, 0.0, -1e10)
        # 创建侧边相对位置张量 (batch_size, seq_len, global_seq_len)
        side_relative_position = _make_side_relative_position_ids(mask, self.global_block_size)
        # 将侧边相对位置映射到桶中，以备后续使用
        side_relative_position_bucket = self._relative_position_bucket(
            side_relative_position,
            bidirectional=(not self.is_decoder),
            num_buckets=self.relative_attention_num_buckets,
            max_distance=self.relative_attention_max_distance,
        )
        # 使用全局相对注意力偏置计算侧边注意力偏置 (batch_size, seq_len, global_seq_len, num_heads)
        side_bias = self.global_relative_attention_bias(side_relative_position_bucket)
        # 调整顺序以匹配模型期望的格式 (batch_size, num_heads, seq_len, global_seq_len)
        side_bias = side_bias.permute([0, 3, 1, 2])
        # 将侧边注意力偏置与注意力偏置相结合 (batch_size, num_heads, seq_len, global_seq_len)
        attention_side_bias = attention_side_bias + side_bias
        return attention_side_bias

    def forward(
        self,
        hidden_states,
        mask=None,
        position_bias=None,
        layer_head_mask=None,
        output_attentions=False,
# 从transformers.models.t5.modeling_t5.T5LayerSelfAttention复制，并将T5->LongT5
class LongT5LayerSelfAttention(nn.Module):
    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__()
        # 初始化长序列Transformer的自注意力层
        self.SelfAttention = LongT5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
        # 初始化层归一化
        self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 初始化dropout层
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
    ):
        # 应用层归一化到隐藏状态
        normed_hidden_states = self.layer_norm(hidden_states)
        # 使用自注意力层处理归一化后的隐藏状态
        attention_output = self.SelfAttention(
            normed_hidden_states,
            mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        # 将dropout后的注意力输出与原始隐藏状态相加
        hidden_states = hidden_states + self.dropout(attention_output[0])
        # 如果输出注意力权重，将它们添加到输出元组中
        outputs = (hidden_states,) + attention_output[1:]  # 如果输出注意力权重，则添加它们
        return outputs


class LongT5LayerLocalSelfAttention(nn.Module):
    """用于编码器的局部自注意力"""

    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__()
        # 初始化长序列Transformer的局部自注意力层
        self.LocalSelfAttention = LongT5LocalAttention(config, has_relative_attention_bias=has_relative_attention_bias)
        # 初始化层归一化
        self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 初始化dropout层
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        output_attentions=False,
        **kwargs: Any,  # 接受past_key_value和use_cache等关键字参数
    ):
        # 应用层归一化到隐藏状态
        normed_hidden_states = self.layer_norm(hidden_states)
        # 使用局部自注意力层处理归一化后的隐藏状态
        attention_output = self.LocalSelfAttention(
            normed_hidden_states,
            mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 将dropout后的注意力输出与原始隐藏状态相加
        hidden_states = hidden_states + self.dropout(attention_output[0])
        # 如果输出注意力权重，将它们添加到输出元组中
        outputs = (hidden_states,) + attention_output[1:]  # 如果输出注意力权重，则添加它们
        return outputs


class LongT5LayerTransientGlobalSelfAttention(nn.Module):
    """用于编码器的瞬时全局自注意力"""
    # 初始化函数，用于创建一个新的实例
    def __init__(self, config, has_relative_attention_bias=False):
        # 调用父类的初始化方法
        super().__init__()
        # 使用LongT5TransientGlobalAttention类初始化TransientGlobalSelfAttention属性，
        # 并传入config和has_relative_attention_bias参数
        self.TransientGlobalSelfAttention = LongT5TransientGlobalAttention(
            config, has_relative_attention_bias=has_relative_attention_bias
        )
        # 初始化layer_norm属性，使用LongT5LayerNorm类创建一个LayerNorm层，
        # 将d_model作为参数传入，同时设置eps为config中的layer_norm_epsilon值
        self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 初始化dropout属性，使用nn.Dropout类创建一个Dropout层，
        # 将dropout_rate作为参数传入，dropout_rate从config中获取
        self.dropout = nn.Dropout(config.dropout_rate)

    # 前向传播函数，定义了模型的数据流向
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        output_attentions=False,
        **kwargs: Any,  # 接受past_key_value和use_cache等其他关键字参数
    ):
        # 对输入的hidden_states进行LayerNorm归一化处理
        normed_hidden_states = self.layer_norm(hidden_states)
        # 将归一化后的hidden_states输入到TransientGlobalSelfAttention模块中进行处理，
        # 并传入attention_mask、position_bias、layer_head_mask和output_attentions等参数
        attention_output = self.TransientGlobalSelfAttention(
            normed_hidden_states,
            mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 将原始的hidden_states与经Dropout处理后的attention_output[0]相加，得到更新后的hidden_states
        hidden_states = hidden_states + self.dropout(attention_output[0])
        # 将更新后的hidden_states作为第一个元素，同时保留attention_output的其余部分（如果有的话）
        # 组成输出的元组outputs
        outputs = (hidden_states,) + attention_output[1:]  # 如果有需要，可以添加注意力信息
        # 返回outputs作为前向传播的结果
        return outputs
# 从 transformers.models.t5.modeling_t5.T5LayerCrossAttention 复制而来，将 T5 替换为 LongT5
class LongT5LayerCrossAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化 LongT5Attention 作为 Encoder-Decoder 注意力层
        self.EncDecAttention = LongT5Attention(config, has_relative_attention_bias=False)
        # 使用 LongT5LayerNorm 对隐藏状态进行层归一化
        self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 使用 nn.Dropout 进行 dropout 正则化
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        key_value_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        query_length=None,
        output_attentions=False,
    ):
        # 对隐藏状态进行层归一化
        normed_hidden_states = self.layer_norm(hidden_states)
        # 进行 Encoder-Decoder 注意力计算
        attention_output = self.EncDecAttention(
            normed_hidden_states,
            mask=attention_mask,
            key_value_states=key_value_states,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            query_length=query_length,
            output_attentions=output_attentions,
        )
        # 计算最终层输出，包括 dropout 正则化
        layer_output = hidden_states + self.dropout(attention_output[0])
        outputs = (layer_output,) + attention_output[1:]  # 如果有需要，则添加注意力输出
        return outputs


class LongT5Block(nn.Module):
    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__()
        self.is_decoder = config.is_decoder
        # 根据配置选择适当的注意力层
        if config.is_decoder:
            attention_layer = LongT5LayerSelfAttention
        elif config.encoder_attention_type == "local":
            attention_layer = LongT5LayerLocalSelfAttention
        elif config.encoder_attention_type == "transient-global":
            attention_layer = LongT5LayerTransientGlobalSelfAttention
        else:
            raise ValueError(
                "For encoder attention mechanism, either `local` or `transient-global` attention type is expected, "
                f"but got {config.encoder_attention_type}."
            )
        # 初始化 LongT5Block 的各层
        self.layer = nn.ModuleList()
        self.layer.append(attention_layer(config, has_relative_attention_bias=has_relative_attention_bias))
        # 如果是解码器，添加交叉注意力层
        if self.is_decoder:
            self.layer.append(LongT5LayerCrossAttention(config))

        # 添加前馈神经网络层
        self.layer.append(LongT5LayerFF(config))

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        encoder_decoder_position_bias=None,
        layer_head_mask=None,
        cross_attn_layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
        return_dict=True,
    ):
        # 依次传递每一层的输入与参数，计算输出
        for layer_module in self.layer:
            hidden_states = layer_module(
                hidden_states,
                attention_mask=attention_mask,
                position_bias=position_bias,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                encoder_decoder_position_bias=encoder_decoder_position_bias,
                layer_head_mask=layer_head_mask,
                cross_attn_layer_head_mask=cross_attn_layer_head_mask,
                past_key_value=past_key_value,
                use_cache=use_cache,
                output_attentions=output_attentions,
                return_dict=return_dict,
            )
        return hidden_states
    """
    模型的基类，LongT5PreTrainedModel，继承自父类 T5PreTrainedModel，并针对 LongT5 进行特定设置和调整。
    """

    # 指定模型配置类为 LongT5Config
    config_class = LongT5Config
    # 基础模型前缀设置为 "transformer"
    base_model_prefix = "transformer"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不需要拆分的模块列表，这里包含 "LongT5Block"
    _no_split_modules = ["LongT5Block"]

    @property
    # 从 transformers.models.t5.modeling_t5.T5PreTrainedModel.dummy_inputs 复制的属性和注释
    def dummy_inputs(self):
        # 创建一个包含虚拟输入的张量 input_ids 和 input_mask
        input_ids = torch.tensor(DUMMY_INPUTS)
        input_mask = torch.tensor(DUMMY_MASK)
        # 构建虚拟输入字典 dummy_inputs
        dummy_inputs = {
            "decoder_input_ids": input_ids,
            "input_ids": input_ids,
            "decoder_attention_mask": input_mask,
        }
        return dummy_inputs

    # 从 transformers.models.t5.modeling_t5.T5PreTrainedModel._shift_right 复制的方法和注释，但用 LongT5 替代了 T5
    def _shift_right(self, input_ids):
        # 获取解码器的起始标记 ID 和填充标记 ID
        decoder_start_token_id = self.config.decoder_start_token_id
        pad_token_id = self.config.pad_token_id

        if decoder_start_token_id is None:
            # 如果未定义 decoder_start_token_id，则抛出 ValueError
            raise ValueError(
                "self.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. "
                "See LongT5 docs for more information."
            )

        # 将输入向右移动一位
        if is_torch_fx_proxy(input_ids):
            # 对于 TorchFX 代理，不支持原生的项目赋值操作
            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
        else:
            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
            shifted_input_ids[..., 0] = decoder_start_token_id

        if pad_token_id is None:
            # 如果未定义 pad_token_id，则抛出 ValueError
            raise ValueError("self.model.config.pad_token_id has to be defined.")
        
        # 将标签中可能存在的 -100 值替换为 pad_token_id
        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

        return shifted_input_ids
class LongT5Stack(LongT5PreTrainedModel):
    def __init__(self, config, embed_tokens=None):
        super().__init__(config)

        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)  # 创建词嵌入层，大小为vocab_size × d_model
        if embed_tokens is not None:
            self.embed_tokens.weight = embed_tokens.weight  # 如果传入了自定义的词嵌入，则使用这些权重
        self.is_decoder = config.is_decoder  # 是否为解码器模式的标志

        self.local_radius = config.local_radius  # 获取局部注意力的半径大小
        self.block_len = self.local_radius + 1  # 计算每个块的长度，包括中心位置和左右邻居

        self.block = nn.ModuleList(
            [LongT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
        )  # 创建长T5块的列表，每个块都包含一个长T5块模型
        self.final_layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)  # 创建最终的层归一化层
        self.dropout = nn.Dropout(config.dropout_rate)  # 创建dropout层，用于正则化

        self.gradient_checkpointing = False  # 梯度检查点标志，默认为False，表示不使用梯度检查点

        # Initialize weights and apply final processing
        self.post_init()  # 执行后初始化方法，用于权重初始化和最终处理

    # Copied from transformers.models.t5.modeling_t5.T5Stack.get_input_embeddings
    def get_input_embeddings(self):
        return self.embed_tokens  # 返回输入的词嵌入层对象

    # Copied from transformers.models.t5.modeling_t5.T5Stack.set_input_embeddings
    def set_input_embeddings(self, new_embeddings):
        self.embed_tokens = new_embeddings  # 设置新的输入词嵌入层对象

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        inputs_embeds=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
LONGT5_START_DOCSTRING = r"""
    The LongT5 model was proposed in [LongT5: Efficient Text-To-Text Transformer for Long
    Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo
    Ni, Yun-Hsuan Sung and Yinfei Yang. It's an encoder-decoder transformer pre-trained in a text-to-text denoising
    generative setting. LongT5 model is an extension of T5 model, and it enables using one of the two different
    efficient attention mechanisms - (1) Local attention, or (2) Transient-Global attention.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`LongT5Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# 定义长T5模型输入文档字符串常量
LONGT5_INPUTS_DOCSTRING = r"""
"""

# 定义长T5编码器输入文档字符串常量
LONGT5_ENCODER_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            输入序列标记在词汇表中的索引。LongT5模型具有相对位置嵌入，因此可以在左右两侧进行填充。

            索引可以使用[`AutoTokenizer`]获得。参见[`PreTrainedTokenizer.encode`]和
            [`PreTrainedTokenizer.__call__`]了解详情。

            如何为预训练准备`input_ids`的更多信息，请参阅[LONGT5
            Training](./longt5#training)。
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            避免在填充标记索引上执行注意力的掩码。掩码的值在`[0, 1]`范围内选择：

            - 1 表示**未被掩码**的标记，
            - 0 表示**被掩码**的标记。

            [什么是注意力掩码？](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            用于空置自注意力模块中选择头部的掩码。掩码的值在`[0, 1]`范围内选择：

            - 1 表示头部**未被掩码**，
            - 0 表示头部**被掩码**。

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            可选，可以直接传递嵌入表示而不是`input_ids`。如果您希望更多控制如何将`input_ids`索引转换为关联向量，
            而不是使用模型的内部嵌入查找矩阵，这将非常有用。
        output_attentions (`bool`, *optional*):
            是否返回所有注意力层的注意力张量。有关更多细节，请参见返回张量中的`attentions`。
        output_hidden_states (`bool`, *optional*):
            是否返回所有层的隐藏状态。有关更多细节，请参见返回张量中的`hidden_states`。
        return_dict (`bool`, *optional*):
            是否返回[`~utils.ModelOutput`]而不是普通元组。
"""

# 未来警告的警告消息：head_mask被拆分为两个输入参数 - head_mask和decoder_head_mask
__HEAD_MASK_WARNING_MSG = """
输入参数`head_mask`已分成两个参数`head_mask`和`decoder_head_mask`。当前，
`decoder_head_mask`设置为复制`head_mask`，但此功能已被弃用，并将在未来版本中删除。
如果您现在不想使用任何`decoder_head_mask`，请设置`decoder_head_mask = torch.ones(num_layers, num_heads)`。
"""


@add_start_docstrings(
    "The bare LONGT5 Model transformer outputting raw hidden-states without any specific head on top.",
    LONGT5_START_DOCSTRING,
)
class LongT5Model(LongT5PreTrainedModel):
    # 长T5模型类，输出裸的隐藏状态，没有特定的输出头部。
    # 在加载时忽略的关键键列表，用于意外情况下的模型加载
    _keys_to_ignore_on_load_unexpected = [
        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
    ]
    # 被绑定权重的键列表，这些权重将被共享
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    def __init__(self, config: LongT5Config):
        # 调用父类构造函数，初始化模型配置
        super().__init__(config)
        # 共享的词嵌入层，用于编码器和解码器
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        # 复制编码器配置并初始化编码器
        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        self.encoder = LongT5Stack(encoder_config, self.shared)

        # 复制解码器配置并初始化解码器
        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        self.decoder = LongT5Stack(decoder_config, self.shared)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回共享的输入词嵌入层
        return self.shared

    def set_input_embeddings(self, new_embeddings):
        # 设置新的输入词嵌入层，并更新编码器和解码器的输入词嵌入层
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)
        self.decoder.set_input_embeddings(new_embeddings)

    def _tie_weights(self):
        # 如果配置要求，绑定编码器和解码器的词嵌入权重
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)

    def get_encoder(self):
        # 返回编码器模型
        return self.encoder

    def get_decoder(self):
        # 返回解码器模型
        return self.decoder

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 剪枝模型中的注意力头
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(LONGT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        decoder_inputs_embeds: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 使用 `add_start_docstrings` 装饰器为模型添加长文档字符串，描述其作为具有顶部语言建模头的 LONGT5 模型
@add_start_docstrings("""LONGT5 Model with a `language modeling` head on top.""", LONGT5_START_DOCSTRING)
class LongT5ForConditionalGeneration(LongT5PreTrainedModel):
    # 在加载时忽略的键列表，用于意外情况下不加载的键
    _keys_to_ignore_on_load_unexpected = [
        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
    ]
    # 要绑定权重的键列表，包括 encoder 和 decoder 的嵌入权重以及 lm_head 的权重
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]

    # 初始化方法，接受一个 LongT5Config 类型的参数 config
    def __init__(self, config: LongT5Config):
        super().__init__(config)
        # 设置模型维度为配置中的 d_model
        self.model_dim = config.d_model

        # 共享的嵌入层，使用 nn.Embedding 初始化，将词汇表大小设为 config.vocab_size，嵌入维度为 config.d_model
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        # 复制 encoder 配置并设置为非解码器，不使用缓存，且非编码-解码器模式
        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        # 使用 LongT5Stack 类构建编码器
        self.encoder = LongT5Stack(encoder_config, self.shared)

        # 复制 decoder 配置并设置为解码器，且非编码-解码器模式，层数为 config.num_decoder_layers
        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        # 使用 LongT5Stack 类构建解码器
        self.decoder = LongT5Stack(decoder_config, self.shared)

        # 使用 nn.Linear 初始化 lm_head，输入维度为 config.d_model，输出维度为 config.vocab_size，无偏置
        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入嵌入层 shared
    def get_input_embeddings(self):
        return self.shared

    # 设置输入嵌入层 shared 的新嵌入
    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        # 更新 encoder 和 decoder 的输入嵌入
        self.encoder.set_input_embeddings(new_embeddings)
        self.decoder.set_input_embeddings(new_embeddings)

    # 绑定权重方法，如果配置中设置了 tie_word_embeddings，则共享 encoder 和 decoder 的嵌入权重
    def _tie_weights(self):
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)

    # 设置 lm_head 的输出嵌入
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 获取 lm_head 的输出嵌入
    def get_output_embeddings(self):
        return self.lm_head

    # 获取编码器
    def get_encoder(self):
        return self.encoder

    # 获取解码器
    def get_decoder(self):
        return self.decoder

    # 使用 `add_start_docstrings_to_model_forward` 和 `replace_return_docstrings` 装饰器为模型的 forward 方法添加文档字符串，
    # 描述其输入参数和输出类型
    @add_start_docstrings_to_model_forward(LONGT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 定义模型的前向传播方法，接受多个可选的输入参数和掩码
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 如果 past_key_values 不为 None，根据其值裁剪 decoder_input_ids
        if past_key_values is not None:
            # 获取过去键值对的长度
            past_length = past_key_values[0][0].shape[2]

            # 如果 input_ids 的长度大于过去长度，则移除前缀长度
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 否则，默认保留最后一个 ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 裁剪 input_ids，保留后缀部分
            input_ids = input_ids[:, remove_prefix_length:]

        # 返回包含各种输入和掩码信息的字典，用于生成阶段的输入准备
        return {
            "decoder_input_ids": input_ids,
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,
        }

    # 根据标签生成解码器的输入 IDs
    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        # 调用内部方法 _shift_right 将标签向右移动，用作解码器的输入
        return self._shift_right(labels)
    def _reorder_cache(self, past_key_values, beam_idx):
        # 如果过去的键值不包含在输出中
        # 则关闭快速解码并且无需重新排序
        if past_key_values is None:
            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
            return past_key_values

        # 重新排序后的解码器过去状态
        reordered_decoder_past = ()
        for layer_past_states in past_key_values:
            # 从层过去状态中获取正确的批次索引
            # `past` 的批次维度在第二个位置
            reordered_layer_past_states = ()
            for layer_past_state in layer_past_states:
                # 需要为每个键/值状态设置正确的 `past`
                reordered_layer_past_states = reordered_layer_past_states + (
                    layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
                )

            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
            assert len(reordered_layer_past_states) == len(layer_past_states)

            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
        return reordered_decoder_past
# 定义一个LongT5EncoderModel类，继承自LongT5PreTrainedModel，表示不带顶部特定头部的LONGT5模型编码器的原始隐藏状态输出
class LongT5EncoderModel(LongT5PreTrainedModel):
    # 在加载时需要绑定权重的键名列表
    _tied_weights_keys = ["encoder.embed_tokens.weight"]
    # 在加载时忽略的意外键名列表，匹配到"decoder"的键名将被忽略
    _keys_to_ignore_on_load_unexpected = [r"decoder"]

    # 初始化方法，接受一个LongT5Config类型的配置参数config
    def __init__(self, config: LongT5Config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 创建一个共享的词嵌入层，词汇量大小为config.vocab_size，嵌入维度为config.d_model
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        # 复制配置参数config，设置不使用缓存，不是编码器-解码器结构
        encoder_config = copy.deepcopy(config)
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        # 创建LongT5Stack编码器，使用上述配置和共享的词嵌入层self.shared
        self.encoder = LongT5Stack(encoder_config, self.shared)

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入的词嵌入层对象
    def get_input_embeddings(self):
        return self.shared

    # 设置输入的词嵌入层对象为new_embeddings，并将其传递给编码器self.encoder
    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)

    # 绑定权重的内部方法，如果配置参数中tie_word_embeddings为True，则绑定编码器的嵌入层和共享的词嵌入层self.shared
    def _tie_weights(self):
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)

    # 获取编码器self.encoder
    def get_encoder(self):
        return self.encoder

    # 剪枝模型中的注意力头部方法，heads_to_prune为一个字典，格式为{层编号: 需要剪枝的注意力头部列表}
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历需要剪枝的层和头部信息，调用self.encoder的具体层的注意力对象的prune_heads方法进行剪枝
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 前向传播方法，接受多个可选的输入参数，并返回一个BaseModelOutput类型的对象
    @add_start_docstrings_to_model_forward(LONGT5_ENCODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

        # 省略了具体实现细节，将会使用LongT5Stack编码器处理输入，并返回模型输出
        pass
        ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
        r"""
        返回一个元组，包含 torch.FloatTensor 或 BaseModelOutput 类型的对象。

        Returns:
        返回模型的输出结果。

        Example:
        示例代码展示如何使用这个模型：

        ```
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用编码器（encoder）模块进行前向传播，生成编码器的输出
        encoder_outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 返回编码器的输出作为最终函数的返回值
        return encoder_outputs

`.\models\longt5\init.py`

# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import TYPE_CHECKING

# 从 utils 中导入自定义的异常和模块，用于处理可选的依赖不可用情况
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available

# 定义模块的导入结构，列出了各个模块和类的导入信息
_import_structure = {
    "configuration_longt5": ["LONGT5_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongT5Config", "LongT5OnnxConfig"],
}

# 尝试导入 Torch 版本的 LongT5 模型和相关配置
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["modeling_longt5"] = [
        "LONGT5_PRETRAINED_MODEL_ARCHIVE_LIST",
        "LongT5EncoderModel",
        "LongT5ForConditionalGeneration",
        "LongT5Model",
        "LongT5PreTrainedModel",
    ]

# 尝试导入 Flax 版本的 LongT5 模型和相关配置
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["modeling_flax_longt5"] = [
        "FlaxLongT5ForConditionalGeneration",
        "FlaxLongT5Model",
        "FlaxLongT5PreTrainedModel",
    ]

# 如果是在类型检查阶段，导入详细的类型信息
if TYPE_CHECKING:
    from .configuration_longt5 import LONGT5_PRETRAINED_CONFIG_ARCHIVE_MAP, LongT5Config, LongT5OnnxConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_longt5 import (
            LONGT5_PRETRAINED_MODEL_ARCHIVE_LIST,
            LongT5EncoderModel,
            LongT5ForConditionalGeneration,
            LongT5Model,
            LongT5PreTrainedModel,
        )

    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_flax_longt5 import (
            FlaxLongT5ForConditionalGeneration,
            FlaxLongT5Model,
            FlaxLongT5PreTrainedModel,
        )

# 如果不是在类型检查阶段，则将当前模块设置为懒加载模块，以延迟导入各个模块
else:
    import sys

    # 使用 LazyModule 将当前模块设置为懒加载模块，根据需要导入相关模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\luke\configuration_luke.py`

# coding=utf-8
# Copyright Studio Ousia and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" LUKE configuration"""

from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...utils import logging  # 导入日志工具

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# LUKE 预训练配置与其配置文件的映射字典
LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "studio-ousia/luke-base": "https://huggingface.co/studio-ousia/luke-base/resolve/main/config.json",
    "studio-ousia/luke-large": "https://huggingface.co/studio-ousia/luke-large/resolve/main/config.json",
}


class LukeConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LukeModel`]. It is used to instantiate a LUKE
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the LUKE
    [studio-ousia/luke-base](https://huggingface.co/studio-ousia/luke-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Examples:

    ```
    >>> from transformers import LukeConfig, LukeModel

    >>> # Initializing a LUKE configuration
    >>> configuration = LukeConfig()

    >>> # Initializing a model from the configuration
    >>> model = LukeModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```

    """

    # 模型类型标识为 LUKE
    model_type = "luke"

    def __init__(
        self,
        vocab_size=50267,
        entity_vocab_size=500000,
        hidden_size=768,
        entity_emb_size=256,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        use_entity_aware_attention=True,
        classifier_dropout=None,
        pad_token_id=1,
        bos_token_id=0,
        eos_token_id=2,
        **kwargs,
    ):
        """
        Initialize a LUKE configuration with default values.

        Args:
            vocab_size (int): Size of the token vocabulary.
            entity_vocab_size (int): Size of the entity vocabulary.
            hidden_size (int): Size of the encoder layers and the pooler layer.
            entity_emb_size (int): Dimensionality of the entity embeddings.
            num_hidden_layers (int): Number of hidden layers in the Transformer encoder.
            num_attention_heads (int): Number of attention heads for each attention layer in the Transformer encoder.
            intermediate_size (int): Size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
            hidden_act (str): The non-linear activation function (function or string) in the encoder and pooler.
            hidden_dropout_prob (float): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob (float): The dropout ratio for the attention probabilities.
            max_position_embeddings (int): The maximum sequence length that this model might ever be used with.
            type_vocab_size (int): The vocabulary size of the "type" (i.e., token type IDs) embeddings.
            initializer_range (float): The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
            layer_norm_eps (float): The epsilon used by the layer normalization layers.
            use_entity_aware_attention (bool): Whether to use entity-aware attention in the model.
            classifier_dropout (float or None): The dropout probability for the classifier layer (None means no dropout).
            pad_token_id (int): The ID of the padding token in the token vocabulary.
            bos_token_id (int): The ID of the beginning-of-sequence token in the token vocabulary.
            eos_token_id (int): The ID of the end-of-sequence token in the token vocabulary.
            **kwargs: Additional configuration arguments.

        """
        # 调用父类 PretrainedConfig 的初始化方法，传递所有参数
        super().__init__(
            vocab_size=vocab_size,
            entity_vocab_size=entity_vocab_size,
            hidden_size=hidden_size,
            entity_emb_size=entity_emb_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            intermediate_size=intermediate_size,
            hidden_act=hidden_act,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_position_embeddings=max_position_embeddings,
            type_vocab_size=type_vocab_size,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )

        # 是否使用实体感知注意力机制
        self.use_entity_aware_attention = use_entity_aware_attention

        # 分类器层的 dropout 概率
        self.classifier_dropout = classifier_dropout
        """
        Constructs LukeConfig.
        """
        # 调用父类的初始化方法，设置特定的配置参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 设置配置对象的词汇表大小
        self.vocab_size = vocab_size
        # 设置配置对象的实体词汇表大小
        self.entity_vocab_size = entity_vocab_size
        # 设置配置对象的隐藏层大小
        self.hidden_size = hidden_size
        # 设置配置对象的实体嵌入大小
        self.entity_emb_size = entity_emb_size
        # 设置配置对象的隐藏层数量
        self.num_hidden_layers = num_hidden_layers
        # 设置配置对象的注意力头数量
        self.num_attention_heads = num_attention_heads
        # 设置配置对象的隐藏层激活函数类型
        self.hidden_act = hidden_act
        # 设置配置对象的中间层大小
        self.intermediate_size = intermediate_size
        # 设置配置对象的隐藏层丢弃率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 设置配置对象的注意力概率丢弃率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 设置配置对象的最大位置嵌入长度
        self.max_position_embeddings = max_position_embeddings
        # 设置配置对象的类型词汇表大小
        self.type_vocab_size = type_vocab_size
        # 设置配置对象的初始化范围
        self.initializer_range = initializer_range
        # 设置配置对象的层归一化 epsilon 值
        self.layer_norm_eps = layer_norm_eps
        # 设置是否使用实体感知注意力机制的标志
        self.use_entity_aware_attention = use_entity_aware_attention
        # 设置分类器层的丢弃率
        self.classifier_dropout = classifier_dropout

`.\models\luke\convert_luke_original_pytorch_checkpoint_to_pytorch.py`

# 设置文件编码为 UTF-8，确保可以正确处理中文等特殊字符
# 版权声明，此代码版权归 The HuggingFace Inc. 团队所有，基于 Apache License, Version 2.0 发布
# 只有在符合许可证的条件下才能使用该文件
# 可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按"现状"分发本软件，无任何明示或暗示的担保或条件
# 请查阅许可证，了解详细的法律条款和免责声明
"""Convert LUKE checkpoint."""

import argparse  # 导入解析命令行参数的库
import json  # 导入处理 JSON 格式数据的库
import os  # 导入操作系统功能的库

import torch  # 导入 PyTorch 深度学习框架

from transformers import LukeConfig, LukeModel, LukeTokenizer, RobertaTokenizer  # 导入 LUKE 模型相关类
from transformers.tokenization_utils_base import AddedToken  # 导入用于添加特殊 token 的类


@torch.no_grad()
def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path, pytorch_dump_folder_path, model_size):
    # 从 metadata 文件中加载配置信息
    with open(metadata_path) as metadata_file:
        metadata = json.load(metadata_file)
    # 使用 metadata 中的配置信息创建 LUKE 模型配置对象
    config = LukeConfig(use_entity_aware_attention=True, **metadata["model_config"])

    # 从 checkpoint_path 加载模型的权重参数
    state_dict = torch.load(checkpoint_path, map_location="cpu")

    # 加载实体词汇表文件
    entity_vocab = load_entity_vocab(entity_vocab_path)

    # 根据 metadata 中的配置信息加载 RobertaTokenizer
    tokenizer = RobertaTokenizer.from_pretrained(metadata["model_config"]["bert_model_name"])

    # 添加特殊 token 到 tokenizer 的词汇表，用于下游任务
    entity_token_1 = AddedToken("<ent>", lstrip=False, rstrip=False)
    entity_token_2 = AddedToken("<ent2>", lstrip=False, rstrip=False)
    tokenizer.add_special_tokens({"additional_special_tokens": [entity_token_1, entity_token_2]})
    config.vocab_size += 2  # 更新配置中的词汇表大小

    # 打印信息，保存 tokenizer 到指定路径
    print(f"Saving tokenizer to {pytorch_dump_folder_path}")
    tokenizer.save_pretrained(pytorch_dump_folder_path)
    # 将实体词汇表保存为 JSON 文件
    with open(os.path.join(pytorch_dump_folder_path, LukeTokenizer.vocab_files_names["entity_vocab_file"]), "w") as f:
        json.dump(entity_vocab, f)

    # 从指定路径加载 LUKETokenizer
    tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path)

    # 初始化特殊 token 的嵌入向量
    word_emb = state_dict["embeddings.word_embeddings.weight"]
    ent_emb = word_emb[tokenizer.convert_tokens_to_ids(["@"])[0]].unsqueeze(0)
    ent2_emb = word_emb[tokenizer.convert_tokens_to_ids(["#"])[0]].unsqueeze(0)
    state_dict["embeddings.word_embeddings.weight"] = torch.cat([word_emb, ent_emb, ent2_emb])

    # 初始化实体感知自注意力机制的查询层
    # 对每一个编码器层进行循环，范围是从 0 到 config.num_hidden_layers - 1
    for layer_index in range(config.num_hidden_layers):
        # 对于每一个矩阵名称，例如 "query.weight" 和 "query.bias"，进行循环
        for matrix_name in ["query.weight", "query.bias"]:
            # 构建前缀，形如 "encoder.layer.{layer_index}.attention.self."
            prefix = f"encoder.layer.{layer_index}.attention.self."
            # 复制指定矩阵名称的值到三个不同的状态字典键中
            state_dict[prefix + "w2e_" + matrix_name] = state_dict[prefix + matrix_name]
            state_dict[prefix + "e2w_" + matrix_name] = state_dict[prefix + matrix_name]
            state_dict[prefix + "e2e_" + matrix_name] = state_dict[prefix + matrix_name]

    # 使用"[MASK]"实体的嵌入初始化"[MASK2]"实体的嵌入，用于下游任务
    entity_emb = state_dict["entity_embeddings.entity_embeddings.weight"]
    entity_emb[entity_vocab["[MASK2]"]] = entity_emb[entity_vocab["[MASK]"]]

    # 实例化Luke模型，并设置为评估模式
    model = LukeModel(config=config).eval()

    # 加载模型的状态字典，忽略丢失的键
    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
    # 检查是否只丢失了一个键，并且该键是"embeddings.position_ids"
    if not (len(missing_keys) == 1 and missing_keys[0] == "embeddings.position_ids"):
        raise ValueError(f"Missing keys {', '.join(missing_keys)}. Expected only missing embeddings.position_ids")
    # 检查是否所有意外的键都以"entity_predictions"或"lm_head"开头
    if not (all(key.startswith("entity_predictions") or key.startswith("lm_head") for key in unexpected_keys)):
        raise ValueError(
            "Unexpected keys"
            f" {', '.join([key for key in unexpected_keys if not (key.startswith('entity_predictions') or key.startswith('lm_head'))])}"
        )

    # 检查模型输出
    # 根据路径加载Luke的tokenizer，用于实体分类任务
    tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path, task="entity_classification")

    # 设置用于测试的文本和实体范围
    text = (
        "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the"
        " new world number one avoid a humiliating second- round exit at Wimbledon ."
    )
    span = (39, 42)
    # 使用tokenizer对文本进行编码，指定实体范围和其他参数
    encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt")

    # 将编码传递给模型，获取输出结果
    outputs = model(**encoding)

    # 验证词级别的隐藏状态
    if model_size == "large":
        expected_shape = torch.Size((1, 42, 1024))
        expected_slice = torch.tensor(
            [[0.0133, 0.0865, 0.0095], [0.3093, -0.2576, -0.7418], [-0.1720, -0.2117, -0.2869]]
        )
    else:  # base
        expected_shape = torch.Size((1, 42, 768))
        expected_slice = torch.tensor([[0.0037, 0.1368, -0.0091], [0.1099, 0.3329, -0.1095], [0.0765, 0.5335, 0.1179]])

    # 检查模型输出的最后隐藏状态的形状是否符合预期
    if not (outputs.last_hidden_state.shape == expected_shape):
        raise ValueError(
            f"Outputs.last_hidden_state.shape is {outputs.last_hidden_state.shape}, Expected shape is {expected_shape}"
        )
    # 检查模型输出的部分隐藏状态是否与预期接近
    if not torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
        raise ValueError

    # 验证实体级别的隐藏状态
    if model_size == "large":
        expected_shape = torch.Size((1, 1, 1024))
        expected_slice = torch.tensor([[0.0466, -0.0106, -0.0179]])
    else:  # base
        expected_shape = torch.Size((1, 1, 768))
        expected_slice = torch.tensor([[0.1457, 0.1044, 0.0174]])
    # 检查输出的实体最后隐藏状态的形状是否与预期形状不同，如果是则抛出值错误异常
    if not (outputs.entity_last_hidden_state.shape != expected_shape):
        raise ValueError(
            f"Outputs.entity_last_hidden_state.shape is {outputs.entity_last_hidden_state.shape}, Expected shape is"
            f" {expected_shape}"
        )
    
    # 检查输出的实体最后隐藏状态的一个子集是否与预期的切片在数值上接近，如果不是则抛出值错误异常
    if not torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
        raise ValueError

    # 最后，将 PyTorch 模型和分词器保存到指定的路径中
    print("Saving PyTorch model to {}".format(pytorch_dump_folder_path))
    model.save_pretrained(pytorch_dump_folder_path)
# 定义一个函数用于加载实体词汇表
def load_entity_vocab(entity_vocab_path):
    # 创建一个空字典，用于存储实体词汇
    entity_vocab = {}
    # 使用 UTF-8 编码打开实体词汇表文件
    with open(entity_vocab_path, "r", encoding="utf-8") as f:
        # 逐行读取文件内容并枚举行号
        for index, line in enumerate(f):
            # 去除行末尾的换行符并按制表符分割行内容，取得标题部分
            title, _ = line.rstrip().split("\t")
            # 将标题和其对应的行号添加到实体词汇字典中
            entity_vocab[title] = index

    # 返回构建好的实体词汇字典
    return entity_vocab


if __name__ == "__main__":
    # 创建一个参数解析器对象
    parser = argparse.ArgumentParser()
    # 添加必需的参数
    parser.add_argument("--checkpoint_path", type=str, help="Path to a pytorch_model.bin file.")
    parser.add_argument(
        "--metadata_path", default=None, type=str, help="Path to a metadata.json file, defining the configuration."
    )
    parser.add_argument(
        "--entity_vocab_path",
        default=None,
        type=str,
        help="Path to an entity_vocab.tsv file, containing the entity vocabulary.",
    )
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to where to dump the output PyTorch model."
    )
    parser.add_argument(
        "--model_size", default="base", type=str, choices=["base", "large"], help="Size of the model to be converted."
    )
    # 解析命令行参数
    args = parser.parse_args()
    # 调用函数将 Luke 模型的 checkpoint 转换为 PyTorch 格式
    convert_luke_checkpoint(
        args.checkpoint_path,
        args.metadata_path,
        args.entity_vocab_path,
        args.pytorch_dump_folder_path,
        args.model_size,
    )

`.\models\luke\modeling_luke.py`

# coding=utf-8
# Copyright Studio Ousia and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch LUKE model."""

import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN, gelu
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# 用于文档的配置和检查点
_CONFIG_FOR_DOC = "LukeConfig"
_CHECKPOINT_FOR_DOC = "studio-ousia/luke-base"

# 预训练模型存档列表
LUKE_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "studio-ousia/luke-base",
    "studio-ousia/luke-large",
    # 查看所有 LUKE 模型：https://huggingface.co/models?filter=luke
]


@dataclass
class BaseLukeModelOutputWithPooling(BaseModelOutputWithPooling):
    """
    Base class for outputs of the LUKE model.
    """
    # 定义函数的参数及其类型说明
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层输出的隐藏状态序列。
        entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
            实体的最后一层隐藏状态序列。
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            经过线性层和Tanh激活函数处理过的序列中第一个标记（分类标记）的最后一层隐藏状态。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            元组，包含模型每一层的隐藏状态（从嵌入层开始，每层一个张量），形状为 `(batch_size, sequence_length, hidden_size)`。
            当 `output_hidden_states=True` 时返回。
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            元组，包含实体的每一层隐藏状态（从嵌入层开始，每层一个张量），形状为 `(batch_size, entity_length, hidden_size)`。
            当 `output_hidden_states=True` 时返回。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            元组，包含每一层的注意力权重张量，形状为 `(batch_size, num_heads, sequence_length + entity_length, sequence_length + entity_length)`。
            注意力权重经过 softmax 处理，用于计算自注意力头中的加权平均值。
    
    
    
    # 声明实体的最后隐藏状态和实体隐藏状态，类型分别为 torch.FloatTensor 和 Optional[Tuple[torch.FloatTensor, ...]]
    entity_last_hidden_state: torch.FloatTensor = None
    entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 使用 dataclass 装饰器定义一个名为 BaseLukeModelOutput 的数据类，作为模型输出的基类
@dataclass
class BaseLukeModelOutput(BaseModelOutput):
    """
    Base class for model's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
            Sequence of entity hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
            layer plus the initial entity embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # 定义实体最后隐藏状态，类型为 torch.FloatTensor，默认为 None
    entity_last_hidden_state: torch.FloatTensor = None
    # 定义实体隐藏状态的元组，类型为 Optional[Tuple[torch.FloatTensor, ...]]，默认为 None
    entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            The sum of masked language modeling (MLM) loss and entity prediction loss.
            当提供 `labels` 参数时返回，表示掩码语言建模（MLM）损失和实体预测损失的总和。
        mlm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Masked language modeling (MLM) loss.
            当提供 `labels` 参数时返回，表示掩码语言建模（MLM）损失。
        mep_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Masked entity prediction (MEP) loss.
            当提供 `labels` 参数时返回，表示掩码实体预测（MEP）损失。
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
            语言建模头部的预测分数（SoftMax 之前的每个词汇标记的分数）。
        entity_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the entity prediction head (scores for each entity vocabulary token before SoftMax).
            实体预测头部的预测分数（SoftMax 之前的每个实体词汇标记的分数）。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
            模型在每个层输出后的隐藏状态的元组，包括初始嵌入输出。
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`.
            模型在每个层输出后的实体隐藏状态的元组，包括初始实体嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            注意力权重在经过注意力 softmax 后的结果，用于计算自注意力头部的加权平均值。
# 定义一个数据类 EntityClassificationOutput，继承自 ModelOutput
@dataclass
class EntityClassificationOutput(ModelOutput):
    """
    实体分类模型的输出结果。

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            分类损失。
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            分类得分（SoftMax 之前）。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            包含每一层的输出的元组 `torch.FloatTensor`，形状为 `(batch_size, sequence_length, hidden_size)`。
            模型每一层的隐藏状态加上初始嵌入的输出。
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            实体的隐藏状态的元组 `torch.FloatTensor`，形状为 `(batch_size, entity_length, hidden_size)`。
            模型每一层的实体隐藏状态加上初始实体嵌入的输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            自注意力头中用于计算加权平均的注意力权重的元组 `torch.FloatTensor`，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            注意力 softmax 后的注意力权重。

    """
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            分类损失。
            如果提供了 `labels` 参数，则返回分类损失。
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            分类分数（SoftMax 之前的输出）。
            形状为 `(batch_size, config.num_labels)` 的分类分数。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型隐藏状态，包括每一层的输出和初始嵌入的输出。
            形状为 `(batch_size, sequence_length, hidden_size)` 的元组，第一个元素是嵌入的输出，后续元素是每一层的输出。
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            实体的隐藏状态，包括每一层的输出和初始实体嵌入的输出。
            形状为 `(batch_size, entity_length, hidden_size)` 的元组，第一个元素是实体嵌入的输出，后续元素是每一层的输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            注意力权重，用于计算自注意力头中的加权平均值。
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的元组，每个元素对应一个层的注意力权重。
    """

    # 可选的分类损失，形状为 `(1,)` 的 `torch.FloatTensor`
    loss: Optional[torch.FloatTensor] = None
    # 分类分数（SoftMax 之前的输出），形状为 `(batch_size, config.num_labels)` 的 `torch.FloatTensor`
    logits: torch.FloatTensor = None
    # 可选的模型隐藏状态元组，包含每层的输出和初始嵌入的输出
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 可选的实体隐藏状态元组，包含每层的输出和初始实体嵌入的输出
    entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 可选的注意力权重元组，用于计算自注意力头中的加权平均值
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义一个数据类，用于存储实体跨度分类模型的输出结果
@dataclass
class EntitySpanClassificationOutput(ModelOutput):
    """
    实体跨度分类模型的输出结果。

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, 当提供 `labels` 时返回):
            分类损失值。
        logits (`torch.FloatTensor` of shape `(batch_size, entity_length, config.num_labels)`):
            分类分数（SoftMax 之前的）。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
            由两部分组成的元组 `torch.FloatTensor` （一个用于嵌入的输出 + 一个用于每层输出），形状为 `(batch_size, sequence_length, hidden_size)`。
            模型在每层输出结束时的隐藏状态，加上初始嵌入输出。
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
            由两部分组成的元组 `torch.FloatTensor` （一个用于嵌入的输出 + 一个用于每层输出），形状为 `(batch_size, entity_length, hidden_size)`。
            模型在每层输出结束时实体的隐藏状态，加上初始实体嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, 当 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
            由每一层的 `torch.FloatTensor` 组成的元组，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
    """


@dataclass
class LukeSequenceClassifierOutput(ModelOutput):
    """
    句子分类模型的输出结果。
    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            分类（或回归，如果 `config.num_labels==1`）的损失值。
            如果提供 `labels`，则返回损失值。
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            分类（或回归，如果 `config.num_labels==1`）的分数（SoftMax 之前）。
            模型的输出分数。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型在每一层的隐藏状态以及可选的初始嵌入输出的元组。
            形状为 `(batch_size, sequence_length, hidden_size)`。
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            实体的隐藏状态，包括每一层的输出以及初始实体嵌入输出的元组。
            形状为 `(batch_size, entity_length, hidden_size)`。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            注意力权重的元组，每个层级一个。
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            在自注意力机制中用于计算加权平均值的注意力权重。
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 使用 dataclass 装饰器定义一个名为 LukeTokenClassifierOutput 的数据类，它继承自 ModelOutput。
@dataclass
class LukeTokenClassifierOutput(ModelOutput):
    """
    Base class for outputs of token classification models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
            Classification loss.
            分类损失，当提供 `labels` 参数时返回，数据类型为 `torch.FloatTensor`，形状为 `(1,)`。
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before SoftMax).
            分类分数（SoftMax 之前的输出），数据类型为 `torch.FloatTensor`，形状为 `(batch_size, sequence_length, config.num_labels)`。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
            可选项。当传入 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回，
            返回一个元组，包含 `torch.FloatTensor` 类型的张量（如果模型有嵌入层，则包含嵌入层的输出，
            否则包含每一层的输出），形状为 `(batch_size, sequence_length, hidden_size)`。
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
            layer plus the initial entity embedding outputs.
            可选项。当传入 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回，
            返回一个元组，包含 `torch.FloatTensor` 类型的张量（一个用于嵌入层的输出，另一个包含每一层的输出），
            形状为 `(batch_size, entity_length, hidden_size)`。用于表示每一层的实体隐藏状态以及初始实体嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
            可选项。当传入 `output_attentions=True` 或 `config.output_attentions=True` 时返回，
            返回一个元组，包含 `torch.FloatTensor` 类型的张量（每一层一个），形状为 `(batch_size, num_heads, sequence_length,
            sequence_length)`。表示经过注意力 softmax 后的注意力权重，用于计算自注意力头部的加权平均值。
    """
    
    # 分类损失，数据类型为 `torch.FloatTensor`，可选项。
    loss: Optional[torch.FloatTensor] = None
    # 分类分数（SoftMax 之前的输出），数据类型为 `torch.FloatTensor`，默认为 `None`。
    logits: torch.FloatTensor = None
    # 隐藏状态，数据类型为 `tuple`，包含 `torch.FloatTensor` 的元组，可选项。
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 实体隐藏状态，数据类型为 `tuple`，包含 `torch.FloatTensor` 的元组，可选项。
    entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 注意力权重，数据类型为 `tuple`，包含 `torch.FloatTensor` 的元组，可选项。
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


# 使用 dataclass 装饰器定义一个名为 LukeQuestionAnsweringModelOutput 的数据类，它继承自 ModelOutput。
@dataclass
class LukeQuestionAnsweringModelOutput(ModelOutput):
    """
    Outputs of question answering models.
    """

    # 这里将不添加额外的注释，因为类本身并没有额外的字段或说明。
    # 损失函数（如果提供了`labels`则会返回），总体抽取损失由起始和结束位置的交叉熵之和组成
    loss: Optional[torch.FloatTensor] = None

    # 起始位置的分数（softmax之前的张量），形状为`(batch_size, sequence_length)`
    start_logits: torch.FloatTensor = None

    # 结束位置的分数（softmax之前的张量），形状为`(batch_size, sequence_length)`
    end_logits: torch.FloatTensor = None

    # 模型每层的隐藏状态的元组（如果设置了`output_hidden_states=True`或`config.output_hidden_states=True`时返回）
    # 包含嵌入层的输出（如果模型有嵌入层）和每层的输出，形状为`(batch_size, sequence_length, hidden_size)`
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None

    # 实体每层的隐藏状态的元组（如果设置了`output_hidden_states=True`或`config.output_hidden_states=True`时返回）
    # 包含嵌入层的输出和每层的输出，形状为`(batch_size, entity_length, hidden_size)`
    entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None

    # 注意力权重的元组（如果设置了`output_attentions=True`或`config.output_attentions=True`时返回）
    # 每层的注意力权重，形状为`(batch_size, num_heads, sequence_length, sequence_length)`
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义一个数据类 LukeMultipleChoiceModelOutput，继承自 ModelOutput
@dataclass
class LukeMultipleChoiceModelOutput(ModelOutput):
    """
    多选模型的输出结果。

    Args:
        loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
            分类损失值。
        logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
            `num_choices` 是输入张量的第二个维度。参见上文中的 `input_ids`。

            分类分数（SoftMax 之前）。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            一个元组，包含 `torch.FloatTensor`（如果模型有嵌入层，则包含嵌入层输出，以及每一层的输出），
            形状为 `(batch_size, sequence_length, hidden_size)`。

            模型在每一层输出的隐藏状态，以及可选的初始嵌入输出。
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            一个元组，包含 `torch.FloatTensor`（包含嵌入层输出以及每一层的输出），
            形状为 `(batch_size, entity_length, hidden_size)`。

            模型在每一层输出的实体隐藏状态，以及初始实体嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            一个元组，包含 `torch.FloatTensor`（每一层一个），
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。

            注意力机制 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
    """

    # 可选的分类损失值
    loss: Optional[torch.FloatTensor] = None
    # 分类分数（SoftMax 之前）
    logits: torch.FloatTensor = None
    # 可选的隐藏状态元组
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 可选的实体隐藏状态元组
    entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 可选的注意力权重元组
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


# 定义一个类 LukeEmbeddings，与 BertEmbeddings 类相同，但稍作修改以支持位置嵌入索引
class LukeEmbeddings(nn.Module):
    """
    与 BertEmbeddings 类似，但稍作修改以支持位置嵌入索引。
    """
    def __init__(self, config):
        super().__init__()
        # 定义词嵌入层，根据配置参数创建词嵌入矩阵，大小为 vocab_size * hidden_size，其中使用 pad_token_id 进行填充
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 定义位置嵌入层，根据配置参数创建位置嵌入矩阵，大小为 max_position_embeddings * hidden_size
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 定义类型嵌入层，根据配置参数创建类型嵌入矩阵，大小为 type_vocab_size * hidden_size
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 使用 TensorFlow 模型变量名命名的 LayerNorm 层，用于标准化隐藏层输出，eps 为配置参数中的 layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 定义 Dropout 层，用于随机失活，丢弃概率为 hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 设置 padding_idx 属性，用于输入序列中的填充标记
        self.padding_idx = config.pad_token_id
        # 重新定义位置嵌入层，使用与输入序列相同的参数配置，并指定填充标记
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

    def forward(
        self,
        input_ids=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
    ):
        if position_ids is None:
            if input_ids is not None:
                # 根据输入的 token ids 创建位置 ids，保持填充的 token 仍然是填充状态
                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device)
            else:
                # 如果没有输入 token ids，则根据 inputs_embeds 创建位置 ids
                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        if token_type_ids is None:
            # 如果没有指定 token_type_ids，则创建全零的 token 类型 ids
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            # 如果没有提供 inputs_embeds，则使用 word_embeddings 对 input_ids 进行嵌入
            inputs_embeds = self.word_embeddings(input_ids)

        # 根据位置 ids 获取位置嵌入向量
        position_embeddings = self.position_embeddings(position_ids)
        # 根据 token_type_ids 获取类型嵌入向量
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将输入的嵌入向量、位置嵌入向量和类型嵌入向量相加
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
        # 对合并后的嵌入向量进行 LayerNorm 处理
        embeddings = self.LayerNorm(embeddings)
        # 对处理后的向量进行随机失活
        embeddings = self.dropout(embeddings)
        return embeddings

    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        input_shape = inputs_embeds.size()[:-1]
        sequence_length = input_shape[1]

        # 生成与输入嵌入向量维度相匹配的位置 ids
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        return position_ids.unsqueeze(0).expand(input_shape)
class LukeEntityEmbeddings(nn.Module):
    # Luke 实体嵌入模块，继承自 nn.Module
    def __init__(self, config: LukeConfig):
        super().__init__()
        self.config = config

        # 实体嵌入层，使用 nn.Embedding 创建一个词汇表大小为 config.entity_vocab_size，
        # 嵌入维度为 config.entity_emb_size 的嵌入层，padding 索引为 0
        self.entity_embeddings = nn.Embedding(config.entity_vocab_size, config.entity_emb_size, padding_idx=0)
        
        # 如果实体嵌入维度不等于隐藏层大小，创建一个线性层用于将实体嵌入维度转换到隐藏层大小
        if config.entity_emb_size != config.hidden_size:
            self.entity_embedding_dense = nn.Linear(config.entity_emb_size, config.hidden_size, bias=False)

        # 位置嵌入层，使用 nn.Embedding 创建一个最大位置嵌入大小为 config.max_position_embeddings，
        # 嵌入维度为 config.hidden_size
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        
        # 类型嵌入层，使用 nn.Embedding 创建一个类型词汇表大小为 config.type_vocab_size，
        # 嵌入维度为 config.hidden_size
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # Layer normalization 层，输入大小为 config.hidden_size，epsilon 为 config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # Dropout 层，概率为 config.hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数定义
    def forward(
        self, entity_ids: torch.LongTensor, position_ids: torch.LongTensor, token_type_ids: torch.LongTensor = None
    ):
        # 如果 token_type_ids 为 None，则初始化为全零的与 entity_ids 大小相同的张量
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(entity_ids)

        # 获取实体嵌入，通过实体嵌入层获取对应的实体嵌入向量
        entity_embeddings = self.entity_embeddings(entity_ids)
        
        # 如果实体嵌入维度不等于隐藏层大小，则通过线性层进行维度转换
        if self.config.entity_emb_size != self.config.hidden_size:
            entity_embeddings = self.entity_embedding_dense(entity_embeddings)

        # 获取位置嵌入，通过位置嵌入层获取对应的位置嵌入向量，同时根据位置索引进行裁剪和掩码处理
        position_embeddings = self.position_embeddings(position_ids.clamp(min=0))
        position_embedding_mask = (position_ids != -1).type_as(position_embeddings).unsqueeze(-1)
        position_embeddings = position_embeddings * position_embedding_mask
        position_embeddings = torch.sum(position_embeddings, dim=-2)
        position_embeddings = position_embeddings / position_embedding_mask.sum(dim=-2).clamp(min=1e-7)

        # 获取类型嵌入，通过类型嵌入层获取对应的类型嵌入向量
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将实体嵌入、位置嵌入和类型嵌入进行相加得到最终的嵌入向量
        embeddings = entity_embeddings + position_embeddings + token_type_embeddings
        
        # 应用 Layer normalization 层
        embeddings = self.LayerNorm(embeddings)
        
        # 应用 Dropout 层
        embeddings = self.dropout(embeddings)

        # 返回最终的嵌入向量
        return embeddings
    # 初始化方法，接受一个配置参数对象
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        
        # 检查隐藏层大小是否是注意力头数的整数倍，且不存在嵌入大小属性
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            # 如果不是整数倍则抛出数值错误异常
            raise ValueError(
                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
                f"heads {config.num_attention_heads}."
            )
        
        # 将注意力头数和每个注意力头的大小设置为配置中的值
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        
        # 根据配置设置是否使用实体感知注意力
        self.use_entity_aware_attention = config.use_entity_aware_attention

        # 创建用于查询、键和值的线性变换层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 如果使用实体感知注意力，则额外创建实体到词的查询层和实体到实体的查询层
        if self.use_entity_aware_attention:
            self.w2e_query = nn.Linear(config.hidden_size, self.all_head_size)
            self.e2w_query = nn.Linear(config.hidden_size, self.all_head_size)
            self.e2e_query = nn.Linear(config.hidden_size, self.all_head_size)

        # 创建注意力概率的丢弃层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    # 将输入张量转换为适合注意力分数计算的形状
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播方法定义，接受词的隐藏状态、实体的隐藏状态等参数
    def forward(
        self,
        word_hidden_states,
        entity_hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput

class LukeSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性层，输入维度为 config.hidden_size，输出维度为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # LayerNorm 层，对输入的最后一个维度进行归一化，eps 是归一化的参数
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层，根据 config.hidden_dropout_prob 概率丢弃输入
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 线性变换
        hidden_states = self.dense(hidden_states)
        # Dropout 操作
        hidden_states = self.dropout(hidden_states)
        # LayerNorm 归一化并加上输入 tensor
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class LukeAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # LUKE 自注意力机制模块
        self.self = LukeSelfAttention(config)
        # LUKE 自注意力输出模块
        self.output = LukeSelfOutput(config)
        # 头部剪枝集合，用于存储不参与注意力计算的头部
        self.pruned_heads = set()

    def prune_heads(self, heads):
        # LUKE 不支持注意力头部的剪枝操作，抛出未实现错误
        raise NotImplementedError("LUKE does not support the pruning of attention heads")

    def forward(
        self,
        word_hidden_states,
        entity_hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
    ):
        # 计算 word_hidden_states 的第二个维度大小
        word_size = word_hidden_states.size(1)
        # 使用 LUKE 自注意力机制进行计算
        self_outputs = self.self(
            word_hidden_states,
            entity_hidden_states,
            attention_mask,
            head_mask,
            output_attentions,
        )
        # 如果 entity_hidden_states 为 None，则将 self_outputs 的第一个元素作为 concat_self_outputs
        if entity_hidden_states is None:
            concat_self_outputs = self_outputs[0]
            concat_hidden_states = word_hidden_states
        else:
            # 否则将 self_outputs 的前两个元素在第二维度上连接，并将 hidden_states 连接在一起
            concat_self_outputs = torch.cat(self_outputs[:2], dim=1)
            concat_hidden_states = torch.cat([word_hidden_states, entity_hidden_states], dim=1)

        # 输出注意力计算的结果
        attention_output = self.output(concat_self_outputs, concat_hidden_states)

        # 截取 word_attention_output 和 entity_attention_output
        word_attention_output = attention_output[:, :word_size, :]
        if entity_hidden_states is None:
            entity_attention_output = None
        else:
            entity_attention_output = attention_output[:, word_size:, :]

        # 如果输出注意力信息，则将其添加到输出元组中
        outputs = (word_attention_output, entity_attention_output) + self_outputs[2:]

        return outputs


# Copied from transformers.models.bert.modeling_bert.BertIntermediate

class LukeIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性层，输入维度为 config.hidden_size，输出维度为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 中间激活函数，根据配置选择激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 线性变换
        hidden_states = self.dense(hidden_states)
        # 中间激活函数变换
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOutput

class LukeOutput(nn.Module):
    # 该类未完整提供，应该包括更多的代码才能完全注释
    pass
    # 初始化函数，用于创建一个新的神经网络模块实例
    def __init__(self, config):
        # 调用父类的初始化方法，确保正确地初始化神经网络模块
        super().__init__()
        # 创建一个全连接层，将输入特征的维度转换为隐藏层的维度
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个层归一化层，用于标准化隐藏层的输出
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个dropout层，用于在训练过程中随机丢弃部分隐藏层的输出，以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，接收隐藏状态和输入张量作为输入，并返回处理后的隐藏状态张量
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用全连接层进行线性变换，将隐藏状态张量转换到隐藏层的维度
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的隐藏状态张量进行dropout操作，以随机丢弃部分输出，防止过拟合
        hidden_states = self.dropout(hidden_states)
        # 使用层归一化层对dropout后的隐藏状态张量进行归一化处理，加上输入张量，得到最终的隐藏状态张量
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的隐藏状态张量作为前向传播的输出
        return hidden_states
# 定义了一个名为 LukeLayer 的神经网络模块，继承自 nn.Module
class LukeLayer(nn.Module):
    # 初始化函数，接收一个配置参数 config
    def __init__(self, config):
        super().__init__()
        # 设置当前层的前向传播中使用的参数
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1
        # 初始化注意力模块、中间层模块和输出层模块
        self.attention = LukeAttention(config)
        self.intermediate = LukeIntermediate(config)
        self.output = LukeOutput(config)

    # 前向传播函数，接收多个参数：word_hidden_states, entity_hidden_states, attention_mask, head_mask 等
    def forward(
        self,
        word_hidden_states,
        entity_hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
    ):
        # 获取输入 word_hidden_states 的第二维度大小
        word_size = word_hidden_states.size(1)

        # 调用注意力模块的前向传播，获取自注意力的输出
        self_attention_outputs = self.attention(
            word_hidden_states,
            entity_hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
        )
        
        # 根据是否有实体隐藏状态，选择不同的连接方式
        if entity_hidden_states is None:
            concat_attention_output = self_attention_outputs[0]
        else:
            concat_attention_output = torch.cat(self_attention_outputs[:2], dim=1)

        # 如果需要输出注意力权重，则将其添加到输出中
        outputs = self_attention_outputs[2:]  # 如果需要输出注意力权重，则添加自注意力的输出

        # 将前向传播应用于 chunk 分块，得到当前层的输出
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, concat_attention_output
        )
        
        # 提取出词的层的输出
        word_layer_output = layer_output[:, :word_size, :]
        
        # 根据是否有实体隐藏状态，提取出实体的层的输出
        if entity_hidden_states is None:
            entity_layer_output = None
        else:
            entity_layer_output = layer_output[:, word_size:, :]

        # 将词层和实体层的输出，以及可能的注意力权重输出，放入 outputs 中返回
        outputs = (word_layer_output, entity_layer_output) + outputs

        return outputs

    # 定义 feed_forward_chunk 方法，用于前向传播中的块处理
    def feed_forward_chunk(self, attention_output):
        # 中间层的输出
        intermediate_output = self.intermediate(attention_output)
        # 输出层的输出
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output


# 定义了一个名为 LukeEncoder 的神经网络模块，继承自 nn.Module
class LukeEncoder(nn.Module):
    # 初始化函数，接收一个配置参数 config
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 创建多个 LukeLayer 层，根据配置中的层数
        self.layer = nn.ModuleList([LukeLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

    # LukeEncoder 的前向传播函数，接收多个参数：word_hidden_states, entity_hidden_states, attention_mask 等
    def forward(
        self,
        word_hidden_states,
        entity_hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        # 逐层调用 LukeLayer 的前向传播函数
        for layer_module in self.layer:
            # 对每一层进行前向传播
            word_hidden_states, entity_hidden_states = layer_module(
                word_hidden_states,
                entity_hidden_states,
                attention_mask,
                head_mask,
                output_attentions=output_attentions,
            )

        # 返回最终的词和实体隐藏状态
        return word_hidden_states, entity_hidden_states
        ):
        # 初始化空元组以存储所有层的隐藏状态（如果需要输出）
        all_word_hidden_states = () if output_hidden_states else None
        all_entity_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        # 遍历模型的每一层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前层的隐藏状态添加到相应的元组中
            if output_hidden_states:
                all_word_hidden_states = all_word_hidden_states + (word_hidden_states,)
                all_entity_hidden_states = all_entity_hidden_states + (entity_hidden_states,)

            # 获取当前层的头部掩码（如果有的话）
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 根据梯度检查点和训练模式选择不同的前向传播方法
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数进行前向传播
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    word_hidden_states,
                    entity_hidden_states,
                    attention_mask,
                    layer_head_mask,
                    output_attentions,
                )
            else:
                # 普通的前向传播
                layer_outputs = layer_module(
                    word_hidden_states,
                    entity_hidden_states,
                    attention_mask,
                    layer_head_mask,
                    output_attentions,
                )

            # 更新当前层的词级隐藏状态
            word_hidden_states = layer_outputs[0]

            # 如果实体级别的隐藏状态不为None，则更新
            if entity_hidden_states is not None:
                entity_hidden_states = layer_outputs[1]

            # 如果需要输出注意力权重，则将当前层的注意力权重添加到元组中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态，则将最终层的隐藏状态添加到相应的元组中
        if output_hidden_states:
            all_word_hidden_states = all_word_hidden_states + (word_hidden_states,)
            all_entity_hidden_states = all_entity_hidden_states + (entity_hidden_states,)

        # 如果不需要返回字典形式的输出，则返回包含非None值的元组
        if not return_dict:
            return tuple(
                v
                for v in [
                    word_hidden_states,
                    all_word_hidden_states,
                    all_self_attentions,
                    entity_hidden_states,
                    all_entity_hidden_states,
                ]
                if v is not None
            )
        # 返回一个BaseLukeModelOutput对象，包含不同层级的最终隐藏状态和注意力权重
        return BaseLukeModelOutput(
            last_hidden_state=word_hidden_states,
            hidden_states=all_word_hidden_states,
            attentions=all_self_attentions,
            entity_last_hidden_state=entity_hidden_states,
            entity_hidden_states=all_entity_hidden_states,
        )
# Copied from transformers.models.bert.modeling_bert.BertPooler
# 从 transformers 库中的 BertPooler 类复制而来

class LukePooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        # 通过获取第一个 token 对应的隐藏状态来实现模型的“池化”
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


class EntityPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.entity_emb_size)
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        self.LayerNorm = nn.LayerNorm(config.entity_emb_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


class EntityPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transform = EntityPredictionHeadTransform(config)
        self.decoder = nn.Linear(config.entity_emb_size, config.entity_vocab_size, bias=False)
        self.bias = nn.Parameter(torch.zeros(config.entity_vocab_size))

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states) + self.bias

        return hidden_states


class LukePreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    # LukePreTrainedModel 类的抽象基类，处理权重初始化和预训练模型的下载加载接口

    config_class = LukeConfig
    base_model_prefix = "luke"
    supports_gradient_checkpointing = True
    _no_split_modules = ["LukeAttention", "LukeEntityEmbeddings"]
    # 初始化神经网络模块的权重
    def _init_weights(self, module: nn.Module):
        """Initialize the weights"""
        # 如果是线性层
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化权重，均值为0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项，将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是嵌入层
        elif isinstance(module, nn.Embedding):
            # 如果嵌入维度为1，通常用于偏置参数的嵌入，将权重初始化为零
            if module.embedding_dim == 1:
                module.weight.data.zero_()
            else:
                # 否则使用正态分布初始化权重，均值为0，标准差为配置中的初始化范围
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有填充索引，将对应的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果是层归一化层
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为1（通常用于缩放归一化的权重）
            module.weight.data.fill_(1.0)
# LUKE_START_DOCSTRING 定义了该模型的文档字符串，提供了关于模型的继承、参数初始化和基本使用的描述信息。
LUKE_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`LukeConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# LUKE_INPUTS_DOCSTRING 没有给出具体的文档字符串内容，在实际代码中应当补充相关描述信息。
LUKE_INPUTS_DOCSTRING = r"""
"""


# 使用装饰器 @add_start_docstrings 将以下类的文档字符串与额外的描述信息合并，提供了对 LUKE 模型转换器的描述
# 和 LUKE_START_DOCSTRING 中定义的模型文档信息。
@add_start_docstrings(
    "The bare LUKE model transformer outputting raw hidden-states for both word tokens and entities without any"
    " specific head on top.",
    LUKE_START_DOCSTRING,
)
class LukeModel(LukePreTrainedModel):
    # 初始化方法，接收配置和是否添加池化层的参数，构建 LUKE 模型的组件。
    def __init__(self, config: LukeConfig, add_pooling_layer: bool = True):
        super().__init__(config)
        self.config = config

        # 初始化模型的嵌入层、实体嵌入层和编码器。
        self.embeddings = LukeEmbeddings(config)
        self.entity_embeddings = LukeEntityEmbeddings(config)
        self.encoder = LukeEncoder(config)

        # 如果指定添加池化层，则初始化池化器。
        self.pooler = LukePooler(config) if add_pooling_layer else None

        # 初始化权重并应用最终处理。
        self.post_init()

    # 获取输入嵌入层（词嵌入层）。
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    # 设置输入嵌入层（词嵌入层）的值。
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    # 获取实体嵌入层。
    def get_entity_embeddings(self):
        return self.entity_embeddings.entity_embeddings

    # 设置实体嵌入层的值。
    def set_entity_embeddings(self, value):
        self.entity_embeddings.entity_embeddings = value

    # 实现了头部修剪的方法，但在 LUKE 中未实现头部的修剪操作。
    def _prune_heads(self, heads_to_prune):
        raise NotImplementedError("LUKE does not support the pruning of attention heads")

    # 使用装饰器 @add_start_docstrings_to_model_forward 将以下方法的输入描述与 LUKE_INPUTS_DOCSTRING 结合，
    # 提供了模型前向传播方法的输入描述和相关配置信息。
    @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=BaseLukeModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        entity_ids: Optional[torch.LongTensor] = None,
        entity_attention_mask: Optional[torch.FloatTensor] = None,
        entity_token_type_ids: Optional[torch.LongTensor] = None,
        entity_position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Executes the forward pass for the model.

        Arguments:
            input_ids (`torch.LongTensor`, *optional*):
                Indices of input sequence tokens in the vocabulary.
            attention_mask (`torch.FloatTensor`, *optional*):
                Mask to avoid performing attention on padding tokens.
            token_type_ids (`torch.LongTensor`, *optional*):
                Segment token indices to indicate first and second portions of the inputs.
            position_ids (`torch.LongTensor`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings.
            entity_ids (`torch.LongTensor`, *optional*):
                Indices of entity sequence tokens in the vocabulary.
            entity_attention_mask (`torch.FloatTensor`, *optional*):
                Mask to avoid performing attention on padding tokens in entity tokens.
            entity_token_type_ids (`torch.LongTensor`, *optional*):
                Segment token indices to indicate first and second portions of the entity inputs.
            entity_position_ids (`torch.LongTensor`, *optional*):
                Indices of positions of each entity sequence tokens in the position embeddings.
            head_mask (`torch.FloatTensor`, *optional*):
                Mask to nullify selected heads of the self-attention modules.
            inputs_embeds (`torch.FloatTensor`, *optional*):
                Optionally instead of input_ids, you can pass pre-computed embeddings.
            output_attentions (`bool`, *optional*):
                Whether to output the attentions tensors.
            output_hidden_states (`bool`, *optional*):
                Whether to output the hidden states tensors.
            return_dict (`bool`, *optional*):
                Whether to return a dict instead of a tuple.

        Returns:
            `torch.Tensor` The extended attention mask, with the same dtype as `attention_mask.dtype`.
        """
        # Implementation of the forward pass for the model goes here
        # ...
        pass


    def get_extended_attention_mask(
        self, word_attention_mask: torch.LongTensor, entity_attention_mask: Optional[torch.LongTensor]
    ):
        """
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            word_attention_mask (`torch.LongTensor`):
                Attention mask for word tokens with ones indicating tokens to attend to, zeros for tokens to ignore.
            entity_attention_mask (`torch.LongTensor`, *optional*):
                Attention mask for entity tokens with ones indicating tokens to attend to, zeros for tokens to ignore.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        """
        # Initialize the attention_mask with word_attention_mask
        attention_mask = word_attention_mask

        # Concatenate entity_attention_mask if provided
        if entity_attention_mask is not None:
            attention_mask = torch.cat([attention_mask, entity_attention_mask], dim=-1)

        # Ensure attention_mask is extended properly based on its dimensionality
        if attention_mask.dim() == 3:
            extended_attention_mask = attention_mask[:, None, :, :]
        elif attention_mask.dim() == 2:
            extended_attention_mask = attention_mask[:, None, None, :]
        else:
            raise ValueError(f"Wrong shape for attention_mask (shape {attention_mask.shape})")

        # Ensure dtype compatibility for fp16
        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)

        # Handle floating-point precision compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min

        return extended_attention_mask
# LUKE 模型用于带有语言建模头和顶部实体预测头的任务，支持掩码语言建模和掩码实体预测。
@add_start_docstrings(
    """
    LUKE 模型带有语言建模头和顶部实体预测头，支持掩码语言建模和掩码实体预测。
    """,
    LUKE_START_DOCSTRING,
)
class LukeForMaskedLM(LukePreTrainedModel):
    # 用于绑定权重的键列表，包括语言建模头和实体预测头的权重
    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias", "entity_predictions.decoder.weight"]

    def __init__(self, config):
        # 调用父类初始化方法，传入配置参数
        super().__init__(config)

        # 初始化 LUKE 模型
        self.luke = LukeModel(config)

        # 初始化语言建模头
        self.lm_head = LukeLMHead(config)

        # 初始化实体预测头
        self.entity_predictions = EntityPredictionHead(config)

        # 交叉熵损失函数用于计算损失
        self.loss_fn = nn.CrossEntropyLoss()

        # 初始化权重并应用最终处理
        self.post_init()

    def tie_weights(self):
        # 调用父类方法来绑定权重
        super().tie_weights()

        # 绑定或克隆实体预测头的权重到 LUKE 模型的实体嵌入
        self._tie_or_clone_weights(self.entity_predictions.decoder, self.luke.entity_embeddings.entity_embeddings)

    def get_output_embeddings(self):
        # 返回语言建模头的解码器权重
        return self.lm_head.decoder

    def set_output_embeddings(self, new_embeddings):
        # 设置新的输出嵌入到语言建模头的解码器中
        self.lm_head.decoder = new_embeddings

    @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 重写父类的前向方法，详细描述输入参数的文档字符串
    def forward(self, **kwargs):
    # 使用装饰器替换返回文档字符串，指定输出类型为LukeMaskedLMOutput，配置类为_CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=LukeMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
    # 定义模型的前向传播方法
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的 token IDs，类型为可选的长整型张量
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力遮罩，类型为可选的浮点数张量
        token_type_ids: Optional[torch.LongTensor] = None,  # token 类型 IDs，类型为可选的长整型张量
        position_ids: Optional[torch.LongTensor] = None,  # 位置 IDs，类型为可选的长整型张量
        entity_ids: Optional[torch.LongTensor] = None,  # 实体 IDs，类型为可选的长整型张量
        entity_attention_mask: Optional[torch.LongTensor] = None,  # 实体注意力遮罩，类型为可选的长整型张量
        entity_token_type_ids: Optional[torch.LongTensor] = None,  # 实体 token 类型 IDs，类型为可选的长整型张量
        entity_position_ids: Optional[torch.LongTensor] = None,  # 实体位置 IDs，类型为可选的长整型张量
        labels: Optional[torch.LongTensor] = None,  # 标签，类型为可选的长整型张量
        entity_labels: Optional[torch.LongTensor] = None,  # 实体标签，类型为可选的长整型张量
        head_mask: Optional[torch.FloatTensor] = None,  # 头部遮罩，类型为可选的浮点数张量
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入嵌入，类型为可选的浮点数张量
        output_attentions: Optional[bool] = None,  # 是否输出注意力信息，类型为可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态信息，类型为可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典，类型为可选的布尔值
@add_start_docstrings(
    """
    The LUKE model with a classification head on top (a linear layer on top of the hidden state of the first entity
    token) for entity classification tasks, such as Open Entity.
    """,
    LUKE_START_DOCSTRING,
)
class LukeForEntityClassification(LukePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.luke = LukeModel(config)  # 初始化 LUKE 模型

        self.num_labels = config.num_labels  # 类别数量
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # Dropout 层
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)  # 分类器线性层

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=EntityClassificationOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        entity_ids: Optional[torch.LongTensor] = None,
        entity_attention_mask: Optional[torch.FloatTensor] = None,
        entity_token_type_ids: Optional[torch.LongTensor] = None,
        entity_position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



@add_start_docstrings(
    """
    The LUKE model with a classification head on top (a linear layer on top of the hidden states of the two entity
    tokens) for entity pair classification tasks, such as TACRED.
    """,
    LUKE_START_DOCSTRING,
)
class LukeForEntityPairClassification(LukePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.luke = LukeModel(config)  # 初始化 LUKE 模型

        self.num_labels = config.num_labels  # 类别数量
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # Dropout 层
        self.classifier = nn.Linear(config.hidden_size * 2, config.num_labels, False)  # 用于实体对分类的线性层

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=EntityPairClassificationOutput, config_class=_CONFIG_FOR_DOC)
    # 前向传播函数，用于模型的前向推断过程
    def forward(
        self,
        # 输入的token IDs，类型为torch.LongTensor，可选参数
        input_ids: Optional[torch.LongTensor] = None,
        # 注意力掩码，指定哪些元素是padding的，类型为torch.FloatTensor，可选参数
        attention_mask: Optional[torch.FloatTensor] = None,
        # token类型IDs，用于区分两个句子的token，类型为torch.LongTensor，可选参数
        token_type_ids: Optional[torch.LongTensor] = None,
        # 位置IDs，指定每个token在序列中的位置，类型为torch.LongTensor，可选参数
        position_ids: Optional[torch.LongTensor] = None,
        # 实体IDs，用于指示实体的token序列，类型为torch.LongTensor，可选参数
        entity_ids: Optional[torch.LongTensor] = None,
        # 实体注意力掩码，指定实体token的padding，类型为torch.FloatTensor，可选参数
        entity_attention_mask: Optional[torch.FloatTensor] = None,
        # 实体token类型IDs，区分实体token的两个句子，类型为torch.LongTensor，可选参数
        entity_token_type_ids: Optional[torch.LongTensor] = None,
        # 实体位置IDs，指定每个实体token在序列中的位置，类型为torch.LongTensor，可选参数
        entity_position_ids: Optional[torch.LongTensor] = None,
        # 头部掩码，指定要执行注意力操作的头部，类型为torch.FloatTensor，可选参数
        head_mask: Optional[torch.FloatTensor] = None,
        # 输入的嵌入表示，类型为torch.FloatTensor，可选参数
        inputs_embeds: Optional[torch.FloatTensor] = None,
        # 标签，用于计算损失的真实标签，类型为torch.LongTensor，可选参数
        labels: Optional[torch.LongTensor] = None,
        # 是否输出注意力权重，类型为bool，可选参数
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态，类型为bool，可选参数
        output_hidden_states: Optional[bool] = None,
        # 是否返回一个字典格式的输出，类型为bool，可选参数
        return_dict: Optional[bool] = None,
# 使用自定义的文档字符串初始化 LUKE 模型，添加了一个在隐藏状态输出之上的跨度分类头部，用于诸如命名实体识别等任务
@add_start_docstrings(
    """
    The LUKE model with a span classification head on top (a linear layer on top of the hidden states output) for tasks
    such as named entity recognition.
    """,
    LUKE_START_DOCSTRING,
)
class LukeForEntitySpanClassification(LukePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 LUKE 模型
        self.luke = LukeModel(config)

        # 获取标签数量和隐藏层的 dropout 概率
        self.num_labels = config.num_labels
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 线性层用于分类，输入维度是隐藏层大小的三倍
        self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    # 重写 forward 方法，注解详细描述了模型的输入参数及其用途
    @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=EntitySpanClassificationOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        entity_ids: Optional[torch.LongTensor] = None,
        entity_attention_mask: Optional[torch.LongTensor] = None,
        entity_token_type_ids: Optional[torch.LongTensor] = None,
        entity_position_ids: Optional[torch.LongTensor] = None,
        entity_start_positions: Optional[torch.LongTensor] = None,
        entity_end_positions: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



# 使用自定义的文档字符串初始化 LUKE 模型，添加了一个在汇总输出之上的序列分类/回归头部，例如 GLUE 任务
@add_start_docstrings(
    """
    The LUKE Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    LUKE_START_DOCSTRING,
)
class LukeForSequenceClassification(LukePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 获取标签数量
        self.num_labels = config.num_labels

        # 初始化 LUKE 模型
        self.luke = LukeModel(config)

        # 根据配置使用分类器的 dropout 或者隐藏层的 dropout 概率
        self.dropout = nn.Dropout(
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )

        # 线性层用于分类，输入维度是隐藏层大小
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    # 重写 forward 方法，注解详细描述了模型的输入参数及其用途
    @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=LukeSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义前向传播方法，接受多个可选的输入参数，用于模型的输入和控制
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的token IDs，类型为长整型Tensor，可选
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力掩码，类型为浮点数Tensor，可选
        token_type_ids: Optional[torch.LongTensor] = None,  # token类型IDs，类型为长整型Tensor，可选
        position_ids: Optional[torch.LongTensor] = None,  # 位置IDs，类型为长整型Tensor，可选
        entity_ids: Optional[torch.LongTensor] = None,  # 实体IDs，类型为长整型Tensor，可选
        entity_attention_mask: Optional[torch.FloatTensor] = None,  # 实体的注意力掩码，类型为浮点数Tensor，可选
        entity_token_type_ids: Optional[torch.LongTensor] = None,  # 实体的token类型IDs，类型为长整型Tensor，可选
        entity_position_ids: Optional[torch.LongTensor] = None,  # 实体的位置IDs，类型为长整型Tensor，可选
        head_mask: Optional[torch.FloatTensor] = None,  # 头部掩码，类型为浮点数Tensor，可选
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入表示，类型为浮点数Tensor，可选
        labels: Optional[torch.FloatTensor] = None,  # 标签，类型为浮点数Tensor，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选布尔值
"""
The LUKE Model with a token classification head on top (a linear layer on top of the hidden-states output). To
solve Named-Entity Recognition (NER) task using LUKE, `LukeForEntitySpanClassification` is more suitable than this
class.
"""
@add_start_docstrings(
    """
    The LUKE Model with a token classification head on top (a linear layer on top of the hidden-states output). To
    solve Named-Entity Recognition (NER) task using LUKE, `LukeForEntitySpanClassification` is more suitable than this
    class.
    """,
    LUKE_START_DOCSTRING,
)
class LukeForTokenClassification(LukePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # Initialize the LUKE model backbone without adding pooling layer
        self.luke = LukeModel(config, add_pooling_layer=False)
        
        # Dropout layer for regularization
        self.dropout = nn.Dropout(
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        
        # Linear layer for token classification, output dimension is num_labels
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=LukeTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        entity_ids: Optional[torch.LongTensor] = None,
        entity_attention_mask: Optional[torch.FloatTensor] = None,
        entity_token_type_ids: Optional[torch.LongTensor] = None,
        entity_position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, LukeTokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据 return_dict 是否为 None 来确定是否使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 LUKE 模型进行前向传播
        outputs = self.luke(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            entity_ids=entity_ids,
            entity_attention_mask=entity_attention_mask,
            entity_token_type_ids=entity_token_type_ids,
            entity_position_ids=entity_position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=True,
        )

        # 获取模型最后一层的隐藏状态作为序列输出
        sequence_output = outputs.last_hidden_state

        # 对序列输出进行 dropout 操作
        sequence_output = self.dropout(sequence_output)
        
        # 将 dropout 后的输出传入分类器，得到 logits
        logits = self.classifier(sequence_output)

        # 初始化 loss 为 None
        loss = None
        if labels is not None:
            # 将 labels 移动到正确的设备上，以支持模型并行计算
            labels = labels.to(logits.device)
            # 定义交叉熵损失函数
            loss_fct = CrossEntropyLoss()
            # 计算交叉熵损失
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果 return_dict 为 False，则返回一个元组，包含 loss, logits, outputs.hidden_states,
        # outputs.entity_hidden_states, outputs.attentions 中不为 None 的部分
        if not return_dict:
            return tuple(
                v
                for v in [loss, logits, outputs.hidden_states, outputs.entity_hidden_states, outputs.attentions]
                if v is not None
            )

        # 如果 return_dict 为 True，则返回 LukeTokenClassifierOutput 对象，包含 loss, logits,
        # outputs.hidden_states, outputs.entity_hidden_states, outputs.attentions
        return LukeTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            entity_hidden_states=outputs.entity_hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    The LUKE Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    LUKE_START_DOCSTRING,
)
class LukeForQuestionAnswering(LukePreTrainedModel):
    """
    LUKE模型，用于支持抽取式问答任务（如SQuAD），在隐藏状态输出的顶部增加一个用于计算“起始位置logits”和“结束位置logits”的线性层。
    继承自LukePreTrainedModel。
    """

    def __init__(self, config):
        super().__init__(config)

        self.num_labels = config.num_labels

        # 初始化LUKE模型，不添加池化层
        self.luke = LukeModel(config, add_pooling_layer=False)
        
        # QA输出层，用于生成答案的线性层
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=LukeQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.FloatTensor] = None,
        entity_ids: Optional[torch.LongTensor] = None,
        entity_attention_mask: Optional[torch.FloatTensor] = None,
        entity_token_type_ids: Optional[torch.LongTensor] = None,
        entity_position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        正向传播方法，接收多种输入并生成相应的输出。
        参数与返回值的详细说明参见LUKE_INPUTS_DOCSTRING。
        """
        # 实现正向传播逻辑的具体内容
        pass


@add_start_docstrings(
    """
    The LUKE Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    LUKE_START_DOCSTRING,
)
class LukeForMultipleChoice(LukePreTrainedModel):
    """
    LUKE模型，用于多选分类任务（例如RocStories/SWAG），在汇总输出的顶部增加一个线性层和softmax激活函数。
    继承自LukePreTrainedModel。
    """

    def __init__(self, config):
        super().__init__(config)

        # 初始化LUKE模型
        self.luke = LukeModel(config)
        
        # Dropout层，用于防止过拟合
        self.dropout = nn.Dropout(
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        
        # 分类器线性层，输出为1，用于多选分类任务
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=LukeMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播方法，接受多个可选的输入参数：
    # - input_ids: 输入的token IDs序列，类型为torch.LongTensor，默认为None
    # - attention_mask: 注意力掩码，类型为torch.FloatTensor，默认为None
    # - token_type_ids: token类型IDs，类型为torch.LongTensor，默认为None
    # - position_ids: 位置IDs，类型为torch.LongTensor，默认为None
    # - entity_ids: 实体IDs，类型为torch.LongTensor，默认为None
    # - entity_attention_mask: 实体的注意力掩码，类型为torch.FloatTensor，默认为None
    # - entity_token_type_ids: 实体的token类型IDs，类型为torch.LongTensor，默认为None
    # - entity_position_ids: 实体的位置IDs，类型为torch.LongTensor，默认为None
    # - head_mask: 头部掩码，类型为torch.FloatTensor，默认为None
    # - inputs_embeds: 输入的嵌入向量，类型为torch.FloatTensor，默认为None
    # - labels: 标签，类型为torch.FloatTensor，默认为None
    # - output_attentions: 是否输出注意力权重，类型为bool，默认为None
    # - output_hidden_states: 是否输出隐藏状态，类型为bool，默认为None
    # - return_dict: 是否以字典形式返回结果，类型为bool，默认为None

`.\models\luke\tokenization_luke.py`

@lru_cache()
# 使用 functools 模块的 lru_cache 装饰器，用于缓存函数的返回值，提高函数的执行效率
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    """
    # 返回 utf-8 字节列表和映射到 Unicode 字符串的映射
    return list(bytes(range(256))) + [chr(i) for i in range(256, 65536)]
    """
    # 创建一个字典，用于将 UTF-8 字节与 Unicode 字符之间建立映射关系
    bs = (
        # bs 列表包含可打印的 ASCII 字符的 Unicode 编码范围
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    # 复制 bs 到 cs 列表
    cs = bs[:]
    n = 0
    # 遍历所有可能的字节值
    for b in range(2**8):
        # 如果字节值不在 bs 中，将其添加到 bs 和 cs 列表，并分配一个新的 Unicode 字符给它
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    # 将 cs 列表中的数字转换为相应的 Unicode 字符
    cs = [chr(n) for n in cs]
    # 返回一个将 bs 中的字节映射到 cs 中 Unicode 字符的字典
    return dict(zip(bs, cs))
    ```
# 从 transformers.models.roberta.tokenization_roberta.get_pairs 复制而来的函数定义，用于获取单词中的符号对集合
def get_pairs(word):
    """
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    # 遍历单词中的每个字符（从第二个字符开始），形成前一个字符和当前字符的符号对，加入集合中
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


# LUKE tokenizer 类，继承自 PreTrainedTokenizer
class LukeTokenizer(PreTrainedTokenizer):
    """
    Constructs a LUKE tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.

    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ```
    >>> from transformers import LukeTokenizer

    >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
    >>> tokenizer("Hello world")["input_ids"]
    [0, 31414, 232, 2]

    >>> tokenizer(" Hello world")["input_ids"]
    [0, 20920, 232, 2]
    ```

    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.

    <Tip>

    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).

    </Tip>

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods. It also creates entity sequences, namely
    `entity_ids`, `entity_attention_mask`, `entity_token_type_ids`, and `entity_position_ids` to be used by the LUKE
    model.

    """

    # 定义了一些类属性
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]

    # 初始化方法，接受多个参数，设置 LUKE tokenizer 的各种配置
    def __init__(
        self,
        vocab_file,
        merges_file,
        entity_vocab_file,
        task=None,
        max_entity_length=32,
        max_mention_length=30,
        entity_token_1="<ent>",
        entity_token_2="<ent2>",
        entity_unk_token="[UNK]",
        entity_pad_token="[PAD]",
        entity_mask_token="[MASK]",
        entity_mask2_token="[MASK2]",
        errors="replace",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        add_prefix_space=False,
        **kwargs,
    ):
        # 继承父类的初始化方法
        super().__init__(
            vocab_file=vocab_file,
            merges_file=merges_file,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )

    @property
    # 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size 复制而来，用于返回 LUKE tokenizer 的词汇表大小
    def vocab_size(self):
        return len(self.encoder)

    # 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab 复制而来，用于获取 LUKE tokenizer 的词汇表
    # 获取词汇表，复制编码器中的内容并更新添加的特殊标记编码器的内容，返回合并后的词汇表字典
    def get_vocab(self):
        vocab = dict(self.encoder).copy()
        vocab.update(self.added_tokens_encoder)
        return vocab

    # 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe 复制过来，修改为使用 LUKE 和 Luke 替代 RoBERTa 和 Roberta
    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token)
        pairs = get_pairs(word)

        if not pairs:
            return token

        while True:
            # 选择具有最小 bpe 排名的双字母对作为 bigram
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                # 如果找到 bigram，则将其替换为一个单一的 token
                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = " ".join(word)
        self.cache[token] = word
        return word

    # 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer._tokenize 复制过来，修改为使用 LUKE 和 Luke 替代 RoBERTa 和 Roberta
    def _tokenize(self, text):
        """Tokenize a string."""
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # 将所有字节映射为 Unicode 字符串，避免 BPE 的控制标记（在我们的情况下是空格）
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        return bpe_tokens

    # 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_token_to_id 复制过来，修改为使用 LUKE 和 Luke 替代 RoBERTa 和 Roberta
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    # 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_id_to_token 复制过来，修改为使用 LUKE 和 Luke 替代 RoBERTa 和 Roberta
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.decoder.get(index)

    # 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.convert_tokens_to_string 复制过来，修改为使用 LUKE 和 Luke 替代 RoBERTa 和 Roberta
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 将一系列的 tokens（字符串）连接成一个字符串
        text = "".join(tokens)
        # 使用 byte_decoder 将字符串解码为 UTF-8 编码的文本
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        # 返回解码后的文本
        return text

    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.build_inputs_with_special_tokens with Roberta->Luke, RoBERTa->LUKE
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A LUKE sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            # 返回单个序列的 input IDs，加上特殊 token `<s>` 和 `</s>`
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        # 返回序列对的 input IDs，加上特殊 token `<s>`, `</s>` 和 `</s>` 以及第二个序列的 tokens
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep

    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_special_tokens_mask with Roberta->Luke, RoBERTa->LUKE
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            # 如果已经有特殊 token，则调用父类方法获取特殊 token 的 mask
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is None:
            # 返回单个序列的特殊 token 的 mask：首位为 1，其余为 0，末尾再加 1
            return [1] + ([0] * len(token_ids_0)) + [1]
        # 返回序列对的特殊 token 的 mask：首位为 1，其余为 0，中间再加两个 1，再加上第二个序列的 mask
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
    # 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.create_token_type_ids_from_sequences 复制并修改，RoBERTa->LUKE, Roberta->Luke
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        创建用于序列对分类任务的掩码。LUKE 不使用 token type ids，因此返回一个全零列表。

        Args:
            token_ids_0 (`List[int]`):
                第一个序列的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                第二个序列的 ID 列表（可选）。

        Returns:
            `List[int]`: 全零列表。
        """
        sep = [self.sep_token_id]  # 分隔符的 token ID
        cls = [self.cls_token_id]  # 类别开始的 token ID

        if token_ids_1 is None:
            # 如果只有一个序列，则返回长度为序列长度加上特殊 token 的全零列表
            return len(cls + token_ids_0 + sep) * [0]
        # 如果有两个序列，则返回长度为两个序列加上多个特殊 token 的全零列表
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    # 从 transformers.models.roberta.tokenization_roberta.RobertaTokenizer.prepare_for_tokenization 复制并修改，RoBERTa->LUKE, Roberta->Luke
    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        # 如果文本已经分成单词或需要在文本前加空格，并且文本长度大于0且第一个字符不是空白，则在文本前加空格
        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
            text = " " + text
        return (text, kwargs)

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def __call__(
        self,
        text: Union[TextInput, List[TextInput]],
        text_pair: Optional[Union[TextInput, List[TextInput]]] = None,
        entity_spans: Optional[Union[EntitySpanInput, List[EntitySpanInput]]] = None,
        entity_spans_pair: Optional[Union[EntitySpanInput, List[EntitySpanInput]]] = None,
        entities: Optional[Union[EntityInput, List[EntityInput]]] = None,
        entities_pair: Optional[Union[EntityInput, List[EntityInput]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        max_entity_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: Optional[bool] = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 这是一个装饰器，将 ENCODE_KWARGS_DOCSTRING 和 ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING 作为参数添加到该方法中的文档字符串中
        pass
    # 定义一个方法 `_encode_plus`，用于处理文本编码及其相关特征的生成
    def _encode_plus(
        self,
        text: Union[TextInput],  # 输入参数：文本或文本对，可以是字符串或列表形式的字符串
        text_pair: Optional[Union[TextInput]] = None,  # 可选参数：第二个文本或文本对
        entity_spans: Optional[EntitySpanInput] = None,  # 可选参数：实体跨度信息
        entity_spans_pair: Optional[EntitySpanInput] = None,  # 可选参数：第二个文本的实体跨度信息
        entities: Optional[EntityInput] = None,  # 可选参数：单个文本的实体信息
        entities_pair: Optional[EntityInput] = None,  # 可选参数：第二个文本的实体信息
        add_special_tokens: bool = True,  # 是否添加特殊标记（如CLS和SEP）
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略，默认不填充
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略，默认不截断
        max_length: Optional[int] = None,  # 最大长度限制
        max_entity_length: Optional[int] = None,  # 单个实体的最大长度限制
        stride: int = 0,  # 步长，默认为0
        is_split_into_words: Optional[bool] = False,  # 输入是否已分词
        pad_to_multiple_of: Optional[int] = None,  # 填充到某个整数倍
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的张量类型
        return_token_type_ids: Optional[bool] = None,  # 是否返回token类型IDs
        return_attention_mask: Optional[bool] = None,  # 是否返回attention mask
        return_overflowing_tokens: bool = False,  # 是否返回溢出的token
        return_special_tokens_mask: bool = False,  # 是否返回特殊token的mask
        return_offsets_mapping: bool = False,  # 是否返回偏移映射
        return_length: bool = False,  # 是否返回编码后的长度
        verbose: bool = True,  # 是否打印详细信息
        **kwargs,  # 其他未列出的关键字参数
    ):
    ) -> BatchEncoding:
        # 如果 return_offsets_mapping 为真，则抛出未实现的错误
        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast. "
                "More information on available tokenizers at "
                "https://github.com/huggingface/transformers/pull/2674"
            )

        # 如果 is_split_into_words 为真，则抛出未实现的错误
        if is_split_into_words:
            raise NotImplementedError("is_split_into_words is not supported in this tokenizer.")

        # 调用 _create_input_sequence 方法生成输入序列的各部分
        (
            first_ids,
            second_ids,
            first_entity_ids,
            second_entity_ids,
            first_entity_token_spans,
            second_entity_token_spans,
        ) = self._create_input_sequence(
            text=text,
            text_pair=text_pair,
            entities=entities,
            entities_pair=entities_pair,
            entity_spans=entity_spans,
            entity_spans_pair=entity_spans_pair,
            **kwargs,
        )

        # prepare_for_model 方法将创建 attention_mask 和 token_type_ids
        return self.prepare_for_model(
            first_ids,
            pair_ids=second_ids,
            entity_ids=first_entity_ids,
            pair_entity_ids=second_entity_ids,
            entity_token_spans=first_entity_token_spans,
            pair_entity_token_spans=second_entity_token_spans,
            add_special_tokens=add_special_tokens,
            padding=padding_strategy.value,
            truncation=truncation_strategy.value,
            max_length=max_length,
            max_entity_length=max_entity_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            prepend_batch_axis=True,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            verbose=verbose,
        )
    # 定义一个方法用于批量编码文本和实体信息，并返回编码后的结果
    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[List[TextInput], List[TextInputPair]],
        batch_entity_spans_or_entity_spans_pairs: Optional[
            Union[List[EntitySpanInput], List[Tuple[EntitySpanInput, EntitySpanInput]]]
        ] = None,
        batch_entities_or_entities_pairs: Optional[
            Union[List[EntityInput], List[Tuple[EntityInput, EntityInput]]]
        ] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        max_entity_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: Optional[bool] = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 检查实体输入的格式是否正确
        def _check_entity_input_format(self, entities: Optional[EntityInput], entity_spans: Optional[EntitySpanInput]):
            # 如果实体 spans 不是 list 类型，抛出数值错误异常
            if not isinstance(entity_spans, list):
                raise ValueError("entity_spans should be given as a list")
            # 如果实体 spans 的长度大于零且第一个元素不是元组，抛出数值错误异常
            elif len(entity_spans) > 0 and not isinstance(entity_spans[0], tuple):
                raise ValueError(
                    "entity_spans should be given as a list of tuples containing the start and end character indices"
                )

            # 如果 entities 不是 None
            if entities is not None:
                # 如果 entities 不是 list 类型，抛出数值错误异常
                if not isinstance(entities, list):
                    raise ValueError("If you specify entities, they should be given as a list")

                # 如果 entities 的长度大于零且第一个元素不是字符串，抛出数值错误异常
                if len(entities) > 0 and not isinstance(entities[0], str):
                    raise ValueError("If you specify entities, they should be given as a list of entity names")

                # 如果 entities 的长度和 entity_spans 的长度不相等，抛出数值错误异常
                if len(entities) != len(entity_spans):
                    raise ValueError("If you specify entities, entities and entity_spans must be the same length")

        # 创建输入序列的方法，接受文本、实体信息以及其他关键字参数
        def _create_input_sequence(
            self,
            text: Union[TextInput],
            text_pair: Optional[Union[TextInput]] = None,
            entities: Optional[EntityInput] = None,
            entities_pair: Optional[EntityInput] = None,
            entity_spans: Optional[EntitySpanInput] = None,
            entity_spans_pair: Optional[EntitySpanInput] = None,
            **kwargs,
        ):
            # 使用 ENCODE_KWARGS_DOCSTRING 和 ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING 的注释添加到方法
            @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # 定义一个方法 `_batch_prepare_for_model`，用于准备模型输入数据的批处理
    def _batch_prepare_for_model(
        # 批次中的每个样本由一个 ID 列表和一个空值组成的元组组成
        self,
        batch_ids_pairs: List[Tuple[List[int], None]],
        # 批次中的每个样本由两个可选的实体 ID 列表组成的元组组成
        batch_entity_ids_pairs: List[Tuple[Optional[List[int]], Optional[List[int]]]],
        # 批次中的每个样本由两个可选的实体标记跨度列表组成的元组组成
        batch_entity_token_spans_pairs: List[Tuple[Optional[List[Tuple[int, int]]], Optional[List[Tuple[int, int]]]]],
        # 是否添加特殊标记
        add_special_tokens: bool = True,
        # 填充策略，默认不填充
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        # 截断策略，默认不截断
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        # 最大长度限制
        max_length: Optional[int] = None,
        # 最大实体长度限制
        max_entity_length: Optional[int] = None,
        # 步幅大小，默认为0
        stride: int = 0,
        # 填充到某个倍数，默认不填充到倍数
        pad_to_multiple_of: Optional[int] = None,
        # 返回的张量类型，默认不返回张量
        return_tensors: Optional[str] = None,
        # 返回的 token_type_ids 是否可选
        return_token_type_ids: Optional[bool] = None,
        # 返回的 attention_mask 是否可选
        return_attention_mask: Optional[bool] = None,
        # 是否返回溢出的 token
        return_overflowing_tokens: bool = False,
        # 是否返回特殊 token 掩码
        return_special_tokens_mask: bool = False,
        # 是否返回长度信息
        return_length: bool = False,
        # 是否详细输出信息，默认为 True
        verbose: bool = True,
    ) -> BatchEncoding:
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens


        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
            batch_entity_ids_pairs: list of entity ids or entity ids pairs
            batch_entity_token_spans_pairs: list of entity spans or entity spans pairs
            max_entity_length: The maximum length of the entity sequence.
        """

        # Initialize an empty dictionary to store batch outputs
        batch_outputs = {}

        # Iterate over input sequences and corresponding entity information
        for input_ids, entity_ids, entity_token_span_pairs in zip(
            batch_ids_pairs, batch_entity_ids_pairs, batch_entity_token_spans_pairs
        ):
            # Unpack input sequences into first and second parts
            first_ids, second_ids = input_ids
            # Unpack entity ids into first and second parts
            first_entity_ids, second_entity_ids = entity_ids
            # Unpack entity token spans into first and second parts
            first_entity_token_spans, second_entity_token_spans = entity_token_span_pairs

            # Prepare inputs for the model using specified parameters
            outputs = self.prepare_for_model(
                first_ids,
                second_ids,
                entity_ids=first_entity_ids,
                pair_entity_ids=second_entity_ids,
                entity_token_spans=first_entity_token_spans,
                pair_entity_token_spans=second_entity_token_spans,
                add_special_tokens=add_special_tokens,
                padding=PaddingStrategy.DO_NOT_PAD.value,  # Specify padding strategy
                truncation=truncation_strategy.value,  # Specify truncation strategy
                max_length=max_length,  # Maximum length of the sequences
                max_entity_length=max_entity_length,  # Maximum length of the entity sequence
                stride=stride,  # Stride for handling overflowing tokens
                pad_to_multiple_of=None,  # We pad in batch afterward
                return_attention_mask=False,  # We pad in batch afterward
                return_token_type_ids=return_token_type_ids,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_length=return_length,
                return_tensors=None,  # Convert batch to tensors at the end
                prepend_batch_axis=False,
                verbose=verbose,
            )

            # Aggregate outputs from each batch iteration
            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        # Perform padding on batch outputs using specified parameters
        batch_outputs = self.pad(
            batch_outputs,
            padding=padding_strategy.value,  # Specify padding strategy
            max_length=max_length,  # Maximum length of the sequences
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
        )

        # Convert batch outputs to BatchEncoding format
        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)

        # Return the processed batch outputs
        return batch_outputs
    # 准备输入数据以供模型使用，根据参数进行处理和转换
    def prepare_for_model(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        entity_ids: Optional[List[int]] = None,
        pair_entity_ids: Optional[List[int]] = None,
        entity_token_spans: Optional[List[Tuple[int, int]]] = None,
        pair_entity_token_spans: Optional[List[Tuple[int, int]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        max_entity_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        prepend_batch_axis: bool = False,
        **kwargs,
    ):
        # 对输入数据进行预处理，包括添加特殊标记、填充、截断等操作
        ...

    # 对编码后的输入进行填充处理，确保输入数据的长度一致性
    def pad(
        self,
        encoded_inputs: Union[
            BatchEncoding,
            List[BatchEncoding],
            Dict[str, EncodedInput],
            Dict[str, List[EncodedInput]],
            List[Dict[str, EncodedInput]],
        ],
        padding: Union[bool, str, PaddingStrategy] = True,
        max_length: Optional[int] = None,
        max_entity_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        verbose: bool = True,
    ):
        # 对编码后的输入进行填充，使得它们具有相同的长度或满足指定的填充要求
        ...

    # 内部方法：对编码后的输入进行低级别的填充操作
    def _pad(
        self,
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
        max_length: Optional[int] = None,
        max_entity_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        ...
    # 保存词汇表到指定目录中
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，若不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 构建词汇表文件路径
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        
        # 构建合并文件路径
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 写入词汇表文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        # 初始化索引
        index = 0
        # 写入合并文件
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            # 遍历并排序BPE词汇表，按索引写入文件
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                # 检查BPE合并索引是否连续，记录警告信息
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                # 写入BPE token到文件
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        # 构建实体词汇表文件路径
        entity_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["entity_vocab_file"]
        )

        # 写入实体词汇表文件
        with open(entity_vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.entity_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        # 返回保存的文件路径元组
        return vocab_file, merge_file, entity_vocab_file

`.\models\luke\init.py`

# 版权声明和许可信息，指出本代码的版权和使用许可
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 引入类型检查模块
from typing import TYPE_CHECKING

# 引入自定义工具模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块导入结构
_import_structure = {
    "configuration_luke": ["LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP", "LukeConfig"],
    "tokenization_luke": ["LukeTokenizer"],
}

# 检查是否存在 Torch 库
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则添加以下模块到导入结构中
    _import_structure["modeling_luke"] = [
        "LUKE_PRETRAINED_MODEL_ARCHIVE_LIST",
        "LukeForEntityClassification",
        "LukeForEntityPairClassification",
        "LukeForEntitySpanClassification",
        "LukeForMultipleChoice",
        "LukeForQuestionAnswering",
        "LukeForSequenceClassification",
        "LukeForTokenClassification",
        "LukeForMaskedLM",
        "LukeModel",
        "LukePreTrainedModel",
    ]


# 如果是类型检查模式
if TYPE_CHECKING:
    # 从具体模块导入所需内容
    from .configuration_luke import LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, LukeConfig
    from .tokenization_luke import LukeTokenizer

    # 再次检查 Torch 库的可用性
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 Torch 可用，则从 modeling_luke 模块导入以下内容
        from .modeling_luke import (
            LUKE_PRETRAINED_MODEL_ARCHIVE_LIST,
            LukeForEntityClassification,
            LukeForEntityPairClassification,
            LukeForEntitySpanClassification,
            LukeForMaskedLM,
            LukeForMultipleChoice,
            LukeForQuestionAnswering,
            LukeForSequenceClassification,
            LukeForTokenClassification,
            LukeModel,
            LukePreTrainedModel,
        )

# 如果不是类型检查模式
else:
    # 导入 sys 模块
    import sys

    # 将当前模块替换为 LazyModule 类的实例，以延迟导入模块内容
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\lxmert\configuration_lxmert.py`

# coding=utf-8
# Copyright 2018, Hao Tan, Mohit Bansal
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" LXMERT model configuration"""


from ...configuration_utils import PretrainedConfig  # 导入预训练配置的类
from ...utils import logging  # 导入日志工具


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/config.json",
}


class LxmertConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LxmertModel`] or a [`TFLxmertModel`]. It is used
    to instantiate a LXMERT model according to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to that of the Lxmert
    [unc-nlp/lxmert-base-uncased](https://huggingface.co/unc-nlp/lxmert-base-uncased) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    """

    model_type = "lxmert"  # 设置模型类型为 "lxmert"
    attribute_map = {}  # 定义一个空的属性映射字典

    def __init__(
        self,
        vocab_size=30522,  # 词汇表大小默认为 30522
        hidden_size=768,  # 隐藏层大小默认为 768
        num_attention_heads=12,  # 注意力头数目默认为 12
        num_qa_labels=9500,  # QA 标签数目默认为 9500
        num_object_labels=1600,  # 对象标签数目默认为 1600
        num_attr_labels=400,  # 属性标签数目默认为 400
        intermediate_size=3072,  # 中间层大小默认为 3072
        hidden_act="gelu",  # 隐藏层激活函数默认为 gelu
        hidden_dropout_prob=0.1,  # 隐藏层 dropout 概率默认为 0.1
        attention_probs_dropout_prob=0.1,  # 注意力概率 dropout 概率默认为 0.1
        max_position_embeddings=512,  # 最大位置嵌入数默认为 512
        type_vocab_size=2,  # 类型词汇表大小默认为 2
        initializer_range=0.02,  # 初始化范围默认为 0.02
        layer_norm_eps=1e-12,  # 层归一化的 epsilon 默认为 1e-12
        l_layers=9,  # L 层默认为 9
        x_layers=5,  # X 层默认为 5
        r_layers=5,  # R 层默认为 5
        visual_feat_dim=2048,  # 视觉特征维度默认为 2048
        visual_pos_dim=4,  # 视觉位置维度默认为 4
        visual_loss_normalizer=6.67,  # 视觉损失正则化默认为 6.67
        task_matched=True,  # 匹配任务默认启用
        task_mask_lm=True,  # Masked LM 任务默认启用
        task_obj_predict=True,  # 对象预测任务默认启用
        task_qa=True,  # QA 任务默认启用
        visual_obj_loss=True,  # 视觉对象损失默认启用
        visual_attr_loss=True,  # 视觉属性损失默认启用
        visual_feat_loss=True,  # 视觉特征损失默认启用
        **kwargs,
        ):
        # 初始化 BERT 模型的参数
        self.vocab_size = vocab_size  # 词汇表大小
        self.hidden_size = hidden_size  # 隐藏层大小
        self.num_attention_heads = num_attention_heads  # 注意力头的数量
        self.hidden_act = hidden_act  # 隐藏层激活函数
        self.intermediate_size = intermediate_size  # 中间层大小
        self.hidden_dropout_prob = hidden_dropout_prob  # 隐藏层dropout概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob  # 注意力dropout概率
        self.max_position_embeddings = max_position_embeddings  # 最大位置嵌入数
        self.type_vocab_size = type_vocab_size  # 类型词汇表大小
        self.initializer_range = initializer_range  # 初始化范围
        self.layer_norm_eps = layer_norm_eps  # 层归一化的 epsilon
        self.num_qa_labels = num_qa_labels  # QA 标签数量
        self.num_object_labels = num_object_labels  # 对象标签数量
        self.num_attr_labels = num_attr_labels  # 属性标签数量
        self.l_layers = l_layers  # 语言层的数量
        self.x_layers = x_layers  # 交叉编码器层的数量
        self.r_layers = r_layers  # 视觉层的数量
        self.visual_feat_dim = visual_feat_dim  # 视觉特征维度
        self.visual_pos_dim = visual_pos_dim  # 视觉位置维度
        self.visual_loss_normalizer = visual_loss_normalizer  # 视觉损失归一化器
        self.task_matched = task_matched  # 匹配任务
        self.task_mask_lm = task_mask_lm  # Masked LM 任务
        self.task_obj_predict = task_obj_predict  # 对象预测任务
        self.task_qa = task_qa  # QA 任务
        self.visual_obj_loss = visual_obj_loss  # 视觉对象损失
        self.visual_attr_loss = visual_attr_loss  # 视觉属性损失
        self.visual_feat_loss = visual_feat_loss  # 视觉特征损失
        self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers}  # 隐藏层的数量字典
        super().__init__(**kwargs)  # 调用父类的初始化方法，并传入额外参数

`.\models\lxmert\convert_lxmert_original_tf_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert LXMERT checkpoint."""


import argparse  # 导入 argparse 模块，用于处理命令行参数

import torch  # 导入 PyTorch 模块

from transformers import LxmertConfig, LxmertForPreTraining, load_tf_weights_in_lxmert  # 从 transformers 模块导入相关函数和类
from transformers.utils import logging  # 导入 logging 模块

logging.set_verbosity_info()  # 设置日志输出级别为 info


def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = LxmertConfig.from_json_file(config_file)  # 从配置文件加载 LXMERT 模型配置
    print(f"Building PyTorch model from configuration: {config}")  # 打印正在根据配置构建 PyTorch 模型
    model = LxmertForPreTraining(config)  # 基于配置创建 LXMERT 的预训练模型实例

    # Load weights from tf checkpoint
    load_tf_weights_in_lxmert(model, config, tf_checkpoint_path)  # 加载 TensorFlow checkpoint 中的权重到 PyTorch 模型中

    # Save pytorch-model
    print(f"Save PyTorch model to {pytorch_dump_path}")  # 打印正在保存 PyTorch 模型到指定路径
    torch.save(model.state_dict(), pytorch_dump_path)  # 将 PyTorch 模型的状态字典保存到指定路径


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建命令行参数解析器实例

    # Required parameters
    parser.add_argument(
        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
    )  # 添加 tf_checkpoint_path 参数，指定 TensorFlow checkpoint 的路径
    parser.add_argument(
        "--config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.",
    )  # 添加 config_file 参数，指定预训练模型对应的配置文件路径
    parser.add_argument(
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )  # 添加 pytorch_dump_path 参数，指定输出的 PyTorch 模型路径
    args = parser.parse_args()  # 解析命令行参数

    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)  # 调用函数将 TensorFlow checkpoint 转换为 PyTorch 模型

`.\models\lxmert\modeling_lxmert.py`

# 导入所需的库和模块
import math  # 导入数学函数库
import os  # 导入操作系统功能库
import warnings  # 导入警告处理库
from dataclasses import dataclass  # 导入数据类装饰器
from typing import Dict, Optional, Tuple, Union  # 导入类型提示相关库

import torch  # 导入PyTorch库
from torch import nn  # 导入神经网络模块
from torch.nn import CrossEntropyLoss, SmoothL1Loss  # 导入交叉熵损失和平滑L1损失

from ...activations import ACT2FN, gelu  # 导入激活函数和GELU激活函数
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...utils import (  # 导入工具函数和类
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_lxmert import LxmertConfig  # 导入LXMERT配置类

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# 文档化相关常量
_CHECKPOINT_FOR_DOC = "unc-nlp/lxmert-base-uncased"  # 预训练模型的检查点
_CONFIG_FOR_DOC = "LxmertConfig"  # LXMERT模型配置信息

# 预训练模型存档列表
LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "unc-nlp/lxmert-base-uncased",  # LXMERT基础模型存档
]


class GeLU(nn.Module):
    """
    实现Gaussian Error Linear Unit (GELU)激活函数的PyTorch模块。
    """

    def __init__(self):
        super().__init__()

    def forward(self, x):
        """
        对输入张量应用GELU激活函数。

        Args:
            x (torch.Tensor): 输入张量

        Returns:
            torch.Tensor: 经过GELU激活函数后的张量
        """
        return gelu(x)


@dataclass
class LxmertModelOutput(ModelOutput):
    """
    LXMERT模型的输出，包含语言编码器、视觉编码器和跨模态编码器的最后隐藏状态、汇总输出和注意力概率。
    （注意：在LXMERT中，视觉编码器称为“关系-语义”编码器）
    """

    # 继承自ModelOutput，不需要额外的字段
    pass  # 无需额外的字段声明，直接继承父类的字段和方法
    # 定义函数的参数和它们的类型注释，指定了每个参数的数据类型和形状
    
    Args:
        language_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            最后一层语言编码器的隐藏状态序列。
        vision_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            最后一层视觉编码器的隐藏状态序列。
        pooled_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            序列中第一个令牌（分类、CLS令牌）的最后一层隐藏状态，通过一个线性层和Tanh激活函数进一步处理。
        language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            语言编码器的隐藏状态元组，形状为 `(batch_size, sequence_length, hidden_size)`。
        vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            视觉编码器的隐藏状态元组，形状为 `(batch_size, sequence_length, hidden_size)`。
        language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            注意力权重元组，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`，经过注意力softmax后得到，用于计算自注意力头中的加权平均值。
        vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            注意力权重元组，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`，经过注意力softmax后得到，用于计算自注意力头中的加权平均值。
        cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            注意力权重元组，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`，经过注意力softmax后得到，用于计算交叉编码器注意力头中的加权平均值。
    """
    
    language_output: Optional[torch.FloatTensor] = None
    vision_output: Optional[torch.FloatTensor] = None
    pooled_output: Optional[torch.FloatTensor] = None
    language_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 声明一个可选的类型为 Tuple[torch.FloatTensor] 的变量 language_attentions，并初始化为 None
    language_attentions: Optional[Tuple[torch.FloatTensor]] = None
    
    # 声明一个可选的类型为 Tuple[torch.FloatTensor] 的变量 vision_attentions，并初始化为 None
    vision_attentions: Optional[Tuple[torch.FloatTensor]] = None
    
    # 声明一个可选的类型为 Tuple[torch.FloatTensor] 的变量 cross_encoder_attentions，并初始化为 None
    cross_encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 定义 LxmertForQuestionAnsweringOutput 类，用于存储 LXMERT 模型问题回答的输出结果
@dataclass
class LxmertForQuestionAnsweringOutput(ModelOutput):
    """
    LxmertForQuestionAnswering 的输出类型。

    Args:
        loss (*optional*, 当提供 `labels` 时返回，`torch.FloatTensor`，形状为 `(1,)`):
            总损失，包括掩码语言建模损失和下一个序列预测（分类）损失的和。
        question_answering_score (`torch.FloatTensor`，形状为 `(batch_size, n_qa_answers)`，*optional*):
            问题回答目标的预测分数（分类）。
        language_hidden_states (`tuple(torch.FloatTensor)`，*optional*，当传递 `output_hidden_states=True` 或者 `config.output_hidden_states=True` 时返回):
            元组，包含 `torch.FloatTensor`（一个用于输入特征 + 一个用于每个交叉模态层的输出），
            形状为 `(batch_size, sequence_length, hidden_size)`。
        vision_hidden_states (`tuple(torch.FloatTensor)`，*optional*，当传递 `output_hidden_states=True` 或者 `config.output_hidden_states=True` 时返回):
            元组，包含 `torch.FloatTensor`（一个用于输入特征 + 一个用于每个交叉模态层的输出），
            形状为 `(batch_size, sequence_length, hidden_size)`。
        language_attentions (`tuple(torch.FloatTensor)`，*optional*，当传递 `output_attentions=True` 或者 `config.output_attentions=True` 时返回):
            元组，包含 `torch.FloatTensor`（每个层一个），
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            经过注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
        vision_attentions (`tuple(torch.FloatTensor)`，*optional*，当传递 `output_attentions=True` 或者 `config.output_attentions=True` 时返回):
            元组，包含 `torch.FloatTensor`（每个层一个），
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            经过注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
        cross_encoder_attentions (`tuple(torch.FloatTensor)`，*optional*，当传递 `output_attentions=True` 或者 `config.output_attentions=True` 时返回):
            元组，包含 `torch.FloatTensor`（每个层一个），
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            经过注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
    """

    # 损失值，类型为可选的浮点张量
    loss: Optional[torch.FloatTensor] = None
    # 问题回答分数，类型为可选的浮点张量
    question_answering_score: Optional[torch.FloatTensor] = None
    # 语言隐藏状态，类型为可选的张量元组
    language_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 视觉隐藏状态，类型为可选的张量元组
    vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 语言注意力权重，类型为可选的张量元组
    language_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 视觉注意力权重，类型为可选的张量元组
    vision_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 定义一个可选的类型注解，表示 cross_encoder_attentions 变量可以是一个包含一个 torch.FloatTensor 的元组，或者是 None
    cross_encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class LxmertForPreTrainingOutput(ModelOutput):
    """
    Output type of [`LxmertForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        cross_relationship_score (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Prediction scores of the textual matching objective (classification) head (scores of True/False
            continuation before SoftMax).
        question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
            Prediction scores of question answering objective (classification).
        language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
        vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
        language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.

    """
    # 定义损失变量，类型为可选的浮点张量
    loss: Optional[torch.FloatTensor] = None
    
    # 定义预测 logits 变量，类型为可选的浮点张量
    prediction_logits: Optional[torch.FloatTensor] = None
    
    # 定义跨关系分数变量，类型为可选的浮点张量
    cross_relationship_score: Optional[torch.FloatTensor] = None
    
    # 定义问答分数变量，类型为可选的浮点张量
    question_answering_score: Optional[torch.FloatTensor] = None
    
    # 定义语言隐藏状态变量，类型为可选的浮点张量元组
    language_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    
    # 定义视觉隐藏状态变量，类型为可选的浮点张量元组
    vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    
    # 定义语言注意力变量，类型为可选的浮点张量元组
    language_attentions: Optional[Tuple[torch.FloatTensor]] = None
    
    # 定义视觉注意力变量，类型为可选的浮点张量元组
    vision_attentions: Optional[Tuple[torch.FloatTensor]] = None
    
    # 定义跨编码器注意力变量，类型为可选的浮点张量元组
    cross_encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
def load_tf_weights_in_lxmert(model, config, tf_checkpoint_path):
    """Load tf checkpoints in a pytorch model."""
    try:
        import re  # 导入正则表达式模块
        import numpy as np  # 导入NumPy库
        import tensorflow as tf  # 导入TensorFlow库
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise

    tf_path = os.path.abspath(tf_checkpoint_path)  # 获取TensorFlow checkpoint文件的绝对路径
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")  # 记录日志：转换TensorFlow checkpoint的路径

    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)  # 获取TensorFlow模型中的所有变量及其形状
    names = []
    arrays = []
    for name, shape in init_vars:
        logger.info(f"Loading TF weight {name} with shape {shape}")  # 记录日志：加载TensorFlow权重的名称和形状
        array = tf.train.load_variable(tf_path, name)  # 加载TensorFlow模型中的变量值
        names.append(name)  # 将变量名添加到列表中
        arrays.append(array)  # 将变量值添加到列表中

    for name, array in zip(names, arrays):
        name = name.split("/")
        
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(
            n
            in [
                "adam_v",
                "adam_m",
                "AdamWeightDecayOptimizer",
                "AdamWeightDecayOptimizer_1",
                "global_step",
            ]
            for n in name
        ):
            logger.info(f"Skipping {'/'.join(name)}")  # 记录日志：跳过特定的TensorFlow变量
            continue
        
        pointer = model
        for m_name in name:
            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
                scope_names = re.split(r"_(\d+)", m_name)
            else:
                scope_names = [m_name]
            
            # 根据变量名的前缀，设置对应的PyTorch模型指针
            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                pointer = getattr(pointer, "bias")
            elif scope_names[0] == "output_weights":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "squad":
                pointer = getattr(pointer, "classifier")
            else:
                try:
                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    logger.info(f"Skipping {'/'.join(name)}")  # 记录日志：跳过特定的PyTorch变量
                    continue
            
            if len(scope_names) >= 2:
                num = int(scope_names[1])
                pointer = pointer[num]  # 根据索引获取嵌套的指针
        
        # 处理特殊情况下的变量名，设置对应的PyTorch模型指针
        if m_name[-11:] == "_embeddings":
            pointer = getattr(pointer, "weight")
        elif m_name == "kernel":
            array = np.transpose(array)  # 转置权重数组

        try:
            assert pointer.shape == array.shape  # 断言PyTorch模型指针和权重数组的形状相同
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        
        logger.info(f"Initialize PyTorch weight {name}")  # 记录日志：初始化PyTorch权重的名称
        pointer.data = torch.from_numpy(array)  # 使用NumPy数组初始化PyTorch模型的权重

    return model  # 返回加载了TensorFlow权重的PyTorch模型
    """Construct the embeddings from word, position and token_type embeddings."""

    # 初始化函数，接受一个配置对象config作为参数
    def __init__(self, config):
        super().__init__()
        # 创建一个词嵌入层，用于将输入的词汇索引映射为隐藏大小的词嵌入向量，padding_idx=0表示用0进行填充
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
        # 创建一个位置嵌入层，用于将位置索引映射为隐藏大小的位置嵌入向量，padding_idx=0表示用0进行填充
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size, padding_idx=0)
        # 创建一个标记类型嵌入层，用于将标记类型索引映射为隐藏大小的嵌入向量，padding_idx=0表示用0进行填充
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size, padding_idx=0)

        # LayerNorm不使用蛇形命名以保持与TensorFlow模型变量名的一致性，使得能够加载任何TensorFlow检查点文件
        # 创建LayerNorm层，用于归一化隐藏状态向量，eps=1e-12是一个非常小的数，用于数值稳定性
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        # 创建Dropout层，用于在训练过程中随机失活部分神经元，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，接受输入的词汇ID（input_ids）、标记类型ID（token_type_ids）和预先计算的嵌入（inputs_embeds）
    def forward(self, input_ids, token_type_ids=None, inputs_embeds=None):
        # 如果input_ids不为None，则获取其形状和设备信息；否则获取inputs_embeds的形状和设备信息
        if input_ids is not None:
            input_shape = input_ids.size()
            device = input_ids.device
        else:
            input_shape = inputs_embeds.size()[:-1]
            device = inputs_embeds.device
        seq_length = input_shape[1]

        # 根据序列长度创建位置ID张量，dtype=torch.long表示数据类型为长整型，device=device表示放置在指定设备上
        position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
        position_ids = position_ids.unsqueeze(0).expand(input_shape)

        # 如果token_type_ids为None，则创建全零张量作为标记类型ID，数据类型为长整型，设备使用self.position_ids的设备
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果inputs_embeds为None，则通过word_embeddings将input_ids转换为嵌入向量
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        # 根据位置ID获取位置嵌入向量
        position_embeddings = self.position_embeddings(position_ids)
        # 根据标记类型ID获取标记类型嵌入向量
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将词嵌入向量、位置嵌入向量和标记类型嵌入向量相加得到最终的嵌入向量
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
        # 对最终的嵌入向量进行LayerNorm归一化
        embeddings = self.LayerNorm(embeddings)
        # 对归一化后的向量进行Dropout处理
        embeddings = self.dropout(embeddings)
        return embeddings
# 定义 LxmertAttention 类，继承自 nn.Module，用于执行 LXMERT 模型中的自注意力机制
class LxmertAttention(nn.Module):
    def __init__(self, config, ctx_dim=None):
        super().__init__()
        # 检查隐藏层大小是否能被注意力头数整除
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.head_size = self.num_attention_heads * self.attention_head_size

        # 如果未指定上下文维度，则使用配置中的隐藏层大小
        if ctx_dim is None:
            ctx_dim = config.hidden_size
        # 创建查询、键、值的线性映射层
        self.query = nn.Linear(config.hidden_size, self.head_size)
        self.key = nn.Linear(ctx_dim, self.head_size)
        self.value = nn.Linear(ctx_dim, self.head_size)

        # 定义 dropout 层，用于注意力概率的 dropout
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    # 对输入张量 x 进行形状转换，以适应多头注意力机制
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (
            self.num_attention_heads,
            self.attention_head_size,
        )
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数，执行自注意力计算
    def forward(self, hidden_states, context, attention_mask=None, output_attentions=False):
        # 通过查询、键、值映射层计算混合的查询、键、值张量
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(context)
        mixed_value_layer = self.value(context)

        # 对混合的查询、键、值张量进行形状转换，以进行多头注意力计算
        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # 计算原始注意力分数，通过查询与键的点积得到
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # 如果存在注意力掩码，则将其应用于注意力分数
        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask

        # 对注意力分数进行 softmax 归一化，得到注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 对注意力概率进行 dropout 操作
        attention_probs = self.dropout(attention_probs)

        # 计算上下文张量，通过注意力概率与值层的乘积得到
        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()

        # 重新调整上下文张量的形状，以匹配预期的输出形状
        new_context_layer_shape = context_layer.size()[:-2] + (self.head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        # 如果需要输出注意力权重，则将其包含在输出中
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
        return outputs


class LxmertAttentionOutput(nn.Module):
    # 初始化函数，用于初始化神经网络模型的参数和层
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__()
        # 创建一个线性层，输入和输出的大小都为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 创建一个 LayerNorm 层，对隐藏状态进行归一化处理
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        # 创建一个 Dropout 层，用于在训练过程中随机置零输入张量的部分元素
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，定义了模型的计算过程
    def forward(self, hidden_states, input_tensor):
        # 使用线性层对隐藏状态进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的隐藏状态进行随机置零处理，以减少过拟合
        hidden_states = self.dropout(hidden_states)
        # 对处理后的隐藏状态进行 LayerNorm 归一化，并与输入张量相加
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的隐藏状态作为输出
        return hidden_states
class LxmertCrossAttentionLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化交叉注意力层，包括注意力和输出
        self.att = LxmertAttention(config)
        self.output = LxmertAttentionOutput(config)

    def forward(self, input_tensor, ctx_tensor, ctx_att_mask=None, output_attentions=False):
        # 执行前向传播，调用注意力层，并返回注意力输出
        output = self.att(input_tensor, ctx_tensor, ctx_att_mask, output_attentions=output_attentions)
        if output_attentions:
            # 如果需要输出注意力权重，则获取注意力概率
            attention_probs = output[1]
        # 使用输出层处理注意力输出和输入张量，得到最终输出
        attention_output = self.output(output[0], input_tensor)
        # 根据需要是否输出注意力权重，构建最终输出结果
        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
        return outputs


class LxmertSelfAttentionLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化自注意力层，包括注意力和输出
        self.self = LxmertAttention(config)
        self.output = LxmertAttentionOutput(config)

    def forward(self, input_tensor, attention_mask, output_attentions=False):
        # 自注意力层的前向传播，处理输入张量、注意力掩码，并返回注意力输出
        # 注意：自注意力的键和查询是相同的（即输入张量）
        output = self.self(
            input_tensor,
            input_tensor,
            attention_mask,
            output_attentions=output_attentions,
        )
        if output_attentions:
            # 如果需要输出注意力权重，则获取注意力概率
            attention_probs = output[1]
        # 使用输出层处理注意力输出和输入张量，得到最终输出
        attention_output = self.output(output[0], input_tensor)
        # 根据需要是否输出注意力权重，构建最终输出结果
        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
        return outputs


class LxmertIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化中间层，包括线性变换和激活函数
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        self.intermediate_act_fn = ACT2FN[config.hidden_act]

    def forward(self, hidden_states):
        # 中间层的前向传播，先进行线性变换，再应用激活函数
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


class LxmertOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化输出层，包括线性变换、LayerNorm和Dropout
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        # 输出层的前向传播，先进行线性变换，再应用Dropout和LayerNorm，最后与输入张量相加
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class LxmertLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化整个 LXMERT 层，包括自注意力层、中间层和输出层
        self.attention = LxmertSelfAttentionLayer(config)
        self.intermediate = LxmertIntermediate(config)
        self.output = LxmertOutput(config)
    # 定义一个前向传播方法，接受隐藏状态作为输入，并可选地接受注意力掩码和是否输出注意力信息的标志
    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
        # 调用注意力层的前向传播方法，得到输出
        outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions)
        # 从注意力层的输出中获取注意力输出
        attention_output = outputs[0]
        # 将注意力输出送入中间层处理
        intermediate_output = self.intermediate(attention_output)
        # 将中间层的输出送入输出层处理，得到最终层输出
        layer_output = self.output(intermediate_output, attention_output)
        # 如果输出注意力信息被激活，将注意力信息加入输出元组中
        outputs = (layer_output,) + outputs[1:]  # add attentions if we output them
        # 返回所有输出（包括最终层输出和可能的注意力信息）
        return outputs
class LxmertXLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # The cross-attention Layer
        self.visual_attention = LxmertCrossAttentionLayer(config)

        # Self-attention Layers
        self.lang_self_att = LxmertSelfAttentionLayer(config)
        self.visn_self_att = LxmertSelfAttentionLayer(config)

        # Intermediate and Output Layers (FFNs)
        self.lang_inter = LxmertIntermediate(config)
        self.lang_output = LxmertOutput(config)
        self.visn_inter = LxmertIntermediate(config)
        self.visn_output = LxmertOutput(config)

    def cross_att(
        self,
        lang_input,
        lang_attention_mask,
        visual_input,
        visual_attention_mask,
        output_x_attentions=False,
    ):
        # Cross Attention between language and visual inputs
        lang_att_output = self.visual_attention(
            lang_input,
            visual_input,
            ctx_att_mask=visual_attention_mask,
            output_attentions=output_x_attentions,
        )
        # Cross Attention between visual and language inputs
        visual_att_output = self.visual_attention(
            visual_input,
            lang_input,
            ctx_att_mask=lang_attention_mask,
            output_attentions=False,
        )
        return lang_att_output, visual_att_output

    def self_att(self, lang_input, lang_attention_mask, visual_input, visual_attention_mask):
        # Self Attention for language input
        lang_att_output = self.lang_self_att(lang_input, lang_attention_mask, output_attentions=False)
        # Self Attention for visual input
        visual_att_output = self.visn_self_att(visual_input, visual_attention_mask, output_attentions=False)
        return lang_att_output[0], visual_att_output[0]

    def output_fc(self, lang_input, visual_input):
        # Feed-forward layers for language input
        lang_inter_output = self.lang_inter(lang_input)
        # Feed-forward layers for visual input
        visual_inter_output = self.visn_inter(visual_input)

        # Output layers for language input
        lang_output = self.lang_output(lang_inter_output, lang_input)
        # Output layers for visual input
        visual_output = self.visn_output(visual_inter_output, visual_input)

        return lang_output, visual_output

    def forward(
        self,
        lang_feats,
        lang_attention_mask,
        visual_feats,
        visual_attention_mask,
        output_attentions=False,
    ):
        # Perform cross-attention
        lang_att_output, visual_att_output = self.cross_att(
            lang_feats,
            lang_attention_mask,
            visual_feats,
            visual_attention_mask,
            output_x_attentions=output_attentions,
        )

        # Perform self-attention
        lang_self_output, visual_self_output = self.self_att(
            lang_feats,
            lang_attention_mask,
            visual_feats,
            visual_attention_mask,
        )

        # Perform output FC layers
        lang_output, visual_output = self.output_fc(lang_self_output, visual_self_output)

        return lang_output, visual_output
    # 定义一个方法，执行交叉注意力操作，将语言和视觉特征进行注意力计算
    def forward(
        self,
        lang_feats,              # 输入的语言特征
        lang_attention_mask,     # 语言注意力掩码
        visual_feats,            # 输入的视觉特征
        visual_attention_mask,   # 视觉注意力掩码
        output_attentions=False  # 是否输出注意力矩阵，默认为 False
    ):
        # 执行交叉注意力计算，得到语言和视觉的注意力输出
        lang_att_output, visual_att_output = self.cross_att(
            lang_input=lang_feats,
            lang_attention_mask=lang_attention_mask,
            visual_input=visual_feats,
            visual_attention_mask=visual_attention_mask,
            output_x_attentions=output_attentions,
        )
        # 获取语言注意力输出中除第一个之外的所有部分
        attention_probs = lang_att_output[1:]
        
        # 执行自注意力计算，传入语言和视觉的注意力输出以及对应的注意力掩码
        lang_att_output, visual_att_output = self.self_att(
            lang_att_output[0],
            lang_attention_mask,
            visual_att_output[0],
            visual_attention_mask,
        )
        
        # 将经过注意力计算后的语言和视觉输出，输入到输出全连接层进行最终的输出
        lang_output, visual_output = self.output_fc(lang_att_output, visual_att_output)
        
        # 根据是否需要输出注意力矩阵，决定返回值的格式
        return (
            (
                lang_output,          # 语言输出
                visual_output,        # 视觉输出
                attention_probs[0],   # 第一个注意力矩阵（如果有输出注意力）
            )
            if output_attentions        # 如果需要输出注意力矩阵
            else (lang_output, visual_output)  # 否则只返回语言和视觉输出
        )
        # LXMERT 编码器模型，用于处理多模态输入数据
        super().__init__()

        # 对象级别视觉特征编码层
        self.visn_fc = LxmertVisualFeatureEncoder(config)
        self.config = config

        # 层的数量
        self.num_l_layers = config.l_layers
        self.num_x_layers = config.x_layers
        self.num_r_layers = config.r_layers

        # 层的初始化
        # 使用 self.layer 而不是 self.l_layer 来支持加载 BERT 权重
        self.layer = nn.ModuleList([LxmertLayer(config) for _ in range(self.num_l_layers)])
        self.x_layers = nn.ModuleList([LxmertXLayer(config) for _ in range(self.num_x_layers)])
        self.r_layers = nn.ModuleList([LxmertLayer(config) for _ in range(self.num_r_layers)])

    def forward(
        self,
        lang_feats,
        lang_attention_mask,
        visual_feats,
        visual_pos,
        visual_attention_mask=None,
        output_attentions=None,
        ):
            vision_hidden_states = ()
            language_hidden_states = ()
            # 如果需要输出注意力权重或者配置要求输出注意力权重，则初始化视觉和语言注意力为空元组，否则设为None
            vision_attentions = () if output_attentions or self.config.output_attentions else None
            language_attentions = () if output_attentions or self.config.output_attentions else None
            cross_encoder_attentions = () if output_attentions or self.config.output_attentions else None

            visual_feats = self.visn_fc(visual_feats, visual_pos)

            # 运行语言层
            for layer_module in self.layer:
                # 调用每个语言层模块进行前向传播
                l_outputs = layer_module(lang_feats, lang_attention_mask, output_attentions=output_attentions)
                lang_feats = l_outputs[0]
                # 将每一层的隐藏状态添加到语言隐藏状态元组中
                language_hidden_states = language_hidden_states + (lang_feats,)
                # 如果需要记录注意力权重，将每一层的注意力权重添加到语言注意力元组中
                if language_attentions is not None:
                    language_attentions = language_attentions + (l_outputs[1],)

            # 运行关系层
            for layer_module in self.r_layers:
                # 调用每个关系层模块进行前向传播
                v_outputs = layer_module(visual_feats, visual_attention_mask, output_attentions=output_attentions)
                visual_feats = v_outputs[0]
                # 将每一层的隐藏状态添加到视觉隐藏状态元组中
                vision_hidden_states = vision_hidden_states + (visual_feats,)
                # 如果需要记录注意力权重，将每一层的注意力权重添加到视觉注意力元组中
                if vision_attentions is not None:
                    vision_attentions = vision_attentions + (v_outputs[1],)

            # 运行跨模态层
            for layer_module in self.x_layers:
                # 调用每个跨模态层模块进行前向传播
                x_outputs = layer_module(
                    lang_feats,
                    lang_attention_mask,
                    visual_feats,
                    visual_attention_mask,
                    output_attentions=output_attentions,
                )
                lang_feats, visual_feats = x_outputs[:2]
                # 将每一层的隐藏状态添加到视觉和语言隐藏状态元组中
                vision_hidden_states = vision_hidden_states + (visual_feats,)
                language_hidden_states = language_hidden_states + (lang_feats,)
                # 如果需要记录注意力权重，将每一层的注意力权重添加到跨模态注意力元组中
                if cross_encoder_attentions is not None:
                    cross_encoder_attentions = cross_encoder_attentions + (x_outputs[2],)
            visual_encoder_outputs = (
                vision_hidden_states,
                vision_attentions if output_attentions else None,
            )
            lang_encoder_outputs = (
                language_hidden_states,
                language_attentions if output_attentions else None,
            )
            # 返回最终的视觉编码器输出、语言编码器输出以及跨编码器注意力权重（如果需要的话）
            return (
                visual_encoder_outputs,
                lang_encoder_outputs,
                cross_encoder_attentions if output_attentions else None,
            )
class LxmertVisualObjHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        hid_dim = config.hidden_size
        self.vis_fc = nn.Sequential(
            nn.Linear(hid_dim, hid_dim * 2),
            GeLU(),
            nn.LayerNorm(hid_dim * 2, eps=1e-12),
        )

    def forward(self, hidden_states):
        # 进行视觉特征的预测，通过全连接层实现特征转换和归一化
        visual_feats = self.vis_fc(hidden_states)
        return visual_feats
    def __init__(self, config):
        super().__init__()
        self.transform = LxmertPredictionHeadTransform(config)
        # Decide the use of visual losses
        visual_losses = {}
        if config.visual_obj_loss:
            visual_losses["obj"] = {"shape": (-1,), "num": config.num_object_labels}
        if config.visual_attr_loss:
            visual_losses["attr"] = {"shape": (-1,), "num": config.num_attr_labels}
        if config.visual_feat_loss:
            visual_losses["feat"] = {
                "shape": (-1, config.visual_feat_dim),
                "num": config.visual_feat_dim,
            }
        self.visual_losses = visual_losses
        # 定义一个字典，用于存储不同类型的视觉损失

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder_dict = nn.ModuleDict(
            {key: nn.Linear(config.hidden_size, self.visual_losses[key]["num"]) for key in self.visual_losses}
        )
        # 使用 nn.ModuleDict 创建一个 Module 字典，每个 key 对应不同的视觉损失类型，
        # 值为一个 Linear 层，用于处理隐藏状态到对应损失类型的输出映射

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        # 对输入的隐藏状态进行转换
        output = {}
        for key in self.visual_losses:
            output[key] = self.decoder_dict[key](hidden_states)
        # 使用每个视觉损失对应的 Linear 层计算输出
        return output
class LxmertPreTrainingHeads(nn.Module):
    def __init__(self, config, lxmert_model_embedding_weights):
        super(LxmertPreTrainingHeads, self).__init__()
        # 初始化预测头部：语言模型预测头部
        self.predictions = LxmertLMPredictionHead(config, lxmert_model_embedding_weights)
        # 初始化预测头部：序列关系预测头部
        self.seq_relationship = nn.Linear(config.hidden_size, 2)

    def forward(self, sequence_output, pooled_output):
        # 预测语言模型的分数
        prediction_scores = self.predictions(sequence_output)
        # 预测序列关系的分数
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score


class LxmertPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = LxmertConfig
    load_tf_weights = load_tf_weights_in_lxmert
    base_model_prefix = "lxmert"

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化线性层的权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化嵌入层的权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 将 LayerNorm 层的偏置项初始化为零，权重初始化为 1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


LXMERT_START_DOCSTRING = r"""

    The LXMERT model was proposed in [LXMERT: Learning Cross-Modality Encoder Representations from
    Transformers](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal. It's a vision and language transformer
    model, pretrained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MSCOCO captions, and Visual
    genome, using a combination of masked language modeling, region of interest feature regression, cross entropy loss
    for question answering attribute prediction, and object tag prediction.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.
    Parameters:
        config ([`LxmertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

LXMERT_INPUTS_DOCSTRING = r"""
Args:
    batch_size (int): The batch size of the input data.
    sequence_length (int): The length of the input sequences.
"""


@add_start_docstrings(
    "The bare Lxmert Model transformer outputting raw hidden-states without any specific head on top.",
    LXMERT_START_DOCSTRING,
)
class LxmertModel(LxmertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # Initialize the embeddings module with the provided configuration
        self.embeddings = LxmertEmbeddings(config)
        # Initialize the encoder module with the provided configuration
        self.encoder = LxmertEncoder(config)
        # Initialize the pooler module with the provided configuration
        self.pooler = LxmertPooler(config)
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        # Return the word embeddings from the embeddings module
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, new_embeddings):
        # Update the word embeddings in the embeddings module with new_embeddings
        self.embeddings.word_embeddings = new_embeddings

    @add_start_docstrings_to_model_forward(LXMERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=LxmertModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        visual_feats: Optional[torch.FloatTensor] = None,
        visual_pos: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        visual_attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

        # Perform forward pass through the LxmertModel
        ...
    def __init__(self, config):
        super().__init__(config)
        # Configuration
        self.config = config
        self.num_qa_labels = config.num_qa_labels  # 从配置中获取问答标签数量
        self.visual_loss_normalizer = config.visual_loss_normalizer  # 从配置中获取视觉损失的归一化器

        # Use of pretraining tasks
        self.task_mask_lm = config.task_mask_lm  # 是否执行掩码语言建模任务
        self.task_obj_predict = config.task_obj_predict  # 是否执行对象预测任务
        self.task_matched = config.task_matched  # 是否执行匹配任务
        self.task_qa = config.task_qa  # 是否执行问答任务

        # Lxmert backbone
        self.lxmert = LxmertModel(config)  # 初始化Lxmert模型

        # Pre-training heads
        self.cls = LxmertPreTrainingHeads(config, self.lxmert.embeddings.word_embeddings.weight)  # 初始化预训练头部
        if self.task_obj_predict:
            self.obj_predict_head = LxmertVisualObjHead(config)  # 如果执行对象预测任务，则初始化对象预测头部
        if self.task_qa:
            self.answer_head = LxmertVisualAnswerHead(config, self.num_qa_labels)  # 如果执行问答任务，则初始化问答头部

        # Weight initialization
        # Initialize weights and apply final processing
        self.post_init()  # 执行后初始化操作，包括权重初始化和最终处理

        # Loss functions
        self.loss_fcts = {
            "l2": SmoothL1Loss(reduction="none"),  # 平滑的L1损失函数，不进行降维
            "visual_ce": CrossEntropyLoss(reduction="none"),  # 视觉交叉熵损失函数，不进行降维
            "ce": CrossEntropyLoss(),  # 交叉熵损失函数，进行降维
        }

        visual_losses = {}
        if config.visual_obj_loss:
            visual_losses["obj"] = {
                "shape": (-1,),  # 形状为一维向量
                "num": config.num_object_labels,  # 目标标签数量
                "loss": "visual_ce",  # 使用视觉交叉熵损失
            }
        if config.visual_attr_loss:
            visual_losses["attr"] = {
                "shape": (-1,),  # 形状为一维向量
                "num": config.num_attr_labels,  # 属性标签数量
                "loss": "visual_ce",  # 使用视觉交叉熵损失
            }
        if config.visual_feat_loss:
            visual_losses["feat"] = {
                "shape": (-1, config.visual_feat_dim),  # 形状为二维张量，其中维度为视觉特征维度
                "num": config.visual_feat_dim,  # 视觉特征的维度
                "loss": "l2",  # 使用平滑的L1损失
            }
        self.visual_losses = visual_losses  # 存储视觉损失的配置信息
    def resize_num_qa_labels(self, num_labels):
        """
        从提供的新线性层构建调整大小的问答线性层模块。增加大小会添加新初始化的权重，减小大小会从末尾移除权重。

        Args:
            num_labels (`int`, *optional*):
                线性层权重矩阵中的新标签数量。增加大小会在末尾添加新初始化的权重，减小大小会从末尾移除权重。如果未提供或为 `None`，则仅返回模型的问答标签 `torch.nn.Linear` 模块的指针，而不执行任何操作。

        Returns:
            `torch.nn.Linear`: 调整大小后的线性层指针或旧线性层
        """

        cur_qa_logit_layer = self.get_qa_logit_layer()
        if num_labels is None or cur_qa_logit_layer is None:
            return
        new_qa_logit_layer = self._resize_qa_labels(num_labels)
        self.config.num_qa_labels = num_labels
        self.num_qa_labels = num_labels

        return new_qa_logit_layer

    def _resize_qa_labels(self, num_labels):
        """
        根据指定的标签数量调整当前问答预测线性层。

        Args:
            num_labels (`int`): 线性层权重矩阵中的新标签数量

        Returns:
            `nn.Module`: 调整大小后的问答预测线性层
        """

        cur_qa_logit_layer = self.get_qa_logit_layer()
        new_qa_logit_layer = self._get_resized_qa_labels(cur_qa_logit_layer, num_labels)
        self._set_qa_logit_layer(new_qa_logit_layer)
        return self.get_qa_logit_layer()

    def get_qa_logit_layer(self) -> nn.Module:
        """
        返回生成问答 logits 的线性层模块。

        Returns:
            `nn.Module`: 一个 torch 模块，映射问答预测隐藏状态的线性层，如果 LXMERT 没有视觉回答头部则返回 `None`。
        """
        if hasattr(self, "answer_head"):
            return self.answer_head.logit_fc[-1]

    def _set_qa_logit_layer(self, qa_logit_layer):
        """
        设置问答预测线性层。

        Args:
            qa_logit_layer (`nn.Module`): 新的问答预测线性层
        """
        self.answer_head.logit_fc[-1] = qa_logit_layer
    # 如果 num_labels 为 None，则直接返回当前的 cur_qa_logit_layer
    if num_labels is None:
        return cur_qa_logit_layer

    # 获取当前 cur_qa_logit_layer 的标签数和隐藏维度
    cur_qa_labels, hidden_dim = cur_qa_logit_layer.weight.size()

    # 如果当前标签数等于 num_labels，则直接返回当前的 cur_qa_logit_layer
    if cur_qa_labels == num_labels:
        return cur_qa_logit_layer

    # 如果 cur_qa_logit_layer 存在偏置项，则创建新的线性输出层，否则不创建偏置项的新线性层
    if getattr(cur_qa_logit_layer, "bias", None) is not None:
        new_qa_logit_layer = nn.Linear(hidden_dim, num_labels)
    else:
        new_qa_logit_layer = nn.Linear(hidden_dim, num_labels, bias=False)

    # 将新的线性层放置在与 cur_qa_logit_layer 相同的设备上
    new_qa_logit_layer.to(cur_qa_logit_layer.weight.device)

    # 初始化新标签的权重
    self._init_weights(new_qa_logit_layer)

    # 复制之前权重中的标签
    num_labels_to_copy = min(cur_qa_labels, num_labels)
    new_qa_logit_layer.weight.data[:num_labels_to_copy, :] = cur_qa_logit_layer.weight.data[:num_labels_to_copy, :]
    if getattr(cur_qa_logit_layer, "bias", None) is not None:
        new_qa_logit_layer.bias.data[:num_labels_to_copy] = cur_qa_logit_layer.bias.data[:num_labels_to_copy]

    # 返回新的线性层 new_qa_logit_layer
    return new_qa_logit_layer


@add_start_docstrings_to_model_forward(LXMERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=LxmertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    visual_feats: Optional[torch.FloatTensor] = None,
    visual_pos: Optional[torch.FloatTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    visual_attention_mask: Optional[torch.FloatTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    obj_labels: Optional[Dict[str, Tuple[torch.FloatTensor, torch.FloatTensor]]] = None,
    matched_label: Optional[torch.LongTensor] = None,
    ans: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    **kwargs,
):
    # 此处函数定义用于模型的前向传播，接收多个输入参数和可选的返回类型标志
@add_start_docstrings(
    """Lxmert Model with a visual-answering head on top for downstream QA tasks""",
    LXMERT_START_DOCSTRING,
)
class LxmertForQuestionAnswering(LxmertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # Configuration
        self.config = config  # 存储模型配置信息
        self.num_qa_labels = config.num_qa_labels  # 获取问题回答标签的数量
        self.visual_loss_normalizer = config.visual_loss_normalizer  # 获取视觉损失归一化参数

        # Lxmert backbone
        self.lxmert = LxmertModel(config)  # 初始化LXMERT模型作为主干网络

        self.answer_head = LxmertVisualAnswerHead(config, self.num_qa_labels)  # 初始化视觉回答头部

        # Weight initialization
        # Initialize weights and apply final processing
        self.post_init()  # 执行权重初始化和最终处理步骤

        # Loss function
        self.loss = CrossEntropyLoss()  # 定义交叉熵损失函数

    def resize_num_qa_labels(self, num_labels):
        """
        Build a resized question answering linear layer Module from a provided new linear layer. Increasing the size
        will add newly initialized weights. Reducing the size will remove weights from the end

        Args:
            num_labels (`int`, *optional*):
                New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
                weights at the end. Reducing the size will remove weights from the end. If not provided or `None`, just
                returns a pointer to the qa labels ``torch.nn.Linear``` module of the model without doing anything.

        Return:
            `torch.nn.Linear`: Pointer to the resized Linear layer or the old Linear layer
        """
        cur_qa_logit_layer = self.get_qa_logit_layer()  # 获取当前问题回答对数层

        if num_labels is None or cur_qa_logit_layer is None:
            return  # 如果没有提供num_labels或当前qa_logit_layer为None，则直接返回

        new_qa_logit_layer = self._resize_qa_labels(num_labels)  # 调整问题回答对数层的大小
        self.config.num_qa_labels = num_labels  # 更新模型配置中的问题回答标签数量
        self.num_qa_labels = num_labels  # 更新当前实例的问题回答标签数量

        return new_qa_logit_layer  # 返回调整后的问题回答对数层

    def _resize_qa_labels(self, num_labels):
        cur_qa_logit_layer = self.get_qa_logit_layer()  # 获取当前问题回答对数层
        new_qa_logit_layer = self._get_resized_qa_labels(cur_qa_logit_layer, num_labels)  # 调整问题回答对数层的大小
        self._set_qa_logit_layer(new_qa_logit_layer)  # 设置新的问题回答对数层
        return self.get_qa_logit_layer()  # 返回调整后的问题回答对数层

    def get_qa_logit_layer(self) -> nn.Module:
        """
        Returns the linear layer that produces question answering logits

        Returns:
            `nn.Module`: A torch module mapping the question answering prediction hidden states. `None`: A NoneType
            object if Lxmert does not have the visual answering head.
        """
        if hasattr(self, "answer_head"):
            return self.answer_head.logit_fc[-1]  # 返回最后一个问题回答对数层

    def _set_qa_logit_layer(self, qa_logit_layer):
        self.answer_head.logit_fc[-1] = qa_logit_layer  # 设置最后一个问题回答对数层
    # 如果 num_labels 为 None，则直接返回当前的 cur_qa_logit_layer
    if num_labels is None:
        return cur_qa_logit_layer

    # 获取当前 cur_qa_logit_layer 的标签数量和隐藏层维度
    cur_qa_labels, hidden_dim = cur_qa_logit_layer.weight.size()

    # 如果当前 cur_qa_logit_layer 的标签数量与 num_labels 相同，则直接返回 cur_qa_logit_layer
    if cur_qa_labels == num_labels:
        return cur_qa_logit_layer

    # 如果 cur_qa_logit_layer 具有偏置项，则构建一个新的线性输出层
    if getattr(cur_qa_logit_layer, "bias", None) is not None:
        new_qa_logit_layer = nn.Linear(hidden_dim, num_labels)
    else:
        # 如果 cur_qa_logit_layer 没有偏置项，则构建一个无偏置的新线性输出层
        new_qa_logit_layer = nn.Linear(hidden_dim, num_labels, bias=False)

    # 将新构建的线性输出层放置在与 cur_qa_logit_layer 相同的设备上
    new_qa_logit_layer.to(cur_qa_logit_layer.weight.device)

    # 初始化新线性输出层的权重
    self._init_weights(new_qa_logit_layer)

    # 复制标签从先前权重中的标签
    num_labels_to_copy = min(cur_qa_labels, num_labels)
    new_qa_logit_layer.weight.data[:num_labels_to_copy, :] = cur_qa_logit_layer.weight.data[:num_labels_to_copy, :]

    # 如果 cur_qa_logit_layer 具有偏置项，则同时复制偏置项
    if getattr(cur_qa_logit_layer, "bias", None) is not None:
        new_qa_logit_layer.bias.data[:num_labels_to_copy] = cur_qa_logit_layer.bias.data[:num_labels_to_copy]

    # 返回新构建的线性输出层 new_qa_logit_layer
    return new_qa_logit_layer
        ) -> Union[LxmertForQuestionAnsweringOutput, Tuple[torch.FloatTensor]]:
        r"""
        labels (`Torch.Tensor` of shape `(batch_size)`, *optional*):
            A one-hot representation of the correct answer
        """
        # 根据需要确定是否返回字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 LXMERT 模型进行前向传播
        lxmert_output = self.lxmert(
            input_ids=input_ids,                      # 输入的token IDs
            visual_feats=visual_feats,                # 视觉特征
            visual_pos=visual_pos,                    # 视觉位置编码
            token_type_ids=token_type_ids,            # token类型IDs
            attention_mask=attention_mask,            # 注意力掩码
            visual_attention_mask=visual_attention_mask,  # 视觉注意力掩码
            inputs_embeds=inputs_embeds,              # 输入的嵌入表示
            output_hidden_states=output_hidden_states,  # 输出隐藏状态
            output_attentions=output_attentions,      # 输出注意力
            return_dict=return_dict,                  # 是否返回字典格式的输出
        )

        # 获取经过 LXMERT 模型后的汇总输出
        pooled_output = lxmert_output[2]

        # 使用答案头部对汇总输出进行评分
        answer_score = self.answer_head(pooled_output)

        # 初始化损失值
        loss = None
        # 如果提供了标签，则计算损失值
        if labels is not None:
            loss = self.loss(answer_score.view(-1, self.num_qa_labels), labels.view(-1))

        # 如果不需要返回字典格式的输出，则按元组方式构建输出
        if not return_dict:
            output = (answer_score,) + lxmert_output[3:]
            return (loss,) + output if loss is not None else output

        # 如果需要返回字典格式的输出，则创建相应的输出对象
        return LxmertForQuestionAnsweringOutput(
            loss=loss,  # 损失值
            question_answering_score=answer_score,  # 问题回答分数
            language_hidden_states=lxmert_output.language_hidden_states,  # 语言模型的隐藏状态
            vision_hidden_states=lxmert_output.vision_hidden_states,      # 视觉模型的隐藏状态
            language_attentions=lxmert_output.language_attentions,        # 语言注意力
            vision_attentions=lxmert_output.vision_attentions,            # 视觉注意力
            cross_encoder_attentions=lxmert_output.cross_encoder_attentions,  # 跨编码器注意力
        )

Transformers-源码解析-六十七-

Transformers 源码解析（六十七）

.\models\longt5\modeling_longt5.py

.\models\longt5\__init__.py

.\models\luke\configuration_luke.py

.\models\luke\convert_luke_original_pytorch_checkpoint_to_pytorch.py

.\models\luke\modeling_luke.py

.\models\luke\tokenization_luke.py

.\models\luke\__init__.py

.\models\lxmert\configuration_lxmert.py

.\models\lxmert\convert_lxmert_original_tf_checkpoint_to_pytorch.py

.\models\lxmert\modeling_lxmert.py

`.\models\longt5\modeling_longt5.py`

`.\models\longt5\init.py`

`.\models\luke\configuration_luke.py`

`.\models\luke\convert_luke_original_pytorch_checkpoint_to_pytorch.py`

`.\models\luke\modeling_luke.py`

`.\models\luke\tokenization_luke.py`

`.\models\luke\init.py`

`.\models\lxmert\configuration_lxmert.py`

`.\models\lxmert\convert_lxmert_original_tf_checkpoint_to_pytorch.py`

`.\models\lxmert\modeling_lxmert.py`