Transformers 源码解析（三十二）

`.\models\data2vec\modeling_data2vec_text.py`

# 设置文件编码为UTF-8，确保支持中文等多种字符集
# 版权声明，告知代码的版权归属于The HuggingFace Inc.团队
#
# 根据Apache许可证2.0版授权使用本文件
# 除非符合许可证的要求，否则不得使用本文件
# 可以从以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按“原样”分发，
# 没有任何明示或暗示的保证或条件
# 请查看许可证了解具体语言的权限和限制
"""PyTorch Data2VecText model."""

# 导入数学库，用于数学运算
import math
# 导入类型提示工具，用于函数参数和返回值的类型注释
from typing import List, Optional, Tuple, Union

# 导入PyTorch相关库
import torch
# 导入PyTorch中的checkpoint工具
import torch.utils.checkpoint
# 导入PyTorch中的神经网络模块
from torch import nn
# 导入PyTorch中的损失函数：二分类交叉熵损失、多分类交叉熵损失、均方误差损失
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入激活函数映射表和GELU激活函数
from ...activations import ACT2FN, gelu
# 导入模型输出类，包括基础输出、带过去和交叉注意力的基础输出、带池化和交叉注意力的基础输出、因果语言模型输出和交叉注意力、掩码语言模型输出、多选模型输出、问答模型输出、序列分类器输出、标记分类器输出
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
# 导入模型工具类，包括预训练模型和一些工具函数
from ...modeling_utils import PreTrainedModel
# 导入PyTorch工具类，应用前向传播分块、找到可修剪头和索引、修剪线性层
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
# 导入通用工具，包括添加代码示例文档字符串、添加起始文档字符串、将起始文档字符串添加到模型前向方法、日志记录、替换返回文档字符串
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 导入Data2VecText的配置类
from .configuration_data2vec_text import Data2VecTextConfig

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# Data2VecText模型中隐藏状态的起始位置常量
_HIDDEN_STATES_START_POSITION = 2

# 文档中常用的检查点示例
_CHECKPOINT_FOR_DOC = "facebook/data2vec-text-base"
# 文档中常用的配置示例
_CONFIG_FOR_DOC = "Data2VecTextConfig"

# Data2VecText预训练模型的存档列表
DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/data2vec-text-base",
    # 更多Data2VecText模型示例请查看 https://huggingface.co/models?filter=data2vec-text
]

# 从transformers.models.roberta.modeling_roberta.RobertaEmbeddings复制并修改为Data2VecText
class Data2VecTextForTextEmbeddings(nn.Module):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    """
    
    # 从transformers.models.bert.modeling_bert.BertEmbeddings.__init__复制
    # 初始化函数，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 初始化词嵌入层，根据配置文件指定词汇表大小、隐藏层大小，并设置填充标记索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 初始化位置嵌入层，根据配置文件指定最大位置嵌入数和隐藏层大小
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 初始化标记类型嵌入层，根据配置文件指定类型词汇表大小和隐藏层大小
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # LayerNorm 的命名不使用蛇形命名法，以便与 TensorFlow 模型变量名保持一致，并能够加载任何 TensorFlow 检查点文件
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 Dropout 层，使用配置文件中的隐藏层 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 位置嵌入类型，默认为绝对位置嵌入
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册一个缓冲区变量 position_ids，包含从 0 到最大位置嵌入数的序列，不持久化
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册一个缓冲区变量 token_type_ids，初始化为与 position_ids 相同形状的全零张量，不持久化
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

        # 设置填充标记索引为配置文件中的 pad_token_id
        self.padding_idx = config.pad_token_id
        # 初始化位置嵌入层，根据配置文件指定最大位置嵌入数和隐藏层大小，使用与 padding_idx 相同的填充索引
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )
        ):
            如果没有提供位置 id：
                如果提供了输入 token id：
                    # 根据输入 token id 创建位置 id。任何填充的 token 保持填充状态。
                    position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
                否则：
                    # 根据输入的嵌入张量创建位置 id
                    position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        如果提供了输入 token id：
            # 获取输入 token id 的形状
            input_shape = input_ids.size()
        否则：
            # 获取输入嵌入张量的形状（去掉最后一个维度，即序列长度）
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度
        seq_length = input_shape[1]

        # 将 token_type_ids 设置为构造函数中注册的缓冲区，通常情况下全为零。这有助于用户在跟踪模型时不传递 token_type_ids，解决问题 #5664
        如果 token_type_ids 为空：
            如果 self 中有 "token_type_ids" 属性：
                # 使用已注册的缓冲区的 token_type_ids，截取到序列长度的部分并扩展为与输入形状相同大小
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            否则：
                # 创建全零的 token_type_ids，其形状与输入相同，数据类型为 long，设备为 self.position_ids 的设备
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        如果 inputs_embeds 为空：
            # 使用 word_embeddings 方法根据输入 token id 获取嵌入张量
            inputs_embeds = self.word_embeddings(input_ids)
        # 使用 token_type_embeddings 方法根据 token_type_ids 获取 token type 嵌入张量
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将输入嵌入张量和 token type 嵌入张量相加得到最终嵌入张量
        embeddings = inputs_embeds + token_type_embeddings

        如果 self.position_embedding_type == "absolute"：
            # 如果使用绝对位置嵌入类型，则根据位置 ids 获取位置嵌入张量并加到最终嵌入张量上
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # 对最终嵌入张量进行 LayerNorm 规范化
        embeddings = self.LayerNorm(embeddings)
        # 对最终嵌入张量进行 dropout 处理
        embeddings = self.dropout(embeddings)
        # 返回最终嵌入张量
        return embeddings

    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        直接提供嵌入张量，无法推断哪些是填充的，因此生成顺序的位置 id。

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        # 获取输入嵌入张量的形状（去掉最后一个维度，即序列长度）
        input_shape = inputs_embeds.size()[:-1]
        # 获取序列长度
        sequence_length = input_shape[1]

        # 创建顺序的位置 id，从 self.padding_idx + 1 开始，到 sequence_length + self.padding_idx + 1 结束
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        # 扩展位置 id 的维度，使其与输入张量形状相同
        return position_ids.unsqueeze(0).expand(input_shape)
# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Data2VecText
class Data2VecTextSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # Linear transformation for query, key, and value tensors
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # Conditionally initialize distance embeddings based on position_embedding_type
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        self.is_decoder = config.is_decoder

    # Reshape the tensor for multi-head attention computation
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
class Data2VecTextSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Fully connected layer for self-output transformation
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # Layer normalization to stabilize learning
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout regularization to prevent overfitting
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # Linear transformation followed by dropout
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        # Residual connection followed by layer normalization
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Data2VecText
# 定义 Data2VecTextAttention 类，继承自 nn.Module，用于处理 Data2VecText 模型的自注意力机制

class Data2VecTextAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 初始化 self 层，使用 Data2VecTextSelfAttention 类处理自注意力机制
        self.self = Data2VecTextSelfAttention(config, position_embedding_type=position_embedding_type)
        # 初始化 output 层，使用 Data2VecTextSelfOutput 类处理自注意力机制的输出
        self.output = Data2VecTextSelfOutput(config)
        # 初始化一个空集合，用于存储被剪枝的注意力头
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数找到可以剪枝的注意力头及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储被剪枝的头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self 层处理输入的隐藏状态和相关的参数
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 调用 output 层处理 self 层的输出和原始的隐藏状态，得到注意力机制的输出
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力权重，则将它们添加到 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # 如果有的话，添加注意力权重
        return outputs


# Copied from transformers.models.bert.modeling_bert.BertIntermediate
# 定义 Data2VecTextIntermediate 类，继承自 nn.Module，用于处理 Data2VecText 模型的中间层

class Data2VecTextIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个线性层，将输入的隐藏状态映射到中间层的大小
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果 config.hidden_act 是字符串，则使用对应的激活函数，否则直接使用配置中的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入的隐藏状态通过线性层映射到中间层的大小
        hidden_states = self.dense(hidden_states)
        # 使用中间层的激活函数处理映射后的结果
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOutput
# 定义 Data2VecTextOutput 类，继承自 nn.Module，用于处理 Data2VecText 模型的输出层
class Data2VecTextOutput(nn.Module):
    # 初始化方法，接受一个名为 config 的参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个线性层，将输入特征的大小设为 config.intermediate_size，输出特征的大小设为 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个 LayerNorm 层，输入特征的大小为 config.hidden_size，使用 config.layer_norm_eps 作为 epsilon 参数
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层，使用 config.hidden_dropout_prob 作为丢弃概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法，接受两个参数 hidden_states 和 input_tensor，返回一个 torch.Tensor 类型的值
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将 hidden_states 输入到 self.dense 线性层中，得到输出 hidden_states
        hidden_states = self.dense(hidden_states)
        # 对 hidden_states 应用 dropout 操作
        hidden_states = self.dropout(hidden_states)
        # 将 dropout 后的 hidden_states 与 input_tensor 相加，并输入到 self.LayerNorm 层中进行归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回归一化后的 hidden_states
        return hidden_states
# 从 transformers.models.bert.modeling_bert.BertLayer 复制并修改为 Data2VecTextLayer
class Data2VecTextLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化模型的配置参数
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度设定为 1
        self.seq_len_dim = 1
        # 初始化自注意力层
        self.attention = Data2VecTextAttention(config)
        # 是否为解码器模型
        self.is_decoder = config.is_decoder
        # 是否添加交叉注意力
        self.add_cross_attention = config.add_cross_attention
        # 如果添加了交叉注意力但不是解码器模型，则引发错误
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 初始化交叉注意力层，并使用绝对位置编码
            self.crossattention = Data2VecTextAttention(config, position_embedding_type="absolute")
        # 初始化中间层
        self.intermediate = Data2VecTextIntermediate(config)
        # 初始化输出层
        self.output = Data2VecTextOutput(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # Perform self-attention mechanism using the stored key/value pairs from previous steps if available
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # Extract the attention output from self-attention mechanism
        attention_output = self_attention_outputs[0]

        # if decoder, the last output is tuple of self-attn cache
        if self.is_decoder:
            # Exclude the first and the last element of self_attention_outputs which are the attention output
            # and the present_key_value respectively
            outputs = self_attention_outputs[1:-1]
            # Retrieve the present key/value tuple from self-attention outputs
            present_key_value = self_attention_outputs[-1]
        else:
            # Include all elements except the first element (attention_output) if output_attentions is enabled
            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                # Raise an error if cross-attention layers are expected but not instantiated
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # Perform cross-attention mechanism using stored key/value pairs from previous steps if available
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            # Extract the attention output from cross-attention mechanism
            attention_output = cross_attention_outputs[0]
            # Combine outputs with cross-attention outputs excluding the first and the last element
            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights

            # add cross-attn cache to positions 3,4 of present_key_value tuple
            cross_attn_present_key_value = cross_attention_outputs[-1]
            # Concatenate present_key_value with cross-attn present_key_value
            present_key_value = present_key_value + cross_attn_present_key_value

        # Apply chunking to the forward pass of the feed forward layer
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # Combine layer_output with outputs
        outputs = (layer_output,) + outputs

        # if decoder, return the attn key/values as the last output
        if self.is_decoder:
            # Append present_key_value to outputs if the model is a decoder
            outputs = outputs + (present_key_value,)

        # Return all outputs of the transformer layer
        return outputs

    def feed_forward_chunk(self, attention_output):
        # Apply feed forward chunk processing using intermediate and output layers
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
# 从 transformers.models.bert.modeling_bert.BertEncoder 复制而来，将 Bert 替换为 Data2VecText
class Data2VecTextEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 创建一个 nn.ModuleList，包含 config.num_hidden_layers 个 Data2VecTextLayer 的实例
        self.layer = nn.ModuleList([Data2VecTextLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果不需要输出隐藏状态，则初始化为空元组；否则为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果不需要输出注意力权重，则初始化为空元组；否则为 None
        all_self_attentions = () if output_attentions else None
        # 如果不需要输出交叉注意力权重或配置不支持，则初始化为空元组；否则为 None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果启用了梯度检查点且处于训练模式
        if self.gradient_checkpointing and self.training:
            # 如果 use_cache 设置为 True，则发出警告并将其设置为 False
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果 use_cache 为 True，则初始化下一个解码器缓存为空元组；否则为 None
        next_decoder_cache = () if use_cache else None

        # 遍历所有的解码器层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则添加当前层的隐藏状态到 all_hidden_states
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果有头部掩码，则使用当前层对应的头部掩码；否则为 None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 如果有过去的键值对，则使用当前层对应的过去键值对；否则为 None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果启用了梯度检查点且处于训练模式
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数进行前向传播计算
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层的前向传播函数
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新隐藏状态为当前层输出的隐藏状态
            hidden_states = layer_outputs[0]
            # 如果 use_cache 为 True，则更新下一个解码器缓存
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力权重，则添加当前层输出的注意力权重到 all_self_attentions
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果模型配置支持交叉注意力，则添加当前层输出的交叉注意力到 all_cross_attentions
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态，则添加最终隐藏状态到 all_hidden_states
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果 return_dict 为 False，则返回一个元组，包含非空值
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则返回一个 BaseModelOutputWithPastAndCrossAttentions 对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# Copied from transformers.models.bert.modeling_bert.BertPooler
class Data2VecTextPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，输入和输出维度均为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 激活函数为双曲正切函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 从隐藏状态中取出第一个 token 对应的隐藏状态作为池化输出
        first_token_tensor = hidden_states[:, 0]
        # 将第一个 token 的隐藏状态传入全连接层
        pooled_output = self.dense(first_token_tensor)
        # 使用激活函数处理全连接层的输出
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出张量
        return pooled_output


class Data2VecTextPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = Data2VecTextConfig
    base_model_prefix = "data2vec_text"
    supports_gradient_checkpointing = True
    _no_split_modules = ["Data2VecTextForTextEmbeddings", "Data2VecTextLayer"]

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化线性层的权重，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果有偏置项，则将其初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化嵌入层的权重，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果指定了 padding_idx，则将其对应的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            if hasattr(module, "bias") and module.bias is not None:
                # 如果有偏置项，则将其初始化为零
                module.bias.data.zero_()
            if hasattr(module, "weight") and module.weight is not None:
                # 如果有权重项，则将其初始化为全 1
                module.weight.data.fill_(1.0)


DATA2VECTEXT_START_DOCSTRING = r"""
    Data2VecText was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
    Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and
    Michael Auli.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.
"""
    Parameters:
        config ([`Data2VecTextConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
定义了一个多行字符串常量，用于文档字符串的输入参数说明。
"""

@add_start_docstrings(
    "The bare Data2VecText Model for text transformer outputting raw hidden-states without any specific head on top.",
    DATA2VECTEXT_START_DOCSTRING,
)
    """
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as a decoder, the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762

    """

    # 初始化函数，用于初始化模型
    def __init__(self, config, add_pooling_layer=True):
        # 调用父类的初始化方法
        super().__init__(config)
        # 将配置信息保存在实例中
        self.config = config

        # 初始化词嵌入层
        self.embeddings = Data2VecTextForTextEmbeddings(config)
        # 初始化文本编码器
        self.encoder = Data2VecTextEncoder(config)

        # 根据需要添加池化层
        self.pooler = Data2VecTextPooler(config) if add_pooling_layer else None

        # 执行后续的初始化操作
        self.post_init()

    # 获取输入词嵌入的方法
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    # 设置输入词嵌入的方法
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    # 剪枝模型中注意力头部的方法
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            # 对指定层的注意力头部进行剪枝
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 覆盖的前向传播方法，实现模型的前向计算
    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 从 transformers.models.bert.modeling_bert.BertModel.forward 复制过来的
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    """Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VECTEXT_START_DOCSTRING
)
class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

    def __init__(self, config):
        super().__init__(config)

        if not config.is_decoder:
            logger.warning("If you want to use `Data2VecTextLMHeadModel` as a standalone, add `is_decoder=True.`")

        # 初始化 Data2VecTextModel，不包含池化层
        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
        # 初始化语言模型头部 Data2VecTextLMHead
        self.lm_head = Data2VecTextLMHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_output_embeddings(self):
        # 返回语言模型头部的解码器权重
        return self.lm_head.decoder

    def set_output_embeddings(self, new_embeddings):
        # 设置语言模型头部的解码器权重
        self.lm_head.decoder = new_embeddings

    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 模型前向传播函数，详细参数说明参见 add_start_docstrings_to_model_forward 的注释
        ...

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        # 如果没有提供注意力遮罩，创建全为1的遮罩
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)

        # 如果传入了过去的键值对，裁剪输入的 input_ids
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法可能只传递最后一个输入 ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认的旧行为：只保留最后一个 ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]

        # 返回包含准备好的输入信息的字典
        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
    # 定义一个方法 `_reorder_cache`，用于重排序缓存中的过去键值
    def _reorder_cache(self, past_key_values, beam_idx):
        # 初始化一个空的元组用于存储重排序后的过去键值
        reordered_past = ()
        # 遍历传入的 past_key_values 中的每一层的过去状态
        for layer_past in past_key_values:
            # 对于每一层的过去状态，按照 beam_idx 给定的顺序进行索引选择，并转移到对应的设备上
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
                # 将重排序后的每一层的过去状态添加到 reordered_past 中
            )
        # 返回重排序后的 past_key_values
        return reordered_past
# 给 Data2VecTextForMaskedLM 类添加文档字符串，描述其作为一个在顶部带有语言建模头部的 data2vec 模型
@add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VECTEXT_START_DOCSTRING)
class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
    # 定义与权重绑定的关键字列表
    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 如果配置指定为解码器，发出警告提示
        if config.is_decoder:
            logger.warning(
                "If you want to use `Data2VecTextForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # 初始化 data2vec_text 模型和 lm_head
        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
        self.lm_head = Data2VecTextLMHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 返回 lm_head 的解码器
    def get_output_embeddings(self):
        return self.lm_head.decoder

    # 设置 lm_head 的解码器的新嵌入
    def set_output_embeddings(self, new_embeddings):
        self.lm_head.decoder = new_embeddings

    # 为 forward 方法添加模型输入的文档字符串和代码示例的文档字符串
    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="<mask>",
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        """
        # 根据 return_dict 是否为 None，决定是否使用配置中的 use_return_dict 值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 data2vec_text 方法，生成预测输出
        outputs = self.data2vec_text(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取预测输出的序列部分
        sequence_output = outputs[0]
        # 将序列输出送入语言模型头部，生成预测分数
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        # 如果 labels 不为 None，则计算 masked language modeling 损失
        if labels is not None:
            # 使用交叉熵损失函数
            loss_fct = CrossEntropyLoss()

            # 将 labels 移动到与 prediction_scores 相同的设备上
            labels = labels.to(prediction_scores.device)
            # 计算 masked language modeling 损失
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # 如果 return_dict 为 False，则返回一个包含预测分数和其他输出的元组
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # 如果 return_dict 为 True，则返回一个 MaskedLMOutput 对象，包含损失、预测分数、隐藏状态和注意力
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 从transformers.models.roberta.modeling_roberta.RobertaLMHead复制并将Roberta改为Data2VecText
class Data2VecTextLMHead(nn.Module):
    """Data2VecText Head for masked language modeling."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 线性层，将隐藏状态映射回词汇表大小
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
        self.decoder.bias = self.bias

    def forward(self, features, **kwargs):
        x = self.dense(features)  # 使用dense层进行线性变换
        x = gelu(x)  # 应用GELU激活函数
        x = self.layer_norm(x)  # 应用LayerNorm

        # 使用decoder层将特征映射回词汇表大小
        x = self.decoder(x)

        return x

    def _tie_weights(self):
        # 当这两个权重断开连接时（在TPU上或者当偏置被重新调整大小时），用于绑定这两个权重
        # 为了加速兼容性和不破坏向后兼容性
        if self.decoder.bias.device.type == "meta":
            self.decoder.bias = self.bias
        else:
            self.bias = self.decoder.bias


@add_start_docstrings(
    """
    Data2VecText模型变换器，顶部带有序列分类/回归头（汇总输出的线性层），例如用于GLUE任务。
    """,
    DATA2VECTEXT_START_DOCSTRING,
)
class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)  # 初始化Data2VecText模型
        self.classifier = Data2VecTextClassificationHead(config)  # 初始化Data2VecText分类头部

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 根据需要决定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 data2vec_text 方法，获取模型的输出
        outputs = self.data2vec_text(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 从模型输出中获取序列输出
        sequence_output = outputs[0]
        # 将序列输出传入分类器获取 logits
        logits = self.classifier(sequence_output)

        # 初始化损失值
        loss = None
        # 如果提供了标签，进行损失计算
        if labels is not None:
            # 将标签移动到 logits 的设备上
            labels = labels.to(logits.device)

            # 根据问题类型设置配置的问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据不同的问题类型选择损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单标签回归任务，计算 MSE 损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签回归任务，计算 MSE 损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类任务，计算交叉熵损失
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类任务，计算带 logits 的 BCE 损失
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不要求返回字典形式的输出，则返回元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回带有损失、logits、隐藏状态和注意力的 SequenceClassifierOutput 对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 添加文档字符串描述模型基础信息和任务应用场景
@add_start_docstrings(
    """
    Data2VecText Model with a multiple choice classification head on top (a linear layer on top of the pooled output
    and a softmax) e.g. for RocStories/SWAG tasks.
    """,
    DATA2VECTEXT_START_DOCSTRING,
)
# 定义一个新的类 Data2VecTextForMultipleChoice，继承自 Data2VecTextPreTrainedModel
class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
    # 初始化方法
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 创建一个 Data2VecTextModel 对象
        self.data2vec_text = Data2VecTextModel(config)
        # 添加一个 dropout 层，使用配置中的隐藏层dropout概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 添加一个线性层用于分类，输入大小为配置中的隐藏层大小，输出为1（用于二分类）
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    # 添加文档字符串描述模型前向传播的输入参数
    @add_start_docstrings_to_model_forward(
        DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    # 添加代码示例文档字符串，包含模型输出类型、检查点和配置信息
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 前向传播方法定义
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 后续还有更多的参数，但这里不对其进行注释
    ):
    ) -> Union[Tuple, MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 如果 return_dict 参数为 None，则使用 self.config.use_return_dict 决定是否返回字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 计算选择题数量，根据 input_ids 的第二维度确定，如果 input_ids 为 None，则根据 inputs_embeds 的第二维度确定
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 将 input_ids 展平为二维张量，用于模型输入，如果 input_ids 为 None，则 flat_input_ids 也为 None
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        # 将 position_ids 展平为二维张量，如果 position_ids 为 None，则 flat_position_ids 也为 None
        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        # 将 token_type_ids 展平为二维张量，如果 token_type_ids 为 None，则 flat_token_type_ids 也为 None
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        # 将 attention_mask 展平为二维张量，如果 attention_mask 为 None，则 flat_attention_mask 也为 None
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        # 将 inputs_embeds 展平为三维张量，如果 inputs_embeds 为 None，则 flat_inputs_embeds 也为 None
        flat_inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用模型的 data2vec_text 方法，传入展平后的张量作为参数，并获取模型输出
        outputs = self.data2vec_text(
            flat_input_ids,
            position_ids=flat_position_ids,
            token_type_ids=flat_token_type_ids,
            attention_mask=flat_attention_mask,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取模型输出中的汇聚输出，即模型的汇总表示
        pooled_output = outputs[1]

        # 对汇聚输出应用 dropout 操作，以防止过拟合
        pooled_output = self.dropout(pooled_output)
        # 将汇聚输出传入分类器，计算分类 logits
        logits = self.classifier(pooled_output)
        # 重新调整 logits 的形状为 (batch_size, num_choices)
        reshaped_logits = logits.view(-1, num_choices)

        # 初始化 loss 为 None
        loss = None
        # 如果 labels 不为 None，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # 将 labels 转移到 reshaped_logits 的设备上，计算交叉熵损失
            labels = labels.to(reshaped_logits.device)
            loss = loss_fct(reshaped_logits, labels)

        # 如果 return_dict 为 False，则返回一个元组
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回一个 MultipleChoiceModelOutput 对象
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Data2VecText Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    """,
    DATA2VECTEXT_START_DOCSTRING,
)
class Data2VecTextForTokenClassification(Data2VecTextPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels  # 从配置中获取标签数目

        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)  # 初始化Data2VecText模型，不添加池化层
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)  # 使用分类器的dropout或者隐藏层的dropout概率
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)  # 线性层，将隐藏状态映射到标签数目

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Perform a forward pass through the model with optional inputs and outputs.

        Args:
            input_ids (Optional[torch.LongTensor]): The input tensor of token indices.
            attention_mask (Optional[torch.FloatTensor]): The attention mask tensor.
            token_type_ids (Optional[torch.LongTensor]): The token type IDs tensor.
            position_ids (Optional[torch.LongTensor]): The position IDs tensor.
            head_mask (Optional[torch.FloatTensor]): The head mask tensor.
            inputs_embeds (Optional[torch.FloatTensor]): The embedded input tensors.
            labels (Optional[torch.LongTensor]): The tensor of labels for classification.
            output_attentions (Optional[bool]): Whether to output attentions.
            output_hidden_states (Optional[bool]): Whether to output hidden states.
            return_dict (Optional[bool]): Whether to return outputs as a dictionary.

        Returns:
            TokenClassifierOutput: Output object with logits and optional additional outputs.
        """
        # 实现模型的前向传播逻辑，生成对应的输出对象
        pass  # placeholder, 实际逻辑应填充在这里
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 如果 return_dict 为 None，则使用 self.config.use_return_dict 决定返回值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 data2vec_text 方法，获取输出结果
        outputs = self.data2vec_text(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从输出中获取序列输出
        sequence_output = outputs[0]

        # 对序列输出应用 dropout
        sequence_output = self.dropout(sequence_output)
        
        # 将 dropout 后的输出传递给分类器，得到 logits
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        
        # 如果提供了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()

            # 将标签转移到 logits 的设备上，并计算损失
            labels = labels.to(logits.device)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果 return_dict 为 False，则构造输出元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则构造 TokenClassifierOutput 对象并返回
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 从transformers.models.roberta.modeling_roberta.RobertaClassificationHead复制并修改为Data2VecTextClassificationHead
class Data2VecTextClassificationHead(nn.Module):
    """用于句子级分类任务的头部模块。"""

    def __init__(self, config):
        super().__init__()
        # 全连接层，输入和输出大小为config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 分类器的dropout率，如果未指定则使用config.hidden_dropout_prob
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # Dropout层
        self.dropout = nn.Dropout(classifier_dropout)
        # 输出投影层，将hidden_size映射到num_labels
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        # 取features的第一个token（等同于[CLS]）
        x = features[:, 0, :]
        # 应用dropout
        x = self.dropout(x)
        # 全连接层
        x = self.dense(x)
        # 使用tanh激活函数
        x = torch.tanh(x)
        # 再次应用dropout
        x = self.dropout(x)
        # 输出投影层
        x = self.out_proj(x)
        return x


@add_start_docstrings(
    """
    Data2VecText模型的问题回答任务头部，用于像SQuAD这样的抽取式问答任务（在隐藏状态输出之上使用线性层来计算“起始位置logits”和“结束位置logits”）。
    """,
    DATA2VECTEXT_START_DOCSTRING,
)
class Data2VecTextForQuestionAnswering(Data2VecTextPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 标签数量
        self.num_labels = config.num_labels

        # Data2VecText模型的实例，不包含池化层
        self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
        # 问题回答输出层，全连接层将hidden_size映射到num_labels
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 初始化 return_dict，如果未提供则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 data2vec_text 方法，将输入数据转换为向量表示
        outputs = self.data2vec_text(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从输出中获取序列输出（通常是模型的最后一层隐藏状态）
        sequence_output = outputs[0]

        # 将序列输出传递给 qa_outputs 模型，获得问题回答的 logits
        logits = self.qa_outputs(sequence_output)

        # 将 logits 拆分为开始和结束位置的预测 logits
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()  # 去除多余的维度并保持连续性
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果 start_positions 或 end_positions 的维度大于 1，则去除多余的维度
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            
            # 忽略超出模型输入范围的 start/end positions
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，忽略指定的 ignore_index
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)

            # 计算开始位置和结束位置损失的平均值作为总损失
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            # 如果不要求返回字典，则返回 start_logits, end_logits 和其它可能的输出
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 返回一个 QuestionAnsweringModelOutput 对象，包含损失、开始和结束位置的 logits，以及其它可能的输出
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        input_ids: torch.Tensor, input tensor containing token IDs
        padding_idx: int, index of padding token
        past_key_values_length: int, optional, length of past key values

    Returns:
        torch.Tensor, tensor of position IDs corresponding to input_ids
    """
    # 创建一个掩码，标记非填充符号的位置为1，填充符号位置为0
    mask = input_ids.ne(padding_idx).int()
    # 根据掩码累积计数，并添加过去键值长度，然后乘以掩码，以得到增量索引
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    # 将增量索引转换为长整型，并加上填充索引，得到最终的位置 ID
    return incremental_indices.long() + padding_idx

`.\models\data2vec\modeling_data2vec_vision.py`

# coding=utf-8
# 声明版权信息，此文件版权归 Meta Platforms 和 The HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证版本 2.0 进行许可，除非符合许可证的要求，否则不得使用此文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发本软件
# 本软件不附带任何明示或暗示的担保或条件
# 有关具体的语言授权，请参阅许可证
""" PyTorch Data2VecVision 模型。"""


import collections.abc  # 导入 collections.abc 模块
import math  # 导入 math 模块
from dataclasses import dataclass  # 从 dataclasses 模块导入 dataclass 装饰器
from typing import List, Optional, Tuple, Union  # 导入类型提示

import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 的 checkpoint 模块
from torch import nn  # 从 PyTorch 导入 nn 模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 从 nn 导入三种损失函数

from ...activations import ACT2FN  # 从本地导入 ACT2FN 激活函数
from ...modeling_outputs import (  # 导入模型输出相关的类
    BaseModelOutput,
    BaseModelOutputWithPooling,
    ImageClassifierOutput,
    SemanticSegmenterOutput,
)
from ...modeling_utils import PreTrainedModel  # 从 modeling_utils 导入 PreTrainedModel 类
from ...pytorch_utils import (  # 导入 PyTorch 工具函数
    find_pruneable_heads_and_indices,
    meshgrid,
    prune_linear_layer,
)
from ...utils import (  # 导入通用实用函数
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_data2vec_vision import Data2VecVisionConfig  # 导入 Data2VecVisionConfig 配置类


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# General docstring
_CONFIG_FOR_DOC = "Data2VecVisionConfig"  # 文档字符串中使用的配置名称

# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"  # 文档字符串中使用的基础检查点名称
_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]  # 预期输出的形状为 1x197x768

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "facebook/data2vec-vision-base-ft1k"  # 图像分类使用的检查点名称
_IMAGE_CLASS_EXPECTED_OUTPUT = "remote control, remote"  # 图像分类的预期输出描述

DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST = [  # Data2VecVision 预训练模型的存档列表
    "facebook/data2vec-vision-base-ft1k",
    # 查看所有 Data2VecVision 模型，请访问 https://huggingface.co/models?filter=data2vec-vision
]


@dataclass
# 从 transformers.models.beit.modeling_beit.BeitModelOutputWithPooling 复制的 Data2VecVisionModelOutputWithPooling 类定义
class Data2VecVisionModelOutputWithPooling(BaseModelOutputWithPooling):
    """
    [`Data2VecVisionModel`] 的输出类。
    """
    pass  # 此处为占位符，表示暂无额外实现
    # 将最后一层模型的隐藏状态作为输入，用于特征提取或下游任务的输入
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            # 模型最后一层的隐藏状态序列，形状为 `(batch_size, sequence_length, hidden_size)`
            Sequence of hidden-states at the output of the last layer of the model.
    
    # 如果 *config.use_mean_pooling* 设置为 True，则返回除 *[CLS]* 标记外的补丁标记的最后一层隐藏状态的平均值；
    # 如果设置为 False，则返回 *[CLS]* 标记的最终隐藏状态。
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        # 如果 *config.use_mean_pooling* 设置为 True，则返回补丁标记的最后一层隐藏状态的平均值。
        # 如果设置为 False，则返回 *[CLS]* 标记的最终隐藏状态。
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    
    # 可选参数，当 `output_hidden_states=True` 时返回，或当 `config.output_hidden_states=True` 时返回，
    # 返回模型每一层的隐藏状态，包括初始嵌入输出。
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        # 可选参数，当 `output_hidden_states=True` 时返回，或当 `config.output_hidden_states=True` 时返回，
        # 包含 `torch.FloatTensor` 的元组（一个用于嵌入的输出 + 一个用于每层输出），
        # 形状为 `(batch_size, sequence_length, hidden_size)`。
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.
    
    # 可选参数，当 `output_attentions=True` 时返回，或当 `config.output_attentions=True` 时返回，
    # 返回每层的注意力权重，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        # 可选参数，当 `output_attentions=True` 时返回，或当 `config.output_attentions=True` 时返回，
        # 包含 `torch.FloatTensor` 的元组（每层一个），形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    
        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    if drop_prob == 0.0 or not training:
        return input
    keep_prob = 1 - drop_prob
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # binarize
    output = input.div(keep_prob) * random_tensor
    return output


# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Data2VecVision
class Data2VecVisionDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用全局函数 drop_path，传入当前实例的 drop_prob 和训练状态 self.training
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        # 返回描述当前实例 drop_prob 的字符串表示
        return "p={}".format(self.drop_prob)


# Copied from transformers.models.beit.modeling_beit.BeitEmbeddings with Beit->Data2VecVision
class Data2VecVisionEmbeddings(nn.Module):
    """
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    """

    def __init__(self, config: Data2VecVisionConfig) -> None:
        super().__init__()

        # 定义 CLS token 参数作为可学习参数
        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        
        # 如果配置中启用了 mask token，则定义 mask token 参数作为可学习参数
        if config.use_mask_token:
            self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        else:
            self.mask_token = None
        
        # 初始化 patch embeddings，根据配置确定是否包含绝对位置 embeddings
        self.patch_embeddings = Data2VecVisionPatchEmbeddings(config)
        num_patches = self.patch_embeddings.num_patches
        
        # 如果配置中启用了绝对位置 embeddings，则定义位置 embeddings 参数作为可学习参数
        if config.use_absolute_position_embeddings:
            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
        else:
            self.position_embeddings = None
        
        # 定义 dropout 层，使用配置中的隐藏层 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor:
        # 使用 patch_embeddings 方法得到嵌入向量和补丁的高度和宽度信息
        embeddings, (patch_height, patch_width) = self.patch_embeddings(
            pixel_values, self.position_embeddings[:, 1:, :] if self.position_embeddings is not None else None
        )
        # 获取批量大小、序列长度和嵌入向量的维度
        batch_size, seq_len, _ = embeddings.size()

        if bool_masked_pos is not None:
            # 根据掩码位置替换被掩码的视觉令牌为 mask_tokens
            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
            w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
            embeddings = embeddings * (1 - w) + mask_tokens * w

        # 扩展 cls_token 以匹配当前批次的维度
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        # 如果存在位置嵌入，则将其添加到 cls_tokens 中
        if self.position_embeddings is not None:
            cls_tokens = cls_tokens + self.position_embeddings[:, :1, :]

        # 将 cls_tokens 和 embeddings 沿着序列长度维度拼接
        embeddings = torch.cat((cls_tokens, embeddings), dim=1)

        # 对 embeddings 应用 dropout
        embeddings = self.dropout(embeddings)

        # 返回嵌入向量和补丁的高度和宽度信息
        return embeddings, (patch_height, patch_width)
# Copied from transformers.models.beit.modeling_beit.BeitPatchEmbeddings with Beit->Data2VecVision
class Data2VecVisionPatchEmbeddings(nn.Module):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config):
        super().__init__()
        # Extract configuration parameters
        image_size, patch_size = config.image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.hidden_size

        # Ensure image_size and patch_size are tuples
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)

        # Calculate number of patches and patch shape
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])

        # Store parameters as attributes
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches
        self.patch_shape = patch_shape

        # Projection layer to generate patch embeddings
        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

    def forward(self, pixel_values: torch.Tensor, position_embedding: Optional[torch.Tensor] = None) -> torch.Tensor:
        # Extract dimensions from input pixel values
        batch_size, num_channels, height, width = pixel_values.shape

        # Check if number of channels matches the configuration
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )

        # Project pixel values into patch embeddings
        embeddings = self.projection(pixel_values)
        patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]

        # Add position embeddings if provided
        if position_embedding is not None:
            # Reshape and interpolate position embeddings to match patch size
            position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(
                0, 3, 1, 2
            )
            position_embedding = nn.functional.interpolate(
                position_embedding, size=(patch_height, patch_width), mode="bicubic"
            )
            embeddings = embeddings + position_embedding

        # Flatten embeddings and transpose for further processing
        embeddings = embeddings.flatten(2).transpose(1, 2)

        return embeddings, (patch_height, patch_width)


# Copied from transformers.models.beit.modeling_beit.BeitSelfAttention with Beit->Data2VecVision
class Data2VecVisionSelfAttention(nn.Module):
    # 初始化函数，接受一个配置对象和一个可选的窗口大小参数
    def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None) -> None:
        # 调用父类的初始化方法
        super().__init__()
        
        # 检查隐藏层大小是否能被注意力头数整除，同时不存在嵌入大小属性
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            # 如果不满足条件，抛出数值错误异常
            raise ValueError(
                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
                f"heads {config.num_attention_heads}."
            )
        
        # 设置注意力头数和每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 定义查询、键、值的线性变换层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 定义注意力概率的丢弃层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

        # 如果存在窗口大小参数，初始化相对位置偏置对象
        if window_size:
            self.relative_position_bias = Data2VecVisionRelativePositionBias(config, window_size=window_size)
        else:
            self.relative_position_bias = None

    # 将输入张量 x 转换为注意力分数的形状
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数，接受隐藏状态张量等输入，可选的头部掩码、是否输出注意力矩阵、相对位置偏置参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None,
        ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
        # 从隐藏状态生成混合查询层
        mixed_query_layer = self.query(hidden_states)

        # 使用self.key处理隐藏状态并为得分转置以获取键层
        key_layer = self.transpose_for_scores(self.key(hidden_states))

        # 使用self.value处理隐藏状态并为得分转置以获取值层
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        # 为混合查询层转置以获取查询层
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 计算原始注意力分数，即查询与键的点积
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        # 根据注意力头大小对得分进行缩放
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # 如果存在相对位置偏置，则添加到注意力分数中
        if self.relative_position_bias is not None:
            attention_scores = attention_scores + self.relative_position_bias().unsqueeze(0)

        # 如果提供了共享的相对位置偏置，则也添加到注意力分数中
        if relative_position_bias is not None:
            attention_scores = attention_scores + relative_position_bias

        # 将注意力分数归一化为注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 对注意力概率进行Dropout处理，实际上是随机丢弃整个token的注意力
        attention_probs = self.dropout(attention_probs)

        # 如果需要，应用头部掩码
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算上下文向量，将注意力概率与值层相乘
        context_layer = torch.matmul(attention_probs, value_layer)

        # 对上下文向量进行维度重排，以便与Transformer模型中的预期形状一致
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        # 根据需要输出注意力分数
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
# 从 transformers.models.beit.modeling_beit.BeitSelfOutput 复制而来，将 Beit 替换为 Data2VecVision
class Data2VecVisionSelfOutput(nn.Module):
    """
    在 Data2VecVisionLayer 中定义了残差连接，而不是在这里（像其他模型一样），这是因为在每个块之前应用了 layernorm。
    """

    def __init__(self, config: Data2VecVisionConfig) -> None:
        super().__init__()
        # 创建一个全连接层，输入和输出大小都为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 创建一个 dropout 层，使用的 dropout 概率为 config.hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor, gamma=None) -> torch.Tensor:
        # 使用全连接层处理 hidden_states
        hidden_states = self.dense(hidden_states)
        # 对处理后的 hidden_states 应用 dropout
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# 从 transformers.models.beit.modeling_beit.BeitAttention 复制而来，将 Beit 替换为 Data2VecVision
class Data2VecVisionAttention(nn.Module):
    def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None) -> None:
        super().__init__()
        # 创建一个 Data2VecVisionSelfAttention 实例，传入 config 和可选的 window_size 参数
        self.attention = Data2VecVisionSelfAttention(config, window_size=window_size)
        # 创建一个 Data2VecVisionSelfOutput 实例，传入 config
        self.output = Data2VecVisionSelfOutput(config)
        # 创建一个空集合，用于存储被修剪的注意力头
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数找到可修剪的头部及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
        )

        # 修剪线性层
        self.attention.query = prune_linear_layer(self.attention.query, index)
        self.attention.key = prune_linear_layer(self.attention.key, index)
        self.attention.value = prune_linear_layer(self.attention.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪的头部
        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None,
    ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
        # 调用 self.attention 进行注意力计算
        self_outputs = self.attention(hidden_states, head_mask, output_attentions, relative_position_bias)

        # 使用 self.output 处理 self_outputs[0] 和 hidden_states，得到注意力输出
        attention_output = self.output(self_outputs[0], hidden_states)

        # 构建输出元组，如果需要输出注意力，则添加到元组中
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出注意力，则添加到元组中
        return outputs


# 从 transformers.models.beit.modeling_beit.BeitIntermediate 复制而来，将 Beit 替换为 Data2VecVision
class Data2VecVisionIntermediate(nn.Module):
    # 初始化方法，用于创建一个新的Data2VecVisionConfig对象的实例
    def __init__(self, config: Data2VecVisionConfig) -> None:
        # 调用父类（nn.Module）的初始化方法
        super().__init__()
        # 创建一个线性层，输入维度是config.hidden_size，输出维度是config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果config.hidden_act是字符串类型，则从ACT2FN字典中获取对应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            # 否则直接使用config.hidden_act作为激活函数
            self.intermediate_act_fn = config.hidden_act

    # 前向传播方法，接收一个张量hidden_states作为输入，返回一个张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入张量通过线性层self.dense进行线性变换
        hidden_states = self.dense(hidden_states)
        # 将线性变换后的张量通过激活函数self.intermediate_act_fn进行非线性变换
        hidden_states = self.intermediate_act_fn(hidden_states)

        # 返回经过线性和非线性变换后的张量作为输出
        return hidden_states
# Copied from transformers.models.beit.modeling_beit.BeitOutput with Beit->Data2VecVision
class Data2VecVisionOutput(nn.Module):
    def __init__(self, config: Data2VecVisionConfig) -> None:
        super().__init__()
        # 定义一个全连接层，将输入特征维度转换为隐藏状态特征维度
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 定义一个dropout层，用于随机将输入张量中部分元素设为0，以减少过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入张量经过全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的张量进行dropout操作
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# Copied from transformers.models.beit.modeling_beit.BeitLayer with Beit->Data2VecVision,BEiT->Data2VecVision
class Data2VecVisionLayer(nn.Module):
    """This corresponds to the Block class in the timm implementation."""

    def __init__(
        self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, drop_path_rate: float = 0.0
    ) -> None:
        super().__init__()
        # 设置前馈chunk的大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度为1
        self.seq_len_dim = 1
        # 定义注意力层，包括自注意力和相对位置编码
        self.attention = Data2VecVisionAttention(config, window_size=window_size)
        # 定义中间层，包括全连接和dropout操作
        self.intermediate = Data2VecVisionIntermediate(config)
        # 定义输出层，包括全连接和dropout操作
        self.output = Data2VecVisionOutput(config)
        # 定义LayerNorm层，在特定维度上对输入进行归一化
        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 根据drop_path_rate的值，定义DropPath层或者恒等映射
        self.drop_path = Data2VecVisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
        # 定义LayerNorm层，在特定维度上对输入进行归一化
        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 根据配置初始化lambda_1和lambda_2参数
        init_values = config.layer_scale_init_value
        if init_values > 0:
            self.lambda_1 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True)
            self.lambda_2 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True)
        else:
            self.lambda_1, self.lambda_2 = None, None

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None,
        # 相对位置偏置，用于考虑局部和全局的关系
    ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
        # 使用 self.attention 对 hidden_states 进行自注意力计算
        self_attention_outputs = self.attention(
            self.layernorm_before(hidden_states),  # 在 Data2VecVision 中，self-attention 之前应用 layernorm
            head_mask,
            output_attentions=output_attentions,
            relative_position_bias=relative_position_bias,
        )
        # 获取自注意力计算的输出
        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # 如果输出注意力权重，将注意力也加入到输出中

        # 如果存在 lambda_1，则对 attention_output 应用缩放
        if self.lambda_1 is not None:
            attention_output = self.lambda_1 * attention_output

        # 第一个残差连接
        hidden_states = self.drop_path(attention_output) + hidden_states

        # 在 Data2VecVision 中，self-attention 之后也应用 layernorm
        layer_output = self.layernorm_after(hidden_states)

        # 应用中间层和输出层的变换
        layer_output = self.intermediate(layer_output)
        layer_output = self.output(layer_output)

        # 如果存在 lambda_2，则对 layer_output 应用缩放
        if self.lambda_2 is not None:
            layer_output = self.lambda_2 * layer_output

        # 第二个残差连接
        layer_output = self.drop_path(layer_output) + hidden_states

        # 将最终输出组装成 outputs 元组
        outputs = (layer_output,) + outputs

        return outputs
# 从 transformers.models.beit.modeling_beit.BeitRelativePositionBias 复制代码，并将 Beit 改为 Data2VecVision
class Data2VecVisionRelativePositionBias(nn.Module):
    def __init__(self, config: Data2VecVisionConfig, window_size: tuple) -> None:
        super().__init__()
        self.window_size = window_size
        # 计算相对位置偏置表的大小
        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros(self.num_relative_distance, config.num_attention_heads)
        )  # 2*Wh-1 * 2*Ww-1, nH
        # cls to token & token 2 cls & cls to cls

        # 获取窗口内每个标记的成对相对位置索引
        coords_h = torch.arange(window_size[0])
        coords_w = torch.arange(window_size[1])
        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :, 0] += window_size[0] - 1  # 从0开始移动
        relative_coords[:, :, 1] += window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
        relative_position_index = torch.zeros(
            size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype
        )
        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        relative_position_index[0, 0:] = self.num_relative_distance - 3
        relative_position_index[0:, 0] = self.num_relative_distance - 2
        relative_position_index[0, 0] = self.num_relative_distance - 1

        self.register_buffer("relative_position_index", relative_position_index, persistent=False)

    def forward(self) -> torch.Tensor:
        # 获取相对位置偏置，形状为 nH, Wh*Ww, Wh*Ww
        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
            self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1
        )  # Wh*Ww,Wh*Ww,nH

        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww


# 从 transformers.models.beit.modeling_beit.BeitEncoder 复制代码，并将 Beit 改为 Data2VecVision
class Data2VecVisionEncoder(nn.Module):
    def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None) -> None:
        super().__init__()
        self.config = config
        # 根据配置决定是否使用共享的相对位置偏置
        if config.use_shared_relative_position_bias:
            # 如果使用共享的相对位置偏置，则创建相应的对象
            self.relative_position_bias = Data2VecVisionRelativePositionBias(config, window_size=window_size)
        else:
            # 否则将相对位置偏置设为 None
            self.relative_position_bias = None

        # 计算随机深度衰减规则，生成一个列表
        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
        # 创建一个 nn.ModuleList，包含多个 Data2VecVisionLayer 对象，每个对象使用不同的随机深度衰减率
        self.layer = nn.ModuleList(
            [
                Data2VecVisionLayer(
                    config,
                    window_size=window_size if config.use_relative_position_bias else None,
                    drop_path_rate=dpr[i],
                )
                for i in range(config.num_hidden_layers)
            ]
        )
        # 设置梯度检查点为 False
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[tuple, BaseModelOutput]:
        # 如果需要输出隐藏状态，则初始化一个空的元组用于存储所有隐藏状态
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力权重，则初始化一个空的元组用于存储所有自注意力权重
        all_self_attentions = () if output_attentions else None

        # 遍历每个层次的模块
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前的隐藏状态加入到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 如果梯度检查点为开启且处于训练阶段，则使用梯度检查点函数进行前向传播
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    layer_head_mask,
                    output_attentions,
                )
            else:
                # 获取相对位置偏置（如果可用）
                relative_position_bias = (
                    self.relative_position_bias() if self.relative_position_bias is not None else None
                )
                # 对当前层进行前向传播，获取输出
                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, relative_position_bias)

            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力权重，则将当前层的自注意力权重加入到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果需要输出隐藏状态，则将最终的隐藏状态加入到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果 return_dict 为 False，则返回一个元组，包含非空的结果项
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则返回一个 BaseModelOutput 对象，包含最终的隐藏状态、所有隐藏状态和所有自注意力权重
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
# 从 transformers.models.data2vec_vision.modeling_data2vec_vision 中复制代码，将 BeitPreTrainedModel 替换为 Data2VecVisionPreTrainedModel，beit 替换为 data2vec_vision
class Data2VecVisionPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 Data2VecVisionConfig 作为配置类
    config_class = Data2VecVisionConfig
    # 基础模型前缀为 "data2vec_vision"
    base_model_prefix = "data2vec_vision"
    # 主输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
            # 使用正态分布初始化权重，均值为 0，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果存在偏置，则初始化为 0
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，均值为 0，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果存在 padding_idx，则将对应位置的权重初始化为 0
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 初始化 LayerNorm 层的偏置为 0，权重为 1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


# DATA2VEC_VISION_START_DOCSTRING 的注释部分，提供了该模型的基本信息和使用说明
DATA2VEC_VISION_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`Data2VecVisionConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# DATA2VEC_VISION_INPUTS_DOCSTRING 暂时为空，用于描述模型的输入信息
DATA2VEC_VISION_INPUTS_DOCSTRING = r"""
    # 定义函数签名和参数说明
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BeitImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
    "The bare Data2VecVision Model transformer outputting raw hidden-states without any specific head on top.",
    DATA2VEC_VISION_START_DOCSTRING,
)
# 从 transformers.models.beit.modeling_beit.BeitModel 复制过来，将 BEIT->DATA2VEC_VISION, Beit->Data2VecVision, True->False
class Data2VecVisionModel(Data2VecVisionPreTrainedModel):
    def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = False) -> None:
        super().__init__(config)
        self.config = config

        # 初始化 Data2VecVisionModel
        self.embeddings = Data2VecVisionEmbeddings(config)  # 初始化视觉嵌入层
        self.encoder = Data2VecVisionEncoder(config, window_size=self.embeddings.patch_embeddings.patch_shape)  # 初始化编码器

        # 如果 config.use_mean_pooling 为 True，则使用 nn.Identity()；否则使用 nn.LayerNorm 初始化 layernorm
        self.layernorm = (
            nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        )

        # 如果 add_pooling_layer 为 True，则初始化 Data2VecVisionPooler；否则设置为 None
        self.pooler = Data2VecVisionPooler(config) if add_pooling_layer else None

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.patch_embeddings  # 返回输入嵌入层的 patch_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 对模型的注意力头进行修剪
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Data2VecVisionModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[tuple, Data2VecVisionModelOutputWithPooling]:
        r"""
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        """
        # 根据传入参数或者配置确定是否返回注意力矩阵
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 根据传入参数或者配置确定是否返回隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据传入参数或者配置确定是否返回一个字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果像素值为 None，则抛出值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 准备头部掩码（如果需要）
        # 在头部掩码中，1.0 表示保留该头部
        # attention_probs 的形状为 bsz x n_heads x N x N
        # 输入的 head_mask 形状为 [num_heads] 或者 [num_hidden_layers x num_heads]
        # head_mask 被转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        # 使用嵌入层处理像素值和可选的布尔掩码位置
        embedding_output, (patch_height, patch_width) = self.embeddings(pixel_values, bool_masked_pos)

        # 使用编码器处理嵌入输出
        encoder_outputs = self.encoder(
            embedding_output,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 提取序列输出
        sequence_output = encoder_outputs[0]
        # 应用层归一化到序列输出
        sequence_output = self.layernorm(sequence_output)
        # 如果存在池化器，则将序列输出池化
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        # 如果不返回字典，则返回序列输出和池化输出的元组
        if not return_dict:
            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
            return head_outputs + encoder_outputs[1:]

        # 如果返回字典，则返回一个包含序列输出、池化输出、隐藏状态和注意力的数据结构
        return Data2VecVisionModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
# Copied from transformers.models.beit.modeling_beit.BeitPooler with Beit->Data2VecVision
class Data2VecVisionPooler(nn.Module):
    def __init__(self, config: Data2VecVisionConfig) -> None:
        super().__init__()
        # 初始化层归一化层，如果配置中使用均值池化，则创建层归一化层
        self.layernorm = (
            nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) if config.use_mean_pooling else None
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        if self.layernorm is not None:
            # 如果存在层归一化层，则对补丁令牌的最终隐藏状态进行均值池化
            patch_tokens = hidden_states[:, 1:, :]
            pooled_output = self.layernorm(patch_tokens.mean(1))
        else:
            # 否则，通过简单地取[CLS]令牌的最终隐藏状态来进行池化
            pooled_output = hidden_states[:, 0]

        return pooled_output


@add_start_docstrings(
    """
    Data2VecVision Model transformer with an image classification head on top (a linear layer on top of the average of
    the final hidden states of the patch tokens) e.g. for ImageNet.
    """,
    DATA2VEC_VISION_START_DOCSTRING,
)
# Copied from transformers.models.beit.modeling_beit.BeitForImageClassification with BEIT->DATA2VEC_VISION,Beit->Data2VecVision,beit->data2vec_vision
class Data2VecVisionForImageClassification(Data2VecVisionPreTrainedModel):
    def __init__(self, config: Data2VecVisionConfig) -> None:
        super().__init__(config)

        self.num_labels = config.num_labels
        # 创建 Data2VecVision 模型，添加池化层
        self.data2vec_vision = Data2VecVisionModel(config, add_pooling_layer=True)

        # 分类器头部
        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 此处省略了函数的最后部分，因为要注意不要更改或省略任何部分
    ) -> Union[tuple, ImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 根据返回字典是否为空，确定是否使用预设的返回字典配置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 使用数据2向量视觉编码器处理像素值，并返回结果
        outputs = self.data2vec_vision(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果使用预设的返回字典配置，则从输出中获取汇聚输出；否则，从输出元组中获取第二个元素
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 使用分类器模型对汇聚输出进行分类，得到预测的逻辑回归值
        logits = self.classifier(pooled_output)

        # 初始化损失值为None
        loss = None
        # 如果提供了标签
        if labels is not None:
            # 如果问题类型未定义，则根据标签数据类型和标签数量设置问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算对应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单标签回归任务，计算均方误差损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签回归任务，计算均方误差损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类任务，使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类任务，使用带logits的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        
        # 如果不使用预设的返回字典配置，则输出包含损失值在内的元组；否则，只输出模型预测的逻辑回归值
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 使用自定义的输出对象构建并返回结果，包括损失值、逻辑回归值、隐藏状态和注意力权重
        return ImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# Copied from transformers.models.beit.modeling_beit.BeitConvModule with Beit->Data2VecVision
class Data2VecVisionConvModule(nn.Module):
    """
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Union[int, Tuple[int, int]],
        padding: Union[int, Tuple[int, int], str] = 0,
        bias: bool = False,
        dilation: Union[int, Tuple[int, int]] = 1,
    ) -> None:
        super().__init__()
        # 定义卷积层，输入通道数、输出通道数、卷积核大小、填充方式、是否包含偏置、扩张率
        self.conv = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            padding=padding,
            bias=bias,
            dilation=dilation,
        )
        # 定义批归一化层，对输出通道数进行归一化处理
        self.bn = nn.BatchNorm2d(out_channels)
        # 定义激活函数层，使用ReLU激活函数
        self.activation = nn.ReLU()

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # 执行前向传播过程，依次经过卷积层、批归一化层和激活函数层
        output = self.conv(input)
        output = self.bn(output)
        output = self.activation(output)

        return output


# Copied from transformers.models.beit.modeling_beit.BeitPyramidPoolingBlock with Beit->Data2VecVision
class Data2VecVisionPyramidPoolingBlock(nn.Module):
    def __init__(self, pool_scale: int, in_channels: int, channels: int) -> None:
        super().__init__()
        # 定义池化模块，使用自适应平均池化进行特征提取
        # 和卷积模块，通过Data2VecVisionConvModule定义的卷积、归一化和ReLU激活层处理
        self.layers = [
            nn.AdaptiveAvgPool2d(pool_scale),
            Data2VecVisionConvModule(in_channels, channels, kernel_size=1),
        ]
        # 将定义的每一层作为模块添加到当前模块中
        for i, layer in enumerate(self.layers):
            self.add_module(str(i), layer)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        hidden_state = input
        # 执行前向传播过程，依次经过池化层和卷积模块层
        for layer in self.layers:
            hidden_state = layer(hidden_state)
        return hidden_state


# Copied from transformers.models.beit.modeling_beit.BeitPyramidPoolingModule with Beit->Data2VecVision
class Data2VecVisionPyramidPoolingModule(nn.Module):
    """
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    """

    # 空白，等待进一步实现
    pass
    # 初始化函数，设置池化尺度、输入通道数、输出通道数、对齐角点标志
    def __init__(self, pool_scales: Tuple[int, ...], in_channels: int, channels: int, align_corners: bool) -> None:
        # 调用父类的初始化函数
        super().__init__()
        # 将参数赋值给对象的属性
        self.pool_scales = pool_scales
        self.align_corners = align_corners
        self.in_channels = in_channels
        self.channels = channels
        self.blocks = []
        
        # 遍历池化尺度列表，创建数据到向量视觉金字塔池化块
        for i, pool_scale in enumerate(pool_scales):
            block = Data2VecVisionPyramidPoolingBlock(
                pool_scale=pool_scale, in_channels=in_channels, channels=channels
            )
            # 将创建的块添加到块列表中
            self.blocks.append(block)
            # 通过 add_module 方法将块添加为当前模块的子模块，使用索引 i 作为名称
            self.add_module(str(i), block)

    # 前向传播函数，接收输入张量 x，返回列表形式的多个上采样后的池化输出张量
    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
        ppm_outs = []
        # 遍历每个池化块
        for ppm in self.blocks:
            # 对输入 x 执行当前池化块的前向传播
            ppm_out = ppm(x)
            # 使用双线性插值上采样池化块的输出，保持与输入 x 相同的尺寸
            upsampled_ppm_out = nn.functional.interpolate(
                ppm_out, size=x.size()[2:], mode="bilinear", align_corners=self.align_corners
            )
            # 将上采样后的输出添加到 ppm_outs 列表中
            ppm_outs.append(upsampled_ppm_out)
        # 返回包含所有池化块输出的列表
        return ppm_outs
# 从transformers.models.data2vec.modeling_data2vec.Data2VecVisionUperHead复制而来，将Beit替换为Data2VecVision
class Data2VecVisionUperHead(nn.Module):
    """
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://arxiv.org/abs/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    """

    def __init__(self, config: Data2VecVisionConfig) -> None:
        super().__init__()

        self.pool_scales = config.pool_scales  # 例如 (1, 2, 3, 6)，池化尺度列表
        self.in_channels = [config.hidden_size] * 4  # 例如 [768, 768, 768, 768]，输入通道数列表
        self.channels = config.hidden_size  # 隐藏层大小，通常等于输入通道数
        self.align_corners = False  # 是否对齐角落像素
        self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)  # 分类器，1x1卷积层

        # PSP模块
        self.psp_modules = Data2VecVisionPyramidPoolingModule(
            self.pool_scales,
            self.in_channels[-1],
            self.channels,
            align_corners=self.align_corners,
        )
        self.bottleneck = Data2VecVisionConvModule(
            self.in_channels[-1] + len(self.pool_scales) * self.channels,
            self.channels,
            kernel_size=3,
            padding=1,
        )
        # FPN模块
        self.lateral_convs = nn.ModuleList()  # 横向卷积列表
        self.fpn_convs = nn.ModuleList()  # FPN卷积列表
        for in_channels in self.in_channels[:-1]:  # 跳过顶层
            l_conv = Data2VecVisionConvModule(in_channels, self.channels, kernel_size=1)  # 横向卷积层
            fpn_conv = Data2VecVisionConvModule(self.channels, self.channels, kernel_size=3, padding=1)  # FPN卷积层
            self.lateral_convs.append(l_conv)
            self.fpn_convs.append(fpn_conv)

        self.fpn_bottleneck = Data2VecVisionConvModule(
            len(self.in_channels) * self.channels,
            self.channels,
            kernel_size=3,
            padding=1,
        )

    def psp_forward(self, inputs):
        x = inputs[-1]  # 获取输入中的最后一个张量
        psp_outs = [x]
        psp_outs.extend(self.psp_modules(x))  # 执行PSP模块
        psp_outs = torch.cat(psp_outs, dim=1)  # 在通道维度上拼接输出
        output = self.bottleneck(psp_outs)  # 应用瓶颈层处理

        return output
    # 定义前向传播函数，接收编码器隐藏状态作为输入，返回处理后的张量
    def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
        # 构建侧向连接
        laterals = [lateral_conv(encoder_hidden_states[i]) for i, lateral_conv in enumerate(self.lateral_convs)]

        # 将 PSP 模块的输出添加到侧向连接中
        laterals.append(self.psp_forward(encoder_hidden_states))

        # 构建自顶向下路径
        used_backbone_levels = len(laterals)
        for i in range(used_backbone_levels - 1, 0, -1):
            prev_shape = laterals[i - 1].shape[2:]
            laterals[i - 1] = laterals[i - 1] + nn.functional.interpolate(
                laterals[i], size=prev_shape, mode="bilinear", align_corners=self.align_corners
            )

        # 构建输出
        fpn_outs = [self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels - 1)]
        
        # 将 PSP 特征也加入到输出中
        fpn_outs.append(laterals[-1])

        # 对所有层级的输出进行自底向上插值
        for i in range(used_backbone_levels - 1, 0, -1):
            fpn_outs[i] = nn.functional.interpolate(
                fpn_outs[i], size=fpn_outs[0].shape[2:], mode="bilinear", align_corners=self.align_corners
            )

        # 在通道维度上拼接所有的输出
        fpn_outs = torch.cat(fpn_outs, dim=1)
        
        # 使用 FPN 瓶颈网络处理拼接后的特征
        output = self.fpn_bottleneck(fpn_outs)
        
        # 使用分类器对处理后的特征进行分类
        output = self.classifier(output)

        return output
# 从 transformers.models.beit.modeling_beit.BeitFCNHead 复制而来，将 Beit 替换为 Data2VecVision
class Data2VecVisionFCNHead(nn.Module):
    """
    基于 Fully Convolution Networks 的语义分割头部。此头部的实现基于 FCNNet。

    Args:
        config (Data2VecVisionConfig): 配置参数。
        in_channels: 输入通道数。
        kernel_size (int): 头部卷积层的内核大小。默认为 3。
        dilation (int): 头部卷积层的扩张率。默认为 1。

    基于 OpenMMLab 实现，详情请见 https://github.com/open-mmlab/mmsegmentation。
    """

    def __init__(
        self,
        config: Data2VecVisionConfig,
        in_index: int = 2,
        kernel_size: int = 3,
        dilation: Union[int, Tuple[int, int]] = 1,
    ) -> None:
        super().__init__()
        self.in_channels = config.hidden_size  # 输入通道数为配置中的隐藏大小
        self.channels = config.auxiliary_channels  # 辅助通道数为配置中的辅助通道数
        self.num_convs = config.auxiliary_num_convs  # 卷积层数为配置中的卷积层数
        self.concat_input = config.auxiliary_concat_input  # 是否拼接输入为配置中的拼接输入标志
        self.in_index = in_index  # 输入索引为给定的输入索引

        conv_padding = (kernel_size // 2) * dilation  # 计算卷积填充大小
        convs = []
        convs.append(
            Data2VecVisionConvModule(
                self.in_channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
            )  # 添加第一个卷积模块
        )
        for i in range(self.num_convs - 1):
            convs.append(
                Data2VecVisionConvModule(
                    self.channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
                )  # 根据配置添加额外的卷积模块
            )
        if self.num_convs == 0:
            self.convs = nn.Identity()  # 如果卷积层数为0，使用恒等映射
        else:
            self.convs = nn.Sequential(*convs)  # 否则创建卷积层的序列模块
        if self.concat_input:
            self.conv_cat = Data2VecVisionConvModule(
                self.in_channels + self.channels, self.channels, kernel_size=kernel_size, padding=kernel_size // 2
            )  # 如果拼接输入为真，创建拼接卷积模块

        self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)  # 创建分类器卷积层

    def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
        # 取出相关特征图
        hidden_states = encoder_hidden_states[self.in_index]
        output = self.convs(hidden_states)  # 经过卷积模块
        if self.concat_input:
            output = self.conv_cat(torch.cat([hidden_states, output], dim=1))  # 如果拼接输入，进行特征拼接
        output = self.classifier(output)  # 最终分类器输出
        return output


@add_start_docstrings(
    """
    带有语义分割头部的 Data2VecVision 模型变压器，例如用于 ADE20k、CityScapes 等数据集。
    """,
    DATA2VEC_VISION_START_DOCSTRING,
)
# 从 transformers.models.beit.modeling_beit.BeitForSemanticSegmentation 复制而来，将 BEIT->DATA2VEC_VISION,Beit->Data2VecVision,microsoft/beit-base-finetuned-ade-640-640->facebook/data2vec-vision-base,beit->data2vec_vision
class Data2VecVisionForSemanticSegmentation(Data2VecVisionPreTrainedModel):
    def __init__(self, config: Data2VecVisionConfig) -> None:
        super().__init__(config)

        self.num_labels = config.num_labels
        self.data2vec_vision = Data2VecVisionModel(config, add_pooling_layer=False)

        # FPNs
        # 检查 config.out_indices 是否包含了四个整数，若不是则抛出数值错误
        if len(self.config.out_indices) != 4:
            raise ValueError(
                "Data2VecVisionForSemanticSegmentation requires config.out_indices to be a list of 4 integers, "
                "specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of "
                "a base-sized architecture."
            )
        
        # 创建第一个特征金字塔网络（FPN）
        self.fpn1 = nn.Sequential(
            nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
            nn.BatchNorm2d(config.hidden_size),
            nn.GELU(),
            nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
        )
        
        # 创建第二个特征金字塔网络（FPN）
        self.fpn2 = nn.Sequential(
            nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
        )
        
        # 创建第三个特征金字塔网络（FPN），是一个恒等映射（Identity mapping）
        self.fpn3 = nn.Identity()
        
        # 创建第四个特征金字塔网络（FPN），使用最大池化操作
        self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Semantic segmentation head(s)
        # 初始化解码头部和辅助头部（如果启用）
        self.decode_head = Data2VecVisionUperHead(config)
        self.auxiliary_head = Data2VecVisionFCNHead(config) if config.use_auxiliary_head else None

        # Initialize weights and apply final processing
        # 执行后续初始化步骤
        self.post_init()

    def compute_loss(self, logits, auxiliary_logits, labels):
        # upsample logits to the images' original size
        # 将 logits 上采样至原始图像大小
        upsampled_logits = nn.functional.interpolate(
            logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
        )
        if auxiliary_logits is not None:
            upsampled_auxiliary_logits = nn.functional.interpolate(
                auxiliary_logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
            )
        
        # compute weighted loss
        # 计算加权损失
        loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
        main_loss = loss_fct(upsampled_logits, labels)
        loss = main_loss
        
        if auxiliary_logits is not None:
            auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
            loss += self.config.auxiliary_loss_weight * auxiliary_loss

        return loss

    @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

`.\models\data2vec\modeling_tf_data2vec_vision.py`

# coding=utf-8
# 版权 2022 Meta Platforms 和 The HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（"许可证"）获得许可；
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"分发，
# 没有任何明示或暗示的担保或条件。
# 有关特定语言的权限，请参阅许可证。
""" TF 2.0 Data2Vec Vision model."""

from __future__ import annotations

import collections.abc
import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutput,
    TFBaseModelOutputWithPooling,
    TFSemanticSegmenterOutput,
    TFSequenceClassifierOutput,
)
from ...modeling_tf_utils import (
    TFModelInputType,
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import shape_list, stable_softmax
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_data2vec_vision import Data2VecVisionConfig

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 用于文档的通用变量
_CONFIG_FOR_DOC = "Data2VecVisionConfig"

# 用于文档的基础检查点信息
_CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]

# 图像分类模型的检查点和预期输出信息
_IMAGE_CLASS_CHECKPOINT = "facebook/data2vec-vision-base-ft1k"
_IMAGE_CLASS_EXPECTED_OUTPUT = "remote control, remote"

# Data2VecVision 模型的预训练模型存档列表
TF_DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/data2vec-vision-base-ft1k",
    # 查看所有 Data2VecVision 模型：https://huggingface.co/models?filter=data2vec-vision
]

@dataclass
class TFData2VecVisionModelOutputWithPooling(TFBaseModelOutputWithPooling):
    """
    [`TFData2VecVisionModel`] 的输出类。
    """
    pass
    # 定义函数参数及其类型注解，说明函数接受的输入和返回的输出
    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列。
        pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
            如果 *config.use_mean_pooling* 设置为 True，则为所有补丁令牌的最后一层隐藏状态的平均值（不包括 *[CLS]* 令牌）。
            如果设置为 False，则返回 *[CLS]* 令牌的最终隐藏状态。
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            一个元组，包含模型每一层的隐藏状态的 `tf.Tensor`（包括初始嵌入输出）。
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            一个元组，包含注意力权重的 `tf.Tensor`（每层一个），形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            这些权重是注意力 softmax 后的结果，用于计算自注意力头中的加权平均值。
    """

    # 初始化函数参数默认值为 None
    last_hidden_state: tf.Tensor = None
    pooler_output: tf.Tensor = None
    hidden_states: Tuple[tf.Tensor] | None = None
    attentions: Tuple[tf.Tensor] | None = None
class TFData2VecVisionDropPath(keras.layers.Layer):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    References:
        (1) github.com:rwightman/pytorch-image-models
    """

    def __init__(self, drop_path, **kwargs):
        super().__init__(**kwargs)
        self.drop_path = drop_path  # 初始化方法，设置了一个名为 drop_path 的属性

    def call(self, x, training=None):
        if training:
            keep_prob = 1 - self.drop_path  # 如果处于训练模式，计算保留概率
            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)  # 计算随机张量的形状
            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)  # 生成随机张量
            random_tensor = tf.floor(random_tensor)  # 取下界，得到二元随机张量
            return (x / keep_prob) * random_tensor  # 应用随机张量的 drop path 操作
        return x  # 如果不处于训练模式，直接返回输入张量


class TFData2VecVisionEmbeddings(keras.layers.Layer):
    """
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    """

    def __init__(self, config: Data2VecVisionConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config  # 初始化方法，设置了一个名为 config 的属性

        self.patch_embeddings = TFData2VecVisionPatchEmbeddings(config, name="patch_embeddings")  # 创建 TFData2VecVisionPatchEmbeddings 对象
        self.num_patches = self.patch_embeddings.num_patches  # 获取 patch embeddings 的数量
        self.config = config  # 再次设置 config 属性，这可能是重复的操作

        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)  # 创建一个 dropout 层

    def build(self, input_shape=None):
        self.cls_token = self.add_weight(
            shape=(1, 1, self.config.hidden_size),
            initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
            trainable=True,
            name="cls_token",
        )  # 构建 CLS token 的权重

        if self.config.use_mask_token:
            self.mask_token = self.add_weight(
                shape=(1, 1, self.config.hidden_size),
                initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
                trainable=True,
                name="mask_token",
            )  # 如果配置中使用 mask token，则构建 mask token 的权重
        else:
            self.mask_token = None  # 否则，设置 mask token 为 None

        if self.config.use_absolute_position_embeddings:
            self.position_embeddings = self.add_weight(
                shape=(1, self.num_patches + 1, self.config.hidden_size),
                initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
                trainable=True,
                name="position_embeddings",
            )  # 如果配置中使用绝对位置 embeddings，则构建位置 embeddings 的权重
        else:
            self.position_embeddings = None  # 否则，设置位置 embeddings 为 None

        if self.built:
            return
        self.built = True  # 标记为已构建状态
        if getattr(self, "patch_embeddings", None) is not None:
            with tf.name_scope(self.patch_embeddings.name):
                self.patch_embeddings.build(None)  # 构建 patch embeddings
    # 定义一个方法`call`，接受两个参数`pixel_values`和`bool_masked_pos`，返回一个张量
    def call(self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None) -> tf.Tensor:
        # 使用`patch_embeddings`方法将输入的像素值转换成嵌入向量
        embeddings = self.patch_embeddings(pixel_values)
        # 获取嵌入向量的形状信息：批大小、序列长度和投影维度
        batch_size, seq_len, projection_dim = shape_list(embeddings)

        # 创建一个形状为(batch_size, 1, 1)的张量，其中每个元素都是`cls_token`
        cls_tokens = tf.tile(self.cls_token, (batch_size, 1, 1))

        # 如果`bool_masked_pos`不为`None`，则执行以下操作
        if bool_masked_pos is not None:
            # 创建一个形状与`embeddings`相同的张量，每个元素都是`self.mask_token`
            mask_tokens = tf.broadcast_to(self.mask_token, (batch_size, seq_len, projection_dim))
            # 将被掩盖的视觉标记替换为`mask_tokens`
            w = bool_masked_pos[..., None]
            # 将`w`转换为与`mask_tokens`相同的数据类型
            w = tf.cast(w, mask_tokens.dtype)
            # 由于TF不支持即时张量赋值，使用加法和乘法来实现掩盖操作
            embeddings = embeddings * (1 - w) + mask_tokens * w

        # 将`cls_tokens`和`embeddings`沿着序列长度的方向连接起来
        embeddings = tf.concat([cls_tokens, embeddings], axis=1)
        # 如果存在`position_embeddings`，将其加到`embeddings`上
        if self.position_embeddings is not None:
            embeddings = embeddings + self.position_embeddings
        # 对`embeddings`应用dropout操作
        embeddings = self.dropout(embeddings)

        # 返回处理后的`embeddings`张量作为方法的输出
        return embeddings
class TFData2VecVisionPatchEmbeddings(keras.layers.Layer):
    """
    Image to Patch Embedding.
    """

    def __init__(self, config: Data2VecVisionConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config

        # 从配置中获取图像大小和补丁大小
        image_size, patch_size = config.image_size, config.patch_size
        # 获取通道数和隐藏层大小
        num_channels, hidden_size = config.num_channels, config.hidden_size

        # 将图像大小和补丁大小转换为迭代对象（如果它们不是），确保它们是元组
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # 计算图像中的补丁数
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        # 计算补丁的形状
        patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])

        # 设置对象的属性
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_patches = num_patches
        self.patch_shape = patch_shape
        self.num_channels = num_channels

        # 创建卷积层，用于将像素值投影到隐藏空间
        self.projection = keras.layers.Conv2D(
            filters=hidden_size,
            kernel_size=patch_size,
            strides=patch_size,
            padding="valid",
            data_format="channels_last",
            kernel_initializer="glorot_uniform",  # 使用glorot_uniform初始化权重，类似于torch.nn.Linear
            bias_initializer="zeros",
            name="projection",
        )

    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 获取像素值张量的形状信息
        batch_size, num_channels, height, width = shape_list(pixel_values)
        
        # 在动态执行模式下，验证像素值的通道数是否与配置中设置的通道数匹配
        if tf.executing_eagerly():
            if num_channels != self.num_channels:
                raise ValueError(
                    "Make sure that the channel dimension of the pixel values match with the one set in the"
                    " configuration."
                )
            # 验证输入图像的高度和宽度是否与配置中设置的图像大小匹配
            if height != self.image_size[0] or width != self.image_size[1]:
                raise ValueError(
                    f"Input image size ({height}*{width}) doesn't match model"
                    f" ({self.image_size[0]}*{self.image_size[1]})."
                )

        # 当在CPU上运行时，`keras.layers.Conv2D`不支持`NCHW`格式，所以将输入格式从`NCHW`转换为`NHWC`
        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))

        # 将像素值投影到隐藏空间
        projection = self.projection(pixel_values)

        # 将2D空间维度变换为单个时间维度，即将投影结果reshape成(batch_size, num_patches, -1)
        num_patches = (width // self.patch_size[1]) * (height // self.patch_size[0])

        return tf.reshape(tensor=projection, shape=(batch_size, num_patches, -1))
    # 定义一个方法 `build`，用于构建神经网络层的参数
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，不再重复构建
        if self.built:
            return
        # 标记为已经构建
        self.built = True
        # 如果存在投影层 `projection`，则构建该投影层
        if getattr(self, "projection", None) is not None:
            # 在 TensorFlow 中创建一个命名空间 `self.projection.name`
            with tf.name_scope(self.projection.name):
                # 构建投影层，指定输入形状为 [None, None, None, self.num_channels]
                self.projection.build([None, None, None, self.num_channels])
    class TFData2VecVisionSelfAttention(keras.layers.Layer):
        def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
            super().__init__(**kwargs)
    
            # 检查隐藏大小是否是注意力头数的整数倍
            if config.hidden_size % config.num_attention_heads != 0:
                raise ValueError(
                    f"The hidden size ({config.hidden_size}) is not a multiple of the number "
                    f"of attention heads ({config.num_attention_heads})"
                )
    
            self.num_attention_heads = config.num_attention_heads
            self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
            self.all_head_size = self.num_attention_heads * self.attention_head_size
            self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
    
            # 创建用于查询的全连接层
            self.query = keras.layers.Dense(
                units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
            )
            # 创建用于键的全连接层，不使用偏置项
            self.key = keras.layers.Dense(
                units=self.all_head_size,
                kernel_initializer=get_initializer(config.initializer_range),
                name="key",
                use_bias=False,
            )
            # 创建用于值的全连接层
            self.value = keras.layers.Dense(
                units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
            )
            # Dropout 层，用于注意力概率的丢弃
            self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
    
            # 如果给定了窗口大小，则创建相对位置偏置层
            if window_size:
                self.relative_position_bias = TFData2VecVisionRelativePositionBias(
                    config, window_size=window_size, name="relative_position_bias"
                )
            else:
                self.relative_position_bias = None
            self.config = config
    
        def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
            # 将张量从 [batch_size, seq_length, all_head_size] 重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size]
            tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
    
            # 将张量从 [batch_size, seq_length, num_attention_heads, attention_head_size] 转置为 [batch_size, num_attention_heads, seq_length, attention_head_size]
            return tf.transpose(tensor, perm=[0, 2, 1, 3])
    
        def call(
            self,
            hidden_states: tf.Tensor,
            head_mask: tf.Tensor,
            output_attentions: bool,
            relative_position_bias: Optional["TFData2VecVisionRelativePositionBias"] = None,
            training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 获取隐藏状态的批量大小
        batch_size = shape_list(hidden_states)[0]
        # 通过self.query对隐藏状态进行查询操作
        mixed_query_layer = self.query(inputs=hidden_states)
        # 通过self.key对隐藏状态进行键操作
        mixed_key_layer = self.key(inputs=hidden_states)
        # 通过self.value对隐藏状态进行值操作
        mixed_value_layer = self.value(inputs=hidden_states)
        # 将混合的查询层转置以便进行注意力计算
        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
        # 将混合的键层转置以便进行注意力计算
        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
        # 将混合的值层转置以便进行注意力计算
        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)

        # 对"查询"和"键"进行点积操作以获得原始注意力分数
        # 结果形状为(batch size, num_heads, seq_len_q, seq_len_k)
        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
        # 对注意力分数进行除以平方根(num_heads)的缩放操作
        attention_scores = attention_scores / self.sqrt_att_head_size

        # 如果存在相对位置偏置，则添加到注意力分数中
        if self.relative_position_bias is not None:
            # 传递0.0给relative_position_bias()层，因为在这种情况下，该输入不会参与任何计算
            attention_scores = attention_scores + self.relative_position_bias(0.0)[None, ...]

        # 如果提供了共享的相对位置偏置，则添加到注意力分数中
        if relative_position_bias is not None:
            attention_scores = attention_scores + relative_position_bias

        # 将注意力分数归一化为概率
        attention_probs = stable_softmax(logits=attention_scores, axis=-1)

        # 使用dropout随机丢弃整个注意力概率矩阵中的元素，这在Transformer中是标准做法
        attention_probs = self.dropout(inputs=attention_probs, training=training)

        # 如果有头部掩码(head_mask)，则应用头部掩码到注意力概率中
        if head_mask is not None:
            attention_probs = tf.multiply(attention_probs, head_mask)

        # 计算注意力输出，将注意力概率乘以值层
        attention_output = tf.matmul(attention_probs, value_layer)
        # 调整注意力输出的维度顺序
        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])

        # 将注意力输出重塑为(batch_size, seq_len_q, all_head_size)
        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
        # 如果需要输出注意力矩阵，则将注意力概率包含在输出中
        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)

        return outputs
    # 构建方法用于初始化模型结构，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 将标志设置为已构建
        self.built = True
        # 如果存在查询（query）属性，则构建查询的神经网络层
        if getattr(self, "query", None) is not None:
            # 在 TensorFlow 中使用名称作用域来管理操作，这里是为查询层创建名称作用域
            with tf.name_scope(self.query.name):
                # 构建查询层，输入形状为 [None, None, self.config.hidden_size]
                self.query.build([None, None, self.config.hidden_size])
        # 如果存在键（key）属性，则构建键的神经网络层
        if getattr(self, "key", None) is not None:
            # 在 TensorFlow 中使用名称作用域来管理操作，这里是为键层创建名称作用域
            with tf.name_scope(self.key.name):
                # 构建键层，输入形状为 [None, None, self.config.hidden_size]
                self.key.build([None, None, self.config.hidden_size])
        # 如果存在值（value）属性，则构建值的神经网络层
        if getattr(self, "value", None) is not None:
            # 在 TensorFlow 中使用名称作用域来管理操作，这里是为值层创建名称作用域
            with tf.name_scope(self.value.name):
                # 构建值层，输入形状为 [None, None, self.config.hidden_size]
                self.value.build([None, None, self.config.hidden_size])
        # 如果存在相对位置偏置（relative_position_bias）属性，则构建该偏置
        if getattr(self, "relative_position_bias", None) is not None:
            # 在 TensorFlow 中使用名称作用域来管理操作，这里是为相对位置偏置层创建名称作用域
            with tf.name_scope(self.relative_position_bias.name):
                # 构建相对位置偏置层，输入形状为 None（形状由数据决定）
                self.relative_position_bias.build(None)
class TFData2VecVisionSelfOutput(keras.layers.Layer):
    """
    The residual connection is defined in TFData2VecVisionLayer instead of here (as is the case with other models), due
    to the layernorm applied before each block.
    """

    def __init__(self, config: Data2VecVisionConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化一个全连接层，用于变换隐藏状态到指定大小
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 初始化一个dropout层，用于在训练时随机丢弃部分神经元，防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, gamma=None, training: bool = False) -> tf.Tensor:
        # 使用全连接层变换隐藏状态
        hidden_states = self.dense(inputs=hidden_states)
        # 在训练时应用dropout层
        hidden_states = self.dropout(inputs=hidden_states, training=training)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建，直接返回
        if getattr(self, "dense", None) is not None:
            # 在名为dense的作用域内构建dense层
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


class TFData2VecVisionAttention(keras.layers.Layer):
    def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
        super().__init__(**kwargs)

        # 初始化自注意力层，使用Data2VecVisionSelfAttention
        self.attention = TFData2VecVisionSelfAttention(config, window_size=window_size, name="attention")
        # 初始化输出层，使用TFData2VecVisionSelfOutput
        self.dense_output = TFData2VecVisionSelfOutput(config, name="output")

    def prune_heads(self, heads):
        # 留空，抛出未实现错误，暂不实现头部修剪功能
        raise NotImplementedError

    def call(
        self,
        input_tensor: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        relative_position_bias: Optional["TFData2VecVisionRelativePositionBias"] = None,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 调用自注意力层处理输入张量
        self_outputs = self.attention(
            hidden_states=input_tensor,
            head_mask=head_mask,
            output_attentions=output_attentions,
            relative_position_bias=relative_position_bias,
            training=training,
        )
        # 使用输出层处理自注意力层的输出结果
        attention_output = self.dense_output(
            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
        )
        # 将处理后的结果打包成元组输出，如果需要输出注意力权重，则附加到输出元组中
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them

        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果未构建，构建自注意力层
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        # 如果未构建，构建输出层
        if getattr(self, "dense_output", None) is not None:
            with tf.name_scope(self.dense_output.name):
                self.dense_output.build(None)


# Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->Data2VecVision
class TFData2VecVisionIntermediate(keras.layers.Layer):
    # 初始化函数，用于创建一个新的Data2VecVisionConfig对象，并设置网络层的一些参数
    def __init__(self, config: Data2VecVisionConfig, **kwargs):
        # 调用父类的初始化方法，传递额外的关键字参数
        super().__init__(**kwargs)

        # 创建一个全连接层，设置单元数为config.intermediate_size，使用指定的初始化器初始化权重
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 如果config.hidden_act是字符串类型，将其转换为对应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            # 否则直接使用配置中指定的激活函数
            self.intermediate_act_fn = config.hidden_act
        
        # 保存配置对象到当前实例中
        self.config = config

    # 调用函数，实现对输入隐藏状态的前向传播
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 将输入隐藏状态传递给全连接层，并获取输出
        hidden_states = self.dense(inputs=hidden_states)
        # 将全连接层的输出应用中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)

        # 返回处理后的隐藏状态作为输出
        return hidden_states

    # 构建函数，用于在第一次调用call函数时构建网络层
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        
        # 设置标志位表示已经构建过网络层
        self.built = True
        
        # 如果存在全连接层dense，则开始构建dense层，指定输入形状为[None, None, self.config.hidden_size]
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
class TFData2VecVisionOutput(keras.layers.Layer):
    def __init__(self, config: Data2VecVisionConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，用于处理隐藏状态
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 创建一个dropout层，用于在训练时随机断开连接以防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 输入隐藏状态到全连接层进行处理
        hidden_states = self.dense(inputs=hidden_states)
        # 根据训练状态应用dropout操作
        hidden_states = self.dropout(inputs=hidden_states, training=training)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                # 根据配置构建全连接层，输入形状为[None, None, self.config.intermediate_size]
                self.dense.build([None, None, self.config.intermediate_size])


class TFData2VecVisionLayer(keras.layers.Layer):
    """This corresponds to the Block class in the timm implementation."""

    def __init__(
        self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, drop_path_rate: float = 0.0, **kwargs
    ):
        super().__init__(**kwargs)
        self.config = config

        # 创建注意力层对象，用于处理视觉特征
        self.attention = TFData2VecVisionAttention(config, window_size=window_size, name="attention")
        # 创建中间层对象，用于进一步处理注意力层的输出
        self.intermediate = TFData2VecVisionIntermediate(config, name="intermediate")
        # 创建数据2向量输出层对象，用于最终的输出
        self.data2vec_output = TFData2VecVisionOutput(config, name="output")

        # 创建LayerNormalization层，用于在每个子层之前应用
        self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
        # 创建LayerNormalization层，用于在每个子层之后应用
        self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
        # 根据drop_path_rate值选择不同的激活层对象
        self.drop_path = (
            TFData2VecVisionDropPath(drop_path_rate, name="drop_path")
            if drop_path_rate > 0.0
            else keras.layers.Activation("linear", name="drop_path")
        )
        # 初始化层标度的初始值
        self.init_values = config.layer_scale_init_value
    # 定义模型的 build 方法，用于构建模型结构
    def build(self, input_shape: tf.TensorShape = None):
        # 如果指定了初始化值，则创建 lambda_1 和 lambda_2 权重，并赋予初始值
        if self.init_values > 0:
            # 创建 lambda_1 权重，形状为隐藏层大小，初始化为全1，可训练
            self.lambda_1 = self.add_weight(
                shape=(self.config.hidden_size),
                initializer="ones",
                trainable=True,
                name="lambda_1",
            )
            # 创建 lambda_2 权重，形状为隐藏层大小，初始化为全1，可训练
            self.lambda_2 = self.add_weight(
                shape=(self.config.hidden_size),
                initializer="ones",
                trainable=True,
                name="lambda_2",
            )
            # 使用初始化值乘以全1向量，赋值给 lambda_1 和 lambda_2
            self.lambda_1.assign(self.init_values * tf.ones((self.config.hidden_size)))
            self.lambda_2.assign(self.init_values * tf.ones((self.config.hidden_size)))
        else:
            # 如果没有指定初始化值，则 lambda_1 和 lambda_2 设为 None
            self.lambda_1, self.lambda_2 = None, None

        # 如果模型已经构建完成，则直接返回
        if self.built:
            return
        self.built = True
        
        # 如果定义了 attention 属性，则构建 attention 模块
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        
        # 如果定义了 intermediate 属性，则构建 intermediate 模块
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        
        # 如果定义了 data2vec_output 属性，则构建 data2vec_output 模块
        if getattr(self, "data2vec_output", None) is not None:
            with tf.name_scope(self.data2vec_output.name):
                self.data2vec_output.build(None)
        
        # 如果定义了 layernorm_before 属性，则构建 layernorm_before 模块
        if getattr(self, "layernorm_before", None) is not None:
            with tf.name_scope(self.layernorm_before.name):
                self.layernorm_before.build([None, None, self.config.hidden_size])
        
        # 如果定义了 layernorm_after 属性，则构建 layernorm_after 模块
        if getattr(self, "layernorm_after", None) is not None:
            with tf.name_scope(self.layernorm_after.name):
                self.layernorm_after.build([None, None, self.config.hidden_size])
        
        # 如果定义了 drop_path 属性，则构建 drop_path 模块
        if getattr(self, "drop_path", None) is not None:
            with tf.name_scope(self.drop_path.name):
                self.drop_path.build(None)

    # 定义模型的 call 方法，用于执行前向传播
    def call(
        self,
        hidden_states: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        relative_position_bias: Optional["TFData2VecVisionRelativePositionBias"] = None,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:  # 定义函数返回类型为包含一个 TensorFlow 张量的元组
        self_attention_outputs = self.attention(
            # 在 Data2VecVision 中，在自注意力之前应用层归一化
            input_tensor=self.layernorm_before(inputs=hidden_states),  # 对输入张量进行层归一化处理
            head_mask=head_mask,  # 头部遮罩，用于指定屏蔽哪些注意力头
            output_attentions=output_attentions,  # 是否输出注意力权重
            relative_position_bias=relative_position_bias,  # 相对位置偏置，用于自注意力中的位置编码
            training=training,  # 是否在训练模式下
        )
        attention_output = self_attention_outputs[0]  # 获取自注意力模块的输出张量
        outputs = self_attention_outputs[1:]  # 如果需要输出注意力权重，则包含在输出中

        # 如果存在 lambda_1，则应用到注意力输出上
        if self.lambda_1 is not None:
            attention_output = self.lambda_1 * attention_output

        # 第一个残差连接
        hidden_states = self.drop_path(attention_output) + hidden_states  # 使用 drop_path 函数进行残差连接

        # 在 Data2VecVision 中，还会在自注意力之后应用层归一化
        layer_output = self.layernorm_after(hidden_states)  # 对残差连接后的张量进行层归一化处理

        layer_output = self.intermediate(layer_output)  # 中间层转换
        layer_output = self.data2vec_output(layer_output)  # Data2Vec 输出层

        # 如果存在 lambda_2，则应用到最终输出层上
        if self.lambda_2 is not None:
            layer_output = self.lambda_2 * layer_output

        # 第二个残差连接
        layer_output = self.drop_path(layer_output) + hidden_states  # 使用 drop_path 函数进行残差连接到原始隐藏状态上

        outputs = (layer_output,) + outputs  # 构建最终的输出元组

        return outputs  # 返回输出元组
# Taken and modified from here:
# https://github.com/leondgarse/keras_cv_attention_models/blob/main/keras_cv_attention_models/beit/beit.py#L28
# 定义一个自定义的Keras层，用于处理数据2Vec视觉任务的相对位置偏置
class TFData2VecVisionRelativePositionBias(keras.layers.Layer):
    def __init__(self, config: Data2VecVisionConfig, window_size: tuple, **kwargs) -> None:
        super().__init__(**kwargs)
        self.config = config

        self.window_size = window_size
        # 计算相对距离的数量，加上3用于处理特殊标记（cls_token_pos_len）
        # window_size可以是类似于(14, 14)的元组
        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3

        # 创建相对位置索引
        self.relative_position_index = self.get_position_index()

    def build(self, input_shape):
        # 添加一个可训练的权重矩阵，用于存储相对位置偏置表
        self.relative_position_bias_table = self.add_weight(
            shape=(self.num_relative_distance, self.config.num_attention_heads),
            initializer="zeros",
            trainable=True,
            name="relative_position_bias_table",
        )  # [2*Wh-1 * 2*Ww-1, nH]
        # cls to token & token 2 cls & cls to cls

        super().build(input_shape)

    def get_position_index(self):
        # 获取窗口内每个标记的成对相对位置索引
        xx, yy = tf.meshgrid(range(self.window_size[0]), range(self.window_size[1]))
        coords = tf.stack([yy, xx], axis=0)  # [2, Wh, Ww]
        coords_flatten = tf.reshape(coords, [2, -1])  # [2, Wh*Ww]

        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # [2, Wh*Ww, Wh*Ww]
        relative_coords = tf.transpose(relative_coords, perm=[1, 2, 0])  # [Wh*Ww, Wh*Ww, 2]

        xx = (relative_coords[:, :, 0] + self.window_size[0] - 1) * (2 * self.window_size[1] - 1)
        yy = relative_coords[:, :, 1] + self.window_size[1] - 1
        relative_coords = tf.stack([xx, yy], axis=-1)

        relative_position_index = tf.reduce_sum(relative_coords, axis=-1)  # [Wh*Ww, Wh*Ww]

        # 添加特殊标记，表示cls到token、token到cls、cls到cls的相对位置
        top = tf.ones((1, relative_position_index.shape[1]), dtype=relative_position_index.dtype) * (
            self.num_relative_distance - 3
        )
        left = tf.ones((relative_position_index.shape[0], 1), dtype=relative_position_index.dtype) * (
            self.num_relative_distance - 2
        )
        corner = tf.ones((1, 1), dtype=relative_position_index.dtype) * (self.num_relative_distance - 1)

        left_corner = tf.concat([corner, left], axis=0)
        relative_position_index = tf.concat([top, relative_position_index], axis=0)
        relative_position_index = tf.concat([left_corner, relative_position_index], axis=1)  # [Wh*Ww + 1, Wh*Ww + 1]
        return relative_position_index

    def call(self, inputs=None) -> tf.Tensor:
        # 根据相对位置索引从相对位置偏置表中获取相对位置偏置
        relative_position_bias = tf.gather(self.relative_position_bias_table, self.relative_position_index, axis=0)
        return tf.transpose(relative_position_bias, [2, 0, 1])
    def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
        super().__init__(**kwargs)
        self.config = config  # 初始化对象的配置信息

        # 根据配置决定是否创建相对位置偏置对象
        if config.use_shared_relative_position_bias:
            self.relative_position_bias = TFData2VecVisionRelativePositionBias(
                config, window_size=window_size, name="relative_position_bias"
            )
        else:
            self.relative_position_bias = None

        # 根据层的数量创建 TFData2VecVisionLayer 对象的列表
        # 每层具有不同的 drop path rate，并且根据配置选择是否使用相对位置偏置
        dpr = list(tf.linspace(0.0, config.drop_path_rate, config.num_hidden_layers))
        self.layer = [
            TFData2VecVisionLayer(
                config,
                window_size=window_size if config.use_relative_position_bias else None,
                drop_path_rate=dpr[i],
                name=f"layer_._{i}",
            )
            for i in range(config.num_hidden_layers)
        ]

    def call(
        self,
        hidden_states: tf.Tensor,
        head_mask: tf.Tensor | None = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[tuple, TFBaseModelOutput]:
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        # 遍历所有层，对每一层进行前向传播
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 获取相对位置偏置对象，如果没有则为 None
            relative_position_bias = (
                self.relative_position_bias(0.0) if self.relative_position_bias is not None else None
            )

            # 调用当前层的前向传播方法
            layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, relative_position_bias)

            # 更新隐藏状态为当前层的输出
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力权重，记录当前层的注意力权重输出
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果需要输出所有隐藏状态，记录最终的隐藏状态
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 根据 return_dict 决定返回值的格式
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)

        # 返回 TFBaseModelOutput 对象，包含最终的隐藏状态、所有隐藏状态和注意力权重
        return TFBaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
    # 定义 build 方法，用于构建神经网络层
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 设置标志位，表示已经构建
        self.built = True
        
        # 检查是否存在相对位置偏置项，如果有，则构建其相应的层
        if getattr(self, "relative_position_bias", None) is not None:
            # 使用相对位置偏置项的名称作为命名空间
            with tf.name_scope(self.relative_position_bias.name):
                # 调用该项的 build 方法构建
                self.relative_position_bias.build(None)
        
        # 检查是否存在层列表，如果有，则逐层构建
        if getattr(self, "layer", None) is not None:
            # 遍历层列表
            for layer in self.layer:
                # 使用层的名称作为命名空间
                with tf.name_scope(layer.name):
                    # 调用层的 build 方法构建
                    layer.build(None)
# 声明一个自定义层 TFData2VecVisionMainLayer，使用 keras_serializable 装饰器标记为可序列化
@keras_serializable
class TFData2VecVisionMainLayer(keras.layers.Layer):
    # 设置类属性 config_class 为 Data2VecVisionConfig，指定配置类
    config_class = Data2VecVisionConfig

    # 初始化方法，接受 Data2VecVisionConfig 实例和一个布尔型参数 add_pooling_layer
    def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = True, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 设置实例属性 config 为传入的 config 参数
        self.config = config
        # 设置实例属性 add_pooling_layer 为传入的 add_pooling_layer 参数
        self.add_pooling_layer = add_pooling_layer

        # 创建 TFData2VecVisionEmbeddings 实例，命名为 embeddings
        self.embeddings = TFData2VecVisionEmbeddings(config, name="embeddings")
        # 创建 TFData2VecVisionEncoder 实例，命名为 encoder，传入 window_size 参数
        self.encoder = TFData2VecVisionEncoder(
            config, window_size=self.embeddings.patch_embeddings.patch_shape, name="encoder"
        )

        # 根据配置中的 use_mean_pooling 属性选择性地初始化 layernorm 层
        self.layernorm = (
            tf.identity  # 如果 use_mean_pooling 为 True，使用 tf.identity
            if config.use_mean_pooling
            else keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
            # 如果 use_mean_pooling 为 False，使用 LayerNormalization，并指定 epsilon 和名字
        )

        # 如果 add_pooling_layer 为 True，则创建 TFData2VecVisionPooler 实例，命名为 pooler
        # 否则设为 None
        self.pooler = TFData2VecVisionPooler(config, name="pooler") if add_pooling_layer else None

    # 返回嵌入层的输入 embeddings.patch_embeddings
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.embeddings.patch_embeddings

    # 未实现的方法，用于剪枝模型中的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError

    # 使用 unpack_inputs 装饰器处理输入参数，定义模型的调用方法
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        bool_masked_pos: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        **kwargs
    ):
        # 这里未完整显示，继续下面的函数参数和逻辑
    ) -> Union[tuple, TFData2VecVisionModelOutputWithPooling]:
        # 设置输出注意力权重，默认为模型配置中的设定
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置输出隐藏状态，默认为模型配置中的设定
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置是否返回字典格式的输出，默认为模型配置中的设定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            # 如果未提供像素值，抛出数值错误
            raise ValueError("You have to specify pixel_values")

        # 如果需要，准备头部掩码
        # head_mask 中 1.0 表示保留该头部
        # attention_probs 的形状为 bsz x n_heads x N x N
        # 输入的 head_mask 形状为 [num_heads] 或 [num_hidden_layers x num_heads]
        # head_mask 被转换为 [num_hidden_layers x batch x num_heads x seq_length x seq_length] 的形状
        if head_mask is not None:
            # 如果有头部掩码，则抛出未实现错误
            raise NotImplementedError
        else:
            # 否则，使用 None 初始化 head_mask 列表，长度为模型中的隐藏层数
            head_mask = [None] * self.config.num_hidden_layers

        # 使用 embeddings 方法生成嵌入输出
        embedding_output = self.embeddings(pixel_values, bool_masked_pos, training=training)

        # 使用 encoder 方法对嵌入输出进行编码
        encoder_outputs = self.encoder(
            embedding_output,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取编码器的序列输出
        sequence_output = encoder_outputs[0]
        # 应用 layernorm 层
        sequence_output = self.layernorm(sequence_output)
        # 如果存在池化层，对序列输出进行池化
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        if not return_dict:
            # 如果不要求返回字典格式的输出，则返回元组形式的输出
            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
            return head_outputs + encoder_outputs[1:]

        # 如果要求返回字典格式的输出，则构建 TFData2VecVisionModelOutputWithPooling 对象返回
        return TFData2VecVisionModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        self.built = True
        # 如果存在 embeddings 属性，则构建 embeddings 层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        # 如果存在 encoder 属性，则构建 encoder 层
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        # 如果存在 layernorm 属性，则构建 layernorm 层
        if getattr(self, "layernorm", None) is not None:
            if hasattr(self.layernorm, "name"):
                with tf.name_scope(self.layernorm.name):
                    self.layernorm.build((None, self.config.hidden_size))
        # 如果存在 pooler 属性，则构建 pooler 层
        if getattr(self, "pooler", None) is not None:
            with tf.name_scope(self.pooler.name):
                self.pooler.build(None)
class TFData2VecVisionPooler(keras.layers.Layer):
    def __init__(self, config: Data2VecVisionConfig, **kwargs):
        super().__init__(**kwargs)
        # 如果配置要求使用均值池化，则初始化 LayerNormalization 层
        self.layernorm = (
            keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
            if config.use_mean_pooling
            else None
        )
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        if self.layernorm is not None:
            # 对补丁令牌的最终隐藏状态进行均值池化
            patch_tokens = hidden_states[:, 1:, :]
            pooled_output = self.layernorm(tf.reduce_mean(patch_tokens, axis=1))
        else:
            # 通过仅获取 [CLS] 令牌的最终隐藏状态进行池化
            pooled_output = hidden_states[:, 0]

        return pooled_output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在 layernorm 层，构建它并指定其输入形状
        if getattr(self, "layernorm", None) is not None:
            if hasattr(self.layernorm, "name"):
                with tf.name_scope(self.layernorm.name):
                    self.layernorm.build((None, self.config.hidden_size))


class TFData2VecVisionPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = Data2VecVisionConfig
    base_model_prefix = "data2vec_vision"
    main_input_name = "pixel_values"
    _keys_to_ignore_on_load_unexpected = [r"relative_position_index"]


DATA2VEC_VISION_START_DOCSTRING = r"""
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.).

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:
    - a single Tensor with `pixel_values` only and nothing else: `model(pixel_values)`
    仅传入 `pixel_values` 张量，没有其他参数，用法示例为 `model(pixel_values)`。

    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([pixel_values, attention_mask])` or `model([pixel_values, attention_mask, token_type_ids])`
    传入一个长度可变的列表，按照文档字符串中给定的顺序包含一个或多个输入张量，例如 `model([pixel_values, attention_mask])` 或 `model([pixel_values, attention_mask, token_type_ids])`。

    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`
    传入一个字典，其中键是文档字符串中指定的输入名称，对应的值是相应的输入张量，例如 `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`。

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`Data2VecVisionConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""

DATA2VEC_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BeitImageProcessor.__call__`] for details.

        head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
            in eager mode, in graph mode the value will always be set to True.

        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
"""


@add_start_docstrings(
    "The bare Data2VecVision Model transformer outputting raw hidden-states without any specific head on top.",
    DATA2VEC_VISION_START_DOCSTRING,
)
class TFData2VecVisionModel(TFData2VecVisionPreTrainedModel):
    def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = False, *inputs, **kwargs):
        # 调用父类构造函数初始化模型
        super().__init__(config, *inputs, **kwargs)
        # 将传入的配置参数保存在实例变量中
        self.config = config

        # 创建 Data2VecVisionMainLayer 的实例作为模型的核心层
        self.data2vec_vision = TFData2VecVisionMainLayer(
            config, add_pooling_layer=add_pooling_layer, name="data2vec_vision"
        )

    # 获取输入嵌入的方法
    def get_input_embeddings(self):
        return self.data2vec_vision.get_input_embeddings()

    # 模型的调用方法，接受多个输入参数并返回模型输出
    @unpack_inputs
    @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFData2VecVisionModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def call(
        self,
        pixel_values: TFModelInputType | None = None,
        bool_masked_pos: tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        # 剩余未列出的参数由装饰器处理
    ) -> Union[tuple, TFData2VecVisionModelOutputWithPooling]:
        r"""
        bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        """
        # 调用 self.data2vec_vision 方法，传入参数并获取输出
        outputs = self.data2vec_vision(
            pixel_values=pixel_values,
            bool_masked_pos=bool_masked_pos,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回 self.data2vec_vision 方法的输出作为结果
        return outputs

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        
        # 标记模型已经构建
        self.built = True
        
        # 如果 self.data2vec_vision 方法存在
        if getattr(self, "data2vec_vision", None) is not None:
            # 在命名空间 self.data2vec_vision.name 下构建模型
            with tf.name_scope(self.data2vec_vision.name):
                self.data2vec_vision.build(None)
@add_start_docstrings(
    """
    Data2VecVision Model transformer with an image classification head on top (a linear layer`
# 添加文档字符串，以描述 Data2VecVision 模型的初始化细节
@add_start_docstrings(
    """
    Data2VecVision Model transformer with an image classification head on top (a linear layer on top of the average of
    the final hidden states of the patch tokens) e.g. for ImageNet.
    """,
    DATA2VEC_VISION_START_DOCSTRING,
)
# 定义 TFData2VecVisionForImageClassification 类，继承自 TFData2VecVisionPreTrainedModel 和 TFSequenceClassificationLoss
class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TFSequenceClassificationLoss):
    # 初始化方法，接收配置参数 config 以及其他输入参数
    def __init__(self, config: Data2VecVisionConfig, *inputs, **kwargs):
        # 调用父类初始化方法
        super().__init__(config, *inputs, **kwargs)

        # 设置类别数量
        self.num_labels = config.num_labels
        # 初始化 Data2VecVision 主层，传入配置，添加池化层，并指定名称
        self.data2vec_vision = TFData2VecVisionMainLayer(config, add_pooling_layer=True, name="data2vec_vision")

        # 初始化分类器头部，使用 Dense 层，设置单位数为类别数量，权重初始化方法为配置中的初始化范围，命名为 "classifier"
        self.classifier = keras.layers.Dense(
            units=config.num_labels,
            kernel_initializer=get_initializer(config.initializer_range),
            name="classifier",
        )
        # 存储配置
        self.config = config

    # 解包输入参数装饰器
    @unpack_inputs
    # 添加文档字符串到模型的前向方法
    @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
    # 添加代码示例的文档字符串，包括检查点、输出类型、配置类和预期输出
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    # 定义模型的前向传播方法，接收多个输入参数
    def call(
        self,
        pixel_values: TFModelInputType | None = None,  # 输入的像素值，类型可以是 TFModelInputType 或 None
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部掩码，类型可以是 ndarray、tf.Tensor 或 None
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，默认 None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，默认 None
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，默认 None
        labels: np.ndarray | tf.Tensor | None = None,  # 标签，类型可以是 ndarray、tf.Tensor 或 None
        training: Optional[bool] = False,  # 是否在训练模式下，默认 False
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 确定是否返回字典格式的输出，如果未指定则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将像素值和其他参数传递给数据转换函数data2vec_vision，获取其输出
        outputs = self.data2vec_vision(
            pixel_values=pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 根据return_dict决定选取的输出方式，获取池化后的输出或者特定位置的输出向量
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 将池化输出通过分类器模型进行分类得到logits
        logits = self.classifier(pooled_output)

        # 如果提供了标签，计算损失，否则损失为None
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果不是以字典格式返回结果，按照元组格式构建输出
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要以TFSequenceClassifierOutput对象格式返回结果，构建对象并返回
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True

        # 如果data2vec_vision函数存在，使用tf.name_scope构建其模型
        if getattr(self, "data2vec_vision", None) is not None:
            with tf.name_scope(self.data2vec_vision.name):
                self.data2vec_vision.build(None)

        # 如果classifier函数存在，使用tf.name_scope构建其模型
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
# 定义一个自定义的 Keras 层，用于创建包含卷积、归一化和激活层的卷积块。这个块简化了卷积层的使用，
# 这些卷积层通常与归一化层（如 BatchNorm）和激活层（如 ReLU）一起使用。
class TFData2VecVisionConvModule(keras.layers.Layer):
    """
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Union[int, Tuple[int, int]],
        padding: str = "valid",
        bias: bool = False,
        dilation: Union[int, Tuple[int, int]] = 1,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        # 创建一个二维卷积层
        self.conv = keras.layers.Conv2D(
            filters=out_channels,
            kernel_size=kernel_size,
            padding=padding,
            use_bias=bias,
            dilation_rate=dilation,
            name="conv",
        )
        # 创建一个批归一化层
        self.bn = keras.layers.BatchNormalization(name="bn", momentum=0.9, epsilon=1e-5)
        # 设置激活函数为 ReLU
        self.activation = tf.nn.relu
        self.in_channels = in_channels
        self.out_channels = out_channels

    def call(self, input: tf.Tensor) -> tf.Tensor:
        # 前向传播函数，依次对输入进行卷积、归一化和激活操作
        output = self.conv(input)
        output = self.bn(output)
        output = self.activation(output)
        return output

    def build(self, input_shape=None):
        # 在首次调用 build 方法时构建层
        if self.built:
            return
        self.built = True
        # 如果存在卷积层，根据输入形状构建卷积层
        if getattr(self, "conv", None) is not None:
            with tf.name_scope(self.conv.name):
                self.conv.build([None, None, None, self.in_channels])
        # 如果存在归一化层，根据输出通道数构建归一化层
        if getattr(self, "bn", None) is not None:
            with tf.name_scope(self.bn.name):
                self.bn.build((None, None, None, self.out_channels))


class TFAdaptiveAvgPool2D(keras.layers.Layer):
    # 定义一个自适应平均池化层，根据给定的输出维度和输入数据的顺序（NHWC 或者 NCHW）
    def __init__(self, output_dims: Tuple[int, int], input_ordering: str = "NHWC", **kwargs):
        super().__init__(**kwargs)
        self.output_dims = output_dims
        self.input_ordering = input_ordering
        # 如果输入数据顺序不是 'NCHW' 或者 'NHWC'，则抛出异常
        if input_ordering not in ("NCHW", "NHWC"):
            raise ValueError("Unrecognized input_ordering, should be 'NCHW' or 'NHWC'!")
        # 获取输入数据中高度和宽度的索引位置
        self.h_axis = input_ordering.index("H")
        self.w_axis = input_ordering.index("W")
    # 定义一个方法 `call`，接受一个 TensorFlow 张量作为输入
    def call(self, inputs: tf.Tensor):
        # 根据输入顺序确定输入的形状
        if self.input_ordering == "NHWC":
            # 如果输入顺序是 NHWC，则提取高度和宽度信息
            input_shape = inputs.shape[1:3]
        else:
            # 如果输入顺序不是 NHWC，则提取剩余维度的信息
            input_shape = inputs.shape[2:]

        # 将任务分解为每种可能的情况
        # 首先，如果输出维度为1，则直接使用 tf.reduce_mean
        if self.output_dims[0] == self.output_dims[1] == 1:
            if self.input_ordering == "NHWC":
                reduce_dims = [1, 2]
            else:
                reduce_dims = [2, 3]
            return tf.reduce_mean(inputs, axis=reduce_dims, keepdims=True)
        
        # 其次，如果在两个维度上以整数因子进行调整，则可以使用快捷方式
        elif input_shape[0] % self.output_dims[0] == 0 and input_shape[1] % self.output_dims[1] == 0:
            # 计算高度和宽度的调整因子
            h_resize = int(input_shape[0] // self.output_dims[0])
            w_resize = int(input_shape[1] // self.output_dims[1])
            # 使用 tf.nn.avg_pool2d 进行平均池化操作
            return tf.nn.avg_pool2d(
                inputs,
                ksize=(h_resize, w_resize),
                strides=(h_resize, w_resize),
                padding="VALID",
                data_format=self.input_ordering,
            )
        
        # 最后，如果不能采用快捷方式，则在每个轴上进行一维池化
        else:
            # 对于无法使用整数因子调整大小的维度，使用伪一维池化方法
            h_pooled = self.pseudo_1d_pool(inputs, h_pooling=True)
            return self.pseudo_1d_pool(h_pooled, h_pooling=False)
class TFData2VecVisionPyramidPoolingModule(keras.layers.Layer):
    """
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        channels (int): Channels after modules, before conv_seg.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    """

    def __init__(self, pool_scales: Tuple[int, ...], in_channels: int, out_channels: int, **kwargs) -> None:
        super().__init__(**kwargs)
        self.pool_scales = pool_scales  # 设置池化尺度
        self.in_channels = in_channels  # 输入通道数
        self.out_channels = out_channels  # 输出通道数

        self.layer_list = []  # 初始化层列表
        for idx, pool_scale in enumerate(pool_scales):
            pool_scale = pool_scale if isinstance(pool_scale, collections.abc.Iterable) else (pool_scale, pool_scale)
            self.layer_list.append(  # 向层列表添加池化层和卷积层组成的模块
                [
                    TFAdaptiveAvgPool2D(output_dims=pool_scale),  # 自适应平均池化层
                    TFData2VecVisionConvModule(  # 自定义的卷积模块
                        in_channels=in_channels, out_channels=self.out_channels, kernel_size=1, name=f"{idx}.1"
                    ),
                ]
            )

    def call(self, x: tf.Tensor) -> List[tf.Tensor]:
        ppm_outs = []  # 初始化池化模块输出列表
        inputs = x  # 保存输入张量

        for ppm in self.layer_list:
            for layer_module in ppm:
                ppm_out = layer_module(x)  # 对输入应用每个模块
                x = ppm_out  # 更新输入为当前模块的输出

            upsampled_ppm_out = tf.image.resize(ppm_out, size=shape_list(inputs)[1:-1], method="bilinear")  # 双线性插值上采样
            ppm_outs.append(upsampled_ppm_out)  # 将上采样后的结果添加到输出列表
        return ppm_outs  # 返回所有池化模块的输出列表

    def build(self, input_shape=None):
        for layer in self.layer_list:
            for layer_module in layer:
                with tf.name_scope(layer_module.name):
                    layer_module.build(None)  # 构建每个层模块


class TFData2VecVisionUperHead(keras.layers.Layer):
    """
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://arxiv.org/abs/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    """
    # 初始化函数，用于初始化类的实例
    def __init__(self, config: Data2VecVisionConfig, **kwargs) -> None:
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 设置池化尺度，例如 (1, 2, 3, 6)
        self.pool_scales = config.pool_scales
        # 设置输入通道数列表，例如 [768, 768, 768, 768]
        self.in_channels = [config.hidden_size] * 4
        # 设置通道数
        self.channels = config.hidden_size
        # 创建一个卷积层作为分类器
        self.classifier = keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier")

        # PSP模块
        # 创建一个金字塔池化模块
        self.psp_modules = TFData2VecVisionPyramidPoolingModule(
            self.pool_scales, self.in_channels[-1], self.channels, name="psp_modules"
        )
        # 创建一个卷积模块作为瓶颈层
        self.bottleneck = TFData2VecVisionConvModule(
            self.in_channels[-1] + len(self.pool_scales) * self.channels,
            self.channels,
            kernel_size=3,
            padding="same",
            name="bottleneck",
        )

        # FPN模块
        self.lateral_convs = []
        self.fpn_convs = []
        # 遍历输入通道数列表，创建侧边卷积和FPN卷积模块
        for idx, in_channels in enumerate(self.in_channels[:-1]):  # 跳过顶层
            l_conv = TFData2VecVisionConvModule(
                in_channels, out_channels=self.channels, kernel_size=1, name=f"lateral_convs.{idx}"
            )
            fpn_conv = TFData2VecVisionConvModule(
                in_channels=self.channels,
                out_channels=self.channels,
                kernel_size=3,
                padding="same",
                name=f"fpn_convs.{idx}",
            )
            self.lateral_convs.append(l_conv)
            self.fpn_convs.append(fpn_conv)

        # 创建一个FPN瓶颈层
        self.fpn_bottleneck = TFData2VecVisionConvModule(
            in_channels=len(self.in_channels) * self.channels,
            out_channels=self.channels,
            kernel_size=3,
            padding="same",
            name="fpn_bottleneck",
        )

    # PSP模块的前向传播方法
    def psp_forward(self, inputs):
        # 获取输入的最后一个元素
        x = inputs[-1]
        # 将输入的最后一层作为初始输出
        psp_outs = [x]
        # 对PSP模块进行操作，并将结果拼接在一起
        psp_outs.extend(self.psp_modules(x))
        psp_outs = tf.concat(psp_outs, axis=-1)
        # 使用瓶颈层处理PSP模块的输出
        output = self.bottleneck(psp_outs)

        return output
    def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor:
        # 构建侧向连接
        laterals = [lateral_conv(encoder_hidden_states[i]) for i, lateral_conv in enumerate(self.lateral_convs)]

        # 将PSP模块的输出添加到侧向连接中
        laterals.append(self.psp_forward(encoder_hidden_states))

        # 构建自顶向下路径
        used_backbone_levels = len(laterals)
        for i in range(used_backbone_levels - 1, 0, -1):
            # 获取前一层特征图的形状（不包括批次和通道维度）
            prev_shape = shape_list(laterals[i - 1])[1:-1]
            # 使用双线性插值将当前层特征图调整到前一层特征图的大小，并与前一层特征图相加
            laterals[i - 1] = laterals[i - 1] + tf.image.resize(laterals[i], size=prev_shape, method="bilinear")

        # 构建FPN的输出
        fpn_outs = [self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels - 1)]
        # 将PSP特征添加到FPN的输出中
        fpn_outs.append(laterals[-1])

        # 使用双线性插值将所有层的FPN输出调整为与第一层相同大小
        for i in range(used_backbone_levels - 1, 0, -1):
            fpn_outs[i] = tf.image.resize(fpn_outs[i], size=shape_list(fpn_outs[0])[1:-1], method="bilinear")
        # 将所有层的FPN输出连接在一起
        fpn_outs = tf.concat(fpn_outs, axis=-1)
        # 过FPN的瓶颈层
        output = self.fpn_bottleneck(fpn_outs)
        # 使用分类器输出最终结果
        output = self.classifier(output)

        return output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已定义分类器，则构建分类器
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, None, self.channels])
        # 如果已定义PSP模块，则构建PSP模块
        if getattr(self, "psp_modules", None) is not None:
            with tf.name_scope(self.psp_modules.name):
                self.psp_modules.build(None)
        # 如果已定义瓶颈层，则构建瓶颈层
        if getattr(self, "bottleneck", None) is not None:
            with tf.name_scope(self.bottleneck.name):
                self.bottleneck.build(None)
        # 如果已定义FPN瓶颈层，则构建FPN瓶颈层
        if getattr(self, "fpn_bottleneck", None) is not None:
            with tf.name_scope(self.fpn_bottleneck.name):
                self.fpn_bottleneck.build(None)
        # 遍历所有侧向卷积层，并构建它们
        for layer in self.lateral_convs:
            with tf.name_scope(layer.name):
                layer.build(None)
        # 遍历所有FPN卷积层，并构建它们
        for layer in self.fpn_convs:
            with tf.name_scope(layer.name):
                layer.build(None)
class TFData2VecVisionFCNHead(keras.layers.Layer):
    """
    Fully Convolution Networks for Semantic Segmentation. This head is implemented from
    [FCNNet](https://arxiv.org/abs/1411.4038).

    Args:
        config (Data2VecVisionConfig): Configuration.
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    """

    def __init__(
        self,
        config: Data2VecVisionConfig,
        in_index: int = 2,
        kernel_size: int = 3,
        dilation: Union[int, Tuple[int, int]] = 1,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        # 设置输入通道数为模型配置的隐藏层大小
        self.in_channels = config.hidden_size
        # 设置通道数为辅助通道数
        self.channels = config.auxiliary_channels
        # 设置卷积层数为辅助卷积层数
        self.num_convs = config.auxiliary_num_convs
        # 设置是否连接输入的标志为模型配置的辅助连接输入
        self.concat_input = config.auxiliary_concat_input
        # 设置输入索引为给定的索引
        self.in_index = in_index

        convs = []
        # 添加第一个卷积模块到列表中
        convs.append(
            TFData2VecVisionConvModule(
                in_channels=self.in_channels,
                out_channels=self.channels,
                kernel_size=kernel_size,
                padding="same",
                dilation=dilation,
                name="convs.0",
            )
        )
        # 循环添加剩余的卷积模块到列表中
        for i in range(self.num_convs - 1):
            convs.append(
                TFData2VecVisionConvModule(
                    in_channels=self.channels,
                    out_channels=self.channels,
                    kernel_size=kernel_size,
                    padding="same",
                    dilation=dilation,
                    name=f"conv_module_{i+2}",
                )
            )
        # 如果卷积层数为0，则设置卷积模块列表为tf.identity
        if self.num_convs == 0:
            self.convs = [tf.identity]
        else:
            self.convs = convs
        # 如果设置了连接输入，则创建连接输入的卷积模块
        if self.concat_input:
            self.conv_cat = TFData2VecVisionConvModule(
                self.in_channels + self.channels,
                out_channels=self.channels,
                kernel_size=kernel_size,
                padding="same",
                name="conv_cat",
            )

        # 设置分类器为卷积层，输出类别数为模型配置的类别数，卷积核大小为1x1
        self.classifier = keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier")

    def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor:
        # 从编码器隐藏状态中取出指定索引的特征映射
        hidden_states = encoder_hidden_states[self.in_index]
        output = hidden_states
        # 逐层应用卷积模块列表中的卷积操作
        for layer_module in self.convs:
            output = layer_module(output)
        # 如果设置了连接输入，则将原始输入与最终输出进行连接并应用连接输入的卷积模块
        if self.concat_input:
            output = self.conv_cat(tf.concat([hidden_states, output], axis=-1))
        # 应用分类器卷积层，最终输出预测结果
        output = self.classifier(output)
        return output
    # 如果模型已经构建完成，则直接返回，不进行重复构建
    if self.built:
        return
    
    # 将模型标记为已构建状态
    self.built = True
    
    # 如果模型中包含分类器（classifier）属性，则构建分类器模型
    if getattr(self, "classifier", None) is not None:
        # 使用分类器的名称作为命名空间，构建分类器模型
        with tf.name_scope(self.classifier.name):
            self.classifier.build([None, None, None, self.channels])
    
    # 如果模型中包含卷积层（conv_cat）属性，则构建卷积层模型
    if getattr(self, "conv_cat", None) is not None:
        # 使用卷积层的名称作为命名空间，构建卷积层模型
        with tf.name_scope(self.conv_cat.name):
            self.conv_cat.build(None)
@add_start_docstrings(
    """
    Data2VecVision Model transformer with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
    """,
    DATA2VEC_VISION_START_DOCSTRING,
)
class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel):
    def __init__(self, config: Data2VecVisionConfig, *inputs, **kwargs) -> None:
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
        self.data2vec_vision = TFData2VecVisionMainLayer(config, add_pooling_layer=False, name="data2vec_vision")

        # FPNs (Feature Pyramid Networks)
        self.fpn1 = [
            # First upsample layer of FPN
            keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.0"),
            keras.layers.BatchNormalization(name="fpn1.1", momentum=0.9, epsilon=1e-5),
            keras.layers.Activation("gelu"),
            # Second upsample layer of FPN
            keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.3"),
        ]

        self.fpn2 = [
            # Third upsample layer of FPN
            keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn2.0"),
        ]

        # Identity function for FPN3
        self.fpn3 = tf.identity

        # Max pooling layer for FPN4
        self.fpn4 = keras.layers.MaxPool2D(pool_size=2, strides=2)

        # Semantic segmentation head(s)
        self.decode_head = TFData2VecVisionUperHead(config, name="decode_head")
        self.auxiliary_head = (
            TFData2VecVisionFCNHead(config, name="auxiliary_head") if config.use_auxiliary_head else None
        )

    def compute_loss(self, logits, auxiliary_logits, labels):
        # upsample logits to the images' original size
        if len(shape_list(labels)) > 3:
            label_interp_shape = shape_list(labels)[1:-1]
        else:
            label_interp_shape = shape_list(labels)[-2:]

        upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear")
        if auxiliary_logits is not None:
            upsampled_auxiliary_logits = tf.image.resize(auxiliary_logits, size=label_interp_shape, method="bilinear")

        # compute weighted loss
        loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")

        # Copied from https://www.tensorflow.org/text/tutorials/transformer#loss_and_metrics.
        # Utility to mask the index to ignore during computing the loss.
        def masked_loss(real, pred):
            mask = tf.math.logical_not(tf.math.equal(real, self.config.semantic_loss_ignore_index))
            loss_ = loss_fct(real, pred)
            mask = tf.cast(mask, dtype=loss_.dtype)
            loss_ *= mask
            reduced_masked_loss = tf.reduce_sum(loss_) / tf.reduce_sum(mask)
            return tf.reshape(reduced_masked_loss, (1,))

        main_loss = masked_loss(labels, upsampled_logits)
        auxiliary_loss = masked_loss(labels, upsampled_auxiliary_logits)

        # Total loss combining main and auxiliary losses with weights
        loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss

        return loss

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
    # 使用装饰器替换函数返回值的文档字符串，指定输出类型为TFSemanticSegmenterOutput，并指定配置类为_CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=TFSemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
    # 定义call方法，接受多个参数作为输入
    def call(
        self,
        pixel_values: tf.Tensor | None = None,  # 表示像素值的张量，可选参数，默认为None
        head_mask: tf.Tensor | None = None,  # 表示头部遮罩的张量，可选参数，默认为None
        labels: tf.Tensor | None = None,  # 表示标签的张量，可选参数，默认为None
        output_attentions: Optional[bool] = None,  # 是否输出注意力信息的布尔值，可选参数，默认为None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态的布尔值，可选参数，默认为None
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，可选参数，默认为None
    ):
    
    # 构建函数，用于构建模型的结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 将构建状态标记为已完成
        self.built = True
        
        # 如果存在data2vec_vision属性，则构建data2vec_vision模块
        if getattr(self, "data2vec_vision", None) is not None:
            with tf.name_scope(self.data2vec_vision.name):
                self.data2vec_vision.build(None)
        
        # 如果存在decode_head属性，则构建decode_head模块
        if getattr(self, "decode_head", None) is not None:
            with tf.name_scope(self.decode_head.name):
                self.decode_head.build(None)
        
        # 如果存在auxiliary_head属性，则构建auxiliary_head模块
        if getattr(self, "auxiliary_head", None) is not None:
            with tf.name_scope(self.auxiliary_head.name):
                self.auxiliary_head.build(None)
        
        # 如果存在fpn1属性，则分别构建fpn1的各个子模块
        if getattr(self, "fpn1", None) is not None:
            with tf.name_scope(self.fpn1[0].name):
                self.fpn1[0].build([None, None, None, self.config.hidden_size])
            with tf.name_scope(self.fpn1[1].name):
                self.fpn1[1].build((None, None, None, self.config.hidden_size))
            with tf.name_scope(self.fpn1[3].name):
                self.fpn1[3].build([None, None, None, self.config.hidden_size])
        
        # 如果存在fpn2属性，则构建fpn2的子模块
        if getattr(self, "fpn2", None) is not None:
            with tf.name_scope(self.fpn2[0].name):
                self.fpn2[0].build([None, None, None, self.config.hidden_size])

`.\models\data2vec\init.py`

# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import TYPE_CHECKING

# 从 HuggingFace 的 utils 模块导入必要的异常和工具函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available

# 定义一个结构，用于存储不同模块的导入信息
_import_structure = {
    "configuration_data2vec_audio": ["DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecAudioConfig"],
    "configuration_data2vec_text": [
        "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "Data2VecTextConfig",
        "Data2VecTextOnnxConfig",
    ],
    "configuration_data2vec_vision": [
        "DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "Data2VecVisionConfig",
        "Data2VecVisionOnnxConfig",
    ],
}

# 尝试检查是否 Torch 可用，若不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    # 如果 Torch 不可用，则忽略异常继续执行
    pass
else:
    # 如果 Torch 可用，则扩展 _import_structure 添加相关的模型定义
    _import_structure["modeling_data2vec_audio"] = [
        "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Data2VecAudioForAudioFrameClassification",
        "Data2VecAudioForCTC",
        "Data2VecAudioForSequenceClassification",
        "Data2VecAudioForXVector",
        "Data2VecAudioModel",
        "Data2VecAudioPreTrainedModel",
    ]
    _import_structure["modeling_data2vec_text"] = [
        "DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Data2VecTextForCausalLM",
        "Data2VecTextForMaskedLM",
        "Data2VecTextForMultipleChoice",
        "Data2VecTextForQuestionAnswering",
        "Data2VecTextForSequenceClassification",
        "Data2VecTextForTokenClassification",
        "Data2VecTextModel",
        "Data2VecTextPreTrainedModel",
    ]
    _import_structure["modeling_data2vec_vision"] = [
        "DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Data2VecVisionForImageClassification",
        "Data2VecVisionForMaskedImageModeling",
        "Data2VecVisionForSemanticSegmentation",
        "Data2VecVisionModel",
        "Data2VecVisionPreTrainedModel",
    ]

# 如果是在类型检查模式下，导入额外的类型相关信息
if TYPE_CHECKING:
    from .configuration_data2vec_audio import DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig

# 注意：此处的代码没有返回值，仅用于定义模块导入结构和在特定条件下导入额外的类型信息
    # 从配置文件中导入文本数据2vec的预训练配置映射和相关类
    from .configuration_data2vec_text import (
        DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        Data2VecTextConfig,
        Data2VecTextOnnxConfig,
    )
    # 从配置文件中导入视觉数据2vec的预训练配置映射和相关类
    from .configuration_data2vec_vision import (
        DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP,
        Data2VecVisionConfig,
        Data2VecVisionOnnxConfig,
    )
    
    try:
        # 检查是否已经安装了torch，如果没有则引发OptionalDependencyNotAvailable异常
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果OptionalDependencyNotAvailable异常被引发，则什么都不做，继续执行后续代码
        pass
    else:
        # 如果没有异常发生，则导入音频数据2vec的预训练模型和相关类
        from .modeling_data2vec_audio import (
            DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
            Data2VecAudioForAudioFrameClassification,
            Data2VecAudioForCTC,
            Data2VecAudioForSequenceClassification,
            Data2VecAudioForXVector,
            Data2VecAudioModel,
            Data2VecAudioPreTrainedModel,
        )
        # 导入文本数据2vec的预训练模型和相关类
        from .modeling_data2vec_text import (
            DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
            Data2VecTextForCausalLM,
            Data2VecTextForMaskedLM,
            Data2VecTextForMultipleChoice,
            Data2VecTextForQuestionAnswering,
            Data2VecTextForSequenceClassification,
            Data2VecTextForTokenClassification,
            Data2VecTextModel,
            Data2VecTextPreTrainedModel,
        )
        # 导入视觉数据2vec的预训练模型和相关类
        from .modeling_data2vec_vision import (
            DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST,
            Data2VecVisionForImageClassification,
            Data2VecVisionForMaskedImageModeling,
            Data2VecVisionForSemanticSegmentation,
            Data2VecVisionModel,
            Data2VecVisionPreTrainedModel,
        )
    
    # 如果TensorFlow可用，导入TensorFlow版本的视觉数据2vec模型和相关类
    if is_tf_available():
        from .modeling_tf_data2vec_vision import (
            TFData2VecVisionForImageClassification,
            TFData2VecVisionForSemanticSegmentation,
            TFData2VecVisionModel,
            TFData2VecVisionPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于动态操作模块信息
    import sys

    # 将当前模块注册到 sys.modules 中，使用 _LazyModule 进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\deberta\configuration_deberta.py`

# coding=utf-8
# Copyright 2020, Microsoft and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" DeBERTa model configuration"""

# 导入所需模块
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union

# 从 Transformers 库中导入必要的配置和工具函数
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging

# 如果是类型检查，导入相关类
if TYPE_CHECKING:
    from ... import FeatureExtractionMixin, PreTrainedTokenizerBase, TensorType

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# 定义 DeBERTa 预训练配置文件映射表
DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/config.json",
    "microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/config.json",
    "microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/config.json",
    "microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/config.json",
    "microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/config.json",
    "microsoft/deberta-xlarge-mnli": "https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/config.json",
}

# DeBERTa 的配置类，继承自 PretrainedConfig
class DebertaConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DebertaModel`] or a [`TFDebertaModel`]. It is
    used to instantiate a DeBERTa model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the DeBERTa
    [microsoft/deberta-base](https://huggingface.co/microsoft/deberta-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import DebertaConfig, DebertaModel

    >>> # Initializing a DeBERTa microsoft/deberta-base style configuration
    >>> configuration = DebertaConfig()

    >>> # Initializing a model (with random weights) from the microsoft/deberta-base style configuration
    >>> model = DebertaModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型为 "deberta"
    model_type = "deberta"
        # 初始化函数，用于创建一个新的实例
        def __init__(
            self,
            vocab_size=50265,                      # 词汇表大小，默认为50265
            hidden_size=768,                       # 隐藏层大小，默认为768
            num_hidden_layers=12,                  # 隐藏层的数量，默认为12
            num_attention_heads=12,                # 注意力头的数量，默认为12
            intermediate_size=3072,                # 中间层大小，默认为3072
            hidden_act="gelu",                     # 隐藏层激活函数，默认为GELU
            hidden_dropout_prob=0.1,               # 隐藏层的dropout概率，默认为0.1
            attention_probs_dropout_prob=0.1,      # 注意力概率的dropout概率，默认为0.1
            max_position_embeddings=512,           # 最大位置嵌入数，默认为512
            type_vocab_size=0,                     # 类型词汇表大小，默认为0
            initializer_range=0.02,                # 初始化范围，默认为0.02
            layer_norm_eps=1e-7,                   # 层归一化的epsilon，默认为1e-7
            relative_attention=False,              # 是否使用相对注意力，默认为False
            max_relative_positions=-1,             # 最大相对位置，默认为-1
            pad_token_id=0,                        # 填充标记的ID，默认为0
            position_biased_input=True,            # 是否使用位置偏置的输入，默认为True
            pos_att_type=None,                     # 位置注意力的类型，默认为None
            pooler_dropout=0,                      # 汇集器的dropout概率，默认为0
            pooler_hidden_act="gelu",              # 汇集器的隐藏层激活函数，默认为GELU
            **kwargs,
        ):
            super().__init__(**kwargs)

            self.hidden_size = hidden_size                       # 设置隐藏层大小
            self.num_hidden_layers = num_hidden_layers           # 设置隐藏层的数量
            self.num_attention_heads = num_attention_heads       # 设置注意力头的数量
            self.intermediate_size = intermediate_size           # 设置中间层大小
            self.hidden_act = hidden_act                         # 设置隐藏层激活函数
            self.hidden_dropout_prob = hidden_dropout_prob       # 设置隐藏层的dropout概率
            self.attention_probs_dropout_prob = attention_probs_dropout_prob  # 设置注意力概率的dropout概率
            self.max_position_embeddings = max_position_embeddings    # 设置最大位置嵌入数
            self.type_vocab_size = type_vocab_size               # 设置类型词汇表大小
            self.initializer_range = initializer_range           # 设置初始化范围
            self.relative_attention = relative_attention         # 设置是否使用相对注意力
            self.max_relative_positions = max_relative_positions  # 设置最大相对位置
            self.pad_token_id = pad_token_id                     # 设置填充标记的ID
            self.position_biased_input = position_biased_input   # 设置是否使用位置偏置的输入

            # 向后兼容性
            if isinstance(pos_att_type, str):
                pos_att_type = [x.strip() for x in pos_att_type.lower().split("|")]

            self.pos_att_type = pos_att_type                     # 设置位置注意力的类型
            self.vocab_size = vocab_size                         # 设置词汇表大小
            self.layer_norm_eps = layer_norm_eps                 # 设置层归一化的epsilon

            self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size)   # 设置汇集器的隐藏层大小
            self.pooler_dropout = pooler_dropout                 # 设置汇集器的dropout概率
            self.pooler_hidden_act = pooler_hidden_act           # 设置汇集器的隐藏层激活函数
# 从 transformers.models.deberta_v2.configuration_deberta_v2.DebertaV2OnnxConfig 复制而来的类定义，继承自 OnnxConfig
class DebertaOnnxConfig(OnnxConfig):
    
    # 定义 inputs 属性，返回输入的结构化映射，键为字符串，值为映射到字符串的整数
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务类型是 "multiple-choice"
        if self.task == "multiple-choice":
            # 设置动态轴的结构为 {0: "batch", 1: "choice", 2: "sequence"}
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则设置为 {0: "batch", 1: "sequence"}
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 如果配置对象的 type_vocab_size 大于 0
        if self._config.type_vocab_size > 0:
            # 返回有序字典，包含 "input_ids", "attention_mask", "token_type_ids" 三个键，值为 dynamic_axis
            return OrderedDict(
                [("input_ids", dynamic_axis), ("attention_mask", dynamic_axis), ("token_type_ids", dynamic_axis)]
            )
        else:
            # 返回有序字典，包含 "input_ids", "attention_mask" 两个键，值为 dynamic_axis
            return OrderedDict([("input_ids", dynamic_axis), ("attention_mask", dynamic_axis)])

    # 定义 default_onnx_opset 属性，返回默认的 ONNX 操作集版本号，为整数 12
    @property
    def default_onnx_opset(self) -> int:
        return 12

    # 定义 generate_dummy_inputs 方法，生成虚拟输入数据的字典
    def generate_dummy_inputs(
        self,
        preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
        batch_size: int = -1,
        seq_length: int = -1,
        num_choices: int = -1,
        is_pair: bool = False,
        framework: Optional["TensorType"] = None,
        num_channels: int = 3,
        image_width: int = 40,
        image_height: int = 40,
        tokenizer: "PreTrainedTokenizerBase" = None,
    ) -> Mapping[str, Any]:
        # 调用父类的 generate_dummy_inputs 方法生成初始的虚拟输入数据
        dummy_inputs = super().generate_dummy_inputs(preprocessor=preprocessor, framework=framework)
        
        # 如果配置对象的 type_vocab_size 为 0 并且 dummy_inputs 中包含 "token_type_ids"
        if self._config.type_vocab_size == 0 and "token_type_ids" in dummy_inputs:
            # 删除 dummy_inputs 中的 "token_type_ids" 键
            del dummy_inputs["token_type_ids"]
        
        # 返回更新后的 dummy_inputs 字典
        return dummy_inputs

`.\models\deberta\modeling_deberta.py`

# coding=utf-8
# 版权 2020 年 Microsoft 和 Hugging Face Inc. 团队。
#
# 根据 Apache 许可证 2.0 版本（"许可证"）授权;
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"分发，
# 没有任何明示或暗示的保证或条件。
# 有关详细信息，请参阅许可证。

""" PyTorch DeBERTa 模型。"""

from collections.abc import Sequence
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutput,
    MaskedLMOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import softmax_backward_data
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_deberta import DebertaConfig

# 获取 logger 实例
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DebertaConfig"
_CHECKPOINT_FOR_DOC = "microsoft/deberta-base"

# Masked LM 的文档字符串
_CHECKPOINT_FOR_MASKED_LM = "lsanochkin/deberta-large-feedback"
_MASKED_LM_EXPECTED_OUTPUT = "' Paris'"
_MASKED_LM_EXPECTED_LOSS = "0.54"

# QuestionAnswering 的文档字符串
_CHECKPOINT_FOR_QA = "Palak/microsoft_deberta-large_squad"
_QA_EXPECTED_OUTPUT = "' a nice puppet'"
_QA_EXPECTED_LOSS = 0.14
_QA_TARGET_START_INDEX = 12
_QA_TARGET_END_INDEX = 14

# 预训练模型存档列表
DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/deberta-base",
    "microsoft/deberta-large",
    "microsoft/deberta-xlarge",
    "microsoft/deberta-base-mnli",
    "microsoft/deberta-large-mnli",
    "microsoft/deberta-xlarge-mnli",
]


class ContextPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性层，用于池化隐藏状态
        self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
        # 稳定的 Dropout 层，用于池化层输出
        self.dropout = StableDropout(config.pooler_dropout)
        self.config = config

    def forward(self, hidden_states):
        # 通过获取第一个 token 对应的隐藏状态来"池化"模型。

        context_token = hidden_states[:, 0]
        context_token = self.dropout(context_token)
        pooled_output = self.dense(context_token)
        pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
        return pooled_output

    @property
    def output_dim(self):
        # 返回输出维度，即配置中的隐藏大小
        return self.config.hidden_size


class XSoftmax(torch.autograd.Function):
    """
    优化了内存的 Masked Softmax 实现
    """
    @staticmethod
    def forward(self, input, mask, dim):
        # 设置对象的维度属性为指定的 softmax 维度
        self.dim = dim
        # 计算反转后的掩码，将 mask 张量转换为布尔类型，然后取反
        rmask = ~(mask.to(torch.bool))

        # 使用最小值填充输入张量中掩码位置的元素
        output = input.masked_fill(rmask, torch.tensor(torch.finfo(input.dtype).min))
        # 在指定维度上应用 softmax 操作
        output = torch.softmax(output, self.dim)
        # 将 softmax 结果中掩码位置的元素置为 0
        output.masked_fill_(rmask, 0)
        # 保存输出张量以备反向传播使用
        self.save_for_backward(output)
        # 返回经 softmax 处理后的输出张量
        return output

    @staticmethod
    def backward(self, grad_output):
        # 从保存的张量中获取输出
        (output,) = self.saved_tensors
        # 调用 softmax 反向传播函数计算输入的梯度
        inputGrad = softmax_backward_data(self, grad_output, output, self.dim, output)
        # 返回输入梯度及其余两个 None
        return inputGrad, None, None

    @staticmethod
    def symbolic(g, self, mask, dim):
        # 导入符号化帮助函数和符号化操作集
        import torch.onnx.symbolic_helper as sym_help
        from torch.onnx.symbolic_opset9 import masked_fill, softmax

        # 将 mask 转换为 long 类型并取其相反值作为 r_mask
        mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx["Long"])
        r_mask = g.op(
            "Cast",
            g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
            to_i=sym_help.cast_pytorch_to_onnx["Bool"],
        )
        # 使用最小值填充 self 中 r_mask 的位置
        output = masked_fill(
            g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
        )
        # 在 dim 维度上应用 softmax
        output = softmax(g, output, dim)
        # 将输出中 r_mask 的位置置为 0
        return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool)))
class DropoutContext(object):
    # 定义一个 DropoutContext 类，用于保存 dropout 相关的上下文信息
    def __init__(self):
        # 初始化 dropout 概率为 0
        self.dropout = 0
        # 初始化掩码为 None
        self.mask = None
        # 初始化缩放系数为 1
        self.scale = 1
        # 是否重用掩码，默认为 True
        self.reuse_mask = True


def get_mask(input, local_context):
    # 根据传入的 local_context 类型判断是否为 DropoutContext 类型
    if not isinstance(local_context, DropoutContext):
        # 如果不是 DropoutContext 类型，则将 local_context 视为 dropout 概率
        dropout = local_context
        # 掩码初始化为 None
        mask = None
    else:
        # 如果是 DropoutContext 类型，从 local_context 中获取 dropout 概率
        dropout = local_context.dropout
        # 将 dropout 乘以缩放系数
        dropout *= local_context.scale
        # 如果允许重用掩码，则获取 local_context 中的掩码；否则初始化掩码为 None
        mask = local_context.mask if local_context.reuse_mask else None

    # 如果 dropout 大于 0 且掩码为 None，则根据输入张量 input 的形状生成掩码
    if dropout > 0 and mask is None:
        # 使用 Bernoulli 分布生成与 input 同样形状的掩码，并转换为布尔型
        mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool)

    # 如果 local_context 是 DropoutContext 类型且其掩码为 None，则将生成的掩码保存到 local_context 中
    if isinstance(local_context, DropoutContext):
        if local_context.mask is None:
            local_context.mask = mask

    # 返回生成的掩码和 dropout 概率
    return mask, dropout


class XDropout(torch.autograd.Function):
    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""

    @staticmethod
    def forward(ctx, input, local_ctx):
        # 调用 get_mask 函数获取掩码和 dropout 概率
        mask, dropout = get_mask(input, local_ctx)
        # 计算缩放系数
        ctx.scale = 1.0 / (1 - dropout)
        # 如果 dropout 概率大于 0，则将输入张量 input 中掩码为 True 的元素置为 0，并乘以缩放系数
        if dropout > 0:
            # 保存掩码以备反向传播使用
            ctx.save_for_backward(mask)
            return input.masked_fill(mask, 0) * ctx.scale
        else:
            # 如果 dropout 概率为 0，则直接返回输入张量 input
            return input

    @staticmethod
    def backward(ctx, grad_output):
        # 如果缩放系数大于 1，则从 ctx 中恢复保存的掩码，并将梯度乘以缩放系数
        if ctx.scale > 1:
            (mask,) = ctx.saved_tensors
            return grad_output.masked_fill(mask, 0) * ctx.scale, None
        else:
            # 如果缩放系数不大于 1，则直接返回梯度
            return grad_output, None

    @staticmethod
    def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
        from torch.onnx import symbolic_opset12

        dropout_p = local_ctx
        # 如果 local_ctx 是 DropoutContext 类型，则从中获取 dropout 概率
        if isinstance(local_ctx, DropoutContext):
            dropout_p = local_ctx.dropout
        # 在训练时使用 StableDropout，故设置 train=True
        train = True
        # TODO: 应检查 opset_version 是否大于 12，暂无法良好实现，如在 https://github.com/pytorch/pytorch/issues/78391 修复后，执行：
        # if opset_version < 12:
        #   return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train)
        # 使用 symbolic_opset12 中的 dropout 符号化函数
        return symbolic_opset12.dropout(g, input, dropout_p, train)


class StableDropout(nn.Module):
    """
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    """

    def __init__(self, drop_prob):
        super().__init__()
        # 初始化 dropout 概率
        self.drop_prob = drop_prob
        # 计数器初始化为 0
        self.count = 0
        # 上下文栈初始化为 None
        self.context_stack = None

    def forward(self, x):
        """
        Call the module

        Args:
            x (`torch.tensor`): The input tensor to apply dropout
        """
        # 如果处于训练模式且 dropout 概率大于 0，则调用 XDropout 的 apply 方法应用 dropout
        if self.training and self.drop_prob > 0:
            return XDropout.apply(x, self.get_context())
        # 否则直接返回输入张量 x
        return x
    # 重置对象的计数器和上下文堆栈
    def clear_context(self):
        self.count = 0
        self.context_stack = None
    
    # 初始化上下文堆栈，设置重用掩码和缩放比例
    def init_context(self, reuse_mask=True, scale=1):
        # 如果上下文堆栈为空，则初始化为空列表
        if self.context_stack is None:
            self.context_stack = []
        # 重置计数器
        self.count = 0
        # 遍历上下文堆栈中的每个上下文对象，并设置其重用掩码和缩放比例
        for c in self.context_stack:
            c.reuse_mask = reuse_mask
            c.scale = scale
    
    # 获取当前上下文对象，并设置丢弃概率
    def get_context(self):
        # 如果上下文堆栈不为空
        if self.context_stack is not None:
            # 如果计数超出了堆栈长度，添加一个新的丢弃上下文对象到堆栈中
            if self.count >= len(self.context_stack):
                self.context_stack.append(DropoutContext())
            # 获取当前计数对应的上下文对象
            ctx = self.context_stack[self.count]
            ctx.dropout = self.drop_prob  # 设置丢弃概率
            self.count += 1  # 计数器自增
            return ctx  # 返回获取的上下文对象
        else:
            return self.drop_prob  # 如果上下文堆栈为空，则返回丢弃概率本身
# 定义一个 Deberta 模型的中间层，继承自 nn.Module 类
class DebertaIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层，将输入维度从 config.hidden_size 转换为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果 config.hidden_act 是字符串，则使用预定义的激活函数 ACT2FN[config.hidden_act]
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            # 否则直接使用 config.hidden_act 作为激活函数
            self.intermediate_act_fn = config.hidden_act
    # 定义一个前向传播方法，接受隐藏状态张量作为输入，并返回处理后的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态张量传递给全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的张量应用中间激活函数，例如ReLU等
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回经过线性变换和激活函数处理后的张量作为输出
        return hidden_states
# 定义一个名为 DebertaOutput 的神经网络模块，继承自 nn.Module 类
class DebertaOutput(nn.Module):
    # 初始化方法，接受一个 config 参数
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，输入大小为 config.intermediate_size，输出大小为 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个 DebertaLayerNorm 层，输入大小为 config.hidden_size，使用给定的 layer_norm_eps 进行归一化
        self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
        # 创建一个 StableDropout 层，使用给定的 hidden_dropout_prob 进行稳定的随机失活
        self.dropout = StableDropout(config.hidden_dropout_prob)
        # 保存 config 参数到当前对象中
        self.config = config

    # 前向传播方法，接受 hidden_states 和 input_tensor 作为输入
    def forward(self, hidden_states, input_tensor):
        # 将 hidden_states 输入至 self.dense 线性层
        hidden_states = self.dense(hidden_states)
        # 对 hidden_states 进行 dropout 处理
        hidden_states = self.dropout(hidden_states)
        # 将处理后的 hidden_states 和 input_tensor 相加，然后输入至 self.LayerNorm 层
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的 hidden_states
        return hidden_states


# 定义一个名为 DebertaLayer 的神经网络模块，继承自 nn.Module 类
class DebertaLayer(nn.Module):
    # 初始化方法，接受一个 config 参数
    def __init__(self, config):
        super().__init__()
        # 创建一个 DebertaAttention 层，使用给定的 config 参数
        self.attention = DebertaAttention(config)
        # 创建一个 DebertaIntermediate 层，使用给定的 config 参数
        self.intermediate = DebertaIntermediate(config)
        # 创建一个 DebertaOutput 层，使用给定的 config 参数
        self.output = DebertaOutput(config)

    # 前向传播方法，接受多个参数，包括 hidden_states、attention_mask 等
    def forward(
        self,
        hidden_states,
        attention_mask,
        query_states=None,
        relative_pos=None,
        rel_embeddings=None,
        output_attentions=False,
    ):
        # 将 hidden_states 等参数输入至 self.attention 层进行处理，获取 attention_output
        attention_output = self.attention(
            hidden_states,
            attention_mask,
            output_attentions=output_attentions,
            query_states=query_states,
            relative_pos=relative_pos,
            rel_embeddings=rel_embeddings,
        )
        # 如果设置了 output_attentions 参数为 True，则从 attention_output 中获取 att_matrix
        if output_attentions:
            attention_output, att_matrix = attention_output
        # 将 attention_output 输入至 self.intermediate 层进行处理，获取 intermediate_output
        intermediate_output = self.intermediate(attention_output)
        # 将 intermediate_output 和 attention_output 输入至 self.output 层进行处理，获取 layer_output
        layer_output = self.output(intermediate_output, attention_output)
        # 如果设置了 output_attentions 参数为 True，则返回 layer_output 和 att_matrix
        if output_attentions:
            return (layer_output, att_matrix)
        else:
            # 否则，返回 layer_output
            return layer_output


# 定义一个名为 DebertaEncoder 的神经网络模块，继承自 nn.Module 类
class DebertaEncoder(nn.Module):
    """Modified BertEncoder with relative position bias support"""

    # 初始化方法，接受一个 config 参数
    def __init__(self, config):
        super().__init__()
        # 创建一个 nn.ModuleList，其中包含 config.num_hidden_layers 个 DebertaLayer 层对象
        self.layer = nn.ModuleList([DebertaLayer(config) for _ in range(config.num_hidden_layers)])
        # 检查是否需要支持相对位置偏置
        self.relative_attention = getattr(config, "relative_attention", False)
        # 如果启用了相对位置偏置
        if self.relative_attention:
            # 获取最大的相对位置距离，并设置为 max_relative_positions，如果小于 1，则使用 config.max_position_embeddings
            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
            if self.max_relative_positions < 1:
                self.max_relative_positions = config.max_position_embeddings
            # 创建一个相对位置嵌入的 Embedding 层，大小为 max_relative_positions * 2，维度为 config.hidden_size
            self.rel_embeddings = nn.Embedding(self.max_relative_positions * 2, config.hidden_size)
        # 禁用梯度检查点
        self.gradient_checkpointing = False

    # 获取相对位置嵌入的方法
    def get_rel_embedding(self):
        # 如果启用了相对位置注意力，则返回 rel_embeddings 的权重，否则返回 None
        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
        return rel_embeddings

    # 获取注意力掩码的方法，输入参数 attention_mask
    def get_attention_mask(self, attention_mask):
        # 如果 attention_mask 的维度小于等于 2
        if attention_mask.dim() <= 2:
            # 对 attention_mask 进行扩展，增加一个维度在第二和第三个位置
            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            # 通过扩展的 attention_mask 和自身的点积来生成新的 attention_mask
            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
        # 如果 attention_mask 的维度为 3
        elif attention_mask.dim() == 3:
            # 在第二个位置增加一个维度
            attention_mask = attention_mask.unsqueeze(1)

        # 返回处理后的 attention_mask
        return attention_mask
    # 如果启用了相对位置注意力并且未提供相对位置参数，则根据查询状态和隐藏状态的维度构建相对位置信息
    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
        if self.relative_attention and relative_pos is None:
            q = query_states.size(-2) if query_states is not None else hidden_states.size(-2)
            relative_pos = build_relative_position(q, hidden_states.size(-2), hidden_states.device)
        return relative_pos

    # Transformer 模型的前向传播函数
    def forward(
        self,
        hidden_states,
        attention_mask,
        output_hidden_states=True,
        output_attentions=False,
        query_states=None,
        relative_pos=None,
        return_dict=True,
    ):
        # 获取注意力掩码
        attention_mask = self.get_attention_mask(attention_mask)
        # 获取相对位置信息
        relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)

        # 初始化保存所有隐藏状态和注意力分数的空元组
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        # 如果 hidden_states 是 Sequence 对象，则取其第一个元素作为 next_kv，否则直接使用 hidden_states
        if isinstance(hidden_states, Sequence):
            next_kv = hidden_states[0]
        else:
            next_kv = hidden_states
        
        # 获取相对位置嵌入
        rel_embeddings = self.get_rel_embedding()

        # 遍历每个 Transformer 层
        for i, layer_module in enumerate(self.layer):
            # 如果输出隐藏状态，则将当前隐藏状态加入 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果启用了梯度检查点和处于训练模式，则调用梯度检查点函数，否则直接调用当前层的 forward 方法
            if self.gradient_checkpointing and self.training:
                hidden_states = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    next_kv,
                    attention_mask,
                    query_states,
                    relative_pos,
                    rel_embeddings,
                    output_attentions,
                )
            else:
                hidden_states = layer_module(
                    next_kv,
                    attention_mask,
                    query_states=query_states,
                    relative_pos=relative_pos,
                    rel_embeddings=rel_embeddings,
                    output_attentions=output_attentions,
                )

            # 如果输出注意力分数，则解压 hidden_states 为 hidden_states 和 att_m
            if output_attentions:
                hidden_states, att_m = hidden_states

            # 更新 query_states
            if query_states is not None:
                query_states = hidden_states
                # 如果 hidden_states 是 Sequence 对象，则更新 next_kv 为下一个隐藏状态
                if isinstance(hidden_states, Sequence):
                    next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
            else:
                next_kv = hidden_states

            # 如果输出注意力分数，则将当前层的 att_m 加入 all_attentions 中
            if output_attentions:
                all_attentions = all_attentions + (att_m,)

        # 最后一个 Transformer 层的隐藏状态加入 all_hidden_states
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的结果，则按顺序返回非空的结果元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
        # 返回字典形式的 BaseModelOutput 结果
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )
# 使用给定的 query_size 和 device 创建一个长为 query_size 的长整型张量 q_ids
q_ids = torch.arange(query_size, dtype=torch.long, device=device)

# 使用给定的 key_size 和 device 创建一个长为 key_size 的长整型张量 k_ids
k_ids = torch.arange(key_size, dtype=torch.long, device=device)

# 计算相对位置张量 rel_pos_ids，其维度为 [query_size, key_size]，其中每个元素为对应的 query 和 key 的相对位置差值
rel_pos_ids = q_ids[:, None] - k_ids.view(1, -1).repeat(query_size, 1)

# 截取 rel_pos_ids 的子集，保留前 query_size 行，其维度变为 [1, query_size, key_size]
rel_pos_ids = rel_pos_ids[:query_size, :]

# 将 rel_pos_ids 维度扩展为 [1, query_size, key_size]，以符合函数返回的期望输出形状
rel_pos_ids = rel_pos_ids.unsqueeze(0)
return rel_pos_ids

# 将输入的 c2p_pos 张量扩展为 [query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)]
@torch.jit.script
def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
    return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)])

# 将输入的 c2p_pos 张量扩展为 [query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)]
@torch.jit.script
def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
    return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)])

# 将输入的 pos_index 张量扩展为 [p2c_att.size()[0], p2c_att.size()[1], pos_index.size(-2), key_layer.size(-2)]
@torch.jit.script
def pos_dynamic_expand(pos_index, p2c_att, key_layer):
    return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2)))

class DisentangledSelfAttention(nn.Module):
    """
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    """
    def __init__(self, config):
        super().__init__()
        # 检查隐藏层大小是否是注意力头数的倍数，如果不是则抛出数值错误异常
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )
        
        # 初始化注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        
        # 创建输入到投影层的线性变换，输出维度是注意力头数的三倍
        self.in_proj = nn.Linear(config.hidden_size, self.all_head_size * 3, bias=False)
        
        # 初始化注意力头的偏置参数
        self.q_bias = nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
        self.v_bias = nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
        
        # 根据配置初始化位置注意力类型列表
        self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
        
        # 是否启用相对位置注意力和对话头机制
        self.relative_attention = getattr(config, "relative_attention", False)
        self.talking_head = getattr(config, "talking_head", False)
        
        # 如果启用了对话头机制，则初始化对话头的线性投影层
        if self.talking_head:
            self.head_logits_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False)
            self.head_weights_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False)
        
        # 如果启用了相对位置注意力，则根据配置初始化相关参数
        if self.relative_attention:
            # 最大相对位置，默认为配置中的最大相对位置或者位置嵌入的最大位置数
            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
            if self.max_relative_positions < 1:
                self.max_relative_positions = config.max_position_embeddings
            # 使用稳定的 Dropout 初始化位置 Dropout 层
            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
            
            # 如果 pos_att_type 包含 "c2p"，则初始化位置投影层
            if "c2p" in self.pos_att_type:
                self.pos_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
            # 如果 pos_att_type 包含 "p2c"，则初始化位置查询投影层
            if "p2c" in self.pos_att_type:
                self.pos_q_proj = nn.Linear(config.hidden_size, self.all_head_size)
        
        # 初始化注意力概率的稳定 Dropout
        self.dropout = StableDropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        # 调整张量形状以便计算注意力分数，将最后一维划分为注意力头数和其余部分
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states,
        attention_mask,
        output_attentions=False,
        query_states=None,
        relative_pos=None,
        rel_embeddings=None,
    # 计算解缠注意力偏置
    def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
        # 如果未提供相对位置信息，则根据查询层和键层的大小构建相对位置张量
        if relative_pos is None:
            q = query_layer.size(-2)
            relative_pos = build_relative_position(q, key_layer.size(-2), query_layer.device)
        
        # 如果相对位置张量的维度为2，则扩展为4维张量
        if relative_pos.dim() == 2:
            relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
        # 如果相对位置张量的维度为3，则在第二维度上进行扩展
        elif relative_pos.dim() == 3:
            relative_pos = relative_pos.unsqueeze(1)
        # 如果相对位置张量的维度不为2或3或4，则引发异常
        elif relative_pos.dim() != 4:
            raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")

        # 限制注意力跨度为查询层和键层大小的较小值，同时不超过最大相对位置范围
        att_span = min(max(query_layer.size(-2), key_layer.size(-2)), self.max_relative_positions)
        # 将相对位置张量转换为长整型，并移动到查询层所在的设备上
        relative_pos = relative_pos.long().to(query_layer.device)
        # 从相对位置嵌入中选择与限制注意力跨度相关的子集，扩展为三维张量
        rel_embeddings = rel_embeddings[
            self.max_relative_positions - att_span : self.max_relative_positions + att_span, :
        ].unsqueeze(0)

        # 初始化注意力分数
        score = 0

        # 如果位置注意力类型包含"c2p"
        if "c2p" in self.pos_att_type:
            # 使用位置投影对相对位置嵌入进行处理，并转换以适应注意力计算的需求
            pos_key_layer = self.pos_proj(rel_embeddings)
            pos_key_layer = self.transpose_for_scores(pos_key_layer)
            # 计算内容到位置的注意力分数
            c2p_att = torch.matmul(query_layer, pos_key_layer.transpose(-1, -2))
            # 对相对位置进行限幅处理，确保不超出有效范围
            c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
            # 使用动态索引扩展相对位置，获取相应的注意力分数
            c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_dynamic_expand(c2p_pos, query_layer, relative_pos))
            # 累加内容到位置的注意力分数
            score += c2p_att

        # 如果位置注意力类型包含"p2c"
        if "p2c" in self.pos_att_type:
            # 使用位置查询投影对相对位置嵌入进行处理，并转换以适应注意力计算的需求
            pos_query_layer = self.pos_q_proj(rel_embeddings)
            pos_query_layer = self.transpose_for_scores(pos_query_layer)
            # 根据比例因子对位置查询层进行归一化
            pos_query_layer /= torch.sqrt(torch.tensor(pos_query_layer.size(-1), dtype=torch.float) * scale_factor)
            # 如果查询层和键层的大小不同，则重新构建相对位置张量
            if query_layer.size(-2) != key_layer.size(-2):
                r_pos = build_relative_position(key_layer.size(-2), key_layer.size(-2), query_layer.device)
            else:
                r_pos = relative_pos
            # 对位置到内容的相对位置进行限幅处理
            p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1)
            # 计算位置到内容的注意力分数
            p2c_att = torch.matmul(key_layer, pos_query_layer.transpose(-1, -2).to(dtype=key_layer.dtype))
            # 使用动态索引扩展相对位置，获取相应的注意力分数，并进行转置以匹配注意力计算的形状
            p2c_att = torch.gather(
                p2c_att, dim=-1, index=p2c_dynamic_expand(p2c_pos, query_layer, key_layer)
            ).transpose(-1, -2)

            # 如果查询层和键层的大小不同，则进一步处理位置索引
            if query_layer.size(-2) != key_layer.size(-2):
                pos_index = relative_pos[:, :, :, 0].unsqueeze(-1)
                p2c_att = torch.gather(p2c_att, dim=-2, index=pos_dynamic_expand(pos_index, p2c_att, key_layer))
            # 累加位置到内容的注意力分数
            score += p2c_att

        # 返回最终的注意力分数
        return score
    # DebertaEmbeddings 类定义，用于构建来自单词、位置和标记类型嵌入的嵌入层
    """Construct the embeddings from word, position and token_type embeddings."""
    
    # 初始化方法
    def __init__(self, config):
        super().__init__()
        
        # 从配置中获取填充标记 ID，默认为 0
        pad_token_id = getattr(config, "pad_token_id", 0)
        
        # 确定嵌入层的维度大小，默认为 config.hidden_size
        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
        
        # 创建单词嵌入层，vocab_size 是词汇表大小，embedding_size 是嵌入向量的维度，padding_idx 是填充标记的索引
        self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx=pad_token_id)
        
        # 是否使用位置偏置输入，默认为 True
        self.position_biased_input = getattr(config, "position_biased_input", True)
        
        # 如果不使用位置偏置输入，则位置嵌入层设为 None；否则创建位置嵌入层，max_position_embeddings 是最大位置嵌入数量
        if not self.position_biased_input:
            self.position_embeddings = None
        else:
            self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size)
        
        # 如果配置中有标记类型大小（type_vocab_size 大于 0），则创建标记类型嵌入层
        if config.type_vocab_size > 0:
            self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size)
        
        # 如果嵌入层大小不等于隐藏层大小，则使用线性变换将其投影到隐藏层大小
        if self.embedding_size != config.hidden_size:
            self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
        
        # 创建 DebertaLayerNorm 层，用于归一化隐藏层输出
        self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
        
        # 创建稳定的 Dropout 层，用于隐藏层的随机失活
        self.dropout = StableDropout(config.hidden_dropout_prob)
        
        # 保存配置信息
        self.config = config
        
        # 注册缓冲区，position_ids 是一个持久化的缓冲区，torch.arange 生成 0 到 max_position_embeddings-1 的序列
        # expand((1, -1)) 将其扩展为形状为 (1, max_position_embeddings) 的张量
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
    # 定义前向传播函数，接受多个输入参数并返回嵌入向量表示
    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
        # 如果传入了 input_ids，则获取其形状
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则，获取 inputs_embeds 的形状（除去最后一个维度）
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度
        seq_length = input_shape[1]

        # 如果 position_ids 为空，则使用预定义的位置编码（截取到序列长度）
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果 token_type_ids 为空，则创建零填充的张量，与 input_shape 相同的大小
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果 inputs_embeds 为空，则使用 word_embeddings 从 input_ids 中获取嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        # 如果存在 position_embeddings 属性，则从 position_ids 中获取位置嵌入
        if self.position_embeddings is not None:
            position_embeddings = self.position_embeddings(position_ids.long())
        else:
            # 否则，创建与 inputs_embeds 相同形状的零填充张量
            position_embeddings = torch.zeros_like(inputs_embeds)

        # 初始化 embeddings 为 inputs_embeds
        embeddings = inputs_embeds

        # 如果启用了 position_biased_input，则将 position_embeddings 添加到 embeddings 中
        if self.position_biased_input:
            embeddings += position_embeddings

        # 如果配置中定义了 type_vocab_size（表示 token 类型的数量），则将 token_type_embeddings 添加到 embeddings 中
        if self.config.type_vocab_size > 0:
            token_type_embeddings = self.token_type_embeddings(token_type_ids)
            embeddings += token_type_embeddings

        # 如果嵌入大小不等于隐藏层大小，则使用 embed_proj 对 embeddings 进行投影
        if self.embedding_size != self.config.hidden_size:
            embeddings = self.embed_proj(embeddings)

        # 对 embeddings 应用 LayerNorm 规范化
        embeddings = self.LayerNorm(embeddings)

        # 如果存在 mask 参数，则对 embeddings 应用 mask
        if mask is not None:
            # 如果 mask 的维度与 embeddings 不同，则根据 mask 的维度进行调整
            if mask.dim() != embeddings.dim():
                if mask.dim() == 4:
                    mask = mask.squeeze(1).squeeze(1)
                mask = mask.unsqueeze(2)
            # 将 mask 转换为 embeddings 的数据类型
            mask = mask.to(embeddings.dtype)

            # 将 embeddings 应用 mask
            embeddings = embeddings * mask

        # 对 embeddings 应用 dropout
        embeddings = self.dropout(embeddings)

        # 返回处理后的 embeddings
        return embeddings
# DebertaPreTrainedModel 类定义，继承自 PreTrainedModel，用于处理模型权重初始化、预训练模型下载和加载的抽象类
class DebertaPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # DebertaPreTrainedModel 类的配置类，指定为 DebertaConfig
    config_class = DebertaConfig

    # 基础模型名称前缀为 "deberta"
    base_model_prefix = "deberta"

    # 在加载模型时忽略的键列表，预期外的键为 "position_embeddings"
    _keys_to_ignore_on_load_unexpected = ["position_embeddings"]

    # 支持梯度检查点的标志
    supports_gradient_checkpointing = True

    # 初始化模型权重的方法
    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, nn.Linear):
            # 如果是线性层，使用正态分布初始化权重，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项，初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 如果是嵌入层，使用正态分布初始化权重，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在填充索引，将填充索引对应的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


这段代码定义了一个抽象类 `DebertaPreTrainedModel`，用于处理权重初始化和预训练模型的下载和加载。注释详细解释了类的各个部分和方法的作用。
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列标记在词汇表中的索引。
            # 可以使用 `AutoTokenizer` 获取这些索引。参见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。
            # 详细信息请参阅 [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 遮罩，用于避免对填充标记索引执行注意力操作。
            # 遮罩值为 0 或 1：
            # - 1 表示 **未遮罩** 的标记，
            # - 0 表示 **遮罩** 的标记。
            # 详细信息请参阅 [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 段标记索引，指示输入的第一部分和第二部分。
            # 索引选取在 `[0, 1]` 范围内：
            # - 0 对应 *句子 A* 的标记，
            # - 1 对应 *句子 B* 的标记。
            # 详细信息请参阅 [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 每个输入序列标记在位置嵌入中的位置索引。
            # 索引选取在 `[0, config.max_position_embeddings - 1]` 范围内。
            # 详细信息请参阅 [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选项，可以直接传递嵌入表示代替 `input_ids`。
            # 如果需要更多控制如何将 *input_ids* 索引转换为相关向量，则此选项非常有用。
            # 这比模型内部的嵌入查找矩阵更为灵活。
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。
            # 返回的张量中的 `attentions` 部分有关更多详细信息。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。
            # 返回的张量中的 `hidden_states` 部分有关更多详细信息。
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
"""
DeBERTa 模型的基础类，输出原始的隐藏状态，没有特定的输出头部。

Args:
    config (DebertaConfig): 包含模型配置的对象实例

Attributes:
    embeddings (DebertaEmbeddings): DeBERTa 模型的嵌入层
    encoder (DebertaEncoder): DeBERTa 模型的编码器
    z_steps (int): 用于某些特定功能的步骤计数
    config (DebertaConfig): 模型的配置对象

Raises:
    NotImplementedError: 当尝试调用未实现的修剪功能时抛出异常

Methods:
    get_input_embeddings: 返回模型的输入嵌入层
    set_input_embeddings: 设置模型的输入嵌入层
    _prune_heads: 修剪模型的头部，但此功能在 DeBERTa 模型中尚未实现
    forward: DeBERTa 模型的前向传播方法，接受多个输入参数和配置选项

"""
@add_start_docstrings(
    "The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.",
    DEBERTA_START_DOCSTRING,
)
class DebertaModel(DebertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化嵌入层和编码器
        self.embeddings = DebertaEmbeddings(config)
        self.encoder = DebertaEncoder(config)
        self.z_steps = 0
        self.config = config
        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回模型的输入嵌入层（词嵌入）
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, new_embeddings):
        # 设置模型的输入嵌入层（词嵌入）
        self.embeddings.word_embeddings = new_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 抛出未实现异常，因为 DeBERTa 模型中未实现修剪功能
        raise NotImplementedError("The prune function is not implemented in DeBERTa model.")

    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        # 如果未指定 output_attentions，则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定 output_hidden_states，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定 return_dict，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果同时指定了 input_ids 和 inputs_embeds，则抛出 ValueError 异常
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        # 如果指定了 input_ids，则检查 padding 和 attention_mask
        elif input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            # 获取 input_ids 的形状
            input_shape = input_ids.size()
        # 如果指定了 inputs_embeds，则获取其形状除去最后一维的部分
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            # 如果既未指定 input_ids 也未指定 inputs_embeds，则抛出 ValueError 异常
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 获取输入数据所在设备的信息
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # 如果未提供 attention_mask，则创建一个全为 1 的 mask，形状与输入数据一致，放置在相同设备上
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        # 如果未提供 token_type_ids，则创建一个全为 0 的 tensor，数据类型为 long，形状与输入数据一致，放置在相同设备上
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # 通过 embeddings 层处理输入数据，得到嵌入的输出
        embedding_output = self.embeddings(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            mask=attention_mask,
            inputs_embeds=inputs_embeds,
        )

        # 将嵌入输出传递给 encoder 层进行编码，获取编码器的输出
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask,
            output_hidden_states=True,
            output_attentions=output_attentions,
            return_dict=return_dict,
        )
        # 获取编码器的编码层输出
        encoded_layers = encoder_outputs[1]

        # 如果设置了 z_steps 大于 1，则执行多步的自注意力操作
        if self.z_steps > 1:
            # 获取倒数第二层的隐藏状态
            hidden_states = encoded_layers[-2]
            # 复制编码器的最后一层以形成层数组
            layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
            # 获取编码器的最后一层的查询状态
            query_states = encoded_layers[-1]
            # 获取相对位置编码的嵌入
            rel_embeddings = self.encoder.get_rel_embedding()
            # 获取注意力 mask
            attention_mask = self.encoder.get_attention_mask(attention_mask)
            # 获取相对位置编码
            rel_pos = self.encoder.get_rel_pos(embedding_output)
            # 对于除第一层外的每一层，执行自注意力操作
            for layer in layers[1:]:
                query_states = layer(
                    hidden_states,
                    attention_mask,
                    output_attentions=False,
                    query_states=query_states,
                    relative_pos=rel_pos,
                    rel_embeddings=rel_embeddings,
                )
                # 将查询状态添加到编码层列表中
                encoded_layers.append(query_states)

        # 获取编码层的最后一层作为序列输出
        sequence_output = encoded_layers[-1]

        # 如果不需要返回字典，则返回编码器输出的元组
        if not return_dict:
            return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2):]

        # 如果需要返回字典，则构造一个 BaseModelOutput 对象，并返回
        return BaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states if output_hidden_states else None,
            attentions=encoder_outputs.attentions,
        )
# 使用装饰器为类添加文档字符串，描述此类为基于DeBERTa模型的语言建模头部模型
@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
class DebertaForMaskedLM(DebertaPreTrainedModel):
    # 定义权重共享的键名列表，这些键名指定了需要共享权重的模型参数
    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]

    def __init__(self, config):
        # 调用父类的初始化方法，传入配置参数config
        super().__init__(config)

        # 创建DeBERTa模型实例，并传入配置参数
        self.deberta = DebertaModel(config)
        # 创建DeBERTa的MLM头部实例，并传入配置参数
        self.cls = DebertaOnlyMLMHead(config)

        # 调用本类的后初始化方法，用于初始化权重并进行最终处理
        self.post_init()

    # 返回MLM头部的输出嵌入，这里是预测的词汇表解码器
    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    # 设置MLM头部的输出嵌入为新的嵌入张量
    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    # 使用装饰器为forward方法添加文档字符串，描述其输入与输出
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 添加代码示例文档字符串，包括模型检查点、输出类型、配置类等信息
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_MASKED_LM,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="[MASK]",
        expected_output=_MASKED_LM_EXPECTED_OUTPUT,
        expected_loss=_MASKED_LM_EXPECTED_LOSS,
    )
    # 前向传播函数，接收多种输入参数，并返回一个字典或单个张量，根据return_dict的布尔值来决定返回类型
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 初始化 return_dict，若未提供则使用 self.config.use_return_dict 的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 DeBERTa 模型进行前向传播
        outputs = self.deberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出的序列表示
        sequence_output = outputs[0]

        # 将序列表示作为输入，通过分类层获取预测分数
        prediction_scores = self.cls(sequence_output)

        # 初始化 masked_lm_loss
        masked_lm_loss = None
        # 如果提供了 labels，则计算 masked language modeling 的损失
        if labels is not None:
            # 定义损失函数为交叉熵损失
            loss_fct = CrossEntropyLoss()  # -100 index = padding token
            # 计算 masked language modeling 的损失
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # 如果 return_dict 为 False，则按非字典形式返回输出
        if not return_dict:
            # 组装输出为元组
            output = (prediction_scores,) + outputs[1:]
            # 返回损失和输出序列，如果存在损失
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # 如果 return_dict 为 True，则以 MaskedLMOutput 对象形式返回结果
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
class DebertaPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)

        # 定义一个全连接层，将隐藏状态转换为指定的嵌入大小
        self.dense = nn.Linear(config.hidden_size, self.embedding_size)
        
        # 根据配置选择激活函数，用于转换层的输出
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        
        # 对转换后的输出进行 Layer Normalization
        self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states):
        # 将隐藏状态输入全连接层进行线性转换
        hidden_states = self.dense(hidden_states)
        
        # 应用预定义的激活函数对转换后的状态进行非线性变换
        hidden_states = self.transform_act_fn(hidden_states)
        
        # 对转换后的状态进行 Layer Normalization
        hidden_states = self.LayerNorm(hidden_states)
        
        return hidden_states


class DebertaLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        # 创建一个预测头变换模块，根据配置参数进行初始化
        self.transform = DebertaPredictionHeadTransform(config)

        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
        
        # 定义一个线性层，用于预测每个 token 的分数，输出维度为词汇表大小，无偏置项
        self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False)

        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # 将线性层的偏置与参数进行关联，以便与 `resize_token_embeddings` 正确调整偏置大小
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        # 将隐藏状态输入变换模块进行预测头的变换
        hidden_states = self.transform(hidden_states)
        
        # 输入变换后的隐藏状态进行预测头的线性预测
        hidden_states = self.decoder(hidden_states)
        
        return hidden_states


# 从 transformers.models.bert.BertOnlyMLMHead 复制并更名为 DebertaOnlyMLMHead
class DebertaOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        # 创建一个预测模块，用于 MLM 头的预测
        self.predictions = DebertaLMPredictionHead(config)

    def forward(self, sequence_output):
        # 将序列输出输入预测模块进行 MLM 头的预测
        prediction_scores = self.predictions(sequence_output)
        
        return prediction_scores


@add_start_docstrings(
    """
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    DEBERTA_START_DOCSTRING,
)
class DebertaForSequenceClassification(DebertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        
        # 获取类别数
        num_labels = getattr(config, "num_labels", 2)
        self.num_labels = num_labels

        # 创建 DeBERTa 模型，并初始化上下文池化器
        self.deberta = DebertaModel(config)
        self.pooler = ContextPooler(config)
        
        # 获取池化后的输出维度
        output_dim = self.pooler.output_dim

        # 创建一个线性层，用于分类任务的输出，输出维度为类别数
        self.classifier = nn.Linear(output_dim, num_labels)
        
        # 获取或设置分类器的 dropout 概率
        drop_out = getattr(config, "cls_dropout", None)
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
        self.dropout = StableDropout(drop_out)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 获取输入嵌入层
        return self.deberta.get_input_embeddings()
    # 设置新的输入嵌入到DeBERTa模型中
    def set_input_embeddings(self, new_embeddings):
        self.deberta.set_input_embeddings(new_embeddings)

    # 为模型的forward方法添加文档字符串，描述输入参数的格式和含义
    # 包括batch_size（批量大小）和sequence_length（序列长度）
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 添加代码示例的文档字符串，包括模型的checkpoint（检查点）、输出类型、配置类等信息
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的forward方法
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,                  # 输入的token IDs（可选）
        attention_mask: Optional[torch.Tensor] = None,            # 注意力掩码（可选）
        token_type_ids: Optional[torch.Tensor] = None,            # token类型IDs（可选）
        position_ids: Optional[torch.Tensor] = None,              # 位置IDs（可选）
        inputs_embeds: Optional[torch.Tensor] = None,             # 嵌入的输入（可选）
        labels: Optional[torch.Tensor] = None,                    # 标签（可选）
        output_attentions: Optional[bool] = None,                 # 是否输出注意力（可选）
        output_hidden_states: Optional[bool] = None,              # 是否输出隐藏状态（可选）
        return_dict: Optional[bool] = None,                       # 是否返回字典格式的输出（可选）
@add_start_docstrings(
    """
    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    DEBERTA_START_DOCSTRING,
)
class DebertaForTokenClassification(DebertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # 初始化 DeBERTa 模型
        self.deberta = DebertaModel(config)
        # Dropout 层，用于防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 线性分类器，将隐藏状态映射到标签空间
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 DeBERTa 模型的前向传播
        outputs = self.deberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取序列输出
        sequence_output = outputs[0]

        # 应用 Dropout
        sequence_output = self.dropout(sequence_output)
        # 应用线性分类器获取 logits
        logits = self.classifier(sequence_output)

        # 计算损失
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 根据是否返回字典构建返回值
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TokenClassifierOutput 对象，包含损失、logits、隐藏状态和注意力权重
        return TokenClassifierOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    DEBERTA_START_DOCSTRING,


注释：

# 将隐藏状态输出作为基础层，计算“span起始logits”和“span结束logits”的上层层次。
# DEBERTA_START_DOCSTRING是预定义的常量或字符串，可能用于文档字符串或注释的格式化。
)
class DebertaForQuestionAnswering(DebertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.deberta = DebertaModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_QA,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_QA_EXPECTED_OUTPUT,
        expected_loss=_QA_EXPECTED_LOSS,
        qa_target_start_index=_QA_TARGET_START_INDEX,
        qa_target_end_index=_QA_TARGET_END_INDEX,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的token ID序列，可选的Tensor类型
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，指示模型在计算中应忽略的位置，可选的Tensor类型
        token_type_ids: Optional[torch.Tensor] = None,  # token类型ID，用于区分两个句子或段落，可选的Tensor类型
        position_ids: Optional[torch.Tensor] = None,  # 位置ID，标识输入token的位置信息，可选的Tensor类型
        inputs_embeds: Optional[torch.Tensor] = None,  # 嵌入输入，替代输入IDs的嵌入表示，可选的Tensor类型
        start_positions: Optional[torch.Tensor] = None,  # 答案的起始位置索引，用于训练和评估，可选的Tensor类型
        end_positions: Optional[torch.Tensor] = None,  # 答案的结束位置索引，用于训练和评估，可选的Tensor类型
        output_attentions: Optional[bool] = None,  # 是否返回注意力权重，可选的布尔类型
        output_hidden_states: Optional[bool] = None,  # 是否返回隐藏状态，可选的布尔类型
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选的布尔类型
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 确定是否要返回字典格式的输出，根据配置或参数设定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 DeBERTa 模型进行前向推断
        outputs = self.deberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出中的序列输出
        sequence_output = outputs[0]

        # 将序列输出送入问答输出层获取 logits
        logits = self.qa_outputs(sequence_output)
        # 将 logits 拆分为起始和结束 logits
        start_logits, end_logits = logits.split(1, dim=-1)
        # 去除多余的维度并保持连续性
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        # 如果提供了起始和结束位置的标签，则计算损失
        if start_positions is not None and end_positions is not None:
            # 如果在多 GPU 上运行，扩展维度以匹配模型输出
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 忽略超出模型输入范围的位置
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，并计算起始和结束位置的损失
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        # 如果不需要返回字典格式的输出，则按元组方式返回结果
        if not return_dict:
            output = (start_logits, end_logits) + outputs[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果需要返回字典格式的输出，则构建 QuestionAnsweringModelOutput 对象并返回
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Transformers-源码解析-三十二-

Transformers 源码解析（三十二）

.\models\data2vec\modeling_data2vec_text.py

.\models\data2vec\modeling_data2vec_vision.py

.\models\data2vec\modeling_tf_data2vec_vision.py

.\models\data2vec\__init__.py

.\models\deberta\configuration_deberta.py

.\models\deberta\modeling_deberta.py

`.\models\data2vec\modeling_data2vec_text.py`

`.\models\data2vec\modeling_data2vec_vision.py`

`.\models\data2vec\modeling_tf_data2vec_vision.py`

`.\models\data2vec\init.py`

`.\models\deberta\configuration_deberta.py`

`.\models\deberta\modeling_deberta.py`